From: Nick Pollitt <npollitt@sgi.com>
To: linux-ia64@vger.kernel.org
Subject: [Linux-ia64] Help with Ingo scheduler on IA64
Date: Sat, 12 Jan 2002 02:23:59 +0000 [thread overview]
Message-ID: <marc-linux-ia64-105590698805816@msgid-missing> (raw)
[-- Attachment #1: Type: text/plain, Size: 941 bytes --]
I'm trying to get Ingo's scheduler working on IA64 but I've hit a
dead-end with the head.S code. Ingo's patch removes init_tasks,
so I've modified the assembly in head.S to point at
runqueues(cpu)->idle, I think - it dies very early in the boot,
and I'm not familiar with ia64 assembly.
Other issues, I had to build offsets.h by hand, and I moved some
stuff from sched.c to sched.h. Other than that, it's H6 + ia64.
Anyone have any feedback on getting this booting?
Thanks
Nick
On Fri, Jan 11, 2002 at 06:49:28PM +0100, Ingo Molnar wrote:
>
> the -H6 patch is available:
>
> http://redhat.com/~mingo/O(1)-scheduler/sched-O1-2.5.2-pre11-H6.patch
> http://redhat.com/~mingo/O(1)-scheduler/sched-O1-2.4.17-H6.patch
>
--
Nick Pollitt phone: 650.933.7406
Scalable Linux Project fax: 650.932.0317
Silicon Graphics, Inc. npollitt@engr.sgi.com
[-- Attachment #2: ingo-all.3.patch --]
[-- Type: text/plain, Size: 114492 bytes --]
diff -X dontdiff -Nur origlinux/arch/i386/kernel/apic.c mylinux/arch/i386/kernel/apic.c
--- origlinux/arch/i386/kernel/apic.c Fri Jan 11 14:39:21 2002
+++ mylinux/arch/i386/kernel/apic.c Fri Jan 11 14:46:44 2002
@@ -785,8 +785,7 @@
*/
slice = clocks / (smp_num_cpus+1);
- printk("cpu: %d, clocks: %d, slice: %d\n",
- smp_processor_id(), clocks, slice);
+ printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice);
/*
* Wait for IRQ0's slice:
@@ -809,8 +808,7 @@
__setup_APIC_LVTT(clocks);
- printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n",
- smp_processor_id(), t0, t1, delta, slice, clocks);
+ printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", smp_processor_id(), t0, t1, delta, slice, clocks);
__restore_flags(flags);
}
diff -X dontdiff -Nur origlinux/arch/i386/kernel/nmi.c mylinux/arch/i386/kernel/nmi.c
--- origlinux/arch/i386/kernel/nmi.c Fri Jan 11 14:39:21 2002
+++ mylinux/arch/i386/kernel/nmi.c Fri Jan 11 14:46:44 2002
@@ -283,7 +283,7 @@
* to get a message out.
*/
bust_spinlocks(1);
- printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu);
+ printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
show_registers(regs);
printk("console shuts up ...\n");
console_silent();
diff -X dontdiff -Nur origlinux/arch/i386/kernel/process.c mylinux/arch/i386/kernel/process.c
--- origlinux/arch/i386/kernel/process.c Fri Jan 11 14:39:21 2002
+++ mylinux/arch/i386/kernel/process.c Fri Jan 11 14:46:44 2002
@@ -123,15 +123,12 @@
void cpu_idle (void)
{
/* endless idle loop with no priority at all */
- init_idle();
- current->nice = 20;
- current->counter = -100;
while (1) {
void (*idle)(void) = pm_idle;
if (!idle)
idle = default_idle;
- while (!current->need_resched)
+ if (!current->need_resched)
idle();
schedule();
check_pgt_cache();
diff -X dontdiff -Nur origlinux/arch/i386/kernel/smp.c mylinux/arch/i386/kernel/smp.c
--- origlinux/arch/i386/kernel/smp.c Fri Jan 11 14:39:21 2002
+++ mylinux/arch/i386/kernel/smp.c Fri Jan 11 14:46:44 2002
@@ -105,7 +105,7 @@
/* The 'big kernel lock' */
spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }};
+struct tlb_state cpu_tlbstate[NR_CPUS] __cacheline_aligned = {[0 ... NR_CPUS-1] = { &init_mm, 0, }};
/*
* the following functions deal with sending IPIs between CPUs.
@@ -490,10 +490,20 @@
* it goes straight through and wastes no time serializing
* anything. Worst case is that we lose a reschedule ...
*/
-
void smp_send_reschedule(int cpu)
{
send_IPI_mask(1 << cpu, RESCHEDULE_VECTOR);
+}
+
+/*
+ * this function sends a reschedule IPI to all (other) CPUs.
+ * This should only be used if some 'global' task became runnable,
+ * such as a RT task, that must be handled now. The first CPU
+ * that manages to grab the task will run it.
+ */
+void smp_send_reschedule_all(void)
+{
+ send_IPI_allbutself(RESCHEDULE_VECTOR);
}
/*
diff -X dontdiff -Nur origlinux/arch/i386/kernel/smpboot.c mylinux/arch/i386/kernel/smpboot.c
--- origlinux/arch/i386/kernel/smpboot.c Fri Jan 11 14:39:21 2002
+++ mylinux/arch/i386/kernel/smpboot.c Fri Jan 11 14:46:44 2002
@@ -308,14 +308,14 @@
if (tsc_values[i] < avg)
realdelta = -realdelta;
- printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
- i, realdelta);
+ printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", i, realdelta);
}
sum += delta;
}
if (!buggy)
printk("passed.\n");
+ ;
}
static void __init synchronize_tsc_ap (void)
@@ -365,7 +365,7 @@
* (This works even if the APIC is not enabled.)
*/
phys_id = GET_APIC_ID(apic_read(APIC_ID));
- cpuid = current->processor;
+ cpuid = cpu();
if (test_and_set_bit(cpuid, &cpu_online_map)) {
printk("huh, phys CPU#%d, CPU#%d already present??\n",
phys_id, cpuid);
@@ -471,6 +471,8 @@
*/
local_flush_tlb();
+ init_idle();
+ printk("cpu %d has done init idle, doing cpu_idle().\n", cpu());
return cpu_idle();
}
@@ -803,16 +805,13 @@
if (!idle)
panic("No idle process for CPU %d", cpu);
- idle->processor = cpu;
- idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */
+ idle->cpu = cpu;
map_cpu_to_boot_apicid(cpu, apicid);
idle->thread.eip = (unsigned long) start_secondary;
- del_from_runqueue(idle);
unhash_process(idle);
- init_tasks[cpu] = idle;
/* start_eip had better be page-aligned! */
start_eip = setup_trampoline();
@@ -1020,8 +1019,7 @@
map_cpu_to_boot_apicid(0, boot_cpu_apicid);
global_irq_holder = 0;
- current->processor = 0;
- init_idle();
+ current->cpu = 0;
smp_tune_scheduling();
/*
diff -X dontdiff -Nur origlinux/arch/i386/mm/fault.c mylinux/arch/i386/mm/fault.c
--- origlinux/arch/i386/mm/fault.c Fri Jan 11 14:39:22 2002
+++ mylinux/arch/i386/mm/fault.c Fri Jan 11 14:46:44 2002
@@ -86,8 +86,7 @@
out_of_memory:
if (current->pid == 1) {
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
goto survive;
}
goto bad_area;
@@ -342,8 +341,7 @@
out_of_memory:
up_read(&mm->mmap_sem);
if (tsk->pid == 1) {
- tsk->policy |= SCHED_YIELD;
- schedule();
+ yield();
down_read(&mm->mmap_sem);
goto survive;
}
diff -X dontdiff -Nur origlinux/arch/ia64/kernel/head.S mylinux/arch/ia64/kernel/head.S
--- origlinux/arch/ia64/kernel/head.S Fri Jan 11 14:39:23 2002
+++ mylinux/arch/ia64/kernel/head.S Fri Jan 11 14:41:18 2002
@@ -124,6 +124,7 @@
#define isAP p2 // are we an Application Processor?
#define isBP p3 // are we the Bootstrap Processor?
+
#ifdef CONFIG_SMP
/*
* Find the init_task for the currently booting CPU. At poweron, and in
@@ -132,9 +133,14 @@
movl r3=cpucount
;;
ld4 r3=[r3] // r3 <- smp_processor_id()
- movl r2=init_tasks
+ movl r2=runqueues
+ movl r4=IA64_RUNQUEUE_SIZE
;;
- shladd r2=r3,3,r2
+1: add r2=r2,r4
+ ;;
+ br.cloop.sptk.many 1b
+ ;;
+ addl r2=IA64_RUNQUEUE_IDLE_OFFSET,r2
;;
ld8 r2=[r2]
#else
diff -X dontdiff -Nur origlinux/arch/ia64/kernel/process.c mylinux/arch/ia64/kernel/process.c
--- origlinux/arch/ia64/kernel/process.c Fri Jan 11 14:39:23 2002
+++ mylinux/arch/ia64/kernel/process.c Fri Jan 11 14:37:23 2002
@@ -125,9 +125,6 @@
cpu_idle (void *unused)
{
/* endless idle loop with no priority at all */
- init_idle();
- current->nice = 20;
- current->counter = -100;
while (1) {
@@ -136,7 +133,7 @@
min_xtp();
#endif
- while (!current->need_resched) {
+ if (!current->need_resched) {
#ifdef CONFIG_IA64_SGI_SN
snidle();
#endif
diff -X dontdiff -Nur origlinux/arch/ia64/kernel/setup.c mylinux/arch/ia64/kernel/setup.c
--- origlinux/arch/ia64/kernel/setup.c Fri Jan 11 14:39:23 2002
+++ mylinux/arch/ia64/kernel/setup.c Fri Jan 11 15:09:46 2002
@@ -375,10 +375,10 @@
{
#ifdef CONFIG_SMP
# define lpj c->loops_per_jiffy
-# define cpu c->processor
+# define cpum c->processor
#else
# define lpj loops_per_jiffy
-# define cpu 0
+# define cpum 0
#endif
char family[32], features[128], *cp;
struct cpuinfo_ia64 *c = v;
@@ -417,7 +417,7 @@
"cpu MHz : %lu.%06lu\n"
"itc MHz : %lu.%06lu\n"
"BogoMIPS : %lu.%02lu\n\n",
- cpu, c->vendor, family, c->model, c->revision, c->archrev,
+ cpum, c->vendor, family, c->model, c->revision, c->archrev,
features, c->ppn, c->number,
c->proc_freq / 1000000, c->proc_freq % 1000000,
c->itc_freq / 1000000, c->itc_freq % 1000000,
diff -X dontdiff -Nur origlinux/arch/ia64/kernel/smp.c mylinux/arch/ia64/kernel/smp.c
--- origlinux/arch/ia64/kernel/smp.c Fri Jan 11 14:39:23 2002
+++ mylinux/arch/ia64/kernel/smp.c Fri Jan 11 14:37:23 2002
@@ -186,6 +186,12 @@
}
void
+smp_send_reschedule_all(void)
+{
+ send_IPI_all(IA64_IPI_RESCHEDULE);
+}
+
+void
smp_flush_tlb_all (void)
{
smp_call_function ((void (*)(void *))__flush_tlb_all,0,1,1);
diff -X dontdiff -Nur origlinux/arch/ia64/kernel/smpboot.c mylinux/arch/ia64/kernel/smpboot.c
--- origlinux/arch/ia64/kernel/smpboot.c Fri Jan 11 14:39:23 2002
+++ mylinux/arch/ia64/kernel/smpboot.c Fri Jan 11 14:37:23 2002
@@ -23,6 +23,7 @@
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/spinlock.h>
+#include <linux/sched.h>
#include <asm/atomic.h>
#include <asm/bitops.h>
@@ -323,7 +324,7 @@
extern void perfmon_init_percpu(void);
#endif
- cpuid = smp_processor_id();
+ cpuid = cpu();
phys_id = hard_smp_processor_id();
if (test_and_set_bit(cpuid, &cpu_online_map)) {
@@ -416,13 +417,11 @@
if (!idle)
panic("No idle process for CPU %d", cpu);
- task_set_cpu(idle, cpu); /* we schedule the first task manually */
+ idle->cpu = cpu();
ia64_cpu_to_sapicid[cpu] = sapicid;
- del_from_runqueue(idle);
unhash_process(idle);
- init_tasks[cpu] = idle;
Dprintk("Sending wakeup vector %u to AP 0x%x/0x%x.\n", ap_wakeup_vector, cpu, sapicid);
@@ -481,7 +480,7 @@
printk("Boot processor id 0x%x/0x%x\n", 0, boot_cpu_id);
global_irq_holder = 0;
- current->processor = 0;
+ current->cpu = 0;
init_idle();
/*
diff -X dontdiff -Nur origlinux/arch/ia64/mm/fault.c mylinux/arch/ia64/mm/fault.c
--- origlinux/arch/ia64/mm/fault.c Fri Jan 11 14:39:24 2002
+++ mylinux/arch/ia64/mm/fault.c Fri Jan 11 14:37:23 2002
@@ -194,8 +194,6 @@
out_of_memory:
up_read(&mm->mmap_sem);
if (current->pid == 1) {
- current->policy |= SCHED_YIELD;
- schedule();
down_read(&mm->mmap_sem);
goto survive;
}
diff -X dontdiff -Nur origlinux/arch/ia64/tools/print_offsets.c mylinux/arch/ia64/tools/print_offsets.c
--- origlinux/arch/ia64/tools/print_offsets.c Fri Jan 11 14:39:25 2002
+++ mylinux/arch/ia64/tools/print_offsets.c Fri Jan 11 14:37:23 2002
@@ -50,11 +50,12 @@
{ "IA64_CPU_SIZE", sizeof (struct cpuinfo_ia64) },
{ "SIGFRAME_SIZE", sizeof (struct sigframe) },
{ "UNW_FRAME_INFO_SIZE", sizeof (struct unw_frame_info) },
+ { "IA64_RUNQUEUE_SIZE", sizeof (struct runqueue) },
{ "", 0 }, /* spacer */
{ "IA64_TASK_PTRACE_OFFSET", offsetof (struct task_struct, ptrace) },
{ "IA64_TASK_SIGPENDING_OFFSET", offsetof (struct task_struct, sigpending) },
{ "IA64_TASK_NEED_RESCHED_OFFSET", offsetof (struct task_struct, need_resched) },
- { "IA64_TASK_PROCESSOR_OFFSET", offsetof (struct task_struct, processor) },
+ { "IA64_TASK_PROCESSOR_OFFSET", offsetof (struct task_struct, cpu) },
{ "IA64_TASK_THREAD_OFFSET", offsetof (struct task_struct, thread) },
{ "IA64_TASK_THREAD_KSP_OFFSET", offsetof (struct task_struct, thread.ksp) },
#ifdef CONFIG_PERFMON
@@ -62,6 +63,7 @@
#endif
{ "IA64_TASK_PID_OFFSET", offsetof (struct task_struct, pid) },
{ "IA64_TASK_MM_OFFSET", offsetof (struct task_struct, mm) },
+ { "IA64_RUNQUEUE_IDLE_OFFSET", offsetof (struct runqueue, idle) },
{ "IA64_PT_REGS_CR_IPSR_OFFSET", offsetof (struct pt_regs, cr_ipsr) },
{ "IA64_PT_REGS_CR_IIP_OFFSET", offsetof (struct pt_regs, cr_iip) },
{ "IA64_PT_REGS_CR_IFS_OFFSET", offsetof (struct pt_regs, cr_ifs) },
diff -X dontdiff -Nur origlinux/drivers/block/loop.c mylinux/drivers/block/loop.c
--- origlinux/drivers/block/loop.c Fri Jan 11 14:39:52 2002
+++ mylinux/drivers/block/loop.c Fri Jan 11 14:46:44 2002
@@ -570,9 +570,6 @@
flush_signals(current);
spin_unlock_irq(¤t->sigmask_lock);
- current->policy = SCHED_OTHER;
- current->nice = -20;
-
spin_lock_irq(&lo->lo_lock);
lo->lo_state = Lo_bound;
atomic_inc(&lo->lo_pending);
diff -X dontdiff -Nur origlinux/drivers/ide/ataraid.c mylinux/drivers/ide/ataraid.c
--- origlinux/drivers/ide/ataraid.c Fri Jan 11 14:40:02 2002
+++ mylinux/drivers/ide/ataraid.c Fri Jan 11 14:46:44 2002
@@ -121,11 +121,8 @@
void *ptr = NULL;
while (!ptr) {
ptr=kmalloc(sizeof(struct buffer_head),GFP_NOIO);
- if (!ptr) {
- __set_current_state(TASK_RUNNING);
- current->policy |= SCHED_YIELD;
- schedule();
- }
+ if (!ptr)
+ yield();
}
return ptr;
}
@@ -137,11 +134,8 @@
void *ptr = NULL;
while (!ptr) {
ptr=kmalloc(sizeof(struct ataraid_bh_private),GFP_NOIO);
- if (!ptr) {
- __set_current_state(TASK_RUNNING);
- current->policy |= SCHED_YIELD;
- schedule();
- }
+ if (!ptr)
+ yield();
}
return ptr;
}
diff -X dontdiff -Nur origlinux/drivers/md/md.c mylinux/drivers/md/md.c
--- origlinux/drivers/md/md.c Fri Jan 11 14:40:09 2002
+++ mylinux/drivers/md/md.c Fri Jan 11 14:46:44 2002
@@ -2930,8 +2930,6 @@
* bdflush, otherwise bdflush will deadlock if there are too
* many dirty RAID5 blocks.
*/
- current->policy = SCHED_OTHER;
- current->nice = -20;
md_unlock_kernel();
complete(thread->event);
@@ -3381,11 +3379,6 @@
"(but not more than %d KB/sec) for reconstruction.\n",
sysctl_speed_limit_max);
- /*
- * Resync has low priority.
- */
- current->nice = 19;
-
is_mddev_idle(mddev); /* this also initializes IO event counters */
for (m = 0; m < SYNC_MARKS; m++) {
mark[m] = jiffies;
@@ -3463,16 +3456,13 @@
currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
if (currspeed > sysctl_speed_limit_min) {
- current->nice = 19;
-
if ((currspeed > sysctl_speed_limit_max) ||
!is_mddev_idle(mddev)) {
current->state = TASK_INTERRUPTIBLE;
md_schedule_timeout(HZ/4);
goto repeat;
}
- } else
- current->nice = -20;
+ }
}
printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
err = 0;
diff -X dontdiff -Nur origlinux/drivers/net/slip.c mylinux/drivers/net/slip.c
--- origlinux/drivers/net/slip.c Fri Jan 11 14:40:21 2002
+++ mylinux/drivers/net/slip.c Fri Jan 11 14:46:44 2002
@@ -1393,10 +1393,8 @@
/* First of all: check for active disciplines and hangup them.
*/
do {
- if (busy) {
- current->counter = 0;
- schedule();
- }
+ if (busy)
+ sys_sched_yield();
busy = 0;
local_bh_disable();
diff -X dontdiff -Nur origlinux/fs/binfmt_elf.c mylinux/fs/binfmt_elf.c
--- origlinux/fs/binfmt_elf.c Fri Jan 11 14:40:54 2002
+++ mylinux/fs/binfmt_elf.c Fri Jan 11 14:46:44 2002
@@ -1143,7 +1143,7 @@
psinfo.pr_state = i;
psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i];
psinfo.pr_zomb = psinfo.pr_sname == 'Z';
- psinfo.pr_nice = current->nice;
+ psinfo.pr_nice = current->__nice;
psinfo.pr_flag = current->flags;
psinfo.pr_uid = NEW_TO_OLD_UID(current->uid);
psinfo.pr_gid = NEW_TO_OLD_GID(current->gid);
diff -X dontdiff -Nur origlinux/fs/buffer.c mylinux/fs/buffer.c
--- origlinux/fs/buffer.c Fri Jan 11 14:40:54 2002
+++ mylinux/fs/buffer.c Fri Jan 11 14:46:44 2002
@@ -725,9 +725,7 @@
wakeup_bdflush();
try_to_free_pages(zone, GFP_NOFS, 0);
run_task_queue(&tq_disk);
- current->policy |= SCHED_YIELD;
- __set_current_state(TASK_RUNNING);
- schedule();
+ sys_sched_yield();
}
void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
diff -X dontdiff -Nur origlinux/fs/jbd/journal.c mylinux/fs/jbd/journal.c
--- origlinux/fs/jbd/journal.c Fri Jan 11 14:40:58 2002
+++ mylinux/fs/jbd/journal.c Fri Jan 11 14:46:44 2002
@@ -460,8 +460,7 @@
printk (KERN_NOTICE __FUNCTION__
": ENOMEM at get_unused_buffer_head, "
"trying again.\n");
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
}
} while (!new_bh);
/* keep subsequent assertions sane */
@@ -1539,8 +1538,7 @@
last_warning = jiffies;
}
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
}
}
@@ -1598,8 +1596,7 @@
last_warning = jiffies;
}
while (ret == 0) {
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
}
}
diff -X dontdiff -Nur origlinux/fs/jbd/revoke.c mylinux/fs/jbd/revoke.c
--- origlinux/fs/jbd/revoke.c Fri Jan 11 14:40:58 2002
+++ mylinux/fs/jbd/revoke.c Fri Jan 11 14:46:44 2002
@@ -137,8 +137,7 @@
if (!journal_oom_retry)
return -ENOMEM;
jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
goto repeat;
}
diff -X dontdiff -Nur origlinux/fs/jbd/transaction.c mylinux/fs/jbd/transaction.c
--- origlinux/fs/jbd/transaction.c Fri Jan 11 14:40:58 2002
+++ mylinux/fs/jbd/transaction.c Fri Jan 11 14:46:44 2002
@@ -1377,8 +1377,7 @@
do {
old_handle_count = transaction->t_handle_count;
set_current_state(TASK_RUNNING);
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
} while (old_handle_count != transaction->t_handle_count);
}
diff -X dontdiff -Nur origlinux/fs/jffs2/background.c mylinux/fs/jffs2/background.c
--- origlinux/fs/jffs2/background.c Fri Jan 11 14:40:58 2002
+++ mylinux/fs/jffs2/background.c Fri Jan 11 14:46:44 2002
@@ -106,9 +106,6 @@
sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index);
- /* FIXME in the 2.2 backport */
- current->nice = 10;
-
for (;;) {
spin_lock_irq(¤t->sigmask_lock);
siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT));
diff -X dontdiff -Nur origlinux/fs/locks.c mylinux/fs/locks.c
--- origlinux/fs/locks.c Fri Jan 11 14:40:59 2002
+++ mylinux/fs/locks.c Fri Jan 11 14:46:44 2002
@@ -445,8 +445,7 @@
/* Let the blocked process remove waiter from the
* block list when it gets scheduled.
*/
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
} else {
/* Remove waiter from the block list, because by the
* time it wakes up blocker won't exist any more.
diff -X dontdiff -Nur origlinux/fs/nfs/pagelist.c mylinux/fs/nfs/pagelist.c
--- origlinux/fs/nfs/pagelist.c Fri Jan 11 14:40:59 2002
+++ mylinux/fs/nfs/pagelist.c Fri Jan 11 14:46:44 2002
@@ -96,8 +96,7 @@
continue;
if (signalled() && (server->flags & NFS_MOUNT_INTR))
return ERR_PTR(-ERESTARTSYS);
- current->policy = SCHED_YIELD;
- schedule();
+ yield();
}
/* Initialize the request struct. Initially, we assume a
diff -X dontdiff -Nur origlinux/fs/proc/array.c mylinux/fs/proc/array.c
--- origlinux/fs/proc/array.c Fri Jan 11 14:41:03 2002
+++ mylinux/fs/proc/array.c Fri Jan 11 14:46:44 2002
@@ -335,9 +335,12 @@
/* scale priority and nice values from timeslices to -20..20 */
/* to make it look like a "normal" Unix priority/nice value */
- priority = task->counter;
- priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER;
- nice = task->nice;
+ priority = task->prio;
+ if (priority >= MAX_RT_PRIO)
+ priority -= MAX_RT_PRIO;
+ else
+ priority = priority-100;
+ nice = task->__nice;
read_lock(&tasklist_lock);
ppid = task->pid ? task->p_opptr->pid : 0;
@@ -387,7 +390,7 @@
task->nswap,
task->cnswap,
task->exit_signal,
- task->processor);
+ task->cpu);
if(mm)
mmput(mm);
return res;
diff -X dontdiff -Nur origlinux/fs/proc/proc_misc.c mylinux/fs/proc/proc_misc.c
--- origlinux/fs/proc/proc_misc.c Fri Jan 11 14:41:03 2002
+++ mylinux/fs/proc/proc_misc.c Fri Jan 11 14:46:44 2002
@@ -85,11 +85,11 @@
a = avenrun[0] + (FIXED_1/200);
b = avenrun[1] + (FIXED_1/200);
c = avenrun[2] + (FIXED_1/200);
- len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
+ len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
LOAD_INT(a), LOAD_FRAC(a),
LOAD_INT(b), LOAD_FRAC(b),
LOAD_INT(c), LOAD_FRAC(c),
- nr_running, nr_threads, last_pid);
+ nr_running(), nr_threads, last_pid);
return proc_calc_metrics(page, start, off, count, eof, len);
}
@@ -101,7 +101,7 @@
int len;
uptime = jiffies;
- idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime;
+ idle = init_task.times.tms_utime + init_task.times.tms_stime;
/* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but
that would overflow about every five days at HZ == 100.
@@ -303,10 +303,10 @@
}
len += sprintf(page + len,
- "\nctxt %u\n"
+ "\nctxt %lu\n"
"btime %lu\n"
"processes %lu\n",
- kstat.context_swtch,
+ nr_context_switches(),
xtime.tv_sec - jif / HZ,
total_forks);
diff -X dontdiff -Nur origlinux/fs/reiserfs/buffer2.c mylinux/fs/reiserfs/buffer2.c
--- origlinux/fs/reiserfs/buffer2.c Fri Jan 11 14:41:04 2002
+++ mylinux/fs/reiserfs/buffer2.c Fri Jan 11 14:46:44 2002
@@ -33,8 +33,7 @@
buffer_journal_dirty(bh) ? ' ' : '!');
}
run_task_queue(&tq_disk);
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
}
if (repeat_counter > 30000000) {
reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ;
@@ -52,11 +51,11 @@
struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size)
{
struct buffer_head *result;
- PROC_EXP( unsigned int ctx_switches = kstat.context_swtch );
+ PROC_EXP( unsigned int ctx_switches = nr_context_switches(); );
result = bread (super -> s_dev, n_block, n_size);
PROC_INFO_INC( super, breads );
- PROC_EXP( if( kstat.context_swtch != ctx_switches )
+ PROC_EXP( if( nr_context_switches() != ctx_switches )
PROC_INFO_INC( super, bread_miss ) );
return result;
}
diff -X dontdiff -Nur origlinux/fs/reiserfs/journal.c mylinux/fs/reiserfs/journal.c
--- origlinux/fs/reiserfs/journal.c Fri Jan 11 14:41:04 2002
+++ mylinux/fs/reiserfs/journal.c Fri Jan 11 14:46:44 2002
@@ -149,8 +149,7 @@
}
bn = allocate_bitmap_node(p_s_sb) ;
if (!bn) {
- current->policy |= SCHED_YIELD ;
- schedule() ;
+ yield();
goto repeat ;
}
return bn ;
diff -X dontdiff -Nur origlinux/fs/ufs/truncate.c mylinux/fs/ufs/truncate.c
--- origlinux/fs/ufs/truncate.c Fri Jan 11 14:41:05 2002
+++ mylinux/fs/ufs/truncate.c Fri Jan 11 14:46:44 2002
@@ -448,10 +448,7 @@
if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
ufs_sync_inode (inode);
run_task_queue(&tq_disk);
- current->policy |= SCHED_YIELD;
- schedule ();
-
-
+ yield();
}
offset = inode->i_size & uspi->s_fshift;
if (offset) {
diff -X dontdiff -Nur origlinux/include/asm-i386/bitops.h mylinux/include/asm-i386/bitops.h
--- origlinux/include/asm-i386/bitops.h Fri Jan 11 14:41:12 2002
+++ mylinux/include/asm-i386/bitops.h Fri Jan 11 14:46:44 2002
@@ -75,6 +75,14 @@
:"=m" (ADDR)
:"Ir" (nr));
}
+
+static __inline__ void __clear_bit(int nr, volatile void * addr)
+{
+ __asm__ __volatile__(
+ "btrl %1,%0"
+ :"=m" (ADDR)
+ :"Ir" (nr));
+}
#define smp_mb__before_clear_bit() barrier()
#define smp_mb__after_clear_bit() barrier()
diff -X dontdiff -Nur origlinux/include/asm-i386/mmu_context.h mylinux/include/asm-i386/mmu_context.h
--- origlinux/include/asm-i386/mmu_context.h Fri Jan 11 14:41:13 2002
+++ mylinux/include/asm-i386/mmu_context.h Fri Jan 11 14:46:44 2002
@@ -7,6 +7,28 @@
#include <asm/pgalloc.h>
/*
+ * Every architecture must define this function. It's the fastest
+ * way of searching a 168-bit bitmap where the first 128 bits are
+ * unlikely to be set. It's guaranteed that at least one of the 168
+ * bits is cleared.
+ */
+#if MAX_RT_PRIO != 128 || MAX_PRIO != 168
+# error update this function.
+#endif
+
+static inline int sched_find_first_zero_bit(unsigned long *b)
+{
+ unsigned int rt;
+
+ rt = b[0] & b[1] & b[2] & b[3];
+ if (unlikely(rt != 0xffffffff))
+ return find_first_zero_bit(b, MAX_RT_PRIO);
+
+ if (b[4] != ~0)
+ return ffz(b[4]) + MAX_RT_PRIO;
+ return ffz(b[5]) + 32 + MAX_RT_PRIO;
+}
+/*
* possibly do the LDT unload here?
*/
#define destroy_context(mm) do { } while(0)
diff -X dontdiff -Nur origlinux/include/asm-i386/pgalloc.h mylinux/include/asm-i386/pgalloc.h
--- origlinux/include/asm-i386/pgalloc.h Fri Jan 11 14:41:13 2002
+++ mylinux/include/asm-i386/pgalloc.h Fri Jan 11 14:46:44 2002
@@ -224,6 +224,7 @@
{
struct mm_struct *active_mm;
int state;
+ char __cacheline_padding[24];
};
extern struct tlb_state cpu_tlbstate[NR_CPUS];
diff -X dontdiff -Nur origlinux/include/asm-i386/smp.h mylinux/include/asm-i386/smp.h
--- origlinux/include/asm-i386/smp.h Fri Jan 11 14:41:13 2002
+++ mylinux/include/asm-i386/smp.h Fri Jan 11 14:46:44 2002
@@ -63,6 +63,7 @@
extern void smp_flush_tlb(void);
extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
extern void smp_send_reschedule(int cpu);
+extern void smp_send_reschedule_all(void);
extern void smp_invalidate_rcv(void); /* Process an NMI */
extern void (*mtrr_hook) (void);
extern void zap_low_mappings (void);
@@ -104,7 +105,7 @@
* so this is correct in the x86 case.
*/
-#define smp_processor_id() (current->processor)
+#define smp_processor_id() (current->cpu)
static __inline int hard_smp_processor_id(void)
{
@@ -121,18 +122,6 @@
#endif /* !__ASSEMBLY__ */
#define NO_PROC_ID 0xFF /* No processor magic marker */
-
-/*
- * This magic constant controls our willingness to transfer
- * a process across CPUs. Such a transfer incurs misses on the L1
- * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My
- * gut feeling is this will vary by board in value. For a board
- * with separate L2 cache it probably depends also on the RSS, and
- * for a board with shared L2 cache it ought to decay fast as other
- * processes are run.
- */
-
-#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */
#endif
#endif
diff -X dontdiff -Nur origlinux/include/asm-ia64/bitops.h mylinux/include/asm-ia64/bitops.h
--- origlinux/include/asm-ia64/bitops.h Fri Jan 11 14:41:14 2002
+++ mylinux/include/asm-ia64/bitops.h Fri Jan 11 15:29:34 2002
@@ -368,6 +368,7 @@
#ifdef __KERNEL__
+#define __clear_bit(nr, addr) clear_bit(nr, addr)
#define ext2_set_bit test_and_set_bit
#define ext2_clear_bit test_and_clear_bit
#define ext2_test_bit test_bit
diff -X dontdiff -Nur origlinux/include/asm-ia64/mmu_context.h mylinux/include/asm-ia64/mmu_context.h
--- origlinux/include/asm-ia64/mmu_context.h Fri Jan 11 14:41:14 2002
+++ mylinux/include/asm-ia64/mmu_context.h Fri Jan 11 15:40:00 2002
@@ -118,6 +118,7 @@
reload_context(next);
}
+#define sched_find_first_zero_bit(bitmap) ffz(bitmap)
#define switch_mm(prev_mm,next_mm,next_task,cpu) activate_mm(prev_mm, next_mm)
# endif /* ! __ASSEMBLY__ */
diff -X dontdiff -Nur origlinux/include/asm-ia64/smp.h mylinux/include/asm-ia64/smp.h
--- origlinux/include/asm-ia64/smp.h Fri Jan 11 14:41:14 2002
+++ mylinux/include/asm-ia64/smp.h Fri Jan 11 15:37:41 2002
@@ -27,7 +27,7 @@
#define SMP_IRQ_REDIRECTION (1 << 0)
#define SMP_IPI_REDIRECTION (1 << 1)
-#define smp_processor_id() (current->processor)
+#define smp_processor_id() (current->cpu)
extern struct smp_boot_data {
int cpu_count;
@@ -109,12 +109,6 @@
}
#define NO_PROC_ID 0xffffffff /* no processor magic marker */
-
-/*
- * Extra overhead to move a task from one cpu to another (due to TLB and cache misses).
- * Expressed in "negative nice value" units (larger number means higher priority/penalty).
- */
-#define PROC_CHANGE_PENALTY 20
extern void __init init_smp_config (void);
extern void smp_do_timer (struct pt_regs *regs);
diff -X dontdiff -Nur origlinux/include/linux/kernel_stat.h mylinux/include/linux/kernel_stat.h
--- origlinux/include/linux/kernel_stat.h Fri Jan 11 14:41:36 2002
+++ mylinux/include/linux/kernel_stat.h Fri Jan 11 15:37:41 2002
@@ -32,10 +32,11 @@
unsigned int ipackets, opackets;
unsigned int ierrors, oerrors;
unsigned int collisions;
- unsigned int context_swtch;
};
extern struct kernel_stat kstat;
+
+extern unsigned long nr_context_switches(void);
#if !defined(CONFIG_ARCH_S390)
/*
diff -X dontdiff -Nur origlinux/include/linux/list.h mylinux/include/linux/list.h
--- origlinux/include/linux/list.h Fri Jan 11 14:41:36 2002
+++ mylinux/include/linux/list.h Fri Jan 11 15:37:41 2002
@@ -19,6 +19,8 @@
struct list_head *next, *prev;
};
+typedef struct list_head list_t;
+
#define LIST_HEAD_INIT(name) { &(name), &(name) }
#define LIST_HEAD(name) \
diff -X dontdiff -Nur origlinux/include/linux/sched.h mylinux/include/linux/sched.h
--- origlinux/include/linux/sched.h Fri Jan 11 14:41:39 2002
+++ mylinux/include/linux/sched.h Fri Jan 11 15:39:46 2002
@@ -6,6 +6,7 @@
extern unsigned long event;
#include <linux/config.h>
+#include <linux/compiler.h>
#include <linux/binfmts.h>
#include <linux/threads.h>
#include <linux/kernel.h>
@@ -72,8 +73,9 @@
#define CT_TO_SECS(x) ((x) / HZ)
#define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ)
-extern int nr_running, nr_threads;
+extern int nr_threads;
extern int last_pid;
+extern unsigned long nr_running(void);
#include <linux/fs.h>
#include <linux/time.h>
@@ -116,12 +118,6 @@
#define SCHED_FIFO 1
#define SCHED_RR 2
-/*
- * This is an additional bit set when we want to
- * yield the CPU for one re-schedule..
- */
-#define SCHED_YIELD 0x10
-
struct sched_param {
int sched_priority;
};
@@ -139,7 +135,6 @@
* a separate lock).
*/
extern rwlock_t tasklist_lock;
-extern spinlock_t runqueue_lock;
extern spinlock_t mmlist_lock;
extern void sched_init(void);
@@ -150,6 +145,7 @@
extern void update_process_times(int user);
extern void update_one_process(struct task_struct *p, unsigned long user,
unsigned long system, int cpu);
+extern void scheduler_tick(struct task_struct *p);
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
extern signed long FASTCALL(schedule_timeout(signed long timeout));
@@ -278,6 +274,55 @@
extern struct user_struct root_user;
#define INIT_USER (&root_user)
+#define BITMAP_SIZE ((((MAX_PRIO+7)/8)+sizeof(long)-1)/sizeof(long))
+
+/*
+ * RT priorites go from 0 to 99, but internally we max
+ * them out at 128 to make it easier to search the
+ * scheduler bitmap.
+ */
+#define MAX_RT_PRIO 128
+/*
+ * The lower the priority of a process, the more likely it is
+ * to run. Priority of a process goes from 0 to 167. The 0-99
+ * priority range is allocated to RT tasks, the 128-167 range
+ * is for SCHED_OTHER tasks.
+ */
+#define MAX_PRIO (MAX_RT_PRIO+40)
+#define DEF_USER_NICE 0
+
+typedef struct task_struct task_t;
+typedef struct prio_array prio_array_t;
+typedef struct runqueue runqueue_t;
+
+struct prio_array {
+ int nr_active;
+ spinlock_t *lock;
+ runqueue_t *rq;
+ unsigned long bitmap[BITMAP_SIZE];
+ list_t queue[MAX_PRIO];
+};
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the process migration code), lock
+ * acquire operations must be ordered by the runqueue's cpu id.
+ *
+ * The RT event id is used to avoid calling into the the RT scheduler
+ * if there is a RT task active in an SMP system but there is no
+ * RT scheduling activity otherwise.
+ */
+struct runqueue {
+ spinlock_t lock;
+ unsigned long nr_running, nr_switches;
+ task_t *curr, *idle;
+ prio_array_t *active, *expired, arrays[2];
+ int prev_nr_running[NR_CPUS];
+} ____cacheline_aligned;
+
+
struct task_struct {
/*
* offsets of these are hardcoded elsewhere - touch with care
@@ -295,35 +340,51 @@
int lock_depth; /* Lock depth */
-/*
- * offset 32 begins here on 32-bit platforms. We keep
- * all fields in a single cacheline that are needed for
- * the goodness() loop in schedule().
- */
- long counter;
- long nice;
- unsigned long policy;
- struct mm_struct *mm;
- int processor;
/*
- * cpus_runnable is ~0 if the process is not running on any
- * CPU. It's (1 << cpu) if it's running on a CPU. This mask
- * is updated under the runqueue lock.
- *
- * To determine whether a process might run on a CPU, this
- * mask is AND-ed with cpus_allowed.
+ * offset 32 begins here on 32-bit platforms.
*/
- unsigned long cpus_runnable, cpus_allowed;
+ unsigned int cpu;
+ int prio;
+ long __nice;
+ list_t run_list;
+ prio_array_t *array;
+
+ unsigned int time_slice;
+ unsigned long sleep_timestamp, run_timestamp;
+
/*
- * (only the 'next' pointer fits into the cacheline, but
- * that's just fine.)
+ * A task's four 'sleep history' entries.
+ *
+ * We track the last 4 seconds of time. (including the current second).
+ *
+ * A value of '0' means it has spent no time sleeping in that
+ * particular past second. The maximum value of 'HZ' means that
+ * the task spent all its time running in that particular second.
+ *
+ * 'hist_idx' points to the current second, which, unlike the other
+ * 3 entries, is only partially complete. This means that a value of
+ * '25' does not mean the task slept 25% of the time in the current
+ * second, it means that it spent 25 timer ticks sleeping in the
+ * current second.
+ *
+ * All this might look a bit complex, but it can be maintained very
+ * small overhead and it gives very good statistics, based on which
+ * the scheduler can decide whether a task is 'interactive' or a
+ * 'CPU hog'. See sched.c for more details.
*/
- struct list_head run_list;
- unsigned long sleep_time;
+ #define SLEEP_HIST_SIZE 4
+
+ int hist_idx;
+ int hist[SLEEP_HIST_SIZE];
+
+ unsigned long policy;
+ unsigned long cpus_allowed;
struct task_struct *next_task, *prev_task;
- struct mm_struct *active_mm;
+
+ struct mm_struct *mm, *active_mm;
struct list_head local_pages;
+
unsigned int allocation_order, nr_local_pages;
/* task state */
@@ -446,10 +507,51 @@
*/
#define _STK_LIM (8*1024*1024)
-#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */
-#define MAX_COUNTER (20*HZ/100)
-#define DEF_NICE (0)
+/*
+ * Scales user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ 24 ... 63 (MAX_PRIO-1) ]
+ *
+ * User-nice value of -20 == static priority 24, and
+ * user-nice value 19 == static priority 63. The lower
+ * the priority value, the higher the task's priority.
+ *
+ * Note that while static priority cannot go below 24,
+ * the priority of a process can go as low as 0.
+ */
+#define NICE_TO_PRIO(n) (MAX_PRIO-1 + (n) - 19)
+#define DEF_PRIO NICE_TO_PRIO(DEF_USER_NICE)
+
+/*
+ * Default timeslice is 90 msecs, maximum is 150 msecs.
+ * Minimum timeslice is 30 msecs.
+ */
+#define MIN_TIMESLICE ( 30 * HZ / 1000)
+#define MAX_TIMESLICE (150 * HZ / 1000)
+
+#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
+#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+
+/*
+ * PRIO_TO_TIMESLICE scales priority values [ 100 ... 139 ]
+ * to initial time slice values [ MAX_TIMESLICE (150 msec) ... 2 ]
+ *
+ * The higher a process's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority process gets MIN_TIMESLICE worth of execution time.
+ */
+#define PRIO_TO_TIMESLICE(p) \
+ ((( (MAX_USER_PRIO-1-USER_PRIO(p))*(MAX_TIMESLICE-MIN_TIMESLICE) + \
+ MAX_USER_PRIO-1) / MAX_USER_PRIO) + MIN_TIMESLICE)
+
+#define RT_PRIO_TO_TIMESLICE(p) \
+ ((( (MAX_RT_PRIO-(p)-1)*(MAX_TIMESLICE-MIN_TIMESLICE) + \
+ MAX_RT_PRIO-1) / MAX_RT_PRIO) + MIN_TIMESLICE)
+
+extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
+extern void set_user_nice(task_t *p, long nice);
+asmlinkage long sys_sched_yield(void);
+#define yield() sys_sched_yield()
/*
* The default (Linux) execution domain.
@@ -468,14 +570,13 @@
addr_limit: KERNEL_DS, \
exec_domain: &default_exec_domain, \
lock_depth: -1, \
- counter: DEF_COUNTER, \
- nice: DEF_NICE, \
+ __nice: DEF_USER_NICE, \
policy: SCHED_OTHER, \
+ cpus_allowed: -1, \
mm: NULL, \
active_mm: &init_mm, \
- cpus_runnable: -1, \
- cpus_allowed: -1, \
run_list: LIST_HEAD_INIT(tsk.run_list), \
+ time_slice: PRIO_TO_TIMESLICE(DEF_PRIO), \
next_task: &tsk, \
prev_task: &tsk, \
p_opptr: &tsk, \
@@ -551,19 +652,6 @@
return p;
}
-#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
-
-static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
-{
- tsk->processor = cpu;
- tsk->cpus_runnable = 1UL << cpu;
-}
-
-static inline void task_release_cpu(struct task_struct *tsk)
-{
- tsk->cpus_runnable = ~0UL;
-}
-
/* per-UID process charging. */
extern struct user_struct * alloc_uid(uid_t);
extern void free_uid(struct user_struct *);
@@ -591,6 +679,7 @@
extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
signed long timeout));
extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
#define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
@@ -785,6 +874,7 @@
extern void reparent_to_init(void);
extern void daemonize(void);
+extern task_t *child_reaper;
extern int do_execve(char *, char **, char **, struct pt_regs *);
extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
@@ -793,6 +883,9 @@
extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+extern void wait_task_inactive(task_t * p);
+extern void kick_if_running(task_t * p);
+
#define __wait_event(wq, condition) \
do { \
wait_queue_t __wait; \
@@ -873,22 +966,8 @@
#define next_thread(p) \
list_entry((p)->thread_group.next, struct task_struct, thread_group)
-static inline void del_from_runqueue(struct task_struct * p)
-{
- nr_running--;
- p->sleep_time = jiffies;
- list_del(&p->run_list);
- p->run_list.next = NULL;
-}
-
-static inline int task_on_runqueue(struct task_struct *p)
-{
- return (p->run_list.next != NULL);
-}
-
static inline void unhash_process(struct task_struct *p)
{
- if (task_on_runqueue(p)) BUG();
write_lock_irq(&tasklist_lock);
nr_threads--;
unhash_pid(p);
diff -X dontdiff -Nur origlinux/include/linux/smp.h mylinux/include/linux/smp.h
--- origlinux/include/linux/smp.h Fri Jan 11 14:41:40 2002
+++ mylinux/include/linux/smp.h Fri Jan 11 15:37:41 2002
@@ -77,6 +77,14 @@
#define cpu_number_map(cpu) 0
#define smp_call_function(func,info,retry,wait) ({ 0; })
#define cpu_online_map 1
+static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_send_reschedule_all(void) { }
#endif
+
+/*
+ * Common definitions:
+ */
+#define cpu() smp_processor_id()
+
#endif
diff -X dontdiff -Nur origlinux/init/main.c mylinux/init/main.c
--- origlinux/init/main.c Fri Jan 11 14:41:43 2002
+++ mylinux/init/main.c Fri Jan 11 14:46:44 2002
@@ -507,18 +507,10 @@
/* Get other processors into their bootup holding patterns. */
smp_boot_cpus();
wait_init_idle = cpu_online_map;
- clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */
+ clear_bit(cpu(), &wait_init_idle); /* Don't wait on me! */
smp_threads_ready=1;
smp_commence();
-
- /* Wait for the other cpus to set up their idle processes */
- printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle);
- while (wait_init_idle) {
- cpu_relax();
- barrier();
- }
- printk("All processors have done init_idle\n");
}
#endif
@@ -534,9 +526,8 @@
{
kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
unlock_kernel();
- current->need_resched = 1;
- cpu_idle();
-}
+ cpu_idle();
+}
/*
* Activate the first processor.
@@ -617,14 +608,23 @@
ipc_init();
#endif
check_bugs();
- printk("POSIX conformance testing by UNIFIX\n");
- /*
- * We count on the initial thread going ok
- * Like idlers init is an unlocked kernel thread, which will
- * make syscalls (and thus be locked).
+ /*
+ * We count on the initial thread going ok
+ * Like idlers init is an unlocked kernel thread, which will
+ * make syscalls (and thus be locked).
*/
smp_init();
+
+ /*
+ * Finally, we wait for all other CPU's, and initialize this
+ * thread that will become the idle thread for the boot CPU.
+ * After this, the scheduler is fully initialized, and we can
+ * start creating and running new threads.
+ */
+ init_idle();
+
+ /* Do the rest non-__init'ed, we're now alive */
rest_init();
}
@@ -785,12 +785,9 @@
int i, pid;
pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
- if (pid > 0) {
- while (pid != wait(&i)) {
- current->policy |= SCHED_YIELD;
- schedule();
- }
- }
+ if (pid > 0)
+ while (pid != wait(&i))
+ yield();
if (MAJOR(real_root_dev) != RAMDISK_MAJOR
|| MINOR(real_root_dev) != 0) {
error = change_root(real_root_dev,"/initrd");
diff -X dontdiff -Nur origlinux/kernel/capability.c mylinux/kernel/capability.c
--- origlinux/kernel/capability.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/capability.c Fri Jan 11 14:46:44 2002
@@ -8,6 +8,8 @@
#include <linux/mm.h>
#include <asm/uaccess.h>
+unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
+
kernel_cap_t cap_bset = CAP_INIT_EFF_SET;
/* Note: never hold tasklist_lock while spinning for this one */
diff -X dontdiff -Nur origlinux/kernel/exit.c mylinux/kernel/exit.c
--- origlinux/kernel/exit.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/exit.c Fri Jan 11 14:46:44 2002
@@ -27,49 +27,39 @@
static void release_task(struct task_struct * p)
{
- if (p != current) {
+ unsigned long flags;
+
+ if (p == current)
+ BUG();
#ifdef CONFIG_SMP
- /*
- * Wait to make sure the process isn't on the
- * runqueue (active on some other CPU still)
- */
- for (;;) {
- task_lock(p);
- if (!task_has_cpu(p))
- break;
- task_unlock(p);
- do {
- cpu_relax();
- barrier();
- } while (task_has_cpu(p));
- }
- task_unlock(p);
+ wait_task_inactive(p);
#endif
- atomic_dec(&p->user->processes);
- free_uid(p->user);
- unhash_process(p);
-
- release_thread(p);
- current->cmin_flt += p->min_flt + p->cmin_flt;
- current->cmaj_flt += p->maj_flt + p->cmaj_flt;
- current->cnswap += p->nswap + p->cnswap;
- /*
- * Potentially available timeslices are retrieved
- * here - this way the parent does not get penalized
- * for creating too many processes.
- *
- * (this cannot be used to artificially 'generate'
- * timeslices, because any timeslice recovered here
- * was given away by the parent in the first place.)
- */
- current->counter += p->counter;
- if (current->counter >= MAX_COUNTER)
- current->counter = MAX_COUNTER;
- p->pid = 0;
- free_task_struct(p);
- } else {
- printk("task releasing itself\n");
- }
+ atomic_dec(&p->user->processes);
+ free_uid(p->user);
+ unhash_process(p);
+
+ release_thread(p);
+ current->cmin_flt += p->min_flt + p->cmin_flt;
+ current->cmaj_flt += p->maj_flt + p->cmaj_flt;
+ current->cnswap += p->nswap + p->cnswap;
+ /*
+ * Potentially available timeslices are retrieved
+ * here - this way the parent does not get penalized
+ * for creating too many processes.
+ *
+ * (this cannot be used to artificially 'generate'
+ * timeslices, because any timeslice recovered here
+ * was given away by the parent in the first place.)
+ */
+ __save_flags(flags);
+ __cli();
+ current->time_slice += p->time_slice;
+ if (current->time_slice > MAX_TIMESLICE)
+ current->time_slice = MAX_TIMESLICE;
+ __restore_flags(flags);
+
+ p->pid = 0;
+ free_task_struct(p);
}
/*
@@ -147,6 +137,79 @@
}
read_unlock(&tasklist_lock);
return retval;
+}
+
+/**
+ * reparent_to_init() - Reparent the calling kernel thread to the init task.
+ *
+ * If a kernel thread is launched as a result of a system call, or if
+ * it ever exits, it should generally reparent itself to init so that
+ * it is correctly cleaned up on exit.
+ *
+ * The various task state such as scheduling policy and priority may have
+ * been inherited from a user process, so we reset them to sane values here.
+ *
+ * NOTE that reparent_to_init() gives the caller full capabilities.
+ */
+void reparent_to_init(void)
+{
+ write_lock_irq(&tasklist_lock);
+
+ /* Reparent to init */
+ REMOVE_LINKS(current);
+ current->p_pptr = child_reaper;
+ current->p_opptr = child_reaper;
+ SET_LINKS(current);
+
+ /* Set the exit signal to SIGCHLD so we signal init on exit */
+ current->exit_signal = SIGCHLD;
+
+ current->ptrace = 0;
+ if ((current->policy == SCHED_OTHER) && (current->__nice < DEF_USER_NICE))
+ set_user_nice(current, DEF_USER_NICE);
+ /* cpus_allowed? */
+ /* rt_priority? */
+ /* signals? */
+ current->cap_effective = CAP_INIT_EFF_SET;
+ current->cap_inheritable = CAP_INIT_INH_SET;
+ current->cap_permitted = CAP_FULL_SET;
+ current->keep_capabilities = 0;
+ memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim)));
+ current->user = INIT_USER;
+
+ write_unlock_irq(&tasklist_lock);
+}
+
+/*
+ * Put all the gunge required to become a kernel thread without
+ * attached user resources in one place where it belongs.
+ */
+
+void daemonize(void)
+{
+ struct fs_struct *fs;
+
+
+ /*
+ * If we were started as result of loading a module, close all of the
+ * user space pages. We don't need them, and if we didn't close them
+ * they would be locked into memory.
+ */
+ exit_mm(current);
+
+ current->session = 1;
+ current->pgrp = 1;
+ current->tty = NULL;
+
+ /* Become as one with the init task */
+
+ exit_fs(current); /* current->fs->count--; */
+ fs = init_task.fs;
+ current->fs = fs;
+ atomic_inc(&fs->count);
+ exit_files(current);
+ current->files = init_task.files;
+ atomic_inc(¤t->files->count);
}
/*
diff -X dontdiff -Nur origlinux/kernel/fork.c mylinux/kernel/fork.c
--- origlinux/kernel/fork.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/fork.c Fri Jan 11 14:46:44 2002
@@ -28,7 +28,6 @@
/* The idle threads do not count.. */
int nr_threads;
-int nr_running;
int max_threads;
unsigned long total_forks; /* Handle normal Linux uptimes. */
@@ -36,6 +35,8 @@
struct task_struct *pidhash[PIDHASH_SZ];
+rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
+
void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;
@@ -563,6 +564,7 @@
struct pt_regs *regs, unsigned long stack_size)
{
int retval;
+ unsigned long flags;
struct task_struct *p;
struct completion vfork;
@@ -611,8 +613,7 @@
copy_flags(clone_flags, p);
p->pid = get_pid(clone_flags);
- p->run_list.next = NULL;
- p->run_list.prev = NULL;
+ INIT_LIST_HEAD(&p->run_list);
p->p_cptr = NULL;
init_waitqueue_head(&p->wait_chldexit);
@@ -638,14 +639,16 @@
#ifdef CONFIG_SMP
{
int i;
- p->cpus_runnable = ~0UL;
- p->processor = current->processor;
+
+ p->cpu = cpu();
+
/* ?? should we just memset this ?? */
for(i = 0; i < smp_num_cpus; i++)
p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
spin_lock_init(&p->sigmask_lock);
}
#endif
+ p->array = NULL;
p->lock_depth = -1; /* -1 = no lock */
p->start_time = jiffies;
@@ -677,15 +680,28 @@
p->pdeath_signal = 0;
/*
- * "share" dynamic priority between parent and child, thus the
- * total amount of dynamic priorities in the system doesnt change,
- * more scheduling fairness. This is only important in the first
- * timeslice, on the long run the scheduling behaviour is unchanged.
+ * Share the timeslice between parent and child, thus the
+ * total amount of pending timeslices in the system doesnt change,
+ * resulting in more scheduling fairness.
*/
- p->counter = (current->counter + 1) >> 1;
- current->counter >>= 1;
- if (!current->counter)
- current->need_resched = 1;
+ __save_flags(flags);
+ __cli();
+ if (!current->time_slice)
+ BUG();
+ p->time_slice = (current->time_slice + 1) >> 1;
+ current->time_slice >>= 1;
+ if (!current->time_slice) {
+ /*
+ * This case is rare, it happens when the parent has only
+ * a single jiffy left from its timeslice. Taking the
+ * runqueue lock is not a problem.
+ */
+ current->time_slice = 1;
+ scheduler_tick(current);
+ }
+ p->sleep_timestamp = p->run_timestamp = jiffies;
+ p->hist[0] = p->hist[1] = p->hist[2] = p->hist[3] = 0;
+ __restore_flags(flags);
/*
* Ok, add it to the run-queues and make it
@@ -722,10 +738,23 @@
if (p->ptrace & PT_PTRACED)
send_sig(SIGSTOP, p, 1);
+#define RUN_CHILD_FIRST 1
+#if RUN_CHILD_FIRST
+ wake_up_forked_process(p); /* do this last */
+#else
wake_up_process(p); /* do this last */
+#endif
++total_forks;
if (clone_flags & CLONE_VFORK)
wait_for_completion(&vfork);
+#if RUN_CHILD_FIRST
+ else
+ /*
+ * Let the child process run first, to avoid most of the
+ * COW overhead when the child exec()s afterwards.
+ */
+ current->need_resched = 1;
+#endif
fork_out:
return retval;
diff -X dontdiff -Nur origlinux/kernel/ksyms.c mylinux/kernel/ksyms.c
--- origlinux/kernel/ksyms.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/ksyms.c Fri Jan 11 14:46:44 2002
@@ -437,6 +437,9 @@
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
EXPORT_SYMBOL(schedule);
EXPORT_SYMBOL(schedule_timeout);
+EXPORT_SYMBOL(sys_sched_yield);
+EXPORT_SYMBOL(set_user_nice);
+EXPORT_SYMBOL(set_cpus_allowed);
EXPORT_SYMBOL(jiffies);
EXPORT_SYMBOL(xtime);
EXPORT_SYMBOL(do_gettimeofday);
@@ -448,6 +451,7 @@
EXPORT_SYMBOL(kstat);
EXPORT_SYMBOL(nr_running);
+EXPORT_SYMBOL(nr_context_switches);
/* misc */
EXPORT_SYMBOL(panic);
diff -X dontdiff -Nur origlinux/kernel/printk.c mylinux/kernel/printk.c
--- origlinux/kernel/printk.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/printk.c Fri Jan 11 14:49:33 2002
@@ -25,6 +25,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/interrupt.h> /* For in_interrupt() */
+#include <linux/delay.h>
#include <asm/uaccess.h>
diff -X dontdiff -Nur origlinux/kernel/ptrace.c mylinux/kernel/ptrace.c
--- origlinux/kernel/ptrace.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/ptrace.c Fri Jan 11 14:46:44 2002
@@ -31,20 +31,7 @@
if (child->state != TASK_STOPPED)
return -ESRCH;
#ifdef CONFIG_SMP
- /* Make sure the child gets off its CPU.. */
- for (;;) {
- task_lock(child);
- if (!task_has_cpu(child))
- break;
- task_unlock(child);
- do {
- if (child->state != TASK_STOPPED)
- return -ESRCH;
- barrier();
- cpu_relax();
- } while (task_has_cpu(child));
- }
- task_unlock(child);
+ wait_task_inactive(child);
#endif
}
diff -X dontdiff -Nur origlinux/kernel/sched.c mylinux/kernel/sched.c
--- origlinux/kernel/sched.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/sched.c Fri Jan 11 15:36:13 2002
@@ -12,333 +12,328 @@
* 1998-12-28 Implemented better SMP scheduling by Ingo Molnar
*/
-/*
- * 'sched.c' is the main kernel file. It contains scheduling primitives
- * (sleep_on, wakeup, schedule etc) as well as a number of simple system
- * call functions (type getpid()), which just extract a field from
- * current-task
- */
-
-#include <linux/config.h>
#include <linux/mm.h>
+#include <linux/nmi.h>
#include <linux/init.h>
+#include <asm/uaccess.h>
#include <linux/smp_lock.h>
-#include <linux/nmi.h>
#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-#include <linux/completion.h>
-#include <linux/prefetch.h>
-#include <linux/compiler.h>
-
-#include <asm/uaccess.h>
#include <asm/mmu_context.h>
-extern void timer_bh(void);
-extern void tqueue_bh(void);
-extern void immediate_bh(void);
+struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
+
+#define cpu_rq(cpu) (runqueues + (cpu))
+#define this_rq() cpu_rq(smp_processor_id())
+#define task_rq(p) cpu_rq((p)->cpu)
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#define rq_cpu(rq) ((rq) - runqueues)
+#define rt_task(p) ((p)->policy != SCHED_OTHER)
+
+
+#define lock_task_rq(rq,p,flags) \
+do { \
+repeat_lock_task: \
+ rq = task_rq(p); \
+ spin_lock_irqsave(&rq->lock, flags); \
+ if (unlikely(rq_cpu(rq) != (p)->cpu)) { \
+ spin_unlock_irqrestore(&rq->lock, flags); \
+ goto repeat_lock_task; \
+ } \
+} while (0)
+
+#define unlock_task_rq(rq,p,flags) \
+ spin_unlock_irqrestore(&rq->lock, flags)
/*
- * scheduler variables
+ * Adding/removing a task to/from a priority array:
*/
+static inline void dequeue_task(struct task_struct *p, prio_array_t *array)
+{
+ array->nr_active--;
+ list_del_init(&p->run_list);
+ if (list_empty(array->queue + p->prio))
+ __set_bit(p->prio, array->bitmap);
+}
-unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
-
-extern void mem_use(void);
+static inline void enqueue_task(struct task_struct *p, prio_array_t *array)
+{
+ list_add_tail(&p->run_list, array->queue + p->prio);
+ __clear_bit(p->prio, array->bitmap);
+ array->nr_active++;
+ p->array = array;
+}
/*
- * Scheduling quanta.
+ * This is the per-process load estimator. Processes that generate
+ * more load than the system can handle get a priority penalty.
*
- * NOTE! The unix "nice" value influences how long a process
- * gets. The nice value ranges from -20 to +19, where a -20
- * is a "high-priority" task, and a "+10" is a low-priority
- * task.
+ * The estimator uses a 4-entry load-history ringbuffer which is
+ * updated whenever a task is moved to/from the runqueue. The load
+ * estimate is also updated from the timer tick to get an accurate
+ * estimation of currently executing tasks as well.
*
- * We want the time-slice to be around 50ms or so, so this
- * calculation depends on the value of HZ.
+ * The 4-entry p->hist[4] array holds the 'sleep history' of
+ * every task. Every entry holds the number of time ticks spent
+ * sleeping in the past 4 seconds. Three of the entries belong to
+ * one-one second in the past, the fourth entry belongs to the current
+ * second. (the p->hist_idx index is used in fact as a rotating index
+ * to reduce overhead.)
+ *
+ * The array elements are integers in the range of 0-HZ. If HZ is 100,
+ * then '100' means a process has spent 100% of it's time sleeping, in
+ * that particular second of time. '0' means the process has spent all
+ * its time on the runqueue - ie. it was a CPU hog in that second.
+ *
+ * For RAM usage and algorithmic overhead reasons we do not want a too
+ * big history buffer. It's also usually not interesting to the scheduler
+ * to know whether a task was idle or not 10 minutes ago. 'Recent behavior'
+ * is what matters, if a task was mostly sleeping recently then it's a
+ * 'good' interactive task. If it has spent most (or all) of its time
+ * running then it's a 'bad' CPU-hog that gets a priority penalty.
+ *
+ * The load estimator itself was written to be fast as well in every
+ * circumstance. Eg. if a task is context switching heavily then we do
+ * not call into the estimator, only about once per timer tick, on average.
*/
-#if HZ < 200
-#define TICK_SCALE(x) ((x) >> 2)
-#elif HZ < 400
-#define TICK_SCALE(x) ((x) >> 1)
-#elif HZ < 800
-#define TICK_SCALE(x) (x)
-#elif HZ < 1600
-#define TICK_SCALE(x) ((x) << 1)
-#else
-#define TICK_SCALE(x) ((x) << 2)
-#endif
-
-#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1)
-
/*
- * Init task must be ok at boot for the ix86 as we will check its signals
- * via the SMP irq return path.
+ * The 'history index' goes forward in time, if one second passes then
+ * the index is increased by 1 via this function. We wrap around the
+ * index if it reaches 4. (The modulo is fast with the current
+ * SLEEP_HIST_SIZE of 4.)
*/
-
-struct task_struct * init_tasks[NR_CPUS] = {&init_task, };
+static inline void new_second(task_t *p)
+{
+ p->hist_idx = (p->hist_idx + 1) % SLEEP_HIST_SIZE;
+}
/*
- * The tasklist_lock protects the linked list of processes.
- *
- * The runqueue_lock locks the parts that actually access
- * and change the run-queues, and have to be interrupt-safe.
- *
- * If both locks are to be concurrently held, the runqueue_lock
- * nests inside the tasklist_lock.
- *
- * task->alloc_lock nests inside tasklist_lock.
+ * process load-history tick length. Right now it's 1 second:
*/
-spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */
-rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */
-
-static LIST_HEAD(runqueue_head);
+#define HHZ (HZ)
/*
- * We align per-CPU scheduling data on cacheline boundaries,
- * to prevent cacheline ping-pong.
+ * This function clears the load-history entries when a task has spent
+ * more than 4 seconds running.
*/
-static union {
- struct schedule_data {
- struct task_struct * curr;
- cycles_t last_schedule;
- } schedule_data;
- char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
-
-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
-
-struct kernel_stat kstat;
-extern struct task_struct *child_reaper;
-
-#ifdef CONFIG_SMP
-
-#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
-#define can_schedule(p,cpu) \
- ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))
-
-#else
-
-#define idle_task(cpu) (&init_task)
-#define can_schedule(p,cpu) (1)
-
-#endif
-
-void scheduling_functions_start_here(void) { }
+static inline void clear_hist(task_t *p)
+{
+ p->hist[0] = p->hist[1] = p->hist[2] = p->hist[3] = 0;
+}
/*
- * This is the function that decides how desirable a process is..
- * You can weigh different processes against each other depending
- * on what CPU they've run on lately etc to try to handle cache
- * and TLB miss penalties.
- *
- * Return values:
- * -1000: never select this
- * 0: out of time, recalculate counters (but it might still be
- * selected)
- * +ve: "goodness" value (the larger, the better)
- * +1000: realtime process, select this.
+ * This function fills in the load-history entries with the maximum
+ * values when a task has spent more than 4 seconds sleeping.
*/
+static inline void fill_hist(task_t *p)
+{
+ p->hist[0] = p->hist[1] = p->hist[2] = p->hist[3] = HHZ;
+}
-static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+/*
+ * This function is called when a task goes sleeping, ie. when the task
+ * has potentially spent alot of time on the runqueue. p->run_timestamp
+ * is the time the task has started running, 'now' is the time when the
+ * task goes to sleep.
+ */
+static inline void update_sleep_avg_deactivate(task_t *p)
{
- int weight;
+ int idx;
+ unsigned long now = jiffies,
+ seconds_passed = now/HHZ - p->run_timestamp/HHZ;
/*
- * select the current process after every other
- * runnable process, but before the idle thread.
- * Also, dont trigger a counter recalculation.
+ * Do we have to update the history entries becase a
+ * 'new second' has been started? If a new second has
+ * been started then we have to clear all the 'full'
+ * seconds that have been passed during the time the
+ * task was running, and the new current entry has
+ * to be cleared as well.
+ *
+ * Otherwise we only have to update the sleep timestamp.
*/
- weight = -1;
- if (p->policy & SCHED_YIELD)
- goto out;
+ if (unlikely(seconds_passed)) {
+ if (seconds_passed < SLEEP_HIST_SIZE)
+ for (idx = 0; idx < seconds_passed; idx++) {
+ new_second(p);
+ p->hist[p->hist_idx] = 0;
+ }
+ else
+ clear_hist(p);
+ }
+ p->sleep_timestamp = now;
+}
- /*
- * Non-RT process - normal case first.
+/*
+ * This is called when a task gets runnable and gets moved to the runqueue.
+ * ie. when the task has potentially spent alot of time sleeping.
+ * p->sleep_timestamp is the time the task has started sleeping, 'now' is
+ * the time when we go to the runqueue.
+ */
+static inline void update_sleep_avg_activate(task_t *p, unsigned long now)
+{
+ int idx;
+ unsigned long sleep_ticks,
+ seconds_passed = now/HHZ - p->sleep_timestamp/HHZ;
+
+ /*
+ * Do we have to update the history entries becase a
+ * 'new second' has been started? This is slightly more
+ * complex than the deactivate path, because in the deactivate
+ * path history entries are simply cleared, but here we have
+ * to add any potential time spent sleeping in the current
+ * second. This value is 'sleep_ticks' - it can be anywhere
+ * between 0 and HZ-1. (it cannot be HZ because that would mean
+ * that the current second is over and we'd have to go to the
+ * next history entry.) Another detail is that we might
+ * have gone sleeping in this second, or in any previous second.
+ *
+ * Otherwise we only have to update the run timestamp and the
+ * current history entry.
*/
- if (p->policy == SCHED_OTHER) {
- /*
- * Give the process a first-approximation goodness value
- * according to the number of clock-ticks it has left.
- *
- * Don't do any other calculations if the time slice is
- * over..
- */
- weight = p->counter;
- if (!weight)
- goto out;
-
-#ifdef CONFIG_SMP
- /* Give a largish advantage to the same processor... */
- /* (this is equivalent to penalizing other processors) */
- if (p->processor == this_cpu)
- weight += PROC_CHANGE_PENALTY;
-#endif
+ if (unlikely(seconds_passed)) {
+ if (seconds_passed < SLEEP_HIST_SIZE) {
+ /*
+ * Update the "last partially-slept" second's entry:
+ */
+ p->hist[p->hist_idx] += HHZ - (p->sleep_timestamp % HHZ);
+ new_second(p);
- /* .. and a slight advantage to the current MM */
- if (p->mm == this_mm || !p->mm)
- weight += 1;
- weight += 20 - p->nice;
- goto out;
- }
+ /*
+ * Clear any (optional) interim seconds that were
+ * spent fully sleeping:
+ */
+ for (idx = 1; idx < seconds_passed; idx++) {
+ new_second(p);
+ p->hist[p->hist_idx] = HHZ;
+ }
+ } else
+ /*
+ * We slept more than 4 seconds, fill in the
+ * history:
+ */
+ fill_hist(p);
+ /* Clear the new current entry: */
+ p->hist[p->hist_idx] = 0;
+ sleep_ticks = now % HHZ;
+ } else
+ sleep_ticks = now - p->sleep_timestamp;
/*
- * Realtime process, select the first one on the
- * runqueue (taking priorities within processes
- * into account).
+ * Update the current entry with the amount of
+ * ticks the task spent sleeping:
*/
- weight = 1000 + p->rt_priority;
-out:
- return weight;
+ p->hist[p->hist_idx] += sleep_ticks;
+ p->run_timestamp = now;
}
/*
- * the 'goodness value' of replacing a process on a given CPU.
- * positive value means 'replace', zero or negative means 'dont'.
+ * Get the current 'load average' of the task.
+ *
+ * Naively one would divide the sum by 4. But in fact the current entry
+ * is just a partial history, so we have to divide by the actual portion
+ * we recorded, which is somewhere between 3.0 and 4.0 seconds.
*/
-static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
+static inline unsigned int get_run_avg(task_t *p, unsigned long new)
{
- return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
+ return HHZ - (p->hist[0] + p->hist[1] + p->hist[2] +
+ p->hist[3]) * HHZ / ((SLEEP_HIST_SIZE-1)*HHZ + (new % HHZ));
}
-/*
- * This is ugly, but reschedule_idle() is very timing-critical.
- * We are called with the runqueue spinlock held and we must
- * not claim the tasklist_lock.
- */
-static FASTCALL(void reschedule_idle(struct task_struct * p));
-
-static void reschedule_idle(struct task_struct * p)
+static inline void activate_task(task_t *p, runqueue_t *rq)
{
-#ifdef CONFIG_SMP
- int this_cpu = smp_processor_id();
- struct task_struct *tsk, *target_tsk;
- int cpu, best_cpu, i, max_prio;
- cycles_t oldest_idle;
+ prio_array_t *array = rq->active;
+ unsigned long now = jiffies;
+ unsigned int penalty;
- /*
- * shortcut if the woken up task's last CPU is
- * idle now.
- */
- best_cpu = p->processor;
- if (can_schedule(p, best_cpu)) {
- tsk = idle_task(best_cpu);
- if (cpu_curr(best_cpu) == tsk) {
- int need_resched;
-send_now_idle:
- /*
- * If need_resched == -1 then we can skip sending
- * the IPI altogether, tsk->need_resched is
- * actively watched by the idle thread.
- */
- need_resched = tsk->need_resched;
- tsk->need_resched = 1;
- if ((best_cpu != this_cpu) && !need_resched)
- smp_send_reschedule(best_cpu);
- return;
- }
- }
+ if (likely(p->run_timestamp == now))
+ goto enqueue;
+ update_sleep_avg_activate(p, now);
/*
- * We know that the preferred CPU has a cache-affine current
- * process, lets try to find a new idle CPU for the woken-up
- * process. Select the least recently active idle CPU. (that
- * one will have the least active cache context.) Also find
- * the executing process which has the least priority.
- */
- oldest_idle = (cycles_t) -1;
- target_tsk = NULL;
- max_prio = 0;
+ * Give the process a priority penalty if it has not slept often
+ * enough in the past. We scale the priority penalty according
+ * to the current load of the runqueue, and the 'load history'
+ * this process has. Eg. if the CPU has 3 processes running
+ * right now then a process that has slept more than two-thirds
+ * of the time is considered to be 'interactive'. The higher
+ * the load of the CPUs is, the easier it is for a process to
+ * get an non-interactivity penalty.
+ *
+ * the return value of get_run_avg() is an integer between 0 and HZ.
+ * We scale this 'load value' to between 0 and MAX_USER_PRIO/3.
+ * A task that generates 100% load gets the maximum penalty.
+ */
+ penalty = MAX_USER_PRIO * get_run_avg(p, now) / (3 * HHZ);
+ if (!rt_task(p)) {
+ p->prio = NICE_TO_PRIO(p->__nice) + penalty;
+ if (p->prio > MAX_PRIO-1)
+ p->prio = MAX_PRIO-1;
+ }
+enqueue:
+ enqueue_task(p, array);
+ rq->nr_running++;
+}
- for (i = 0; i < smp_num_cpus; i++) {
- cpu = cpu_logical_map(i);
- if (!can_schedule(p, cpu))
- continue;
- tsk = cpu_curr(cpu);
- /*
- * We use the first available idle CPU. This creates
- * a priority list between idle CPUs, but this is not
- * a problem.
- */
- if (tsk == idle_task(cpu)) {
-#if defined(__i386__) && defined(CONFIG_SMP)
- /*
- * Check if two siblings are idle in the same
- * physical package. Use them if found.
- */
- if (smp_num_siblings == 2) {
- if (cpu_curr(cpu_sibling_map[cpu]) ==
- idle_task(cpu_sibling_map[cpu])) {
- oldest_idle = last_schedule(cpu);
- target_tsk = tsk;
- break;
- }
-
- }
-#endif
- if (last_schedule(cpu) < oldest_idle) {
- oldest_idle = last_schedule(cpu);
- target_tsk = tsk;
- }
- } else {
- if (oldest_idle == -1ULL) {
- int prio = preemption_goodness(tsk, p, cpu);
+static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
+{
+ rq->nr_running--;
+ dequeue_task(p, p->array);
+ p->array = NULL;
+ update_sleep_avg_deactivate(p);
+}
- if (prio > max_prio) {
- max_prio = prio;
- target_tsk = tsk;
- }
- }
- }
- }
- tsk = target_tsk;
- if (tsk) {
- if (oldest_idle != -1ULL) {
- best_cpu = tsk->processor;
- goto send_now_idle;
- }
- tsk->need_resched = 1;
- if (tsk->processor != this_cpu)
- smp_send_reschedule(tsk->processor);
- }
- return;
-
+static inline void resched_task(task_t *p)
+{
+ int need_resched;
-#else /* UP */
- int this_cpu = smp_processor_id();
- struct task_struct *tsk;
-
- tsk = cpu_curr(this_cpu);
- if (preemption_goodness(tsk, p, this_cpu) > 0)
- tsk->need_resched = 1;
-#endif
+ need_resched = p->need_resched;
+ wmb();
+ p->need_resched = 1;
+ if (!need_resched && (p->cpu != smp_processor_id()))
+ smp_send_reschedule(p->cpu);
}
+#ifdef CONFIG_SMP
+
/*
- * Careful!
- *
- * This has to add the process to the _beginning_ of the
- * run-queue, not the end. See the comment about "This is
- * subtle" in the scheduler proper..
+ * Wait for a process to unschedule. This is used by the exit() and
+ * ptrace() code.
*/
-static inline void add_to_runqueue(struct task_struct * p)
+void wait_task_inactive(task_t * p)
{
- list_add(&p->run_list, &runqueue_head);
- nr_running++;
-}
+ unsigned long flags;
+ runqueue_t *rq;
-static inline void move_last_runqueue(struct task_struct * p)
-{
- list_del(&p->run_list);
- list_add_tail(&p->run_list, &runqueue_head);
+repeat:
+ rq = task_rq(p);
+ while (unlikely(rq->curr == p)) {
+ cpu_relax();
+ barrier();
+ }
+ lock_task_rq(rq, p, flags);
+ if (unlikely(rq->curr == p)) {
+ unlock_task_rq(rq, p, flags);
+ goto repeat;
+ }
+ unlock_task_rq(rq, p, flags);
}
-static inline void move_first_runqueue(struct task_struct * p)
+/*
+ * Kick the remote CPU if the task is running currently,
+ * this code is used by the signal code to signal tasks
+ * which are in user-mode as quickly as possible.
+ *
+ * (Note that we do this lockless - if the task does anything
+ * while the message is in flight then it will notice the
+ * sigpending condition anyway.)
+ */
+void kick_if_running(task_t * p)
{
- list_del(&p->run_list);
- list_add(&p->run_list, &runqueue_head);
+ if (p == task_rq(p)->curr)
+ resched_task(p);
}
+#endif
/*
* Wake up a process. Put it on the run-queue if it's not
@@ -348,392 +343,470 @@
* "current->state = TASK_RUNNING" to mark yourself runnable
* without the overhead of this.
*/
-static inline int try_to_wake_up(struct task_struct * p, int synchronous)
+static int try_to_wake_up(task_t * p, int synchronous)
{
unsigned long flags;
int success = 0;
+ runqueue_t *rq;
- /*
- * We want the common case fall through straight, thus the goto.
- */
- spin_lock_irqsave(&runqueue_lock, flags);
+ lock_task_rq(rq, p, flags);
p->state = TASK_RUNNING;
- if (task_on_runqueue(p))
- goto out;
- add_to_runqueue(p);
- if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
- reschedule_idle(p);
- success = 1;
-out:
- spin_unlock_irqrestore(&runqueue_lock, flags);
+ if (!p->array) {
+ activate_task(p, rq);
+ if ((rq->curr == rq->idle) || (p->prio < rq->curr->prio))
+ resched_task(rq->curr);
+ success = 1;
+ }
+ unlock_task_rq(rq, p, flags);
return success;
}
-inline int wake_up_process(struct task_struct * p)
+inline int wake_up_process(task_t * p)
{
return try_to_wake_up(p, 0);
}
-static void process_timeout(unsigned long __data)
+void wake_up_forked_process(task_t * p)
{
- struct task_struct * p = (struct task_struct *) __data;
+ runqueue_t *rq = this_rq();
- wake_up_process(p);
+ spin_lock_irq(&rq->lock);
+ p->state = TASK_RUNNING;
+ if (!rt_task(p)) {
+ p->prio += MAX_USER_PRIO/10;
+ if (p->prio > MAX_PRIO-1)
+ p->prio = MAX_PRIO-1;
+ }
+ activate_task(p, rq);
+ spin_unlock_irq(&rq->lock);
}
-/**
- * schedule_timeout - sleep until timeout
- * @timeout: timeout value in jiffies
- *
- * Make the current task sleep until @timeout jiffies have
- * elapsed. The routine will return immediately unless
- * the current task state has been set (see set_current_state()).
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
- * pass before the routine returns. The routine will return 0
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * in jiffies will be returned, or 0 if the timer expired in time
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- *
- * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
- * the CPU away without a bound on the timeout. In this case the return
- * value will be %MAX_SCHEDULE_TIMEOUT.
- *
- * In all cases the return value is guaranteed to be non-negative.
- */
-signed long schedule_timeout(signed long timeout)
+asmlinkage void schedule_tail(task_t *prev)
{
- struct timer_list timer;
- unsigned long expire;
+ spin_unlock_irq(&this_rq()->lock);
+}
- switch (timeout)
- {
- case MAX_SCHEDULE_TIMEOUT:
- /*
- * These two special cases are useful to be comfortable
- * in the caller. Nothing more. We could take
- * MAX_SCHEDULE_TIMEOUT from one of the negative value
- * but I' d like to return a valid offset (>=0) to allow
- * the caller to do everything it want with the retval.
- */
- schedule();
- goto out;
- default:
- /*
- * Another bit of PARANOID. Note that the retval will be
- * 0 since no piece of kernel is supposed to do a check
- * for a negative retval of schedule_timeout() (since it
- * should never happens anyway). You just have the printk()
- * that will tell you if something is gone wrong and where.
- */
- if (timeout < 0)
- {
- printk(KERN_ERR "schedule_timeout: wrong timeout "
- "value %lx from %p\n", timeout,
- __builtin_return_address(0));
- current->state = TASK_RUNNING;
- goto out;
- }
+static inline void context_switch(task_t *prev, task_t *next)
+{
+ struct mm_struct *mm = next->mm;
+ struct mm_struct *oldmm = prev->active_mm;
+
+ prepare_to_switch();
+
+ if (!mm) {
+ next->active_mm = oldmm;
+ atomic_inc(&oldmm->mm_count);
+ enter_lazy_tlb(oldmm, next, smp_processor_id());
+ } else
+ switch_mm(oldmm, mm, next, smp_processor_id());
+
+ if (!prev->mm) {
+ prev->active_mm = NULL;
+ mmdrop(oldmm);
}
- expire = timeout + jiffies;
+ /*
+ * Here we just switch the register state and the stack. There are
+ * 3 processes affected by a context switch:
+ *
+ * prev ==> .... ==> (last => next)
+ *
+ * It's the 'much more previous' 'prev' that is on next's stack,
+ * but prev is set to (the just run) 'last' process by switch_to().
+ * This might sound slightly confusing but makes tons of sense.
+ */
+ switch_to(prev, next, prev);
+}
+
+unsigned long nr_running(void)
+{
+ unsigned long i, sum = 0;
- init_timer(&timer);
- timer.expires = expire;
- timer.data = (unsigned long) current;
- timer.function = process_timeout;
+ for (i = 0; i < smp_num_cpus; i++)
+ sum += cpu_rq(i)->nr_running;
- add_timer(&timer);
- schedule();
- del_timer_sync(&timer);
+ return sum;
+}
+
+unsigned long nr_context_switches(void)
+{
+ unsigned long i, sum = 0;
+
+ for (i = 0; i < smp_num_cpus; i++)
+ sum += cpu_rq(i)->nr_switches;
+
+ return sum;
+}
- timeout = expire - jiffies;
+static inline unsigned long max_rq_len(void)
+{
+ unsigned long i, curr, max = 0;
- out:
- return timeout < 0 ? 0 : timeout;
+ for (i = 0; i < smp_num_cpus; i++) {
+ curr = cpu_rq(i)->nr_running;
+ if (curr > max)
+ max = curr;
+ }
+ return max;
}
/*
- * schedule_tail() is getting called from the fork return path. This
- * cleans up all remaining scheduler things, without impacting the
- * common case.
+ * Current runqueue is empty, or rebalance tick: if there is an
+ * inbalance (current runqueue is too short) then pull from
+ * busiest runqueue(s).
+ *
+ * We call this with the current runqueue locked,
+ * irqs disabled.
*/
-static inline void __schedule_tail(struct task_struct *prev)
+static void load_balance(runqueue_t *this_rq, int idle)
{
-#ifdef CONFIG_SMP
- int policy;
+ int imbalance, nr_running, load, prev_max_load,
+ max_load, idx, i, this_cpu = smp_processor_id();
+ task_t *next = this_rq->idle, *tmp;
+ runqueue_t *busiest, *rq_src;
+ prio_array_t *array;
+ list_t *head, *curr;
/*
- * prev->policy can be written from here only before `prev'
- * can be scheduled (before setting prev->cpus_runnable to ~0UL).
- * Of course it must also be read before allowing prev
- * to be rescheduled, but since the write depends on the read
- * to complete, wmb() is enough. (the spin_lock() acquired
- * before setting cpus_runnable is not enough because the spin_lock()
- * common code semantics allows code outside the critical section
- * to enter inside the critical section)
+ * We search all runqueues to find the most busy one.
+ * We do this lockless to reduce cache-bouncing overhead,
+ * we re-check the 'best' source CPU later on again, with
+ * the lock held.
+ *
+ * We fend off statistical fluctuations in runqueue lengths by
+ * saving the runqueue length during the previous load-balancing
+ * operation and using the smaller one the current and saved lengths.
+ * If a runqueue is long enough for a longer amount of time then
+ * we recognize it and pull tasks from it.
+ *
+ * The 'current runqueue length' is a statistical maximum variable,
+ * for that one we take the longer one - to avoid fluctuations in
+ * the other direction. So for a load-balance to happen it needs
+ * stable long runqueue on the target CPU and stable short runqueue
+ * on the local runqueue.
+ *
+ * We make an exception if this CPU is about to become idle - in
+ * that case we are less picky about moving a task across CPUs and
+ * take what can be taken.
*/
- policy = prev->policy;
- prev->policy = policy & ~SCHED_YIELD;
- wmb();
+ if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
+ nr_running = this_rq->nr_running;
+ else
+ nr_running = this_rq->prev_nr_running[this_cpu];
+ prev_max_load = 1000000000;
+
+ busiest = NULL;
+ max_load = 0;
+ for (i = 0; i < smp_num_cpus; i++) {
+ rq_src = cpu_rq(i);
+ if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
+ load = rq_src->nr_running;
+ else
+ load = this_rq->prev_nr_running[i];
+ this_rq->prev_nr_running[i] = rq_src->nr_running;
+
+ if ((load > max_load) && (load < prev_max_load) &&
+ (rq_src != this_rq)) {
+ busiest = rq_src;
+ max_load = load;
+ }
+ }
+
+ if (likely(!busiest))
+ return;
+
+ imbalance = (max_load - nr_running) / 2;
/*
- * fast path falls through. We have to clear cpus_runnable before
- * checking prev->state to avoid a wakeup race. Protect against
- * the task exiting early.
- */
- task_lock(prev);
- task_release_cpu(prev);
- mb();
- if (prev->state == TASK_RUNNING)
- goto needs_resched;
+ * It needs an at least ~25% imbalance to trigger balancing.
+ *
+ * prev_max_load makes sure that we do not try to balance
+ * ad infinitum - certain tasks might be impossible to be
+ * pulled into this runqueue.
+ */
+ if (!idle && (imbalance < (max_load + 3)/4))
+ return;
+ prev_max_load = max_load;
-out_unlock:
- task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
- return;
+ /*
+ * Ok, lets do some actual balancing:
+ */
+ if (rq_cpu(busiest) < this_cpu) {
+ spin_unlock(&this_rq->lock);
+ spin_lock(&busiest->lock);
+ spin_lock(&this_rq->lock);
+ } else
+ spin_lock(&busiest->lock);
/*
- * Slow path - we 'push' the previous process and
- * reschedule_idle() will attempt to find a new
- * processor for it. (but it might preempt the
- * current process as well.) We must take the runqueue
- * lock and re-check prev->state to be correct. It might
- * still happen that this process has a preemption
- * 'in progress' already - but this is not a problem and
- * might happen in other circumstances as well.
+ * Make sure nothing changed since we checked the
+ * runqueue length.
*/
-needs_resched:
- {
- unsigned long flags;
+ if (busiest->nr_running <= nr_running + 1)
+ goto out_unlock;
- /*
- * Avoid taking the runqueue lock in cases where
- * no preemption-check is necessery:
- */
- if ((prev == idle_task(smp_processor_id())) ||
- (policy & SCHED_YIELD))
- goto out_unlock;
+ /*
+ * We first consider expired tasks. Those will likely not run
+ * in the near future, thus switching CPUs has the least effect
+ * on them.
+ */
+ if (busiest->expired->nr_active)
+ array = busiest->expired;
+ else
+ array = busiest->active;
- spin_lock_irqsave(&runqueue_lock, flags);
- if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
- reschedule_idle(prev);
- spin_unlock_irqrestore(&runqueue_lock, flags);
+new_array:
+ /*
+ * Load-balancing does not affect RT tasks, so we start the
+ * searching at priority 128.
+ */
+ idx = MAX_RT_PRIO;
+skip_bitmap:
+ idx = find_next_zero_bit(array->bitmap, MAX_PRIO, idx);
+ if (idx == MAX_PRIO) {
+ if (array == busiest->expired) {
+ array = busiest->active;
+ goto new_array;
+ }
+ spin_unlock(&busiest->lock);
goto out_unlock;
}
-#else
- prev->policy &= ~SCHED_YIELD;
-#endif /* CONFIG_SMP */
+
+ head = array->queue + idx;
+ curr = head->next;
+skip_queue:
+ tmp = list_entry(curr, task_t, run_list);
+ if ((tmp == busiest->curr) || !(tmp->cpus_allowed & (1 << this_cpu))) {
+ curr = curr->next;
+ if (curr != head)
+ goto skip_queue;
+ idx++;
+ goto skip_bitmap;
+ }
+ next = tmp;
+ /*
+ * take the task out of the other runqueue and
+ * put it into this one:
+ */
+ dequeue_task(next, array);
+ busiest->nr_running--;
+ next->cpu = this_cpu;
+ this_rq->nr_running++;
+ enqueue_task(next, this_rq->active);
+ if (next->prio < current->prio)
+ current->need_resched = 1;
+ if (!idle && --imbalance) {
+ if (array == busiest->expired) {
+ array = busiest->active;
+ goto new_array;
+ }
+ spin_unlock(&busiest->lock);
+ }
+out_unlock:
+ spin_unlock(&busiest->lock);
}
-asmlinkage void schedule_tail(struct task_struct *prev)
+/*
+ * One of the idle_cpu_tick() or the busy_cpu_tick() function will
+ * gets called every timer tick, on every CPU. Our balancing action
+ * frequency and balancing agressivity depends on whether the CPU is
+ * idle or not.
+ *
+ * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
+ * systems with HZ=100, every 10 msecs.)
+ */
+#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
+#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
+
+static inline void idle_tick(void)
{
- __schedule_tail(prev);
+ if ((jiffies % IDLE_REBALANCE_TICK) ||
+ likely(this_rq()->curr == NULL))
+ return;
+ spin_lock(&this_rq()->lock);
+ load_balance(this_rq(), 1);
+ spin_unlock(&this_rq()->lock);
}
/*
- * 'schedule()' is the scheduler function. It's a very simple and nice
- * scheduler: it's not perfect, but certainly works for most things.
- *
- * The goto is "interesting".
- *
- * NOTE!! Task 0 is the 'idle' task, which gets called when no other
- * tasks can run. It can not be killed, and it cannot sleep. The 'state'
- * information in task[0] is never used.
+ * Should we treat the task as interactive or not.
+ * A task is interactive if it has not exceeded 50%
+ * of the max CPU-hog penalty yet.
*/
-asmlinkage void schedule(void)
+static int task_interactive(task_t *p, unsigned long now)
{
- struct schedule_data * sched_data;
- struct task_struct *prev, *next, *p;
- struct list_head *tmp;
- int this_cpu, c;
+ int penalty;
+ if (rt_task(p))
+ return 1;
+ penalty = MAX_USER_PRIO * get_run_avg(p, jiffies) / (3 * HHZ);
+ if (penalty <= MAX_USER_PRIO/6)
+ return 1;
+ return 0;
+}
- spin_lock_prefetch(&runqueue_lock);
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ */
+void scheduler_tick(task_t *p)
+{
+ unsigned long now = jiffies;
+ runqueue_t *rq = this_rq();
- if (!current->active_mm) BUG();
-need_resched_back:
- prev = current;
- this_cpu = prev->processor;
+ if (p == rq->idle || !rq->idle)
+ return idle_tick();
+ /* Task might have expired already, but not scheduled off yet */
+ if (p->array != rq->active) {
+ p->need_resched = 1;
+ return;
+ }
+ /*
+ * The task cannot change CPUs because it's the current task.
+ */
+ spin_lock(&rq->lock);
+ if ((p->policy != SCHED_FIFO) && !--p->time_slice) {
+ p->need_resched = 1;
+ if (rt_task(p))
+ p->time_slice = RT_PRIO_TO_TIMESLICE(p->prio);
+ else
+ p->time_slice = PRIO_TO_TIMESLICE(p->prio);
- if (unlikely(in_interrupt())) {
- printk("Scheduling in interrupt\n");
- BUG();
+ /*
+ * Timeslice used up - discard any possible
+ * priority penalty:
+ */
+ dequeue_task(p, rq->active);
+ /*
+ * Tasks that have nice values of -20 ... -15 are put
+ * back into the active array. If they use up too much
+ * CPU time then they'll get a priority penalty anyway
+ * so this can not starve other processes accidentally.
+ * Otherwise this is pretty handy for sysadmins ...
+ */
+ if (task_interactive(p, now))
+ enqueue_task(p, rq->active);
+ else
+ enqueue_task(p, rq->expired);
+ } else {
+ /*
+ * Deactivate + activate the task so that the
+ * load estimator gets updated properly:
+ */
+ if (!rt_task(p)) {
+ deactivate_task(p, rq);
+ activate_task(p, rq);
+ }
}
+ if (!(now % BUSY_REBALANCE_TICK))
+ load_balance(rq, 0);
+ spin_unlock(&rq->lock);
+}
- release_kernel_lock(prev, this_cpu);
-
- /*
- * 'sched_data' is protected by the fact that we can run
- * only one process per CPU.
- */
- sched_data = & aligned_data[this_cpu].schedule_data;
+void scheduling_functions_start_here(void) { }
- spin_lock_irq(&runqueue_lock);
+/*
+ * 'schedule()' is the main scheduler function.
+ */
+asmlinkage void schedule(void)
+{
+ task_t *prev, *next;
+ prio_array_t *array;
+ runqueue_t *rq;
+ list_t *queue;
+ int idx;
- /* move an exhausted RR process to be last.. */
- if (unlikely(prev->policy == SCHED_RR))
- if (!prev->counter) {
- prev->counter = NICE_TO_TICKS(prev->nice);
- move_last_runqueue(prev);
- }
+ if (unlikely(in_interrupt()))
+ BUG();
+need_resched_back:
+ prev = current;
+ release_kernel_lock(prev, smp_processor_id());
+ rq = this_rq();
+ spin_lock_irq(&rq->lock);
switch (prev->state) {
case TASK_INTERRUPTIBLE:
- if (signal_pending(prev)) {
+ if (unlikely(signal_pending(prev))) {
prev->state = TASK_RUNNING;
break;
}
default:
- del_from_runqueue(prev);
- case TASK_RUNNING:;
+ deactivate_task(prev, rq);
+ case TASK_RUNNING:
}
- prev->need_resched = 0;
-
- /*
- * this is the scheduler proper:
- */
-
-repeat_schedule:
- /*
- * Default process to select..
- */
- next = idle_task(this_cpu);
- c = -1000;
- list_for_each(tmp, &runqueue_head) {
- p = list_entry(tmp, struct task_struct, run_list);
- if (can_schedule(p, this_cpu)) {
- int weight = goodness(p, this_cpu, prev->active_mm);
- if (weight > c)
- c = weight, next = p;
- }
+pick_next_task:
+ if (unlikely(!rq->nr_running)) {
+ load_balance(rq, 1);
+ if (rq->nr_running)
+ goto pick_next_task;
+ next = rq->idle;
+ goto switch_tasks;
}
- /* Do we need to re-calculate counters? */
- if (unlikely(!c)) {
- struct task_struct *p;
-
- spin_unlock_irq(&runqueue_lock);
- read_lock(&tasklist_lock);
- for_each_task(p)
- p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
- read_unlock(&tasklist_lock);
- spin_lock_irq(&runqueue_lock);
- goto repeat_schedule;
- }
-
- /*
- * from this point on nothing can prevent us from
- * switching to the next task, save this fact in
- * sched_data.
- */
- sched_data->curr = next;
- task_set_cpu(next, this_cpu);
- spin_unlock_irq(&runqueue_lock);
-
- if (unlikely(prev == next)) {
- /* We won't go through the normal tail, so do this by hand */
- prev->policy &= ~SCHED_YIELD;
- goto same_process;
+ array = rq->active;
+ if (unlikely(!array->nr_active)) {
+ /*
+ * Switch the active and expired arrays.
+ */
+ rq->active = rq->expired;
+ rq->expired = array;
+ array = rq->active;
}
-#ifdef CONFIG_SMP
- /*
- * maintain the per-process 'last schedule' value.
- * (this has to be recalculated even if we reschedule to
- * the same process) Currently this is only used on SMP,
- * and it's approximate, so we do not have to maintain
- * it while holding the runqueue spinlock.
- */
- sched_data->last_schedule = get_cycles();
-
- /*
- * We drop the scheduler lock early (it's a global spinlock),
- * thus we have to lock the previous process from getting
- * rescheduled during switch_to().
- */
+ idx = sched_find_first_zero_bit(array->bitmap);
+ queue = array->queue + idx;
+ next = list_entry(queue->next, task_t, run_list);
-#endif /* CONFIG_SMP */
-
- kstat.context_swtch++;
- /*
- * there are 3 processes which are affected by a context switch:
- *
- * prev == .... ==> (last => next)
- *
- * It's the 'much more previous' 'prev' that is on next's stack,
- * but prev is set to (the just run) 'last' process by switch_to().
- * This might sound slightly confusing but makes tons of sense.
- */
- prepare_to_switch();
- {
- struct mm_struct *mm = next->mm;
- struct mm_struct *oldmm = prev->active_mm;
- if (!mm) {
- if (next->active_mm) BUG();
- next->active_mm = oldmm;
- atomic_inc(&oldmm->mm_count);
- enter_lazy_tlb(oldmm, next, this_cpu);
- } else {
- if (next->active_mm != mm) BUG();
- switch_mm(oldmm, mm, next, this_cpu);
- }
+switch_tasks:
+ prev->need_resched = 0;
- if (!prev->mm) {
- prev->active_mm = NULL;
- mmdrop(oldmm);
- }
+ if (likely(prev != next)) {
+ rq->nr_switches++;
+ rq->curr = next;
+ next->cpu = prev->cpu;
+ context_switch(prev, next);
+ /*
+ * The runqueue pointer might be from another CPU
+ * if the new task was last running on a different
+ * CPU - thus re-load it.
+ */
+ barrier();
+ rq = this_rq();
}
+ spin_unlock_irq(&rq->lock);
- /*
- * This just switches the register state and the
- * stack.
- */
- switch_to(prev, next, prev);
- __schedule_tail(prev);
-
-same_process:
reacquire_kernel_lock(current);
- if (current->need_resched)
+ if (unlikely(current->need_resched))
goto need_resched_back;
return;
}
/*
- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything
- * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
- * non-exclusive tasks and one exclusive task.
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero
- * in this (rare) case, and we handle it by contonuing to scan the queue.
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, const int sync)
{
struct list_head *tmp;
- struct task_struct *p;
+ task_t *p;
- CHECK_MAGIC_WQHEAD(q);
- WQ_CHECK_LIST_HEAD(&q->task_list);
-
list_for_each(tmp,&q->task_list) {
unsigned int state;
- wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+ wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
- CHECK_MAGIC(curr->__magic);
p = curr->task;
state = p->state;
- if (state & mode) {
- WQ_NOTE_WAKER(curr);
- if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
- break;
- }
+ if ((state & mode) &&
+ try_to_wake_up(p, sync) &&
+ ((curr->flags & WQ_FLAG_EXCLUSIVE) &&
+ !--nr_exclusive))
+ break;
}
}
@@ -850,8 +923,95 @@
return timeout;
}
+/*
+ * Change the current task's CPU affinity. Migrate the process to a
+ * proper CPU and schedule away if the current CPU is removed from
+ * the allowed bitmask.
+ */
+void set_cpus_allowed(task_t *p, unsigned long new_mask)
+{
+ runqueue_t *this_rq = this_rq(), *target_rq;
+ unsigned long this_mask = 1UL << smp_processor_id();
+ int target_cpu;
+
+ new_mask &= cpu_online_map;
+ if (!new_mask)
+ BUG();
+ p->cpus_allowed = new_mask;
+ /*
+ * Can the task run on the current CPU? If not then
+ * migrate the process off to a proper CPU.
+ */
+ if (new_mask & this_mask)
+ return;
+ target_cpu = ffz(~new_mask);
+ target_rq = cpu_rq(target_cpu);
+ if (target_cpu < smp_processor_id()) {
+ spin_lock_irq(&target_rq->lock);
+ spin_lock(&this_rq->lock);
+ } else {
+ spin_lock_irq(&this_rq->lock);
+ spin_lock(&target_rq->lock);
+ }
+ dequeue_task(p, p->array);
+ this_rq->nr_running--;
+ target_rq->nr_running++;
+ enqueue_task(p, target_rq->active);
+ target_rq->curr->need_resched = 1;
+ spin_unlock(&target_rq->lock);
+
+ /*
+ * The easiest solution is to context switch into
+ * the idle thread - which will pick the best task
+ * afterwards:
+ */
+ this_rq->nr_switches++;
+ this_rq->curr = this_rq->idle;
+ this_rq->idle->need_resched = 1;
+ context_switch(current, this_rq->idle);
+ barrier();
+ spin_unlock_irq(&this_rq()->lock);
+}
+
void scheduling_functions_end_here(void) { }
+void set_user_nice(task_t *p, long nice)
+{
+ unsigned long flags;
+ prio_array_t *array;
+ runqueue_t *rq;
+
+ if (p->__nice == nice)
+ return;
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ lock_task_rq(rq, p, flags);
+ if (rt_task(p)) {
+ p->__nice = nice;
+ goto out_unlock;
+ }
+ array = p->array;
+ if (array) {
+ dequeue_task(p, array);
+ }
+ p->__nice = nice;
+ p->prio = NICE_TO_PRIO(nice);
+ if (array) {
+ enqueue_task(p, array);
+ /*
+ * If the task is runnable and lowered its priority,
+ * or increased its priority then reschedule its CPU:
+ */
+ if ((nice < p->__nice) ||
+ ((p->__nice < nice) && (p == rq->curr)))
+ resched_task(rq->curr);
+ }
+out_unlock:
+ unlock_task_rq(rq, p, flags);
+}
+
#ifndef __alpha__
/*
@@ -862,7 +1022,7 @@
asmlinkage long sys_nice(int increment)
{
- long newprio;
+ long nice;
/*
* Setpriority might change our priority at the same moment.
@@ -878,32 +1038,30 @@
if (increment > 40)
increment = 40;
- newprio = current->nice + increment;
- if (newprio < -20)
- newprio = -20;
- if (newprio > 19)
- newprio = 19;
- current->nice = newprio;
+ nice = current->__nice + increment;
+ if (nice < -20)
+ nice = -20;
+ if (nice > 19)
+ nice = 19;
+ set_user_nice(current, nice);
return 0;
}
#endif
-static inline struct task_struct *find_process_by_pid(pid_t pid)
+static inline task_t *find_process_by_pid(pid_t pid)
{
- struct task_struct *tsk = current;
-
- if (pid)
- tsk = find_task_by_pid(pid);
- return tsk;
+ return pid ? find_task_by_pid(pid) : current;
}
-static int setscheduler(pid_t pid, int policy,
- struct sched_param *param)
+static int setscheduler(pid_t pid, int policy, struct sched_param *param)
{
struct sched_param lp;
- struct task_struct *p;
+ prio_array_t *array;
+ unsigned long flags;
+ runqueue_t *rq;
int retval;
+ task_t *p;
retval = -EINVAL;
if (!param || pid < 0)
@@ -917,14 +1075,19 @@
* We play safe to avoid deadlocks.
*/
read_lock_irq(&tasklist_lock);
- spin_lock(&runqueue_lock);
p = find_process_by_pid(pid);
retval = -ESRCH;
if (!p)
- goto out_unlock;
-
+ goto out_unlock_tasklist;
+
+ /*
+ * To be able to change p->policy safely, the apropriate
+ * runqueue lock must be held.
+ */
+ lock_task_rq(rq,p,flags);
+
if (policy < 0)
policy = p->policy;
else {
@@ -945,30 +1108,36 @@
goto out_unlock;
retval = -EPERM;
- if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
+ if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
!capable(CAP_SYS_NICE))
goto out_unlock;
if ((current->euid != p->euid) && (current->euid != p->uid) &&
!capable(CAP_SYS_NICE))
goto out_unlock;
+ array = p->array;
+ if (array)
+ deactivate_task(p, task_rq(p));
retval = 0;
p->policy = policy;
p->rt_priority = lp.sched_priority;
- if (task_on_runqueue(p))
- move_first_runqueue(p);
-
- current->need_resched = 1;
+ if (rt_task(p))
+ p->prio = 99-p->rt_priority;
+ else
+ p->prio = NICE_TO_PRIO(p->__nice);
+ if (array)
+ activate_task(p, task_rq(p));
out_unlock:
- spin_unlock(&runqueue_lock);
+ unlock_task_rq(rq,p,flags);
+out_unlock_tasklist:
read_unlock_irq(&tasklist_lock);
out_nounlock:
return retval;
}
-asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
struct sched_param *param)
{
return setscheduler(pid, policy, param);
@@ -981,7 +1150,7 @@
asmlinkage long sys_sched_getscheduler(pid_t pid)
{
- struct task_struct *p;
+ task_t *p;
int retval;
retval = -EINVAL;
@@ -992,7 +1161,7 @@
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
if (p)
- retval = p->policy & ~SCHED_YIELD;
+ retval = p->policy;
read_unlock(&tasklist_lock);
out_nounlock:
@@ -1001,7 +1170,7 @@
asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
{
- struct task_struct *p;
+ task_t *p;
struct sched_param lp;
int retval;
@@ -1032,42 +1201,28 @@
asmlinkage long sys_sched_yield(void)
{
+ runqueue_t *rq = this_rq();
+ prio_array_t *array;
+
/*
- * Trick. sched_yield() first counts the number of truly
- * 'pending' runnable processes, then returns if it's
- * only the current processes. (This test does not have
- * to be atomic.) In threaded applications this optimization
- * gets triggered quite often.
+ * Decrease the yielding task's priority by one, to avoid
+ * livelocks. This priority loss is temporary, it's recovered
+ * once the current timeslice expires.
+ *
+ * If priority is already MAX_PRIO-1 then we still
+ * roundrobin the task within the runlist.
*/
+ spin_lock_irq(&rq->lock);
+ array = current->array;
+ dequeue_task(current, array);
+ if (likely(!rt_task(current)))
+ if (current->prio < MAX_PRIO-1)
+ current->prio++;
+ enqueue_task(current, array);
+ spin_unlock_irq(&rq->lock);
- int nr_pending = nr_running;
-
-#if CONFIG_SMP
- int i;
-
- // Subtract non-idle processes running on other CPUs.
- for (i = 0; i < smp_num_cpus; i++) {
- int cpu = cpu_logical_map(i);
- if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
- nr_pending--;
- }
-#else
- // on UP this process is on the runqueue as well
- nr_pending--;
-#endif
- if (nr_pending) {
- /*
- * This process can only be rescheduled by us,
- * so this is safe without any locking.
- */
- if (current->policy == SCHED_OTHER)
- current->policy |= SCHED_YIELD;
- current->need_resched = 1;
+ schedule();
- spin_lock_irq(&runqueue_lock);
- move_last_runqueue(current);
- spin_unlock_irq(&runqueue_lock);
- }
return 0;
}
@@ -1105,7 +1260,7 @@
asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
{
struct timespec t;
- struct task_struct *p;
+ task_t *p;
int retval = -EINVAL;
if (pid < 0)
@@ -1115,8 +1270,8 @@
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
if (p)
- jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
- &t);
+ jiffies_to_timespec(p->policy & SCHED_FIFO ?
+ 0 : RT_PRIO_TO_TIMESLICE(p->prio), &t);
read_unlock(&tasklist_lock);
if (p)
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -1124,7 +1279,7 @@
return retval;
}
-static void show_task(struct task_struct * p)
+static void show_task(task_t * p)
{
unsigned long free = 0;
int state;
@@ -1172,7 +1327,7 @@
printk(" (NOTLB)\n");
{
- extern void show_trace_task(struct task_struct *tsk);
+ extern void show_trace_task(task_t *tsk);
show_trace_task(p);
}
}
@@ -1194,7 +1349,7 @@
void show_state(void)
{
- struct task_struct *p;
+ task_t *p;
#if (BITS_PER_LONG == 32)
printk("\n"
@@ -1217,121 +1372,97 @@
read_unlock(&tasklist_lock);
}
-/**
- * reparent_to_init() - Reparent the calling kernel thread to the init task.
- *
- * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to init so that
- * it is correctly cleaned up on exit.
- *
- * The various task state such as scheduling policy and priority may have
- * been inherited fro a user process, so we reset them to sane values here.
- *
- * NOTE that reparent_to_init() gives the caller full capabilities.
- */
-void reparent_to_init(void)
-{
- struct task_struct *this_task = current;
-
- write_lock_irq(&tasklist_lock);
-
- /* Reparent to init */
- REMOVE_LINKS(this_task);
- this_task->p_pptr = child_reaper;
- this_task->p_opptr = child_reaper;
- SET_LINKS(this_task);
-
- /* Set the exit signal to SIGCHLD so we signal init on exit */
- this_task->exit_signal = SIGCHLD;
-
- /* We also take the runqueue_lock while altering task fields
- * which affect scheduling decisions */
- spin_lock(&runqueue_lock);
-
- this_task->ptrace = 0;
- this_task->nice = DEF_NICE;
- this_task->policy = SCHED_OTHER;
- /* cpus_allowed? */
- /* rt_priority? */
- /* signals? */
- this_task->cap_effective = CAP_INIT_EFF_SET;
- this_task->cap_inheritable = CAP_INIT_INH_SET;
- this_task->cap_permitted = CAP_FULL_SET;
- this_task->keep_capabilities = 0;
- memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
- this_task->user = INIT_USER;
+extern unsigned long wait_init_idle;
- spin_unlock(&runqueue_lock);
- write_unlock_irq(&tasklist_lock);
+static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+{
+ if (rq1 == rq2)
+ spin_lock(&rq1->lock);
+ else {
+ if (rq_cpu(rq1) < rq_cpu(rq2)) {
+ spin_lock(&rq1->lock);
+ spin_lock(&rq2->lock);
+ } else {
+ spin_lock(&rq2->lock);
+ spin_lock(&rq1->lock);
+ }
+ }
}
-/*
- * Put all the gunge required to become a kernel thread without
- * attached user resources in one place where it belongs.
- */
-
-void daemonize(void)
+static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
{
- struct fs_struct *fs;
-
-
- /*
- * If we were started as result of loading a module, close all of the
- * user space pages. We don't need them, and if we didn't close them
- * they would be locked into memory.
- */
- exit_mm(current);
-
- current->session = 1;
- current->pgrp = 1;
- current->tty = NULL;
-
- /* Become as one with the init task */
-
- exit_fs(current); /* current->fs->count--; */
- fs = init_task.fs;
- current->fs = fs;
- atomic_inc(&fs->count);
- exit_files(current);
- current->files = init_task.files;
- atomic_inc(¤t->files->count);
+ spin_unlock(&rq1->lock);
+ if (rq1 != rq2)
+ spin_unlock(&rq2->lock);
}
-extern unsigned long wait_init_idle;
-
void __init init_idle(void)
{
- struct schedule_data * sched_data;
- sched_data = &aligned_data[smp_processor_id()].schedule_data;
+ runqueue_t *this_rq = this_rq(), *rq = current->array->rq;
+ unsigned long flags;
- if (current != &init_task && task_on_runqueue(current)) {
- printk("UGH! (%d:%d) was on the runqueue, removing.\n",
- smp_processor_id(), current->pid);
- del_from_runqueue(current);
+ __save_flags(flags);
+ __cli();
+ double_rq_lock(this_rq, rq);
+
+ this_rq->curr = this_rq->idle = current;
+ deactivate_task(current, rq);
+ current->array = NULL;
+ current->prio = MAX_PRIO;
+ current->state = TASK_RUNNING;
+ clear_bit(smp_processor_id(), &wait_init_idle);
+ double_rq_unlock(this_rq, rq);
+ while (wait_init_idle) {
+ cpu_relax();
+ barrier();
}
- sched_data->curr = current;
- sched_data->last_schedule = get_cycles();
- clear_bit(current->processor, &wait_init_idle);
+ current->need_resched = 1;
+ __sti();
}
-extern void init_timervecs (void);
+extern void init_timervecs(void);
+extern void timer_bh(void);
+extern void tqueue_bh(void);
+extern void immediate_bh(void);
void __init sched_init(void)
{
+ runqueue_t *rq;
+ int i, j, k;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ runqueue_t *rq = cpu_rq(i);
+ prio_array_t *array;
+
+ rq->active = rq->arrays + 0;
+ rq->expired = rq->arrays + 1;
+ spin_lock_init(&rq->lock);
+
+ for (j = 0; j < 2; j++) {
+ array = rq->arrays + j;
+ array->rq = rq;
+ array->lock = &rq->lock;
+ for (k = 0; k < MAX_PRIO; k++) {
+ INIT_LIST_HEAD(array->queue + k);
+ __set_bit(k, array->bitmap);
+ }
+ // zero delimiter for bitsearch
+ __clear_bit(MAX_PRIO, array->bitmap);
+ }
+ }
/*
* We have to do a little magic to get the first
* process right in SMP mode.
*/
- int cpu = smp_processor_id();
- int nr;
+ rq = this_rq();
+ rq->curr = current;
+ rq->idle = NULL;
+ wake_up_process(current);
- init_task.processor = cpu;
-
- for(nr = 0; nr < PIDHASH_SZ; nr++)
- pidhash[nr] = NULL;
+ for (i = 0; i < PIDHASH_SZ; i++)
+ pidhash[i] = NULL;
init_timervecs();
-
init_bh(TIMER_BH, timer_bh);
init_bh(TQUEUE_BH, tqueue_bh);
init_bh(IMMEDIATE_BH, immediate_bh);
@@ -1340,5 +1471,5 @@
* The boot idle thread does lazy MMU switching as well:
*/
atomic_inc(&init_mm.mm_count);
- enter_lazy_tlb(&init_mm, current, cpu);
+ enter_lazy_tlb(&init_mm, current, smp_processor_id());
}
diff -X dontdiff -Nur origlinux/kernel/signal.c mylinux/kernel/signal.c
--- origlinux/kernel/signal.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/signal.c Fri Jan 11 14:46:44 2002
@@ -478,12 +478,9 @@
* process of changing - but no harm is done by that
* other than doing an extra (lightweight) IPI interrupt.
*/
- spin_lock(&runqueue_lock);
- if (task_has_cpu(t) && t->processor != smp_processor_id())
- smp_send_reschedule(t->processor);
- spin_unlock(&runqueue_lock);
-#endif /* CONFIG_SMP */
-
+ if ((t->state == TASK_RUNNING) && (t->cpu != cpu()))
+ kick_if_running(t);
+#endif
if (t->state & TASK_INTERRUPTIBLE) {
wake_up_process(t);
return;
diff -X dontdiff -Nur origlinux/kernel/softirq.c mylinux/kernel/softirq.c
--- origlinux/kernel/softirq.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/softirq.c Fri Jan 11 14:46:44 2002
@@ -261,10 +261,9 @@
while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
current->state = TASK_RUNNING;
- do {
- current->policy |= SCHED_YIELD;
- schedule();
- } while (test_bit(TASKLET_STATE_SCHED, &t->state));
+ do
+ sys_sched_yield();
+ while (test_bit(TASKLET_STATE_SCHED, &t->state));
}
tasklet_unlock_wait(t);
clear_bit(TASKLET_STATE_SCHED, &t->state);
@@ -365,13 +364,13 @@
int cpu = cpu_logical_map(bind_cpu);
daemonize();
- current->nice = 19;
+ set_user_nice(current, 19);
sigfillset(¤t->blocked);
/* Migrate to the right CPU */
- current->cpus_allowed = 1UL << cpu;
- while (smp_processor_id() != cpu)
- schedule();
+ set_cpus_allowed(current, 1UL << cpu);
+ if (cpu() != cpu)
+ BUG();
sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu);
@@ -405,10 +404,8 @@
CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
printk("spawn_ksoftirqd() failed for cpu %d\n", cpu);
else {
- while (!ksoftirqd_task(cpu_logical_map(cpu))) {
- current->policy |= SCHED_YIELD;
- schedule();
- }
+ while (!ksoftirqd_task(cpu_logical_map(cpu)))
+ sys_sched_yield();
}
}
diff -X dontdiff -Nur origlinux/kernel/sys.c mylinux/kernel/sys.c
--- origlinux/kernel/sys.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/sys.c Fri Jan 11 14:46:44 2002
@@ -220,10 +220,10 @@
}
if (error == -ESRCH)
error = 0;
- if (niceval < p->nice && !capable(CAP_SYS_NICE))
+ if (niceval < p->__nice && !capable(CAP_SYS_NICE))
error = -EACCES;
else
- p->nice = niceval;
+ set_user_nice(p, niceval);
}
read_unlock(&tasklist_lock);
@@ -249,7 +249,7 @@
long niceval;
if (!proc_sel(p, which, who))
continue;
- niceval = 20 - p->nice;
+ niceval = 20 - p->__nice;
if (niceval > retval)
retval = niceval;
}
diff -X dontdiff -Nur origlinux/kernel/timer.c mylinux/kernel/timer.c
--- origlinux/kernel/timer.c Fri Jan 11 14:41:43 2002
+++ mylinux/kernel/timer.c Fri Jan 11 16:54:43 2002
@@ -25,6 +25,8 @@
#include <asm/uaccess.h>
+struct kernel_stat kstat;
+
/*
* Timekeeping variables
*/
@@ -583,17 +585,16 @@
update_one_process(p, user_tick, system, cpu);
if (p->pid) {
- if (--p->counter <= 0) {
- p->counter = 0;
- p->need_resched = 1;
- }
- if (p->nice > 0)
+ if (p->__nice > 0)
kstat.per_cpu_nice[cpu] += user_tick;
else
kstat.per_cpu_user[cpu] += user_tick;
kstat.per_cpu_system[cpu] += system;
- } else if (really_local_bh_count() || really_local_irq_count() > 1)
- kstat.per_cpu_system[cpu] += system;
+ } else {
+ if (bh_count(cpu) || irq_count(cpu) > 1)
+ kstat.per_cpu_system[cpu] += system;
+ }
+ scheduler_tick(p);
}
/*
@@ -795,6 +796,89 @@
}
#endif
+
+static void process_timeout(unsigned long __data)
+{
+ wake_up_process((task_t *)__data);
+}
+
+/**
+ * schedule_timeout - sleep until timeout
+ * @timeout: timeout value in jiffies
+ *
+ * Make the current task sleep until @timeout jiffies have
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
+ * pass before the routine returns. The routine will return 0
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task. In this case the remaining time
+ * in jiffies will be returned, or 0 if the timer expired in time
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
+ * the CPU away without a bound on the timeout. In this case the return
+ * value will be %MAX_SCHEDULE_TIMEOUT.
+ *
+ * In all cases the return value is guaranteed to be non-negative.
+ */
+signed long schedule_timeout(signed long timeout)
+{
+ struct timer_list timer;
+ unsigned long expire;
+
+ switch (timeout)
+ {
+ case MAX_SCHEDULE_TIMEOUT:
+ /*
+ * These two special cases are useful to be comfortable
+ * in the caller. Nothing more. We could take
+ * MAX_SCHEDULE_TIMEOUT from one of the negative value
+ * but I' d like to return a valid offset (>=0) to allow
+ * the caller to do everything it want with the retval.
+ */
+ schedule();
+ goto out;
+ default:
+ /*
+ * Another bit of PARANOID. Note that the retval will be
+ * 0 since no piece of kernel is supposed to do a check
+ * for a negative retval of schedule_timeout() (since it
+ * should never happens anyway). You just have the printk()
+ * that will tell you if something is gone wrong and where.
+ */
+ if (timeout < 0)
+ {
+ printk(KERN_ERR "schedule_timeout: wrong timeout "
+ "value %lx from %p\n", timeout,
+ __builtin_return_address(0));
+ current->state = TASK_RUNNING;
+ goto out;
+ }
+ }
+
+ expire = timeout + jiffies;
+
+ init_timer(&timer);
+ timer.expires = expire;
+ timer.data = (unsigned long) current;
+ timer.function = process_timeout;
+
+ add_timer(&timer);
+ schedule();
+ del_timer_sync(&timer);
+
+ timeout = expire - jiffies;
+
+ out:
+ return timeout < 0 ? 0 : timeout;
+}
/* Thread ID - the internal kernel "pid" */
asmlinkage long sys_gettid(void)
diff -X dontdiff -Nur origlinux/mm/highmem.c mylinux/mm/highmem.c
--- origlinux/mm/highmem.c Fri Jan 11 14:41:44 2002
+++ mylinux/mm/highmem.c Fri Jan 11 14:46:44 2002
@@ -354,9 +354,7 @@
/* we need to wait I/O completion */
run_task_queue(&tq_disk);
- current->policy |= SCHED_YIELD;
- __set_current_state(TASK_RUNNING);
- schedule();
+ yield();
goto repeat_alloc;
}
@@ -392,9 +390,7 @@
/* we need to wait I/O completion */
run_task_queue(&tq_disk);
- current->policy |= SCHED_YIELD;
- __set_current_state(TASK_RUNNING);
- schedule();
+ yield();
goto repeat_alloc;
}
diff -X dontdiff -Nur origlinux/mm/oom_kill.c mylinux/mm/oom_kill.c
--- origlinux/mm/oom_kill.c Fri Jan 11 14:41:44 2002
+++ mylinux/mm/oom_kill.c Fri Jan 11 14:46:44 2002
@@ -82,7 +82,7 @@
* Niced processes are most likely less important, so double
* their badness points.
*/
- if (p->nice > 0)
+ if (p->__nice > 0)
points *= 2;
/*
@@ -149,7 +149,7 @@
* all the memory it needs. That way it should be able to
* exit() and clear out its resources quickly...
*/
- p->counter = 5 * HZ;
+ p->time_slice = 2 * MAX_TIMESLICE;
p->flags |= PF_MEMALLOC | PF_MEMDIE;
/* This process has hardware access, be more careful. */
@@ -188,8 +188,7 @@
* killing itself before someone else gets the chance to ask
* for more memory.
*/
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
return;
}
diff -X dontdiff -Nur origlinux/mm/page_alloc.c mylinux/mm/page_alloc.c
--- origlinux/mm/page_alloc.c Fri Jan 11 14:41:44 2002
+++ mylinux/mm/page_alloc.c Fri Jan 11 14:46:44 2002
@@ -394,9 +394,7 @@
return NULL;
/* Yield for kswapd, and try again */
- current->policy |= SCHED_YIELD;
- __set_current_state(TASK_RUNNING);
- schedule();
+ yield();
goto rebalance;
}
diff -X dontdiff -Nur origlinux/net/ipv4/tcp_output.c mylinux/net/ipv4/tcp_output.c
--- origlinux/net/ipv4/tcp_output.c Fri Jan 11 14:41:47 2002
+++ mylinux/net/ipv4/tcp_output.c Fri Jan 11 14:46:44 2002
@@ -1009,8 +1009,7 @@
skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
if (skb)
break;
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
}
/* Reserve space for headers and prepare control bits. */
diff -X dontdiff -Nur origlinux/net/sched/sch_generic.c mylinux/net/sched/sch_generic.c
--- origlinux/net/sched/sch_generic.c Fri Jan 11 14:41:50 2002
+++ mylinux/net/sched/sch_generic.c Fri Jan 11 14:46:44 2002
@@ -475,10 +475,8 @@
dev_watchdog_down(dev);
- while (test_bit(__LINK_STATE_SCHED, &dev->state)) {
- current->policy |= SCHED_YIELD;
- schedule();
- }
+ while (test_bit(__LINK_STATE_SCHED, &dev->state))
+ yield();
spin_unlock_wait(&dev->xmit_lock);
}
diff -X dontdiff -Nur origlinux/net/socket.c mylinux/net/socket.c
--- origlinux/net/socket.c Fri Jan 11 14:41:50 2002
+++ mylinux/net/socket.c Fri Jan 11 14:46:44 2002
@@ -148,8 +148,7 @@
while (atomic_read(&net_family_lockct) != 0) {
spin_unlock(&net_family_lock);
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
spin_lock(&net_family_lock);
}
diff -X dontdiff -Nur origlinux/net/sunrpc/sched.c mylinux/net/sunrpc/sched.c
--- origlinux/net/sunrpc/sched.c Fri Jan 11 14:41:50 2002
+++ mylinux/net/sunrpc/sched.c Fri Jan 11 14:46:44 2002
@@ -772,8 +772,7 @@
}
if (flags & RPC_TASK_ASYNC)
return NULL;
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
} while (!signalled());
return NULL;
@@ -1114,8 +1113,7 @@
__rpc_schedule();
if (all_tasks) {
dprintk("rpciod_killall: waiting for tasks to exit\n");
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
}
}
@@ -1185,8 +1183,7 @@
* wait briefly before checking the process id.
*/
current->sigpending = 0;
- current->policy |= SCHED_YIELD;
- schedule();
+ yield();
/*
* Display a message if we're going to wait longer.
*/
diff -X dontdiff -Nur origlinux/net/unix/af_unix.c mylinux/net/unix/af_unix.c
--- origlinux/net/unix/af_unix.c Fri Jan 11 14:41:50 2002
+++ mylinux/net/unix/af_unix.c Fri Jan 11 14:46:44 2002
@@ -564,10 +564,8 @@
addr->hash)) {
write_unlock(&unix_table_lock);
/* Sanity yield. It is unusual case, but yet... */
- if (!(ordernum&0xFF)) {
- current->policy |= SCHED_YIELD;
- schedule();
- }
+ if (!(ordernum&0xFF))
+ yield();
goto retry;
}
addr->hash ^= sk->type;
next reply other threads:[~2002-01-12 2:23 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2002-01-12 2:23 Nick Pollitt [this message]
2002-01-12 3:13 ` [Linux-ia64] Help with Ingo scheduler on IA64 David Mosberger
2002-01-14 18:23 ` Erich Focht
2002-01-15 1:07 ` Nick Pollitt
2002-01-15 9:28 ` Erich Focht
2002-01-15 17:53 ` Erich Focht
2002-01-15 17:58 ` Erich Focht
2002-01-15 18:59 ` Erich Focht
2002-01-15 19:52 ` Ingo Molnar
2002-01-15 19:57 ` Ingo Molnar
2002-01-15 20:12 ` Ingo Molnar
2002-01-16 5:30 ` Nick Pollitt
2002-01-16 21:04 ` Erich Focht
2002-01-17 1:42 ` David Mosberger
2002-01-17 5:39 ` Nick Pollitt
2002-01-17 8:06 ` David Mosberger
2002-01-17 9:43 ` Ingo Molnar
2002-01-17 9:45 ` Ingo Molnar
2002-01-17 18:25 ` Erich Focht
2002-01-17 21:17 ` Ingo Molnar
2002-01-19 17:17 ` Erich Focht
2002-01-19 20:10 ` David Mosberger
2002-01-21 16:23 ` Erich Focht
2002-01-21 18:24 ` Erich Focht
2002-01-21 18:45 ` Erich Focht
2002-01-21 20:10 ` David Mosberger
2002-01-21 20:23 ` David Mosberger
2002-01-21 20:32 ` Ingo Molnar
2002-01-21 20:41 ` David Mosberger
2002-01-21 21:11 ` Ingo Molnar
2002-01-21 22:11 ` Ingo Molnar
2002-01-21 22:27 ` Ingo Molnar
2002-01-21 22:30 ` Ingo Molnar
2002-01-21 22:41 ` Ingo Molnar
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=marc-linux-ia64-105590698805816@msgid-missing \
--to=npollitt@sgi.com \
--cc=linux-ia64@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox