* Re: migration thread and active_load_balance
2008-04-21 11:03 ` Dmitry Adamushko
@ 2008-04-21 19:38 ` Dan Upton
2008-04-21 20:39 ` Dmitry Adamushko
0 siblings, 1 reply; 9+ messages in thread
From: Dan Upton @ 2008-04-21 19:38 UTC (permalink / raw)
To: Dmitry Adamushko; +Cc: linux-kernel
[-- Attachment #1: Type: text/plain, Size: 888 bytes --]
On Mon, Apr 21, 2008 at 7:03 AM, Dmitry Adamushko
<dmitry.adamushko@gmail.com> wrote:
> On 21/04/2008, Dan Upton <upton.dan.linux@gmail.com> wrote:
> > [ ... ]
>
> >
> > kernel BUG at kernel/sched.c:2103
>
> and what's this line in your patched sched.c?
>
> is it -- BUG_ON(!irqs_disabled()); ?
>
> anything in your unposted code (e.g. find_coolest_cpu()) that might
> re-enable the interrupts before __migration_task() is called?
>
> If you post your modifications as a patch
> (Documentation/applying-patches.txt) that contains _all_ relevant
> modifications, it'd be easier to guess what's wrong.
Yes, that's the line. I don't recall ever reenabling interrupts, but
maybe somebody will see what I'm missing. I've attached a full diff;
there are a few other places I've made changes for other scheduling
stuff that you'll see in the diff, that have all tested fine.
-dan
[-- Attachment #2: thermdiff --]
[-- Type: application/octet-stream, Size: 34719 bytes --]
Binary files linux-2.6.24.3/Documentation/.applying-patches.txt.swp and linux-2.6.24.3-therm/Documentation/.applying-patches.txt.swp differ
diff -Nur linux-2.6.24.3/fs/proc/array.c linux-2.6.24.3-therm/fs/proc/array.c
--- linux-2.6.24.3/fs/proc/array.c 2008-02-25 19:20:20.000000000 -0500
+++ linux-2.6.24.3-therm/fs/proc/array.c 2008-03-25 15:08:34.000000000 -0400
@@ -496,7 +496,7 @@
res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld new %d\n",
task_pid_nr_ns(task, ns),
tcomm,
state,
@@ -543,7 +543,8 @@
task->policy,
(unsigned long long)delayacct_blkio_ticks(task),
cputime_to_clock_t(gtime),
- cputime_to_clock_t(cgtime));
+ cputime_to_clock_t(cgtime),
+ task->lasttemp);
if (mm)
mmput(mm);
return res;
diff -Nur linux-2.6.24.3/include/linux/sched.h linux-2.6.24.3-therm/include/linux/sched.h
--- linux-2.6.24.3/include/linux/sched.h 2008-02-25 19:20:20.000000000 -0500
+++ linux-2.6.24.3-therm/include/linux/sched.h 2008-02-26 15:51:04.000000000 -0500
@@ -1178,6 +1178,8 @@
int make_it_fail;
#endif
struct prop_local_single dirties;
+
+ int lasttemp; /* dsu9w - last temperature of this process */
};
/*
diff -Nur linux-2.6.24.3/init/main.c linux-2.6.24.3-therm/init/main.c
--- linux-2.6.24.3/init/main.c 2008-02-25 19:20:20.000000000 -0500
+++ linux-2.6.24.3-therm/init/main.c 2008-03-24 13:47:18.000000000 -0400
@@ -77,6 +77,9 @@
#warning gcc-4.1.0 is known to miscompile the kernel. A different compiler version is recommended.
#endif
+//extern int sched_thermal;
+extern int init_sysfs_temp(void);
+
static int kernel_init(void *);
extern void init_IRQ(void);
@@ -449,6 +452,9 @@
schedule();
preempt_disable();
+ //sched_thermal = 1; // turn on thermal scheduling
+ init_sysfs_temp();
+
/* Call into cpu_idle with preempt disabled */
cpu_idle();
}
@@ -646,6 +652,7 @@
/* Do the rest non-__init'ed, we're now alive */
rest_init();
+
}
static int __initdata initcall_debug;
diff -Nur linux-2.6.24.3/kernel/printk.c linux-2.6.24.3-therm/kernel/printk.c
--- linux-2.6.24.3/kernel/printk.c 2008-02-25 19:20:20.000000000 -0500
+++ linux-2.6.24.3-therm/kernel/printk.c 2008-04-07 16:40:21.000000000 -0400
@@ -978,7 +978,7 @@
console_locked = 0;
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
- if (wake_klogd)
+ if (!irqs_disabled() && wake_klogd)
wake_up_klogd();
}
EXPORT_SYMBOL(release_console_sem);
diff -Nur linux-2.6.24.3/kernel/sched.c linux-2.6.24.3-therm/kernel/sched.c
--- linux-2.6.24.3/kernel/sched.c 2008-02-25 19:20:20.000000000 -0500
+++ linux-2.6.24.3-therm/kernel/sched.c 2008-04-21 12:01:37.000000000 -0400
@@ -67,6 +67,11 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
+/* thermal scheduling flag */
+int sched_thermal = 0;
+/* where we're coming from... */
+int from_active_balance = 0;
+
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
@@ -259,6 +264,8 @@
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
#endif
+ int last_therm_balance_index;
+ int last_therm_balance_temp;
};
/* Real-Time classes' related field in a runqueue: */
@@ -360,6 +367,7 @@
unsigned int bkl_count;
#endif
struct lock_class_key rq_lock_key;
+ int from_active_balance;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -875,6 +883,7 @@
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_temp.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif
@@ -1397,6 +1406,7 @@
}
while (sd) {
+ //while(0){ // 'cause i don't actually care about this
cpumask_t span;
struct sched_group *group;
int new_cpu, weight;
@@ -1432,6 +1442,9 @@
}
/* while loop will break here if sd == NULL */
}
+ if(sched_thermal == 1){
+ cpu=find_coolest_cpu(current);
+ }
return cpu;
}
@@ -2354,7 +2367,11 @@
struct task_struct *p = iterator->start(iterator->arg);
int pinned = 0;
+ /* debugg! */
+ //if(busiest->from_active_balance) return;
+
while (p) {
+
if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
pull_task(busiest, p, this_rq, this_cpu);
/*
@@ -2383,6 +2400,9 @@
struct sched_domain *sd, enum cpu_idle_type idle)
{
const struct sched_class *class;
+
+ /* debugg! */
+ //if(busiest->from_active_balance) return 0;
for (class = sched_class_highest; class; class = class->next)
if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
@@ -3019,6 +3039,9 @@
struct sched_domain *sd;
struct rq *target_rq;
+ /* debugg! */
+ //if(busiest_rq->from_active_balance) return;
+
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
return;
@@ -3053,6 +3076,8 @@
else
schedstat_inc(sd, alb_failed);
}
+
+
spin_unlock(&target_rq->lock);
}
@@ -3622,6 +3647,8 @@
long *switch_count;
struct rq *rq;
int cpu;
+ int temperature;
+ static int dbgflag=1;
need_resched:
preempt_disable();
@@ -3636,6 +3663,13 @@
schedule_debug(prev);
+ /* try reading the temperature here before interrupts
+ are disabled */
+ if(sched_thermal == 2){
+ temperature = get_temperature(cpu);
+ prev->lasttemp = temperature;
+ }
+
/*
* Do the rq-clock update outside the rq lock:
*/
@@ -3654,8 +3688,18 @@
switch_count = &prev->nvcsw;
}
- if (unlikely(!rq->nr_running))
+ if (unlikely(!rq->nr_running)){
+ /*if(sched_thermal == 2){
+ if(dbgflag == 1){
+ printk(KERN_ALERT "temp_balancing\n");
+ dbgflag=0;
+ }
+ temp_balance(cpu, rq, temperature);
+ }else{
+ idle_balance(cpu, rq);
+ }*/
idle_balance(cpu, rq);
+ }
prev->sched_class->put_prev_task(rq, prev);
next = pick_next_task(rq, prev);
@@ -5151,6 +5195,9 @@
{
int cpu = (long)data;
struct rq *rq;
+ static int dbgflag=1;
+ int save_ab = 0, save_push=0;
+ int coolest = !cpu;
rq = cpu_rq(cpu);
BUG_ON(rq->migration_thread != current);
@@ -5159,7 +5206,7 @@
while (!kthread_should_stop()) {
struct migration_req *req;
struct list_head *head;
-
+
spin_lock_irq(&rq->lock);
if (cpu_is_offline(cpu)) {
@@ -5167,11 +5214,31 @@
goto wait_to_die;
}
+ // other stuff here too, like checking the cpu temp
+ if(sched_thermal == 3){
+
+ coolest = find_coolest_cpu(NULL);
+ if(coolest != cpu){ // if there's somewhere cooler to push stuff
+ rq->from_active_balance=1;
+ save_ab = rq->active_balance;
+ save_push = rq->push_cpu;
+ rq->push_cpu = coolest;
+ //rq->active_balance = 1;
+
+ //active_load_balance(rq, cpu);
+ rq->from_active_balance=0;
+ rq->active_balance = save_ab;
+ rq->push_cpu = save_push;
+ }
+ }
+ // is it possible this could undo any work we just did? or maybe we could
+ // cause a bug if this was going to be called because it was the busiest proc,
+ // and now it isn't?
if (rq->active_balance) {
active_load_balance(rq, cpu);
rq->active_balance = 0;
}
-
+
head = &rq->migration_queue;
if (list_empty(head)) {
@@ -5180,6 +5247,7 @@
set_current_state(TASK_INTERRUPTIBLE);
continue;
}
+
req = list_entry(head->next, struct migration_req, list);
list_del_init(head->next);
diff -Nur linux-2.6.24.3/kernel/sched_debug.c linux-2.6.24.3-therm/kernel/sched_debug.c
--- linux-2.6.24.3/kernel/sched_debug.c 2008-02-25 19:20:20.000000000 -0500
+++ linux-2.6.24.3-therm/kernel/sched_debug.c 2008-03-28 12:31:55.000000000 -0400
@@ -103,7 +103,7 @@
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+ /*s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
spread, rq0_min_vruntime, spread0;
struct rq *rq = &per_cpu(runqueues, cpu);
struct sched_entity *last;
@@ -143,12 +143,33 @@
#endif
SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
cfs_rq->nr_spread_over);
+ */
+
+ /* get the last-running task for this queue */
+ struct rb_node* lasttask = first_fair(cfs_rq);
+ /* if it's null, this core is idle */
+ if(!lasttask){
+ SEQ_printf(m, "CPU %d: 0 idle\n", cpu);
+ return;
+ }else{
+ struct task_struct* task = rb_entry(lasttask, struct task_struct, se.run_node);
+ SEQ_printf(m, "CPU %d: %d %s", cpu, task->pid, task->comm);
+ /* now see what else was on this runqueue */
+ lasttask = rb_next(lasttask);
+ while(lasttask != NULL){
+ task = rb_entry(lasttask, struct task_struct, se.run_node);
+ SEQ_printf(m, " %d %s", task->pid, task->comm);
+ lasttask = rb_next(lasttask);
+ }
+ SEQ_printf(m, "\n");
+ }
+
}
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = &per_cpu(runqueues, cpu);
-
+/*
#ifdef CONFIG_X86
{
unsigned int freq = cpu_khz ? : 1;
@@ -188,17 +209,17 @@
P(cpu_load[4]);
#undef P
#undef PN
-
+*/
print_cfs_stats(m, cpu);
- print_rq(m, rq, cpu);
+ //print_rq(m, rq, cpu);
}
static int sched_debug_show(struct seq_file *m, void *v)
{
u64 now = ktime_to_ns(ktime_get());
int cpu;
-
+/*
SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
@@ -218,11 +239,11 @@
P(sysctl_sched_features);
#undef PN
#undef P
-
+*/
for_each_online_cpu(cpu)
print_cpu(m, cpu);
- SEQ_printf(m, "\n");
+ //SEQ_printf(m, "\n");
return 0;
}
diff -Nur linux-2.6.24.3/kernel/sched_debug.old linux-2.6.24.3-therm/kernel/sched_debug.old
--- linux-2.6.24.3/kernel/sched_debug.old 1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.24.3-therm/kernel/sched_debug.old 2008-02-26 14:44:53.000000000 -0500
@@ -0,0 +1,396 @@
+/*
+ * kernel/time/sched_debug.c
+ *
+ * Print the CFS rbtree
+ *
+ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+/*
+ * This allows printing both to /proc/sched_debug and
+ * to the console
+ */
+#define SEQ_printf(m, x...) \
+ do { \
+ if (m) \
+ seq_printf(m, x); \
+ else \
+ printk(x); \
+ } while (0)
+
+/*
+ * Ease the printing of nsec fields:
+ */
+static long long nsec_high(unsigned long long nsec)
+{
+ if ((long long)nsec < 0) {
+ nsec = -nsec;
+ do_div(nsec, 1000000);
+ return -nsec;
+ }
+ do_div(nsec, 1000000);
+
+ return nsec;
+}
+
+static unsigned long nsec_low(unsigned long long nsec)
+{
+ if ((long long)nsec < 0)
+ nsec = -nsec;
+
+ return do_div(nsec, 1000000);
+}
+
+#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+
+static void
+print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+{
+ if (rq->curr == p)
+ SEQ_printf(m, "R");
+ else
+ SEQ_printf(m, " ");
+
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+ p->comm, p->pid,
+ SPLIT_NS(p->se.vruntime),
+ (long long)(p->nvcsw + p->nivcsw),
+ p->prio);
+#ifdef CONFIG_SCHEDSTATS
+ SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n",
+ SPLIT_NS(p->se.vruntime),
+ SPLIT_NS(p->se.sum_exec_runtime),
+ SPLIT_NS(p->se.sum_sleep_runtime));
+#else
+ SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
+ 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+#endif
+}
+
+static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+
+ SEQ_printf(m,
+ "\nrunnable tasks:\n"
+ " task PID tree-key switches prio"
+ " exec-runtime sum-exec sum-sleep\n"
+ "------------------------------------------------------"
+ "----------------------------------------------------\n");
+
+ read_lock_irqsave(&tasklist_lock, flags);
+
+ do_each_thread(g, p) {
+ if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+ continue;
+
+ print_task(m, rq, p);
+ } while_each_thread(g, p);
+
+ read_unlock_irqrestore(&tasklist_lock, flags);
+}
+
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+{
+ s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+ spread, rq0_min_vruntime, spread0;
+ struct rq *rq = &per_cpu(runqueues, cpu);
+ struct sched_entity *last;
+ unsigned long flags;
+
+ SEQ_printf(m, "\ncfs_rq\n");
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
+ SPLIT_NS(cfs_rq->exec_clock));
+
+ spin_lock_irqsave(&rq->lock, flags);
+ if (cfs_rq->rb_leftmost)
+ MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+ last = __pick_last_entity(cfs_rq);
+ if (last)
+ max_vruntime = last->vruntime;
+ min_vruntime = rq->cfs.min_vruntime;
+ rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
+ spin_unlock_irqrestore(&rq->lock, flags);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+ SPLIT_NS(MIN_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
+ SPLIT_NS(min_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
+ SPLIT_NS(max_vruntime));
+ spread = max_vruntime - MIN_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
+ SPLIT_NS(spread));
+ spread0 = min_vruntime - rq0_min_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
+ SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
+#ifdef CONFIG_SCHEDSTATS
+ SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
+ rq->bkl_count);
+#endif
+ SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
+ cfs_rq->nr_spread_over);
+}
+
+static void print_cpu(struct seq_file *m, int cpu)
+{
+ struct rq *rq = &per_cpu(runqueues, cpu);
+
+#ifdef CONFIG_X86
+ {
+ unsigned int freq = cpu_khz ? : 1;
+
+ SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+ cpu, freq / 1000, (freq % 1000));
+ }
+#else
+ SEQ_printf(m, "\ncpu#%d\n", cpu);
+#endif
+
+#define P(x) \
+ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))
+#define PN(x) \
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
+
+ P(nr_running);
+ SEQ_printf(m, " .%-30s: %lu\n", "load",
+ rq->load.weight);
+ P(nr_switches);
+ P(nr_load_updates);
+ P(nr_uninterruptible);
+ SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
+ PN(next_balance);
+ P(curr->pid);
+ PN(clock);
+ PN(idle_clock);
+ PN(prev_clock_raw);
+ P(clock_warps);
+ P(clock_overflows);
+ P(clock_deep_idle_events);
+ PN(clock_max_delta);
+ P(cpu_load[0]);
+ P(cpu_load[1]);
+ P(cpu_load[2]);
+ P(cpu_load[3]);
+ P(cpu_load[4]);
+#undef P
+#undef PN
+
+ print_cfs_stats(m, cpu);
+
+ print_rq(m, rq, cpu);
+}
+
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+ SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+
+#define P(x) \
+ SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+ PN(sysctl_sched_latency);
+ PN(sysctl_sched_min_granularity);
+ PN(sysctl_sched_wakeup_granularity);
+ PN(sysctl_sched_batch_wakeup_granularity);
+ PN(sysctl_sched_child_runs_first);
+ P(sysctl_sched_features);
+#undef PN
+#undef P
+
+ for_each_online_cpu(cpu)
+ print_cpu(m, cpu);
+
+ SEQ_printf(m, "\n");
+
+ return 0;
+}
+
+static void sysrq_sched_debug_show(void)
+{
+ sched_debug_show(NULL, NULL);
+}
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_debug_show, NULL);
+}
+
+static const struct file_operations sched_debug_fops = {
+ .open = sched_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init init_sched_debug_procfs(void)
+{
+ struct proc_dir_entry *pe;
+
+ pe = create_proc_entry("sched_debug", 0644, NULL);
+ if (!pe)
+ return -ENOMEM;
+
+ pe->proc_fops = &sched_debug_fops;
+
+ return 0;
+}
+
+__initcall(init_sched_debug_procfs);
+
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+ unsigned long nr_switches;
+ unsigned long flags;
+ int num_threads = 1;
+
+ rcu_read_lock();
+ if (lock_task_sighand(p, &flags)) {
+ num_threads = atomic_read(&p->signal->count);
+ unlock_task_sighand(p, &flags);
+ }
+ rcu_read_unlock();
+
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+ SEQ_printf(m,
+ "---------------------------------------------------------\n");
+#define __P(F) \
+ SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+ SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+ SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+ SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+ PN(se.exec_start);
+ PN(se.vruntime);
+ PN(se.sum_exec_runtime);
+
+ nr_switches = p->nvcsw + p->nivcsw;
+
+#ifdef CONFIG_SCHEDSTATS
+ PN(se.wait_start);
+ PN(se.sleep_start);
+ PN(se.block_start);
+ PN(se.sleep_max);
+ PN(se.block_max);
+ PN(se.exec_max);
+ PN(se.slice_max);
+ PN(se.wait_max);
+ P(sched_info.bkl_count);
+ P(se.nr_migrations);
+ P(se.nr_migrations_cold);
+ P(se.nr_failed_migrations_affine);
+ P(se.nr_failed_migrations_running);
+ P(se.nr_failed_migrations_hot);
+ P(se.nr_forced_migrations);
+ P(se.nr_forced2_migrations);
+ P(se.nr_wakeups);
+ P(se.nr_wakeups_sync);
+ P(se.nr_wakeups_migrate);
+ P(se.nr_wakeups_local);
+ P(se.nr_wakeups_remote);
+ P(se.nr_wakeups_affine);
+ P(se.nr_wakeups_affine_attempts);
+ P(se.nr_wakeups_passive);
+ P(se.nr_wakeups_idle);
+
+ {
+ u64 avg_atom, avg_per_cpu;
+
+ avg_atom = p->se.sum_exec_runtime;
+ if (nr_switches)
+ do_div(avg_atom, nr_switches);
+ else
+ avg_atom = -1LL;
+
+ avg_per_cpu = p->se.sum_exec_runtime;
+ if (p->se.nr_migrations) {
+ avg_per_cpu = div64_64(avg_per_cpu,
+ p->se.nr_migrations);
+ } else {
+ avg_per_cpu = -1LL;
+ }
+
+ __PN(avg_atom);
+ __PN(avg_per_cpu);
+ }
+#endif
+ __P(nr_switches);
+ SEQ_printf(m, "%-35s:%21Ld\n",
+ "nr_voluntary_switches", (long long)p->nvcsw);
+ SEQ_printf(m, "%-35s:%21Ld\n",
+ "nr_involuntary_switches", (long long)p->nivcsw);
+
+ P(se.load.weight);
+ P(policy);
+ P(prio);
+#undef PN
+#undef __PN
+#undef P
+#undef __P
+
+ {
+ u64 t0, t1;
+
+ t0 = sched_clock();
+ t1 = sched_clock();
+ SEQ_printf(m, "%-35s:%21Ld\n",
+ "clock-delta", (long long)(t1-t0));
+ }
+}
+
+void proc_sched_set_task(struct task_struct *p)
+{
+#ifdef CONFIG_SCHEDSTATS
+ p->se.wait_max = 0;
+ p->se.sleep_max = 0;
+ p->se.sum_sleep_runtime = 0;
+ p->se.block_max = 0;
+ p->se.exec_max = 0;
+ p->se.slice_max = 0;
+ p->se.nr_migrations = 0;
+ p->se.nr_migrations_cold = 0;
+ p->se.nr_failed_migrations_affine = 0;
+ p->se.nr_failed_migrations_running = 0;
+ p->se.nr_failed_migrations_hot = 0;
+ p->se.nr_forced_migrations = 0;
+ p->se.nr_forced2_migrations = 0;
+ p->se.nr_wakeups = 0;
+ p->se.nr_wakeups_sync = 0;
+ p->se.nr_wakeups_migrate = 0;
+ p->se.nr_wakeups_local = 0;
+ p->se.nr_wakeups_remote = 0;
+ p->se.nr_wakeups_affine = 0;
+ p->se.nr_wakeups_affine_attempts = 0;
+ p->se.nr_wakeups_passive = 0;
+ p->se.nr_wakeups_idle = 0;
+ p->sched_info.bkl_count = 0;
+#endif
+ p->se.sum_exec_runtime = 0;
+ p->se.prev_sum_exec_runtime = 0;
+ p->nvcsw = 0;
+ p->nivcsw = 0;
+}
diff -Nur linux-2.6.24.3/kernel/sched_fair.c linux-2.6.24.3-therm/kernel/sched_fair.c
--- linux-2.6.24.3/kernel/sched_fair.c 2008-02-25 19:20:20.000000000 -0500
+++ linux-2.6.24.3-therm/kernel/sched_fair.c 2008-04-21 12:00:11.000000000 -0400
@@ -948,6 +948,101 @@
return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
}
+
+static struct task_struct *__load_balance_therm_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr){
+ // local info
+ int indexctr = 0, retindex=0;
+ struct task_struct *p_tmp, *p_ret;
+ struct rb_node *iter=curr;
+ int lowest_temp = cfs_rq->last_therm_balance_temp;
+ int last_index = cfs_rq->last_therm_balance_index;
+
+ if(!curr)
+ return NULL;
+
+ // if last_therm_balance_index is -1, then this is being called from
+ // load_balance_start_therm, so we can just look through the whole
+ // runqueue to find something cooler without worrying about whether
+ // we've already tried it
+ if(last_index == -1){
+ while(iter){
+ p_tmp = rb_entry(iter, struct task_struct, se.run_node);
+ if(p_tmp->lasttemp < lowest_temp){
+ p_ret = p_tmp;
+ lowest_temp = p_tmp->lasttemp;
+ retindex = indexctr;
+ }
+ iter = rb_next(iter);
+ indexctr++;
+ }
+ }
+ // otherwise, we want to look for
+ // - a process of equal temperature further down the queue
+ // - the next-lowest temperature
+ else{
+ int second_lowest_temp=100; // see below
+
+ // so first we look through for the next entry with the same temperature
+ // the comments on __load_balance_iterator suggest dequeues can happen despite
+ // the lock being held, but i'm assuming queueing can't happen, so we don't have
+ // to worry about new, lower-temperatured processes magically appearing. this
+ // assumption simplifies the search for next-coolest tasks.
+ while(iter){
+ p_tmp = rb_entry(iter, struct task_struct, se.run_node);
+ if( (p_tmp->lasttemp <= lowest_temp) && indexctr > last_index){
+ // we're just looking for the next one down the line,
+ // and it looks like we've found it, so we update cf_rq stats
+ // and return from here
+ cfs_rq->last_therm_balance_temp = p_tmp->lasttemp;
+ cfs_rq->last_therm_balance_index = indexctr;
+ return p_tmp;
+ }else if(p_tmp->lasttemp > lowest_temp && p_tmp < second_lowest_temp){
+ second_lowest_temp = p_tmp->lasttemp;
+ }
+ indexctr++;
+ iter = rb_next(iter);
+ }
+
+ // if we get here, it means we wandered off the end of the runqueue without finding
+ // anything else with the same lowest temperature. however, we know now what the
+ // second lowest temperature of the runqueue is (second_lowest_temp as calculated above),
+ // so we can just look for the first task with that temp.
+
+ // this makes use of the above assumption that tasks can only be dequeued but not enqueued
+ iter = curr; // reset the iterator
+ indexctr=0;
+ while(iter){
+ p_tmp = rb_entry(iter, struct task_struct, se.run_node);
+ if(p_tmp->lasttemp == second_lowest_temp){
+ // we found something, so let's update the stats and return it
+ cfs_rq->last_therm_balance_temp = p_tmp->lasttemp;
+ cfs_rq->last_therm_balance_index = indexctr;
+ return p_tmp;
+ }
+ indexctr++;
+ iter = rb_next(iter);
+ }
+ }
+
+ // update stats in case we come back here
+ cfs_rq->last_therm_balance_temp = lowest_temp;
+ cfs_rq->last_therm_balance_index = retindex;
+ return p_ret;
+
+}
+
+static struct task_struct *load_balance_start_therm(void *arg){
+ struct cfs_rq *cfs_rq = arg;
+ cfs_rq->last_therm_balance_index = -1;
+ cfs_rq->last_therm_balance_temp = 100;
+ return __load_balance_therm_iterator(cfs_rq, first_fair(cfs_rq));
+}
+
+static struct task_struct *load_balance_next_therm(void *arg){
+ struct cfs_rq *cfs_rq = arg;
+ return __load_balance_therm_iterator(cfs_rq, first_fair(cfs_rq));
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
{
@@ -967,6 +1062,7 @@
}
#endif
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@@ -976,10 +1072,17 @@
struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move;
struct rq_iterator cfs_rq_iterator;
+
cfs_rq_iterator.start = load_balance_start_fair;
cfs_rq_iterator.next = load_balance_next_fair;
+ if(sched_thermal == 2){
+ // use our new iterators
+ cfs_rq_iterator.start = load_balance_start_therm;
+ cfs_rq_iterator.next = load_balance_next_therm;
+ }
+
for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
#ifdef CONFIG_FAIR_GROUP_SCHED
struct cfs_rq *this_cfs_rq;
@@ -1024,9 +1127,24 @@
{
struct cfs_rq *busy_cfs_rq;
struct rq_iterator cfs_rq_iterator;
+ static int dbgflag = 1;
cfs_rq_iterator.start = load_balance_start_fair;
cfs_rq_iterator.next = load_balance_next_fair;
+
+ /* i think you can only get here from migration_thread
+ which sets from_active balance, and it only does that
+ when sched_thermal is 3, but just in case i'm missing
+ a call site i'll double-check sched_thermal's value
+ here as well. */
+ if(busiest->from_active_balance == 1 && sched_thermal == 3){
+ if(dbgflag==1){
+ //printk(KERN_ALERT "move_one_task_fair+therm\n");
+ dbgflag=0;
+ }
+ cfs_rq_iterator.start = load_balance_start_therm;
+ cfs_rq_iterator.next = load_balance_next_therm;
+ }
for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
/*
@@ -1136,7 +1254,7 @@
#ifdef CONFIG_FAIR_GROUP_SCHED
print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
#endif
- for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
- print_cfs_rq(m, cpu, cfs_rq);
+ /*for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ print_cfs_rq(m, cpu, cfs_rq);*/
}
#endif
diff -Nur linux-2.6.24.3/kernel/sched_temp.c linux-2.6.24.3-therm/kernel/sched_temp.c
--- linux-2.6.24.3/kernel/sched_temp.c 1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.24.3-therm/kernel/sched_temp.c 2008-04-21 11:57:53.000000000 -0400
@@ -0,0 +1,136 @@
+/*
+ * sched_temp.c
+ */
+
+#include <linux/kobject.h>
+
+#define TEMPBASE 85000
+
+
+/* prototypes and such that we need here that are defined but
+ not prototyped in sched.c or sched_fair.c */
+
+#define for_each_leaf_cfs_rq(rq, cfs) \
+ list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+static void pull_task(struct rq *src_rq, struct task_struct *p,
+ struct rq *this_rq, int this_cpu);
+
+static int can_migrate_task(struct task_struct *p, struct rq *rq,
+ int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned);
+
+/* this stuff at the top of the file is necessary for adding
+ the file under /sys/ that lets us turn on/off thermal
+ scheduling methods. Specifically, it sets the value of
+ int sched_thermal, which is defined and used variously in
+ sched.c. */
+
+struct kobject tempsched;
+
+static ssize_t sched_temp_show(struct sys_device *dev, char *page){
+ return sprintf(page, "%u\n", sched_thermal);
+}
+
+/* update this any time a new valid value for sched_thermal is added */
+static ssize_t sched_temp_store(struct sys_device *dev, const char *buf,
+ size_t count){
+ switch(buf[0]){
+ case '0':
+ sched_thermal=0;
+ break;
+ case '1':
+ sched_thermal=1;
+ break;
+ case '2':
+ sched_thermal=2;
+ break;
+ case '3':
+ sched_thermal=3;
+ break;
+ default:
+ return -EINVAL;
+ };
+ return count;
+}
+
+/* SYSDEV_ATTR appears to be a macro which, among other things, creates
+ attr_sched_thermal */
+static SYSDEV_ATTR(sched_thermal, 0644, sched_temp_show, sched_temp_store);
+
+void init_sysfs_temp(void){
+ kobject_set_name(&tempsched, "sched_temp");
+ kobject_register(&tempsched);
+ sysfs_create_file(&tempsched, &attr_sched_thermal.attr);
+}
+
+/* this method is called in various places to get the temperature
+ of a given core (or, practically, for an application).
+
+ this is not general purpose and is pretty much just lifted from
+ the coretemp driver.
+*/
+int get_temperature(int cpu){
+
+ u32 eax, edx;
+ int temperature;
+ /* the Core2 MSR_THERM_STATUS is reported as a delta
+ from a base temperature; in our case, 85000 mC */
+
+ rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &eax, &edx);
+
+ if(eax & 0x80000000){
+ temperature = TEMPBASE - (((eax >> 16) & 0x7f) * 1000);
+ temperature = temperature / 1000;
+ return temperature;
+ }
+
+ // error code
+ return -1;
+}
+
+/* find_coolest_cpu looks for the current coolest cpu in the system
+ * this is usually called in the context of a given process, so we
+ * need to only poll 'legal' cores. if we want to know generally
+ * instead of in the context of a given process, set p=NULL.
+ *
+ * again, not terribly general purpose code (building our own cpumask)
+ */
+int find_coolest_cpu(struct task_struct *p){
+ int numcpus = 2; // hard-coded :x
+ cpumask_t tmp;
+ int coolest_temp = 100;
+ int coolest_cpu = 0;
+ int i, temp;
+ static int next_cpu = 0;
+ // build our own cpumask, because just using setall
+ // gives us way too much
+ for(i=0; i<numcpus; i++){
+ cpu_set(i, tmp);
+ }
+
+ /* now create a mask that is the legal processors
+ for this task */
+ if(p){
+ cpus_and(tmp, tmp, p->cpus_allowed);
+ }
+
+ /* look for the coolest cpu of those allowed */
+ for_each_cpu_mask(i, tmp){
+ temp = get_temperature(i);
+ if(temp < coolest_temp){
+ coolest_temp = temp;
+ coolest_cpu = i;
+ }
+ }
+
+ /* if coolest_temp is -1, we got invalid data
+ somewhere, so we're just going to default
+ to round-robin placement. */
+ if(coolest_temp == -1){
+ coolest_cpu = next_cpu;
+ next_cpu++;
+ next_cpu = next_cpu % numcpus;
+ }
+ return coolest_cpu;
+}
diff -Nur linux-2.6.24.3/kernel/sched_temp.class linux-2.6.24.3-therm/kernel/sched_temp.class
--- linux-2.6.24.3/kernel/sched_temp.class 1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.24.3-therm/kernel/sched_temp.class 2008-04-09 13:45:45.000000000 -0400
@@ -0,0 +1,180 @@
+/*
+ * sched_temp.c
+ *
+ * temperature-based scheduling implementation
+ * (trying to rework it based on a CFS scheduling class)
+ *
+ * the use of the RB tree seems to be specific to sched_fair
+ * so I may end up having to come up with a complete method for
+ * managing tasks... or maybe i really should just monkey around
+ * with stuff inside sched_fair.c.
+ *
+ *
+ * Values of sched_thermal determine when and what sort of
+ * scheduling decisions we might make. The current list:
+ * 1 - Put a new job (sched_fork() and sched_exec()) on
+ * the coolest core.
+ * 2 - When a core becomes idle, look for the coolest
+ * (or hottest) task we can find and steal it
+ */
+
+#include <linux/kobject.h>
+
+#define TEMPBASE 85000
+
+/* prototypes and such that we need here that are defined but
+ not prototyped in sched.c or sched_fair.c */
+
+#define for_each_leaf_cfs_rq(rq, cfs) \
+ list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+static void pull_task(struct rq *src_rq, struct task_struct *p,
+ struct rq *this_rq, int this_cpu);
+
+static int can_migrate_task(struct task_struct *p, struct rq *rq,
+ int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned);
+
+/* this stuff at the top of the file is necessary for adding
+ the file under /sys/ that lets us turn on/off thermal
+ scheduling methods. Specifically, it sets the value of
+ int sched_thermal, which is defined and used variously in
+ sched.c. */
+
+struct kobject tempsched;
+
+static ssize_t sched_temp_show(struct sys_device *dev, char *page){
+ return sprintf(page, "%u\n", sched_thermal);
+}
+
+static ssize_t sched_temp_store(struct sys_device *dev, const char *buf,
+ size_t count){
+ switch(buf[0]){
+ case '0':
+ sched_thermal=0;
+ break;
+ case '1':
+ sched_thermal=1;
+ break;
+ case '2':
+ sched_thermal=2;
+ break;
+ default:
+ return -EINVAL;
+ };
+ return count;
+}
+
+/* SYSDEV_ATTR appears to be a macro which, among other things, creates
+ attr_sched_thermal */
+static SYSDEV_ATTR(sched_thermal, 0644, sched_temp_show, sched_temp_store);
+
+void init_sysfs_temp(void){
+ kobject_set_name(&tempsched, "sched_temp");
+ kobject_register(&tempsched);
+ sysfs_create_file(&tempsched, &attr_sched_thermal.attr);
+}
+
+/* this method is called in various places to get the temperature
+ of a given core (or, practically, for an application).
+
+ Note 1: This is currently not at all general-purpose, and is in
+ fact pretty much just what you need to do read the
+ temperature on Core2-series processors. As such, the
+ code is largely lifted from the coretemp driver.
+
+ Note 2: This calls rdmsr_on_cpu, which eventually calls
+ smp_call_function_single, which can deadlock if
+ called with interrupts disabled. Thus, this
+ function, too, should only be called with interrupts
+ enabled. (I've added a WARN_ON here for that case.) */
+int get_temperature(int cpu){
+
+ u32 eax, edx;
+ int temperature;
+ /* the Core2 MSR_THERM_STATUS is reported as a delta
+ from a base temperature; in our case, 85000 mC */
+
+ rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &eax, &edx);
+
+ if(eax & 0x80000000){
+ temperature = TEMPBASE - (((eax >> 16) & 0x7f) * 1000);
+ temperature = temperature / 1000;
+ return temperature;
+ }
+
+ // error code
+ return -1;
+}
+
+/* find_coolest_cpu is used if sched_thermal = 1
+ *
+ * we have to pass in p to make sure we don't put the
+ * process on an 'illegal' core
+ */
+int find_coolest_cpu(struct task_struct *p){
+ int numcpus = 2; // hard-coded :x
+ cpumask_t tmp;
+ int coolest_temp = 100;
+ int coolest_cpu = 0;
+ int i, temp;
+ static int next_cpu = 0;
+
+ // build our own cpumask, because just using setall
+ // gives us way too much
+ for(i=0; i<numcpus; i++){
+ cpu_set(i, tmp);
+ }
+
+ /* now create a mask that is the legal processors
+ for this task */
+ cpus_and(tmp, tmp, p->cpus_allowed);
+
+ /* look for the coolest cpu of those allowed */
+ for_each_cpu_mask(i, tmp){
+ temp = get_temperature(i);
+ if(temp < coolest_temp){
+ coolest_temp = temp;
+ coolest_cpu = i;
+ }
+ }
+
+ /* if coolest_temp is -1, we got invalid data
+ somewhere, so we're just going to default
+ to round-robin placement. */
+ if(coolest_temp == -1){
+ coolest_cpu = next_cpu;
+ next_cpu++;
+ next_cpu = next_cpu % numcpus;
+ }
+
+ return coolest_cpu;
+}
+
+/*
+ * struct representing a scheduling class
+ *
+ * I kind of wish there was better documentation
+ * on this.
+ */
+
+static const struct sched_class thermal_sched_class = {
+ /* not sure what the .next ptr is for, maybe
+ the next class to try, so i'll point it to
+ fair_sched_class */
+ .next = &fair_sched_class,
+ .enqueue_task = ,
+ .dequeue_task = ,
+ .yield_task = ,
+ .check_preempt_curr = ,
+ .pick_next_task = ,
+ .put_prev_task = ,
+#ifdef CONFIG_SMP
+ .load_balance = ,
+ .move_one_task = ,
+#endif
+ .set_curr_task = ,
+ .task_tick = ,
+ .task_new = ,
+};
+
^ permalink raw reply [flat|nested] 9+ messages in thread