* [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-26 13:05 ` Ingo Molnar
@ 2005-11-29 2:48 ` Steven Rostedt
2005-11-29 3:02 ` Andrew Morton
2005-11-29 13:08 ` Pavel Machek
0 siblings, 2 replies; 26+ messages in thread
From: Steven Rostedt @ 2005-11-29 2:48 UTC (permalink / raw)
To: Ingo Molnar
Cc: acpi-devel, len.brown, Andrew Morton, Fernando Lopez-Lezcano,
Lee Revell, linux-kernel, Paul E. McKenney, K.R. Foley,
Thomas Gleixner, pluto, john cooper, Benedikt Spranger,
Daniel Walker, Tom Rini, George Anzinger
Here's an update on the switching of the idle function.
As Ingo has suggested, I removed this from being specific to the
poll_idle function.
Description:
This patch creates a directory in /sys/kernel called idle. This
directory contains two files: idle_ctrl and idle_methods. Reading
idle_ctrl will show the function that is currently being used for idle,
and idle_methods shows the available methods for the user to send write
into idle_ctrl to change which function to use for idle.
If the freeze attribute is set for an idle function (defined in the
idle_info struct explained below), then the user cannot add or remove
that function. This is used by the acpi since I wasn't sure how it
would handle having that function added or removed dynamically.
Functions that are frozen are shown in the idle_methods (and idle_ctrl
when used) with an asterisk (*) in front of the name.
I moved the code from arch/x86_64 to outside the arch directories into
kernel. The file is called idle.c. This implements functions to
register idle and unregister idle. It also has the functions to set
which idle to use. This file also creates the entries into the sysfs
directory. Currently this is only compiled for i386, x86_64, and
ia64.
Since I only have i386 and x86_64, I was only able to test the changes
in those two archs. I modified ia64, but haven't even tried to compile
it. If someone with that arch would like to do me the favor, please
do ;-)
I've created an idle_info structure that is used to register the idle
functions. This is now how acpi adds its functions.
struct idle_info {
struct list_head list; /* used to link in with all other registered */
const char *name; /* name to be used to add as well as to show */
idlefunc_t func; /* the function to be called for idle */
int freeze; /* set to disallow the user from adding or removing it */
int inuse; /* set when being used as the idle function */
};
This is a much more robust way of handling changes of the idle function
and can easily be adapted to other archs that would like to also
implement dynamic changes of the idle function. This would be nice to
add to sparc (hint hint).
Here's the patch:
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Index: linux-2.6.15-rc2-git5/arch/i386/kernel/process.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/i386/kernel/process.c 2005-11-28 19:59:34.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/i386/kernel/process.c 2005-11-28 20:30:51.000000000 -0500
@@ -39,6 +39,7 @@
#include <linux/ptrace.h>
#include <linux/random.h>
#include <linux/kprobes.h>
+#include <linux/idle.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -72,11 +73,6 @@
return ((unsigned long *)tsk->thread.esp)[3];
}
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
void disable_hlt(void)
@@ -185,7 +181,7 @@
__get_cpu_var(cpu_idle_state) = 0;
rmb();
- idle = pm_idle;
+ idle = idle_func;
if (!idle)
idle = default_idle;
@@ -230,6 +226,8 @@
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
+static struct idle_info idle_mwait;
+
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
@@ -258,25 +256,62 @@
* Skip, if setup has overridden idle.
* One CPU supports mwait => All CPUs supports mwait
*/
- if (!pm_idle) {
+ memset(&idle_mwait, 0, sizeof(idle_mwait));
+ idle_mwait.name = "mwait";
+ idle_mwait.func = mwait_idle;
+ register_idle(&idle_mwait);
+
+ if (!idle_func) {
printk("using mwait in idle threads.\n");
- pm_idle = mwait_idle;
+ set_idle("mwait");
}
}
}
+static struct idle_info idle_default;
+static struct idle_info idle_poll;
+
+static int __init add_idle(void)
+{
+ static int set;
+
+ if (set)
+ return 0;
+ set = 1;
+
+ memset(&idle_poll, 0, sizeof(idle_poll));
+ idle_poll.name = "poll";
+ idle_poll.func = poll_idle;
+ register_idle(&idle_poll);
+
+ /*
+ * Allow the user to switch out of poll_idle even
+ * if it was a boot option.
+ */
+ memset(&idle_default, 0, sizeof(idle_default));
+ idle_default.name = "default";
+ idle_default.func = default_idle;
+ register_idle(&idle_default);
+
+ return 0;
+}
+
+arch_initcall(add_idle);
+
static int __init idle_setup (char *str)
{
+ add_idle();
if (!strncmp(str, "poll", 4)) {
printk("using polling idle threads.\n");
- pm_idle = poll_idle;
+ set_idle("poll");
+
#ifdef CONFIG_X86_SMP
if (smp_num_siblings > 1)
printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
#endif
} else if (!strncmp(str, "halt", 4)) {
printk("using halt in idle threads.\n");
- pm_idle = default_idle;
+ set_idle("default");
}
boot_option_idle_override = 1;
Index: linux-2.6.15-rc2-git5/arch/x86_64/kernel/process.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/x86_64/kernel/process.c 2005-11-28 19:59:34.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/x86_64/kernel/process.c 2005-11-28 20:30:21.000000000 -0500
@@ -36,6 +36,8 @@
#include <linux/utsname.h>
#include <linux/random.h>
#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/idle.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -60,10 +62,6 @@
unsigned long boot_option_idle_override = 0;
EXPORT_SYMBOL(boot_option_idle_override);
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
void disable_hlt(void)
@@ -195,7 +193,7 @@
__get_cpu_var(cpu_idle_state) = 0;
rmb();
- idle = pm_idle;
+ idle = idle_func;
if (!idle)
idle = default_idle;
if (cpu_is_offline(smp_processor_id()))
@@ -209,6 +207,8 @@
}
}
+struct idle_info idle_mwait;
+
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
@@ -233,25 +233,61 @@
{
static int printed;
if (cpu_has(c, X86_FEATURE_MWAIT)) {
+ memset(&idle_mwait, 0, sizeof(idle_mwait));
+ idle_mwait.name = "mwait";
+ idle_mwait.func = mwait_idle;
+ register_idle(&idle_mwait);
+
/*
* Skip, if setup has overridden idle.
* One CPU supports mwait => All CPUs supports mwait
*/
- if (!pm_idle) {
+ if (!idle_func) {
if (!printed) {
printk("using mwait in idle threads.\n");
printed = 1;
}
- pm_idle = mwait_idle;
+ set_idle("mwait");
}
}
}
+static struct idle_info idle_default;
+static struct idle_info idle_poll;
+
+static int __init add_idle(void)
+{
+ static int set;
+
+ if (set)
+ return 0;
+ set = 1;
+
+ memset(&idle_poll, 0, sizeof(idle_poll));
+ idle_poll.name = "poll";
+ idle_poll.func = poll_idle;
+ register_idle(&idle_poll);
+
+ /*
+ * Allow the user to switch out of poll_idle even
+ * if it was a boot option.
+ */
+ memset(&idle_default, 0, sizeof(idle_default));
+ idle_default.name = "default";
+ idle_default.func = default_idle;
+ register_idle(&idle_default);
+
+ return 0;
+}
+arch_initcall(add_idle);
+
static int __init idle_setup (char *str)
{
+ add_idle();
+
if (!strncmp(str, "poll", 4)) {
printk("using polling idle threads.\n");
- pm_idle = poll_idle;
+ set_idle("poll");
}
boot_option_idle_override = 1;
Index: linux-2.6.15-rc2-git5/drivers/acpi/processor_idle.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/drivers/acpi/processor_idle.c 2005-11-28 19:59:34.000000000 -0500
+++ linux-2.6.15-rc2-git5/drivers/acpi/processor_idle.c 2005-11-28 19:59:42.000000000 -0500
@@ -38,6 +38,8 @@
#include <linux/dmi.h>
#include <linux/moduleparam.h>
#include <linux/sched.h> /* need_resched() */
+#include <linux/spinlock.h>
+#include <linux/idle.h>
#include <asm/io.h>
#include <asm/uaccess.h>
@@ -56,6 +58,7 @@
#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
static void (*pm_idle_save) (void);
module_param(max_cstate, uint, 0644);
+#define PM_IDLE_NAME "pm_idle"
static unsigned int nocst = 0;
module_param(nocst, uint, 0000);
@@ -891,13 +894,13 @@
return_VALUE(-ENODEV);
/* Fall back to the default idle loop */
- pm_idle = pm_idle_save;
+ set_idle(NULL);
synchronize_sched(); /* Relies on interrupts forcing exit from idle. */
pr->flags.power = 0;
result = acpi_processor_get_power_info(pr);
if ((pr->flags.power == 1) && (pr->flags.power_setup_done))
- pm_idle = acpi_processor_idle;
+ set_idle(PM_IDLE_NAME);
return_VALUE(result);
}
@@ -983,6 +986,8 @@
.release = single_release,
};
+static struct idle_info pm_idle_info;
+
int acpi_processor_power_init(struct acpi_processor *pr,
struct acpi_device *device)
{
@@ -1032,8 +1037,17 @@
printk(")\n");
if (pr->id == 0) {
- pm_idle_save = pm_idle;
- pm_idle = acpi_processor_idle;
+ memset(&pm_idle_info, 0, sizeof(pm_idle_info));
+ pm_idle_info.name = PM_IDLE_NAME;
+ pm_idle_info.func = acpi_processor_idle;
+ pm_idle_info.freeze = 1;
+
+ register_idle(&pm_idle_info);
+ /*
+ * Just use the default idle
+ */
+ pm_idle_save = get_idle(NULL);
+ set_idle(PM_IDLE_NAME);
}
}
@@ -1068,7 +1082,29 @@
/* Unregister the idle handler when processor #0 is removed. */
if (pr->id == 0) {
- pm_idle = pm_idle_save;
+ int tries = 0;
+ int ret;
+ set_idle(NULL);
+ do {
+ if ((ret = unregister_idle(PM_IDLE_NAME)) == 0)
+ break;
+ /*
+ * for some reason the idle function is being used.
+ * Wait a little and then try again.
+ */
+ if (ret == -EINVAL) {
+ printk(KERN_WARNING
+ "ACPI idle function never registered?\n");
+ break;
+ }
+ yield();
+ } while (tries++ < 10);
+ if (tries > 10) {
+ printk(KERN_WARNING
+ "Unable to unresgister ACPI idle function\n");
+ /* don't unregister */
+ return_VALUE(ret);
+ }
/*
* We are about to unload the current idle thread pm callback
Index: linux-2.6.15-rc2-git5/include/linux/pm.h
===================================================================
--- linux-2.6.15-rc2-git5.orig/include/linux/pm.h 2005-11-28 19:59:34.000000000 -0500
+++ linux-2.6.15-rc2-git5/include/linux/pm.h 2005-11-28 19:59:42.000000000 -0500
@@ -25,6 +25,7 @@
#include <linux/config.h>
#include <linux/list.h>
+#include <linux/spinlock.h>
#include <asm/atomic.h>
/*
@@ -102,6 +103,8 @@
*/
extern void (*pm_idle)(void);
extern void (*pm_power_off)(void);
+extern spinlock_t pm_idle_switch_lock;
+extern int pm_idle_locked;
typedef int __bitwise suspend_state_t;
Index: linux-2.6.15-rc2-git5/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/x86_64/Kconfig 2005-11-28 19:59:34.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/x86_64/Kconfig 2005-11-28 19:59:42.000000000 -0500
@@ -69,6 +69,10 @@
bool
default y
+config DYNAMIC_IDLE
+ bool
+ default y
+
source "init/Kconfig"
Index: linux-2.6.15-rc2-git5/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-11-28 19:59:34.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/x86_64/kernel/x8664_ksyms.c 2005-11-28 19:59:42.000000000 -0500
@@ -58,7 +58,6 @@
EXPORT_SYMBOL(disable_irq_nosync);
EXPORT_SYMBOL(probe_irq_mask);
EXPORT_SYMBOL(kernel_thread);
-EXPORT_SYMBOL(pm_idle);
EXPORT_SYMBOL(pm_power_off);
EXPORT_SYMBOL(get_cmos_time);
Index: linux-2.6.15-rc2-git5/include/linux/idle.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.15-rc2-git5/include/linux/idle.h 2005-11-28 21:36:00.000000000 -0500
@@ -0,0 +1,67 @@
+/*
+ * idle.h - Registering of the idle function (for supported archs)
+ *
+ * Copyright (C) 2005 Steven Rostedt <rostedt@goodmis.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _LINUX_IDLE_H
+#define _LINUX_IDLE_H
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+typedef void (*idlefunc_t)(void);
+
+struct idle_info {
+ struct list_head list;
+ const char *name; /* Name visible to users */
+ idlefunc_t func; /* idle function to run */
+ int freeze; /* Only allow kernel to add or remove */
+ int inuse; /* set when being used */
+};
+
+/*
+ * Registering and unregistering functions that may be used
+ * instead of the default idle function. This only adds
+ * them to the list of functions to be used, it does not
+ * set the
+ */
+extern int register_idle(struct idle_info *info);
+extern int unregister_idle(const char *name);
+
+/*
+ * This sets the idle function to the registered function
+ * by name. Use NULL to set the idle function back to
+ * the default.
+ */
+extern int set_idle(const char *name);
+
+/*
+ * Return the function that is registered by name.
+ * Use NULL to get the default function.
+ * NULL may be returned (as that may be what the current
+ * idle function is set to, to use a default). NULL will
+ * also be returned if name is not registered.
+ */
+extern idlefunc_t get_idle(const char *name);
+
+extern idlefunc_t idle_func;
+
+#endif /* _LINUX_IDLE_H */
Index: linux-2.6.15-rc2-git5/kernel/Makefile
===================================================================
--- linux-2.6.15-rc2-git5.orig/kernel/Makefile 2005-11-28 19:59:34.000000000 -0500
+++ linux-2.6.15-rc2-git5/kernel/Makefile 2005-11-28 19:59:42.000000000 -0500
@@ -32,6 +32,7 @@
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_DYNAMIC_IDLE) += idle.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6.15-rc2-git5/kernel/idle.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.15-rc2-git5/kernel/idle.c 2005-11-28 20:29:57.000000000 -0500
@@ -0,0 +1,308 @@
+/*
+ * kernel/idle.c
+ *
+ * Setting up of the idle function to be dynamic.
+ *
+ * Copyright (C) 2005 Steven Rostedt
+ */
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/spinlock.h>
+#include <linux/idle.h>
+
+idlefunc_t idle_func;
+
+static void (*idle_default)(void);
+static LIST_HEAD(idle_elements);
+static DECLARE_MUTEX(idle_sem);
+static struct idle_info *curr_idle;
+
+#ifdef CONFIG_SYSFS
+int idle_sysfs_init;
+#endif
+
+extern void poll_idle (void);
+
+static struct idle_info *__find_idle_info(const char *name)
+{
+ struct list_head *curr;
+ struct idle_info *p;
+ /*
+ * A little inefficient, but this isn't called often.
+ */
+ list_for_each(curr, &idle_elements) {
+ p = list_entry(curr, struct idle_info, list);
+ if (!strcmp(name, p->name))
+ break;
+ }
+ if (curr == &idle_elements)
+ p = NULL;
+
+ return p;
+}
+
+int register_idle(struct idle_info *info)
+{
+ struct idle_info *p;
+ int ret = -EEXIST;
+
+ BUG_ON(!info->name);
+
+ down(&idle_sem);
+
+ p = __find_idle_info(info->name);
+ if (p)
+ goto out;
+ ret = 0;
+
+ list_add(&info->list, &idle_elements);
+
+out:
+ up(&idle_sem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_idle);
+
+int unregister_idle(const char *name)
+{
+ struct idle_info *p;
+ int ret = -EINVAL;
+
+ BUG_ON(!name);
+
+ down(&idle_sem);
+
+ p = __find_idle_info(name);
+ if (!p)
+ goto out;
+ if (p->inuse) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = 0;
+
+ list_del_init(&p->list);
+
+out:
+ up(&idle_sem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_idle);
+
+static int __set_idle(struct idle_info *info)
+{
+ if (curr_idle)
+ curr_idle->inuse--;
+ info->inuse++;
+ curr_idle = info;
+ return 0;
+}
+
+int set_idle(const char *name)
+{
+ struct idle_info *p;
+ int ret = 0;
+
+ down(&idle_sem);
+
+ if (!name) {
+ /* Set to the default function */
+ if (curr_idle) {
+ curr_idle->inuse--;
+ curr_idle = NULL;
+ }
+ idle_func = idle_default;
+ goto out;
+ }
+
+ ret = -EINVAL;
+ p = __find_idle_info(name);
+ if (!p)
+ goto out;
+
+ __set_idle(p);
+out:
+ up(&idle_sem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_idle);
+
+idlefunc_t get_idle(const char *name)
+{
+ struct idle_info *p;
+ idlefunc_t ret = idle_default;
+
+ down(&idle_sem);
+
+ if (!name)
+ goto out;
+
+ p = __find_idle_info(name);
+ if (!p)
+ goto out;
+
+ ret = p->func;
+out:
+ up(&idle_sem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_idle);
+
+#ifdef CONFIG_SYSFS
+#define KERNEL_ATTR_RW(_name) \
+static struct subsys_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct idlep_kobject
+{
+ struct kobject kobj;
+} idle_kobj;
+
+static ssize_t idle_ctrl_show(struct subsystem *subsys, char *page)
+{
+ ssize_t ret;
+ char *star = "";
+ const char *name = "default";
+
+ down(&idle_sem);
+ if (curr_idle) {
+ name = curr_idle->name;
+ if (curr_idle->freeze)
+ star = "*";
+ }
+ ret = sprintf(page, "%s%s\n", star, name);
+ up(&idle_sem);
+
+ return ret;
+}
+
+static ssize_t idle_ctrl_store(struct subsystem *subsys,
+ const char *buf, size_t len)
+{
+ struct list_head *curr;
+ struct idle_info *p;
+ ssize_t ret = -EBUSY;
+
+ down(&idle_sem);
+
+ if (curr_idle && curr_idle->freeze)
+ goto out;
+
+ list_for_each(curr, &idle_elements) {
+ int size;
+ p = list_entry(curr, struct idle_info, list);
+
+ size = strlen(p->name);
+ if (len <= size)
+ continue;
+ if (!strncmp(p->name, buf, size))
+ break;
+ }
+ if (curr == &idle_elements) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * This idle routine may have been registered to
+ * not allow users to add or remove this.
+ */
+ if (p->freeze)
+ goto out;
+
+ __set_idle(p);
+
+ ret = len;
+out:
+ up(&idle_sem);
+
+ return ret;
+}
+
+KERNEL_ATTR_RW(idle_ctrl);
+
+static ssize_t idle_methods_show(struct subsystem *subsys, char *page)
+{
+ struct list_head *curr;
+ struct idle_info *p;
+ ssize_t len = 0;
+
+ down(&idle_sem);
+ list_for_each(curr, &idle_elements) {
+ p = list_entry(curr, struct idle_info, list);
+ if (len + 3 + strlen(p->name) >= PAGE_SIZE) {
+ printk("idle functions overflowed sysfs??\n");
+ break;
+ }
+ len += sprintf(page+len, "%s%s%s",
+ len ? " " : "",
+ p->freeze ? "*" : "",
+ p->name);
+ }
+ if (len + 2 < PAGE_SIZE)
+ len += sprintf(page+len, "\n");
+
+ up(&idle_sem);
+ return len;
+}
+
+static ssize_t idle_methods_store(struct subsystem *subsys,
+ const char *buf, size_t len)
+{
+ /* do nothing */
+ return len;
+}
+
+KERNEL_ATTR_RW(idle_methods);
+
+static struct attribute * idle_attrs[] = {
+ &idle_ctrl_attr.attr,
+ &idle_methods_attr.attr,
+ NULL
+};
+
+static struct attribute_group idle_attr_group = {
+ .attrs = idle_attrs,
+};
+
+static int __init idle_setup_sysfs(void)
+{
+ int err;
+
+ memset(&idle_kobj, 0, sizeof(idle_kobj));
+ err = kobject_set_name(&idle_kobj.kobj, "%s", "idle");
+ if (err)
+ goto out;
+
+ kobj_set_kset_s(&idle_kobj, kernel_subsys);
+
+ idle_kobj.kobj.parent = &kernel_subsys.kset.kobj;
+ err = kobject_register(&idle_kobj.kobj);
+ if (err)
+ goto out;
+
+ err = sysfs_create_group(&idle_kobj.kobj,
+ &idle_attr_group);
+ if (err)
+ goto out;
+
+ return 0;
+out:
+ printk(KERN_INFO "Problem setting up sysfs idle_ctrl\n");
+ return 0;
+}
+#endif /* CONFIG_SYSFS */
+
+static int __init idle_setup(void)
+{
+ idle_default = idle_func;
+
+#ifdef CONFIG_SYSFS
+ idle_setup_sysfs();
+#endif
+ return 0;
+}
+
+late_initcall(idle_setup);
Index: linux-2.6.15-rc2-git5/arch/i386/Kconfig
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/i386/Kconfig 2005-11-28 19:59:34.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/i386/Kconfig 2005-11-28 19:59:42.000000000 -0500
@@ -45,6 +45,10 @@
bool
default y
+config DYNAMIC_IDLE
+ bool
+ default y
+
source "init/Kconfig"
menu "Processor type and features"
Index: linux-2.6.15-rc2-git5/arch/i386/kernel/apm.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/i386/kernel/apm.c 2005-11-28 19:59:34.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/i386/kernel/apm.c 2005-11-28 19:59:42.000000000 -0500
@@ -225,6 +225,7 @@
#include <linux/smp_lock.h>
#include <linux/dmi.h>
#include <linux/suspend.h>
+#include <linux/idle.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -2220,6 +2221,9 @@
{ }
};
+static struct idle_info apm_idle;
+#define APM_IDLE_NAME "apm"
+
/*
* Just start the APM thread. We do NOT want to do APM BIOS
* calls from anything but the APM thread, if for no other reason
@@ -2373,8 +2377,14 @@
if (HZ != 100)
idle_period = (idle_period * HZ) / 100;
if (idle_threshold < 100) {
- original_pm_idle = pm_idle;
- pm_idle = apm_cpu_idle;
+ memset(&apm_idle, 0, sizeof(apm_idle));
+ apm_idle.name = APM_IDLE_NAME;
+ apm_idle.func = apm_cpu_idle;
+ apm_idle.freeze = 1;
+ register_idle(&apm_idle);
+
+ original_pm_idle = get_idle(NULL);
+ set_idle(APM_IDLE_NAME);
set_pm_idle = 1;
}
@@ -2386,7 +2396,26 @@
int error;
if (set_pm_idle) {
- pm_idle = original_pm_idle;
+ int tries = 0;
+ int ret;
+ set_idle(NULL);
+ do {
+ if ((ret = unregister_idle(APM_IDLE_NAME)) == 0)
+ break;
+ /*
+ * for some reason the idle function is being used.
+ * Wait a little and then try again.
+ */
+ if (ret == -EINVAL) {
+ printk(KERN_WARNING
+ "APM idle function never registered?\n");
+ break;
+ }
+ yield();
+ } while (tries++ < 10);
+ if (tries > 10)
+ printk(KERN_WARNING
+ "Unable to unresgister APM idle function\n");
/*
* We are about to unload the current idle thread pm callback
* (pm_idle), Wait for all processors to update cached/local
Index: linux-2.6.15-rc2-git5/arch/ia64/Kconfig
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/ia64/Kconfig 2005-11-22 12:13:22.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/ia64/Kconfig 2005-11-28 20:17:30.000000000 -0500
@@ -62,6 +62,10 @@
bool
default y
+config DYNAMIC_IDLE
+ bool
+ default y
+
choice
prompt "System type"
default IA64_GENERIC
Index: linux-2.6.15-rc2-git5/arch/ia64/kernel/acpi.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/ia64/kernel/acpi.c 2005-11-22 12:13:22.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/ia64/kernel/acpi.c 2005-11-28 20:23:41.000000000 -0500
@@ -60,8 +60,6 @@
#define PREFIX "ACPI: "
-void (*pm_idle) (void);
-EXPORT_SYMBOL(pm_idle);
void (*pm_power_off) (void);
EXPORT_SYMBOL(pm_power_off);
Index: linux-2.6.15-rc2-git5/arch/ia64/kernel/process.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/ia64/kernel/process.c 2005-11-25 10:58:53.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/ia64/kernel/process.c 2005-11-28 20:29:33.000000000 -0500
@@ -31,6 +31,7 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/kprobes.h>
+#include <linux/idle.h>
#include <asm/cpu.h>
#include <asm/delay.h>
@@ -289,7 +290,7 @@
if (mark_idle)
(*mark_idle)(1);
- idle = pm_idle;
+ idle = idle_func;
if (!idle)
idle = default_idle;
(*idle)();
Index: linux-2.6.15-rc2-git5/arch/ia64/kernel/setup.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/ia64/kernel/setup.c 2005-11-22 12:13:22.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/ia64/kernel/setup.c 2005-11-28 20:23:09.000000000 -0500
@@ -43,6 +43,7 @@
#include <linux/initrd.h>
#include <linux/platform.h>
#include <linux/pm.h>
+#include <linux/idle.h>
#include <asm/ia32.h>
#include <asm/machvec.h>
@@ -738,6 +739,8 @@
ia64_max_cacheline_size = max;
}
+struct idle_info idle_default;
+
/*
* cpu_init() initializes state that is per-CPU. This function acts
* as a 'CPU state barrier', nothing should get across.
@@ -861,7 +864,13 @@
/* size of physical stacked register partition plus 8 bytes: */
__get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
platform_cpu_init();
- pm_idle = default_idle;
+
+ memset(&idle_default, 0, sizeof(idle_default));
+ idle_default.name = "default";
+ idle_default.func = default_idle;
+ register_idle(&idle_default);
+
+ set_idle("default");
}
void
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 2:48 ` [RFC][PATCH] Runtime switching of the idle function [take 2] Steven Rostedt
@ 2005-11-29 3:02 ` Andrew Morton
2005-11-29 3:42 ` Steven Rostedt
2005-11-29 13:08 ` Pavel Machek
1 sibling, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2005-11-29 3:02 UTC (permalink / raw)
To: Steven Rostedt
Cc: mingo, acpi-devel, len.brown, nando, rlrevell, linux-kernel,
paulmck, kr, tglx, pluto, john.cooper, bene, dwalker, trini,
george
Steven Rostedt <rostedt@goodmis.org> wrote:
>
> This patch creates a directory in /sys/kernel called idle.
>
At no point do you appear to explain _why_ the kernel needs this feature?
> ...
> - pm_idle = pm_idle_save;
> + int tries = 0;
> + int ret;
> + set_idle(NULL);
> + do {
> + if ((ret = unregister_idle(PM_IDLE_NAME)) == 0)
> + break;
> + /*
> + * for some reason the idle function is being used.
> + * Wait a little and then try again.
> + */
> + if (ret == -EINVAL) {
> + printk(KERN_WARNING
> + "ACPI idle function never registered?\n");
> + break;
> + }
> + yield();
> + } while (tries++ < 10);
The use of yield() could be problematic - its semantics are rather
ill-defined. Maybe msleep(1) or something?
What's this loop here for anyway? Looks kludgy.
> + if (tries > 10) {
> + printk(KERN_WARNING
> + "Unable to unresgister ACPI idle function\n");
tpyo
> + memset(&idle_kobj, 0, sizeof(idle_kobj));
There are several memsets of statically allocated structures which are
already all-zero.
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 3:02 ` Andrew Morton
@ 2005-11-29 3:42 ` Steven Rostedt
2005-11-29 4:01 ` Andrew Morton
2005-11-29 4:22 ` john stultz
0 siblings, 2 replies; 26+ messages in thread
From: Steven Rostedt @ 2005-11-29 3:42 UTC (permalink / raw)
To: Andrew Morton
Cc: mingo, acpi-devel, len.brown, nando, rlrevell, linux-kernel,
paulmck, kr, tglx, pluto, john.cooper, bene, dwalker, trini,
george
On Mon, 2005-11-28 at 19:02 -0800, Andrew Morton wrote:
> Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > This patch creates a directory in /sys/kernel called idle.
> >
>
> At no point do you appear to explain _why_ the kernel needs this feature?
Sorry about that. This originally came up when we had problems with the
AMD64 x2 in the -rt patch. It was noted that the TSCs would get very
far out of sync and cause problems. The way to solve this was to set
idle=poll. The original patch I sent was to allow the user to change to
idle=poll dynamically. This way they could switch to the poll_idle and
run there tests (requiring tsc not to drift) and then switch back to the
default idle to save on electricity.
Note: It's been stated that the tsc drift can cause problems with the
vanilla kernel too.
Ingo asked if I could make this more robust and not dependent on
idle_poll.
Maybe Ingo can give a better explanation?
>
> > ...
> > - pm_idle = pm_idle_save;
> > + int tries = 0;
> > + int ret;
> > + set_idle(NULL);
> > + do {
> > + if ((ret = unregister_idle(PM_IDLE_NAME)) == 0)
> > + break;
> > + /*
> > + * for some reason the idle function is being used.
> > + * Wait a little and then try again.
> > + */
> > + if (ret == -EINVAL) {
> > + printk(KERN_WARNING
> > + "ACPI idle function never registered?\n");
> > + break;
> > + }
> > + yield();
> > + } while (tries++ < 10);
>
> The use of yield() could be problematic - its semantics are rather
> ill-defined. Maybe msleep(1) or something?
>
> What's this loop here for anyway? Looks kludgy.
Oops! That was required by some other garbage that I had earlier. I
cleaned up the patch some more, and this is no longer required. (will
remove).
>
> > + if (tries > 10) {
> > + printk(KERN_WARNING
> > + "Unable to unresgister ACPI idle function\n");
>
> tpyo
Will fix.
>
> > + memset(&idle_kobj, 0, sizeof(idle_kobj));
>
> There are several memsets of statically allocated structures which are
> already all-zero.
>
:) I'm really paranoid! OK, I always like to do a memset even when it's
not needed. I'll purge them too.
Thanks for having a look.
-- Steve
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 3:42 ` Steven Rostedt
@ 2005-11-29 4:01 ` Andrew Morton
2005-11-29 6:44 ` Ingo Molnar
2005-11-29 4:22 ` john stultz
1 sibling, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2005-11-29 4:01 UTC (permalink / raw)
To: Steven Rostedt
Cc: mingo, acpi-devel, len.brown, nando, rlrevell, linux-kernel,
paulmck, kr, tglx, pluto, john.cooper, bene, dwalker, trini,
george
Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 2005-11-28 at 19:02 -0800, Andrew Morton wrote:
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > This patch creates a directory in /sys/kernel called idle.
> > >
> >
> > At no point do you appear to explain _why_ the kernel needs this feature?
>
> Sorry about that. This originally came up when we had problems with the
> AMD64 x2 in the -rt patch. It was noted that the TSCs would get very
> far out of sync and cause problems.
Unsynced TSCs are rare, but they happen. I guess even if we were to resync
them, these measurements would screw up.
> The way to solve this was to set
> idle=poll. The original patch I sent was to allow the user to change to
> idle=poll dynamically. This way they could switch to the poll_idle and
> run there tests (requiring tsc not to drift) and then switch back to the
> default idle to save on electricity.
Use gettimeofday()?
If it's just for some sort of instrumentation, run NR_CPUS instances of a
niced-down busyloop, pin each one to a different CPU? That way the idle
function doesn't get called at all..
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 3:42 ` Steven Rostedt
2005-11-29 4:01 ` Andrew Morton
@ 2005-11-29 4:22 ` john stultz
2005-11-29 14:22 ` Steven Rostedt
1 sibling, 1 reply; 26+ messages in thread
From: john stultz @ 2005-11-29 4:22 UTC (permalink / raw)
To: Steven Rostedt
Cc: Andrew Morton, mingo, acpi-devel, len.brown, nando, rlrevell,
linux-kernel, paulmck, kr, tglx, pluto, john.cooper, bene,
dwalker, trini, george
On Mon, 2005-11-28 at 22:42 -0500, Steven Rostedt wrote:
> On Mon, 2005-11-28 at 19:02 -0800, Andrew Morton wrote:
> > Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > This patch creates a directory in /sys/kernel called idle.
> > >
> >
> > At no point do you appear to explain _why_ the kernel needs this feature?
>
> Sorry about that. This originally came up when we had problems with the
> AMD64 x2 in the -rt patch. It was noted that the TSCs would get very
> far out of sync and cause problems. The way to solve this was to set
> idle=poll. The original patch I sent was to allow the user to change to
> idle=poll dynamically. This way they could switch to the poll_idle and
> run there tests (requiring tsc not to drift) and then switch back to the
> default idle to save on electricity.
The problem with this is that this must be a one way transition. That
is, once the TSCs have become unsynchronized, there is no use going back
to using the polling idle unless you add some code to re-sync the TSCs
which would be ugly to do after the system has booted.
Using idle=poll (for anything other then debugging) is really a worst
case workaround for systems that do not have alternative clocksources
like ACPI PM or HPET.
Its an interesting bit of code, but I'm not really sure I understand its
usefulness.
thanks
-john
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 4:01 ` Andrew Morton
@ 2005-11-29 6:44 ` Ingo Molnar
2005-11-29 6:55 ` Nick Piggin
2005-11-29 18:05 ` Andi Kleen
0 siblings, 2 replies; 26+ messages in thread
From: Ingo Molnar @ 2005-11-29 6:44 UTC (permalink / raw)
To: Andrew Morton
Cc: Steven Rostedt, acpi-devel, len.brown, nando, rlrevell,
linux-kernel, paulmck, kr, tglx, pluto, john.cooper, bene,
dwalker, trini, george
* Andrew Morton <akpm@osdl.org> wrote:
> > The way to solve this was to set
> > idle=poll. The original patch I sent was to allow the user to change to
> > idle=poll dynamically. This way they could switch to the poll_idle and
> > run there tests (requiring tsc not to drift) and then switch back to the
> > default idle to save on electricity.
>
> Use gettimeofday()?
>
> If it's just for some sort of instrumentation, run NR_CPUS instances
> of a niced-down busyloop, pin each one to a different CPU? That way
> the idle function doesn't get called at all..
idle=poll is also frequently done for performance reasons [it reduces
idle wakeup latency by 10 usecs] - while it could be turned off if the
system has been idle for some time. E.g. cpufreqd could sample idle time
and turn on/off idle=poll. High-performance setups could enable it all
the time.
as long as it can be done with zero-cost, i dont see why Steven's patch
wouldnt be a plus for us. It's a performance thing, and having runtime
switches for seemless performance features cannot be bad.
Ingo
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 6:44 ` Ingo Molnar
@ 2005-11-29 6:55 ` Nick Piggin
2005-11-29 18:05 ` Andi Kleen
1 sibling, 0 replies; 26+ messages in thread
From: Nick Piggin @ 2005-11-29 6:55 UTC (permalink / raw)
To: Ingo Molnar
Cc: Andrew Morton, Steven Rostedt, acpi-devel, len.brown, nando,
rlrevell, linux-kernel, paulmck, kr, tglx, pluto, john.cooper,
bene, dwalker, trini, george
Ingo Molnar wrote:
> * Andrew Morton <akpm@osdl.org> wrote:
>
>
>>>The way to solve this was to set
>>> idle=poll. The original patch I sent was to allow the user to change to
>>> idle=poll dynamically. This way they could switch to the poll_idle and
>>> run there tests (requiring tsc not to drift) and then switch back to the
>>> default idle to save on electricity.
>>
>>Use gettimeofday()?
>>
>>If it's just for some sort of instrumentation, run NR_CPUS instances
>>of a niced-down busyloop, pin each one to a different CPU? That way
>>the idle function doesn't get called at all..
>
>
> idle=poll is also frequently done for performance reasons [it reduces
> idle wakeup latency by 10 usecs] - while it could be turned off if the
> system has been idle for some time. E.g. cpufreqd could sample idle time
> and turn on/off idle=poll. High-performance setups could enable it all
> the time.
>
> as long as it can be done with zero-cost, i dont see why Steven's patch
> wouldnt be a plus for us. It's a performance thing, and having runtime
> switches for seemless performance features cannot be bad.
>
Why not just slightly cleanup and extend (eg. to ACPI) the
hlt_counter thingy that many architectures already have?
Nick
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 2:48 ` [RFC][PATCH] Runtime switching of the idle function [take 2] Steven Rostedt
2005-11-29 3:02 ` Andrew Morton
@ 2005-11-29 13:08 ` Pavel Machek
2005-12-18 15:26 ` Steven Rostedt
1 sibling, 1 reply; 26+ messages in thread
From: Pavel Machek @ 2005-11-29 13:08 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, acpi-devel, len.brown, Andrew Morton,
Fernando Lopez-Lezcano, Lee Revell, linux-kernel,
Paul E. McKenney, K.R. Foley, Thomas Gleixner, pluto, john cooper,
Benedikt Spranger, Daniel Walker, Tom Rini, George Anzinger
Hi!
> Description:
>
> This patch creates a directory in /sys/kernel called idle. This
> directory contains two files: idle_ctrl and idle_methods. Reading
> idle_ctrl will show the function that is currently being used for idle,
> and idle_methods shows the available methods for the user to send write
> into idle_ctrl to change which function to use for idle.
Pretty ugly interface, I'd say... is listing function really neccessary?
Pavel
--
64 bytes from 195.113.31.123: icmp_seq=28 ttl=51 time=448769.1 ms
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 18:05 ` Andi Kleen
@ 2005-11-29 14:19 ` Steven Rostedt
2005-11-29 14:50 ` Andi Kleen
2005-12-02 1:27 ` Max Krasnyansky
1 sibling, 1 reply; 26+ messages in thread
From: Steven Rostedt @ 2005-11-29 14:19 UTC (permalink / raw)
To: Andi Kleen
Cc: Ingo Molnar, acpi-devel, len.brown, nando, rlrevell, linux-kernel,
paulmck, kr, tglx, pluto, john.cooper, bene, dwalker, trini,
george, akpm
On Tue, 2005-11-29 at 11:05 -0700, Andi Kleen so nicely wrote:
> > idle=poll is also frequently done for performance reasons [it reduces
> > idle wakeup latency by 10 usecs]
>
> And it's obsolete on CPUs with monitor/mwait.
And I wish my system supported it.
> And in practice the CPU will run so hot that only benchmarkers like it.
Why would it run hot? What's the difference between polling and doing
other things. How many transistors does it take to poll?
>
> I think switching idle is the wrong way to do. We should rather
> fix the various problems.
>
> For fixing the TSC issue it is 100% the wrong approach Imho.
I would only say 80% the wrong approach, but that's me ;-)
> Basically software has to live with TSCs being unsynchronized
> and gettimeofday should do the right thing (and if not it should be fixed)
I guess the biggest complaint most have is that the rdtsc _is_ the
fastest way to read a clock. If it isn't reliable, then what good is
it? It's unfortunate that Intel didn't solidify the clock usage. Yes,
use HPET, or something else, but those are slower, and may not be on all
systems. Every system that I owned had a tsc but for critical systems
it isn't up to par (what a shame).
>
> - while it could be turned off if the
> > system has been idle for some time. E.g. cpufreqd could sample idle time
> > and turn on/off idle=poll. High-performance setups could enable it all
> > the time.
>
> And upgrade their server air condition or issue additional ear protection
> to the desktop user? Most likely you will just drive the CPUs into
> thermal throttle at some point with that, not get more performance anyways.
Again, what would make it so hot? It is a waste of CPU cycles, and does
waste energy that way, but does it really heat up the CPU that much?
It's just a loop. I've run much more complex algorithms for days
without any problems. I only once over heated a CPU and that was doing
some brute force calculations of prime numbers.
>
> > as long as it can be done with zero-cost, i dont see why Steven's patch
> > wouldnt be a plus for us. It's a performance thing, and having runtime
> > switches for seemless performance features cannot be bad.
>
> The interface is ugly and I suspect fixing the various obscure race this
> obscure feature would undoubtedly add will be a long term maintenance
> issue. And it's the wrong thing to do anyways because it just papers
> over other problems that should be fixed in the right way.
Oh come now, it's not that ugly. And it would not produce any more
obscure race conditions than the current method of changing idle with
the acpi processor_idle module has.
But I'll agree that this is more of a paper over than a solution. Too
bad I wasted a day writing and testing it (mostly just to learn about
kobjects and sysfs which I still feel is very clumsy).
But since I did clean up the patch, and it is still useful for those
debugging problems with timers. I'm supplying this cleaned up version
(Thank you Andrew for the comments).
-- Steve
Ingo, would you like this for -rt? Even if it will never be accepted
into mainline.
[take 3]:
Index: linux-2.6.15-rc2-git5/arch/i386/kernel/process.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/i386/kernel/process.c 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/i386/kernel/process.c 2005-11-29 07:43:52.000000000 -0500
@@ -39,6 +39,7 @@
#include <linux/ptrace.h>
#include <linux/random.h>
#include <linux/kprobes.h>
+#include <linux/idle.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -72,11 +73,6 @@
return ((unsigned long *)tsk->thread.esp)[3];
}
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
void disable_hlt(void)
@@ -185,7 +181,7 @@
__get_cpu_var(cpu_idle_state) = 0;
rmb();
- idle = pm_idle;
+ idle = idle_func;
if (!idle)
idle = default_idle;
@@ -250,6 +246,11 @@
}
}
+static struct idle_info idle_mwait = {
+ .name = "mwait",
+ .func = mwait_idle
+};
+
void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_MWAIT)) {
@@ -258,25 +259,60 @@
* Skip, if setup has overridden idle.
* One CPU supports mwait => All CPUs supports mwait
*/
- if (!pm_idle) {
+ register_idle(&idle_mwait);
+
+ if (!idle_func) {
printk("using mwait in idle threads.\n");
- pm_idle = mwait_idle;
+ set_idle("mwait");
}
}
}
+static struct idle_info idle_default = {
+ .name = "default",
+ .func = default_idle
+};
+
+static struct idle_info idle_poll = {
+ .name = "poll",
+ .func = poll_idle
+};
+
+static int __init add_idle(void)
+{
+ static int set;
+
+ if (set)
+ return 0;
+ set = 1;
+
+ register_idle(&idle_poll);
+
+ /*
+ * Allow the user to switch out of poll_idle even
+ * if it was a boot option.
+ */
+ register_idle(&idle_default);
+
+ return 0;
+}
+
+arch_initcall(add_idle);
+
static int __init idle_setup (char *str)
{
+ add_idle();
if (!strncmp(str, "poll", 4)) {
printk("using polling idle threads.\n");
- pm_idle = poll_idle;
+ set_idle("poll");
+
#ifdef CONFIG_X86_SMP
if (smp_num_siblings > 1)
printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
#endif
} else if (!strncmp(str, "halt", 4)) {
printk("using halt in idle threads.\n");
- pm_idle = default_idle;
+ set_idle("default");
}
boot_option_idle_override = 1;
Index: linux-2.6.15-rc2-git5/arch/x86_64/kernel/process.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/x86_64/kernel/process.c 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/x86_64/kernel/process.c 2005-11-29 07:45:44.000000000 -0500
@@ -36,6 +36,8 @@
#include <linux/utsname.h>
#include <linux/random.h>
#include <linux/kprobes.h>
+#include <linux/spinlock.h>
+#include <linux/idle.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -60,10 +62,6 @@
unsigned long boot_option_idle_override = 0;
EXPORT_SYMBOL(boot_option_idle_override);
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
void disable_hlt(void)
@@ -195,7 +193,7 @@
__get_cpu_var(cpu_idle_state) = 0;
rmb();
- idle = pm_idle;
+ idle = idle_func;
if (!idle)
idle = default_idle;
if (cpu_is_offline(smp_processor_id()))
@@ -229,29 +227,68 @@
}
}
+static struct idle_info idle_mwait = {
+ .name = "mwait",
+ .func = mwait_idle
+};
+
void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
static int printed;
if (cpu_has(c, X86_FEATURE_MWAIT)) {
+ register_idle(&idle_mwait);
+
/*
* Skip, if setup has overridden idle.
* One CPU supports mwait => All CPUs supports mwait
*/
- if (!pm_idle) {
+ if (!idle_func) {
if (!printed) {
printk("using mwait in idle threads.\n");
printed = 1;
}
- pm_idle = mwait_idle;
+ set_idle("mwait");
}
}
}
+static struct idle_info idle_default = {
+ .name = "default",
+ .func = default_idle
+};
+
+static struct idle_info idle_poll = {
+ .name = "poll",
+ .func = poll_idle
+};
+
+static int __init add_idle(void)
+{
+ static int set;
+
+ if (set)
+ return 0;
+ set = 1;
+
+ register_idle(&idle_poll);
+
+ /*
+ * Allow the user to switch out of poll_idle even
+ * if it was a boot option.
+ */
+ register_idle(&idle_default);
+
+ return 0;
+}
+arch_initcall(add_idle);
+
static int __init idle_setup (char *str)
{
+ add_idle();
+
if (!strncmp(str, "poll", 4)) {
printk("using polling idle threads.\n");
- pm_idle = poll_idle;
+ set_idle("poll");
}
boot_option_idle_override = 1;
Index: linux-2.6.15-rc2-git5/drivers/acpi/processor_idle.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/drivers/acpi/processor_idle.c 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/drivers/acpi/processor_idle.c 2005-11-29 07:47:52.000000000 -0500
@@ -38,6 +38,8 @@
#include <linux/dmi.h>
#include <linux/moduleparam.h>
#include <linux/sched.h> /* need_resched() */
+#include <linux/spinlock.h>
+#include <linux/idle.h>
#include <asm/io.h>
#include <asm/uaccess.h>
@@ -56,6 +58,7 @@
#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
static void (*pm_idle_save) (void);
module_param(max_cstate, uint, 0644);
+#define PM_IDLE_NAME "pm_idle"
static unsigned int nocst = 0;
module_param(nocst, uint, 0000);
@@ -891,13 +894,13 @@
return_VALUE(-ENODEV);
/* Fall back to the default idle loop */
- pm_idle = pm_idle_save;
+ set_idle(NULL);
synchronize_sched(); /* Relies on interrupts forcing exit from idle. */
pr->flags.power = 0;
result = acpi_processor_get_power_info(pr);
if ((pr->flags.power == 1) && (pr->flags.power_setup_done))
- pm_idle = acpi_processor_idle;
+ set_idle(PM_IDLE_NAME);
return_VALUE(result);
}
@@ -983,6 +986,12 @@
.release = single_release,
};
+static struct idle_info pm_idle_info = {
+ .name = PM_IDLE_NAME,
+ .func = acpi_processor_idle,
+ .freeze = 1
+};
+
int acpi_processor_power_init(struct acpi_processor *pr,
struct acpi_device *device)
{
@@ -1032,8 +1041,12 @@
printk(")\n");
if (pr->id == 0) {
- pm_idle_save = pm_idle;
- pm_idle = acpi_processor_idle;
+ register_idle(&pm_idle_info);
+ /*
+ * Just use the default idle
+ */
+ pm_idle_save = get_idle(NULL);
+ set_idle(PM_IDLE_NAME);
}
}
@@ -1068,8 +1081,8 @@
/* Unregister the idle handler when processor #0 is removed. */
if (pr->id == 0) {
- pm_idle = pm_idle_save;
-
+ set_idle(NULL);
+ unregister_idle(PM_IDLE_NAME);
/*
* We are about to unload the current idle thread pm callback
* (pm_idle), Wait for all processors to update cached/local
Index: linux-2.6.15-rc2-git5/include/linux/pm.h
===================================================================
--- linux-2.6.15-rc2-git5.orig/include/linux/pm.h 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/include/linux/pm.h 2005-11-28 20:31:47.000000000 -0500
@@ -25,6 +25,7 @@
#include <linux/config.h>
#include <linux/list.h>
+#include <linux/spinlock.h>
#include <asm/atomic.h>
/*
@@ -102,6 +103,8 @@
*/
extern void (*pm_idle)(void);
extern void (*pm_power_off)(void);
+extern spinlock_t pm_idle_switch_lock;
+extern int pm_idle_locked;
typedef int __bitwise suspend_state_t;
Index: linux-2.6.15-rc2-git5/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/x86_64/Kconfig 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/x86_64/Kconfig 2005-11-28 20:31:47.000000000 -0500
@@ -69,6 +69,10 @@
bool
default y
+config DYNAMIC_IDLE
+ bool
+ default y
+
source "init/Kconfig"
Index: linux-2.6.15-rc2-git5/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/x86_64/kernel/x8664_ksyms.c 2005-11-28 20:31:47.000000000 -0500
@@ -58,7 +58,6 @@
EXPORT_SYMBOL(disable_irq_nosync);
EXPORT_SYMBOL(probe_irq_mask);
EXPORT_SYMBOL(kernel_thread);
-EXPORT_SYMBOL(pm_idle);
EXPORT_SYMBOL(pm_power_off);
EXPORT_SYMBOL(get_cmos_time);
Index: linux-2.6.15-rc2-git5/include/linux/idle.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.15-rc2-git5/include/linux/idle.h 2005-11-28 20:31:47.000000000 -0500
@@ -0,0 +1,71 @@
+/*
+ * idle.h - Registering of the idle function (for supported archs)
+ *
+ * Copyright (C) 2005 Steven Rostedt <rostedt@goodmis.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _LINUX_IDLE_H
+#define _LINUX_IDLE_H
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/kobject.h>
+#include <asm/atomic.h>
+
+typedef void (*idlefunc_t)(void);
+
+struct idle_info {
+ struct list_head list;
+ const char *name; /* Name visible to users */
+ idlefunc_t func; /* idle function to run */
+ int freeze; /* Only allow kernel to add or remove */
+ int inuse; /* set when being used */
+#ifdef CONFIG_SYSFS
+ struct kobject kobj;
+#endif
+};
+
+/*
+ * Registering and unregistering functions that may be used
+ * instead of the default idle function. This only adds
+ * them to the list of functions to be used, it does not
+ * set the
+ */
+extern int register_idle(struct idle_info *info);
+extern int unregister_idle(const char *name);
+
+/*
+ * This sets the idle function to the registered function
+ * by name. Use NULL to set the idle function back to
+ * the default.
+ */
+extern int set_idle(const char *name);
+
+/*
+ * Return the function that is registered by name.
+ * Use NULL to get the default function.
+ * NULL may be returned (as that may be what the current
+ * idle function is set to, to use a default). NULL will
+ * also be returned if name is not registered.
+ */
+extern idlefunc_t get_idle(const char *name);
+
+extern idlefunc_t idle_func;
+
+#endif /* _LINUX_IDLE_H */
Index: linux-2.6.15-rc2-git5/kernel/Makefile
===================================================================
--- linux-2.6.15-rc2-git5.orig/kernel/Makefile 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/kernel/Makefile 2005-11-28 20:31:47.000000000 -0500
@@ -32,6 +32,7 @@
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_DYNAMIC_IDLE) += idle.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux-2.6.15-rc2-git5/kernel/idle.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.15-rc2-git5/kernel/idle.c 2005-11-28 20:31:47.000000000 -0500
@@ -0,0 +1,308 @@
+/*
+ * kernel/idle.c
+ *
+ * Setting up of the idle function to be dynamic.
+ *
+ * Copyright (C) 2005 Steven Rostedt
+ */
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/spinlock.h>
+#include <linux/idle.h>
+
+idlefunc_t idle_func;
+
+static void (*idle_default)(void);
+static LIST_HEAD(idle_elements);
+static DECLARE_MUTEX(idle_sem);
+static struct idle_info *curr_idle;
+
+#ifdef CONFIG_SYSFS
+int idle_sysfs_init;
+#endif
+
+extern void poll_idle (void);
+
+static struct idle_info *__find_idle_info(const char *name)
+{
+ struct list_head *curr;
+ struct idle_info *p;
+ /*
+ * A little inefficient, but this isn't called often.
+ */
+ list_for_each(curr, &idle_elements) {
+ p = list_entry(curr, struct idle_info, list);
+ if (!strcmp(name, p->name))
+ break;
+ }
+ if (curr == &idle_elements)
+ p = NULL;
+
+ return p;
+}
+
+int register_idle(struct idle_info *info)
+{
+ struct idle_info *p;
+ int ret = -EEXIST;
+
+ BUG_ON(!info->name);
+
+ down(&idle_sem);
+
+ p = __find_idle_info(info->name);
+ if (p)
+ goto out;
+ ret = 0;
+
+ list_add(&info->list, &idle_elements);
+
+out:
+ up(&idle_sem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_idle);
+
+int unregister_idle(const char *name)
+{
+ struct idle_info *p;
+ int ret = -EINVAL;
+
+ BUG_ON(!name);
+
+ down(&idle_sem);
+
+ p = __find_idle_info(name);
+ if (!p)
+ goto out;
+ if (p->inuse) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = 0;
+
+ list_del_init(&p->list);
+
+out:
+ up(&idle_sem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_idle);
+
+static int __set_idle(struct idle_info *info)
+{
+ if (curr_idle)
+ curr_idle->inuse--;
+ info->inuse++;
+ curr_idle = info;
+ return 0;
+}
+
+int set_idle(const char *name)
+{
+ struct idle_info *p;
+ int ret = 0;
+
+ down(&idle_sem);
+
+ if (!name) {
+ /* Set to the default function */
+ if (curr_idle) {
+ curr_idle->inuse--;
+ curr_idle = NULL;
+ }
+ idle_func = idle_default;
+ goto out;
+ }
+
+ ret = -EINVAL;
+ p = __find_idle_info(name);
+ if (!p)
+ goto out;
+
+ __set_idle(p);
+out:
+ up(&idle_sem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_idle);
+
+idlefunc_t get_idle(const char *name)
+{
+ struct idle_info *p;
+ idlefunc_t ret = idle_default;
+
+ down(&idle_sem);
+
+ if (!name)
+ goto out;
+
+ p = __find_idle_info(name);
+ if (!p)
+ goto out;
+
+ ret = p->func;
+out:
+ up(&idle_sem);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_idle);
+
+#ifdef CONFIG_SYSFS
+#define KERNEL_ATTR_RW(_name) \
+static struct subsys_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct idlep_kobject
+{
+ struct kobject kobj;
+} idle_kobj;
+
+static ssize_t idle_ctrl_show(struct subsystem *subsys, char *page)
+{
+ ssize_t ret;
+ char *star = "";
+ const char *name = "default";
+
+ down(&idle_sem);
+ if (curr_idle) {
+ name = curr_idle->name;
+ if (curr_idle->freeze)
+ star = "*";
+ }
+ ret = sprintf(page, "%s%s\n", star, name);
+ up(&idle_sem);
+
+ return ret;
+}
+
+static ssize_t idle_ctrl_store(struct subsystem *subsys,
+ const char *buf, size_t len)
+{
+ struct list_head *curr;
+ struct idle_info *p;
+ ssize_t ret = -EBUSY;
+
+ down(&idle_sem);
+
+ if (curr_idle && curr_idle->freeze)
+ goto out;
+
+ list_for_each(curr, &idle_elements) {
+ int size;
+ p = list_entry(curr, struct idle_info, list);
+
+ size = strlen(p->name);
+ if (len <= size)
+ continue;
+ if (!strncmp(p->name, buf, size))
+ break;
+ }
+ if (curr == &idle_elements) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * This idle routine may have been registered to
+ * not allow users to add or remove this.
+ */
+ if (p->freeze)
+ goto out;
+
+ __set_idle(p);
+
+ ret = len;
+out:
+ up(&idle_sem);
+
+ return ret;
+}
+
+KERNEL_ATTR_RW(idle_ctrl);
+
+static ssize_t idle_methods_show(struct subsystem *subsys, char *page)
+{
+ struct list_head *curr;
+ struct idle_info *p;
+ ssize_t len = 0;
+
+ down(&idle_sem);
+ list_for_each(curr, &idle_elements) {
+ p = list_entry(curr, struct idle_info, list);
+ if (len + 3 + strlen(p->name) >= PAGE_SIZE) {
+ printk("idle functions overflowed sysfs??\n");
+ break;
+ }
+ len += sprintf(page+len, "%s%s%s",
+ len ? " " : "",
+ p->freeze ? "*" : "",
+ p->name);
+ }
+ if (len + 2 < PAGE_SIZE)
+ len += sprintf(page+len, "\n");
+
+ up(&idle_sem);
+ return len;
+}
+
+static ssize_t idle_methods_store(struct subsystem *subsys,
+ const char *buf, size_t len)
+{
+ /* do nothing */
+ return len;
+}
+
+KERNEL_ATTR_RW(idle_methods);
+
+static struct attribute * idle_attrs[] = {
+ &idle_ctrl_attr.attr,
+ &idle_methods_attr.attr,
+ NULL
+};
+
+static struct attribute_group idle_attr_group = {
+ .attrs = idle_attrs,
+};
+
+static int __init idle_setup_sysfs(void)
+{
+ int err;
+
+ memset(&idle_kobj, 0, sizeof(idle_kobj));
+ err = kobject_set_name(&idle_kobj.kobj, "%s", "idle");
+ if (err)
+ goto out;
+
+ kobj_set_kset_s(&idle_kobj, kernel_subsys);
+
+ idle_kobj.kobj.parent = &kernel_subsys.kset.kobj;
+ err = kobject_register(&idle_kobj.kobj);
+ if (err)
+ goto out;
+
+ err = sysfs_create_group(&idle_kobj.kobj,
+ &idle_attr_group);
+ if (err)
+ goto out;
+
+ return 0;
+out:
+ printk(KERN_INFO "Problem setting up sysfs idle_ctrl\n");
+ return 0;
+}
+#endif /* CONFIG_SYSFS */
+
+static int __init idle_setup(void)
+{
+ idle_default = idle_func;
+
+#ifdef CONFIG_SYSFS
+ idle_setup_sysfs();
+#endif
+ return 0;
+}
+
+late_initcall(idle_setup);
Index: linux-2.6.15-rc2-git5/arch/i386/Kconfig
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/i386/Kconfig 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/i386/Kconfig 2005-11-28 20:31:47.000000000 -0500
@@ -45,6 +45,10 @@
bool
default y
+config DYNAMIC_IDLE
+ bool
+ default y
+
source "init/Kconfig"
menu "Processor type and features"
Index: linux-2.6.15-rc2-git5/arch/i386/kernel/apm.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/i386/kernel/apm.c 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/i386/kernel/apm.c 2005-11-28 20:31:47.000000000 -0500
@@ -225,6 +225,7 @@
#include <linux/smp_lock.h>
#include <linux/dmi.h>
#include <linux/suspend.h>
+#include <linux/idle.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -2220,6 +2221,9 @@
{ }
};
+static struct idle_info apm_idle;
+#define APM_IDLE_NAME "apm"
+
/*
* Just start the APM thread. We do NOT want to do APM BIOS
* calls from anything but the APM thread, if for no other reason
@@ -2373,8 +2377,14 @@
if (HZ != 100)
idle_period = (idle_period * HZ) / 100;
if (idle_threshold < 100) {
- original_pm_idle = pm_idle;
- pm_idle = apm_cpu_idle;
+ memset(&apm_idle, 0, sizeof(apm_idle));
+ apm_idle.name = APM_IDLE_NAME;
+ apm_idle.func = apm_cpu_idle;
+ apm_idle.freeze = 1;
+ register_idle(&apm_idle);
+
+ original_pm_idle = get_idle(NULL);
+ set_idle(APM_IDLE_NAME);
set_pm_idle = 1;
}
@@ -2386,7 +2396,26 @@
int error;
if (set_pm_idle) {
- pm_idle = original_pm_idle;
+ int tries = 0;
+ int ret;
+ set_idle(NULL);
+ do {
+ if ((ret = unregister_idle(APM_IDLE_NAME)) == 0)
+ break;
+ /*
+ * for some reason the idle function is being used.
+ * Wait a little and then try again.
+ */
+ if (ret == -EINVAL) {
+ printk(KERN_WARNING
+ "APM idle function never registered?\n");
+ break;
+ }
+ yield();
+ } while (tries++ < 10);
+ if (tries > 10)
+ printk(KERN_WARNING
+ "Unable to unresgister APM idle function\n");
/*
* We are about to unload the current idle thread pm callback
* (pm_idle), Wait for all processors to update cached/local
Index: linux-2.6.15-rc2-git5/arch/ia64/Kconfig
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/ia64/Kconfig 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/ia64/Kconfig 2005-11-28 20:31:47.000000000 -0500
@@ -62,6 +62,10 @@
bool
default y
+config DYNAMIC_IDLE
+ bool
+ default y
+
choice
prompt "System type"
default IA64_GENERIC
Index: linux-2.6.15-rc2-git5/arch/ia64/kernel/acpi.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/ia64/kernel/acpi.c 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/ia64/kernel/acpi.c 2005-11-28 20:31:47.000000000 -0500
@@ -60,8 +60,6 @@
#define PREFIX "ACPI: "
-void (*pm_idle) (void);
-EXPORT_SYMBOL(pm_idle);
void (*pm_power_off) (void);
EXPORT_SYMBOL(pm_power_off);
Index: linux-2.6.15-rc2-git5/arch/ia64/kernel/process.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/ia64/kernel/process.c 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/ia64/kernel/process.c 2005-11-28 20:31:47.000000000 -0500
@@ -31,6 +31,7 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/kprobes.h>
+#include <linux/idle.h>
#include <asm/cpu.h>
#include <asm/delay.h>
@@ -289,7 +290,7 @@
if (mark_idle)
(*mark_idle)(1);
- idle = pm_idle;
+ idle = idle_func;
if (!idle)
idle = default_idle;
(*idle)();
Index: linux-2.6.15-rc2-git5/arch/ia64/kernel/setup.c
===================================================================
--- linux-2.6.15-rc2-git5.orig/arch/ia64/kernel/setup.c 2005-11-28 20:31:24.000000000 -0500
+++ linux-2.6.15-rc2-git5/arch/ia64/kernel/setup.c 2005-11-29 07:46:59.000000000 -0500
@@ -43,6 +43,7 @@
#include <linux/initrd.h>
#include <linux/platform.h>
#include <linux/pm.h>
+#include <linux/idle.h>
#include <asm/ia32.h>
#include <asm/machvec.h>
@@ -738,6 +739,11 @@
ia64_max_cacheline_size = max;
}
+struct idle_info idle_default = {
+ .name = "default",
+ .func = default_idle
+};
+
/*
* cpu_init() initializes state that is per-CPU. This function acts
* as a 'CPU state barrier', nothing should get across.
@@ -861,7 +867,10 @@
/* size of physical stacked register partition plus 8 bytes: */
__get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
platform_cpu_init();
- pm_idle = default_idle;
+
+ register_idle(&idle_default);
+
+ set_idle("default");
}
void
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 4:22 ` john stultz
@ 2005-11-29 14:22 ` Steven Rostedt
0 siblings, 0 replies; 26+ messages in thread
From: Steven Rostedt @ 2005-11-29 14:22 UTC (permalink / raw)
To: john stultz
Cc: Andrew Morton, mingo, acpi-devel, len.brown, nando, rlrevell,
linux-kernel, paulmck, kr, tglx, pluto, john.cooper, bene,
dwalker, trini, george
On Mon, 2005-11-28 at 20:22 -0800, john stultz wrote:
> On Mon, 2005-11-28 at 22:42 -0500, Steven Rostedt wrote:
> > On Mon, 2005-11-28 at 19:02 -0800, Andrew Morton wrote:
> > > Steven Rostedt <rostedt@goodmis.org> wrote:
> > > >
> > > > This patch creates a directory in /sys/kernel called idle.
> > > >
> > >
> > > At no point do you appear to explain _why_ the kernel needs this feature?
> >
> > Sorry about that. This originally came up when we had problems with the
> > AMD64 x2 in the -rt patch. It was noted that the TSCs would get very
> > far out of sync and cause problems. The way to solve this was to set
> > idle=poll. The original patch I sent was to allow the user to change to
> > idle=poll dynamically. This way they could switch to the poll_idle and
> > run there tests (requiring tsc not to drift) and then switch back to the
> > default idle to save on electricity.
>
> The problem with this is that this must be a one way transition. That
> is, once the TSCs have become unsynchronized, there is no use going back
> to using the polling idle unless you add some code to re-sync the TSCs
> which would be ugly to do after the system has booted.
>
I've thought about that too. But this patch does allow you to start
with idle=poll and then switch back. Also, if you do lock to a cpu,
you don't need to worry about the tsc from slipping if you switch to
idle=poll.
-- Steve
> Using idle=poll (for anything other then debugging) is really a worst
> case workaround for systems that do not have alternative clocksources
> like ACPI PM or HPET.
>
> Its an interesting bit of code, but I'm not really sure I understand its
> usefulness.
>
> thanks
> -john
>
>
>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 14:19 ` Steven Rostedt
@ 2005-11-29 14:50 ` Andi Kleen
2005-11-29 15:42 ` Steven Rostedt
0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2005-11-29 14:50 UTC (permalink / raw)
To: Steven Rostedt
Cc: Andi Kleen, Ingo Molnar, acpi-devel, len.brown, nando, rlrevell,
linux-kernel, paulmck, kr, tglx, pluto, john.cooper, bene,
dwalker, trini, george, akpm
On Tue, Nov 29, 2005 at 09:19:31AM -0500, Steven Rostedt wrote:
> > And in practice the CPU will run so hot that only benchmarkers like it.
>
> Why would it run hot? What's the difference between polling and doing
> other things. How many transistors does it take to poll?
It will prevent the CPU from going into sleep states and essentially
keep most of it enabled.
>
> >
> > I think switching idle is the wrong way to do. We should rather
> > fix the various problems.
> >
> > For fixing the TSC issue it is 100% the wrong approach Imho.
>
> I would only say 80% the wrong approach, but that's me ;-)
>
> > Basically software has to live with TSCs being unsynchronized
> > and gettimeofday should do the right thing (and if not it should be fixed)
>
> I guess the biggest complaint most have is that the rdtsc _is_ the
> fastest way to read a clock. If it isn't reliable, then what good is
It's the fastest way to read something which needs quite complex
knowledge to turn into a reliable clock value. In general only
the kernel has this knowledge.
And gettimeofday is optimized to give you the fatest reliable
clock.
> it? It's unfortunate that Intel didn't solidify the clock usage. Yes,
> use HPET, or something else, but those are slower, and may not be on all
> systems. Every system that I owned had a tsc but for critical systems
> it isn't up to par (what a shame).
Just use gettimeofday. It shields you from all that and when
the hardware supports it is quite fast too.
> > > system has been idle for some time. E.g. cpufreqd could sample idle time
> > > and turn on/off idle=poll. High-performance setups could enable it all
> > > the time.
> >
> > And upgrade their server air condition or issue additional ear protection
> > to the desktop user? Most likely you will just drive the CPUs into
> > thermal throttle at some point with that, not get more performance anyways.
>
> Again, what would make it so hot? It is a waste of CPU cycles, and does
> waste energy that way, but does it really heat up the CPU that much?
Yes it does.
-Andi
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 14:50 ` Andi Kleen
@ 2005-11-29 15:42 ` Steven Rostedt
0 siblings, 0 replies; 26+ messages in thread
From: Steven Rostedt @ 2005-11-29 15:42 UTC (permalink / raw)
To: Andi Kleen
Cc: Ingo Molnar, acpi-devel, len.brown, nando, rlrevell, linux-kernel,
paulmck, kr, tglx, pluto, john.cooper, bene, dwalker, trini,
george, akpm
On Tue, 2005-11-29 at 15:50 +0100, Andi Kleen wrote:
> On Tue, Nov 29, 2005 at 09:19:31AM -0500, Steven Rostedt wrote:
> > > And in practice the CPU will run so hot that only benchmarkers like it.
> >
> > Why would it run hot? What's the difference between polling and doing
> > other things. How many transistors does it take to poll?
>
> It will prevent the CPU from going into sleep states and essentially
> keep most of it enabled.
Well, there's one thing that my patch _does_ help with. (And it has
just helped me now). If you boot up with idle=poll and forget about it,
you can check what idle routine is being used and switch out of poll
without rebooting. (like I'm doing right now :-)
-- Steve
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 6:44 ` Ingo Molnar
2005-11-29 6:55 ` Nick Piggin
@ 2005-11-29 18:05 ` Andi Kleen
2005-11-29 14:19 ` Steven Rostedt
2005-12-02 1:27 ` Max Krasnyansky
1 sibling, 2 replies; 26+ messages in thread
From: Andi Kleen @ 2005-11-29 18:05 UTC (permalink / raw)
To: Ingo Molnar
Cc: Steven Rostedt, acpi-devel, len.brown, nando, rlrevell,
linux-kernel, paulmck, kr, tglx, pluto, john.cooper, bene,
dwalker, trini, george, akpm
Ingo Molnar <mingo@elte.hu> writes:
> * Andrew Morton <akpm@osdl.org> wrote:
>
> > > The way to solve this was to set
> > > idle=poll. The original patch I sent was to allow the user to change to
> > > idle=poll dynamically. This way they could switch to the poll_idle and
> > > run there tests (requiring tsc not to drift) and then switch back to the
> > > default idle to save on electricity.
> >
> > Use gettimeofday()?
> >
> > If it's just for some sort of instrumentation, run NR_CPUS instances
> > of a niced-down busyloop, pin each one to a different CPU? That way
> > the idle function doesn't get called at all..
>
> idle=poll is also frequently done for performance reasons [it reduces
> idle wakeup latency by 10 usecs]
And it's obsolete on CPUs with monitor/mwait.
And in practice the CPU will run so hot that only benchmarkers like it.
I think switching idle is the wrong way to do. We should rather
fix the various problems.
For fixing the TSC issue it is 100% the wrong approach Imho.
Basically software has to live with TSCs being unsynchronized
and gettimeofday should do the right thing (and if not it should be fixed)
- while it could be turned off if the
> system has been idle for some time. E.g. cpufreqd could sample idle time
> and turn on/off idle=poll. High-performance setups could enable it all
> the time.
And upgrade their server air condition or issue additional ear protection
to the desktop user? Most likely you will just drive the CPUs into
thermal throttle at some point with that, not get more performance anyways.
> as long as it can be done with zero-cost, i dont see why Steven's patch
> wouldnt be a plus for us. It's a performance thing, and having runtime
> switches for seemless performance features cannot be bad.
The interface is ugly and I suspect fixing the various obscure race this
obscure feature would undoubtedly add will be a long term maintenance
issue. And it's the wrong thing to do anyways because it just papers
over other problems that should be fixed in the right way.
-Andi
^ permalink raw reply [flat|nested] 26+ messages in thread
* RE: [RFC][PATCH] Runtime switching of the idle function [take 2]
@ 2005-11-29 19:37 Brown, Len
2005-11-29 19:53 ` Andi Kleen
0 siblings, 1 reply; 26+ messages in thread
From: Brown, Len @ 2005-11-29 19:37 UTC (permalink / raw)
To: Nick Piggin, Ingo Molnar, Steven Rostedt, Andi Kleen
Cc: Andrew Morton, acpi-devel, nando, rlrevell, linux-kernel, paulmck,
kr, tglx, pluto, john.cooper, bene, dwalker, trini, george
idle=poll is a really bad way to go from a power perspective.
While it is diminishing returns to get into deeper C-states,
getting into at least C1 (HALT or MONITOR/MWAIT) is very important
on many processors.
Note that if the issue at hand is the TSC stopping in deep
ACPI C-states, that there is a flag already available to limit
how deep the C-states go. eg.
processor.max_cstate=2 will disable C3, C4 etc
You can do this at run-time by writing to
/sys/module/processor/parameters/max_cstate
I agree with Andi that we have some work to do to address
the issue directly, which is that the TSC is not reliable
under all conditions on all processors. I think we need
some modes for TSC to detect and handle the cases where it either
stops in C3 or changes speeds, vs the systems where it actually
works the way we want it to -- constant rate that never stops.
>Why not just slightly cleanup and extend (eg. to ACPI) the
>hlt_counter thingy that many architectures already have?
Hmmm, I see the floppy driver invoking hlt_counter,
but it isn't clear what the general semantics and general
users are supposd to be. Can you clue me in?
thanks,
-Len
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 19:37 [RFC][PATCH] Runtime switching of the idle function [take 2] Brown, Len
@ 2005-11-29 19:53 ` Andi Kleen
2005-11-29 20:35 ` Lee Revell
0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2005-11-29 19:53 UTC (permalink / raw)
To: Brown, Len
Cc: Nick Piggin, Ingo Molnar, Steven Rostedt, Andi Kleen,
Andrew Morton, acpi-devel, nando, rlrevell, linux-kernel, paulmck,
kr, tglx, pluto, john.cooper, bene, dwalker, trini, george
On Tue, Nov 29, 2005 at 02:37:53PM -0500, Brown, Len wrote:
> idle=poll is a really bad way to go from a power perspective.
> While it is diminishing returns to get into deeper C-states,
> getting into at least C1 (HALT or MONITOR/MWAIT) is very important
> on many processors.
>
> Note that if the issue at hand is the TSC stopping in deep
> ACPI C-states, that there is a flag already available to limit
> how deep the C-states go. eg.
No i think they tried to work around the fact that
it's not synchronized on AMD systems - in particular
it drifts slightly even on single socket dual core
A64 X2s and disabling C1 works around that.
But idle=poll is too big an hammer for this. Vojtech
is working on a solution anyways that should address this
better.
> processor.max_cstate=2 will disable C3, C4 etc
> You can do this at run-time by writing to
> /sys/module/processor/parameters/max_cstate
In this case it's already C1 that's the problem,
so that won't help them.
> I agree with Andi that we have some work to do to address
> the issue directly, which is that the TSC is not reliable
> under all conditions on all processors. I think we need
We're mostly addressing it - there are problems left, but
overall it's looking good. The remaining problem is
an education issue of users to not use RDTSC directly,
but use gettimeofday/clock_gettime
One remaining use is measurements, but for that it is
already dubious (e.g. due to ticking at a possible
different frequency than the CPU). For that I want
to establish the RDPMC 0 convention.
Probably need better documentation for all of this though...
> some modes for TSC to detect and handle the cases where it either
> stops in C3 or changes speeds, vs the systems where it actually
> works the way we want it to -- constant rate that never stops.
>
> >Why not just slightly cleanup and extend (eg. to ACPI) the
> >hlt_counter thingy that many architectures already have?
>
> Hmmm, I see the floppy driver invoking hlt_counter,
> but it isn't clear what the general semantics and general
> users are supposd to be. Can you clue me in?
It's an ancient hack for an ancient machine chipset bug, but AFAIK
not used/needed on anything modern.
Should probably remove it from x86-64 too.
-Andi
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 19:53 ` Andi Kleen
@ 2005-11-29 20:35 ` Lee Revell
2005-11-29 20:51 ` Andi Kleen
0 siblings, 1 reply; 26+ messages in thread
From: Lee Revell @ 2005-11-29 20:35 UTC (permalink / raw)
To: Andi Kleen
Cc: Brown, Len, Nick Piggin, Ingo Molnar, Steven Rostedt,
Andrew Morton, acpi-devel, nando, linux-kernel, paulmck, kr, tglx,
pluto, john.cooper, bene, dwalker, trini, george
On Tue, 2005-11-29 at 20:53 +0100, Andi Kleen wrote:
> We're mostly addressing it - there are problems left, but
> overall it's looking good. The remaining problem is
> an education issue of users to not use RDTSC directly,
> but use gettimeofday/clock_gettime
No the issue is to make gettimeofday fast enough that the people who
currently have to use the TSC can use it. Right now it's 1500-3000 nsec
or so, Vojtech mentioned that he has a patch that could reduce that to
150-300 nsec.
Lee
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 20:35 ` Lee Revell
@ 2005-11-29 20:51 ` Andi Kleen
2005-11-29 23:55 ` Lee Revell
0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2005-11-29 20:51 UTC (permalink / raw)
To: Lee Revell
Cc: Andi Kleen, Brown, Len, Nick Piggin, Ingo Molnar, Steven Rostedt,
Andrew Morton, acpi-devel, nando, linux-kernel, paulmck, kr, tglx,
pluto, john.cooper, bene, dwalker, trini, george
On Tue, Nov 29, 2005 at 03:35:39PM -0500, Lee Revell wrote:
> On Tue, 2005-11-29 at 20:53 +0100, Andi Kleen wrote:
> > We're mostly addressing it - there are problems left, but
> > overall it's looking good. The remaining problem is
> > an education issue of users to not use RDTSC directly,
> > but use gettimeofday/clock_gettime
>
> No the issue is to make gettimeofday fast enough that the people who
> currently have to use the TSC can use it. Right now it's 1500-3000 nsec
> or so, Vojtech mentioned that he has a patch that could reduce that to
It's only that slow if the hardware can't do better.
And the kernel makes it only slow when using RDTSC directly
is unsafe - so if you use it directly thinking the kernel cheats
you for your cycles you're just shoting yourself in the own foot.
> 150-300 nsec.
If you have capable hardware it can already do much better.
-Andi
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 20:51 ` Andi Kleen
@ 2005-11-29 23:55 ` Lee Revell
2005-11-30 1:06 ` Andi Kleen
0 siblings, 1 reply; 26+ messages in thread
From: Lee Revell @ 2005-11-29 23:55 UTC (permalink / raw)
To: Andi Kleen
Cc: Brown, Len, Nick Piggin, Ingo Molnar, Steven Rostedt,
Andrew Morton, acpi-devel, nando, linux-kernel, paulmck, kr, tglx,
pluto, john.cooper, bene, dwalker, trini, george
On Tue, 2005-11-29 at 21:51 +0100, Andi Kleen wrote:
> On Tue, Nov 29, 2005 at 03:35:39PM -0500, Lee Revell wrote:
> > On Tue, 2005-11-29 at 20:53 +0100, Andi Kleen wrote:
> > > We're mostly addressing it - there are problems left, but
> > > overall it's looking good. The remaining problem is
> > > an education issue of users to not use RDTSC directly,
> > > but use gettimeofday/clock_gettime
> >
> > No the issue is to make gettimeofday fast enough that the people who
> > currently have to use the TSC can use it. Right now it's 1500-3000 nsec
> > or so, Vojtech mentioned that he has a patch that could reduce that to
>
> It's only that slow if the hardware can't do better.
>
> And the kernel makes it only slow when using RDTSC directly
> is unsafe - so if you use it directly thinking the kernel cheats
> you for your cycles you're just shoting yourself in the own foot.
>
> > 150-300 nsec.
>
> If you have capable hardware it can already do much better.
>
But on my system gettimeofday uses the TSC and it's still ~35x slower
than RDTSC:
rlrevell@mindpipe:~$ ./timetest
rdtsc: 10000 calls in 1079 usecs
gettimeofday: 10000 calls in 36628 usecs
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
typedef unsigned long long cycles_t;
#define rdtscll(val) \
__asm__ __volatile__("rdtsc" : "=A" (val))
static inline cycles_t get_cycles_tsc (void)
{
unsigned long long ret;
rdtscll(ret);
return ret;
}
static inline cycles_t get_cycles_gtod (void)
{
struct timeval tv;
gettimeofday (&tv, NULL);
return tv.tv_usec;
}
int main (void) {
int i;
cycles_t start_time;
start_time= get_cycles_gtod();
for (i = 0; i < 10000; i++) {
get_cycles_tsc();
}
printf("rdtsc: %i calls in %llu usecs\n", i, get_cycles_gtod() - start_time);
start_time = get_cycles_gtod();
for (i = 0; i < 10000; i++) {
get_cycles_gtod();
}
printf("gettimeofday: %i calls in %llu usecs\n", i, get_cycles_gtod() - start_time);
return 0;
}
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 23:55 ` Lee Revell
@ 2005-11-30 1:06 ` Andi Kleen
2005-11-30 1:22 ` Lee Revell
0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2005-11-30 1:06 UTC (permalink / raw)
To: Lee Revell
Cc: Andi Kleen, Brown, Len, Nick Piggin, Ingo Molnar, Steven Rostedt,
Andrew Morton, acpi-devel, nando, linux-kernel, paulmck, kr, tglx,
pluto, john.cooper, bene, dwalker, trini, george
> But on my system gettimeofday uses the TSC and it's still ~35x slower
> than RDTSC:
>
> rlrevell@mindpipe:~$ ./timetest
> rdtsc: 10000 calls in 1079 usecs
> gettimeofday: 10000 calls in 36628 usecs
First if you run this on an Athlon 64 the measurement is likely
wrong because RDTSC can be speculated around. To get accurate
data you need to add synchronizing instructions.
Then you're likely running 32bit. It doesn't use vsyscall gettimeofday
yet, which makes it slower. 64bit would.
-Andi
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-30 1:06 ` Andi Kleen
@ 2005-11-30 1:22 ` Lee Revell
2005-11-30 1:58 ` Andi Kleen
0 siblings, 1 reply; 26+ messages in thread
From: Lee Revell @ 2005-11-30 1:22 UTC (permalink / raw)
To: Andi Kleen
Cc: Brown, Len, Nick Piggin, Ingo Molnar, Steven Rostedt,
Andrew Morton, acpi-devel, nando, linux-kernel, paulmck, kr, tglx,
pluto, john.cooper, bene, dwalker, trini, george, Vojtech Pavlik
On Wed, 2005-11-30 at 02:06 +0100, Andi Kleen wrote:
> > But on my system gettimeofday uses the TSC and it's still ~35x slower
> > than RDTSC:
> >
> > rlrevell@mindpipe:~$ ./timetest
> > rdtsc: 10000 calls in 1079 usecs
> > gettimeofday: 10000 calls in 36628 usecs
>
> First if you run this on an Athlon 64 the measurement is likely
> wrong because RDTSC can be speculated around. To get accurate
> data you need to add synchronizing instructions.
>
OK. Just for reference here's what people on the JACK list reported:
2.6.14-rt13, PREEMPT_RT, Athlon X2 4400+ (dual core)
rdtsc: 10000 calls in 68 usecs
gettimeofday: 10000 calls in 5170 usecs
P4@3.3Ghz/HT (OpenSUSE 10.0 2.6.13-15-smp):
rdtsc: 10000 calls in 253 usecs
gettimeofday: 10000 calls in 26547 usecs
> Then you're likely running 32bit. It doesn't use vsyscall gettimeofday
> yet, which makes it slower. 64bit would.
Yes, I am. So it sounds like vsyscall gettimeofday for i386 is in the
works?
Lee
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-30 1:22 ` Lee Revell
@ 2005-11-30 1:58 ` Andi Kleen
2005-11-30 2:19 ` john stultz
0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2005-11-30 1:58 UTC (permalink / raw)
To: Lee Revell
Cc: Andi Kleen, Brown, Len, Nick Piggin, Ingo Molnar, Steven Rostedt,
Andrew Morton, acpi-devel, nando, linux-kernel, paulmck, kr, tglx,
pluto, john.cooper, bene, dwalker, trini, george, Vojtech Pavlik,
johnstul
> > Then you're likely running 32bit. It doesn't use vsyscall gettimeofday
> > yet, which makes it slower. 64bit would.
>
> Yes, I am. So it sounds like vsyscall gettimeofday for i386 is in the
> works?
John Stultz used to have patches for it, but for some reason he never
pushed them into mainline. On i386 it unfortunately needs adding
a test and branch to the syscall path to be 100% ABI compatible, but I
doubt that was the reason he dropped it.
-Andi
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-30 1:58 ` Andi Kleen
@ 2005-11-30 2:19 ` john stultz
0 siblings, 0 replies; 26+ messages in thread
From: john stultz @ 2005-11-30 2:19 UTC (permalink / raw)
To: Andi Kleen
Cc: Lee Revell, Brown, Len, Nick Piggin, Ingo Molnar, Steven Rostedt,
Andrew Morton, acpi-devel, nando, linux-kernel, paulmck, kr, tglx,
pluto, john.cooper, bene, dwalker, trini, george, Vojtech Pavlik
On Wed, 2005-11-30 at 02:58 +0100, Andi Kleen wrote:
> > > Then you're likely running 32bit. It doesn't use vsyscall gettimeofday
> > > yet, which makes it slower. 64bit would.
> >
> > Yes, I am. So it sounds like vsyscall gettimeofday for i386 is in the
> > works?
>
> John Stultz used to have patches for it, but for some reason he never
> pushed them into mainline.
Unfortunately it was a pretty ugly patch. Correctness issues with the
existing code have kept focused on my timekeeping rework, however I have
kept it in mind, and I do have a i386 vsyscall gtod patch that applies
ontop of my tod work. I've been maintaining it on the side while I focus
on the core code, but it is much cleaner now. For fun I'll try to
remember to send it out with the next release.
> On i386 it unfortunately needs adding
> a test and branch to the syscall path to be 100% ABI compatible, but I
> doubt that was the reason he dropped it.
Yea, I didn't know enough about the VDSO/unwind bits to get it to do the
right thing w/ glibc, so that bit was pretty hackish. I'll still need
some help on this bit to make it really something that could be
included.
thanks
-john
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 18:05 ` Andi Kleen
2005-11-29 14:19 ` Steven Rostedt
@ 2005-12-02 1:27 ` Max Krasnyansky
2005-12-02 1:45 ` Andi Kleen
1 sibling, 1 reply; 26+ messages in thread
From: Max Krasnyansky @ 2005-12-02 1:27 UTC (permalink / raw)
To: Andi Kleen
Cc: Ingo Molnar, Steven Rostedt, acpi-devel, len.brown, nando,
rlrevell, linux-kernel, paulmck, kr, tglx, pluto, john.cooper,
bene, dwalker, trini, george, akpm
Andi Kleen wrote:
> Ingo Molnar <mingo@elte.hu> writes:
>>> If it's just for some sort of instrumentation, run NR_CPUS instances
>>> of a niced-down busyloop, pin each one to a different CPU? That way
>>> the idle function doesn't get called at all..
>> idle=poll is also frequently done for performance reasons [it reduces
>> idle wakeup latency by 10 usecs]
>
> And it's obsolete on CPUs with monitor/mwait.
There are some platforms for example IBM ZPro Xeon based machines where
monitor/mwait seems to trigger some kind of SMM and introduce horrible latencies.
With idle=poll ZPros show pretty good worst case latencies, in the order of 10usec
(tested with RTAI/Fusion). With default idle (ie mwait) even average latency is in
hundreds of milliseconds.
You might argue that it's a bug in the their HW design or something but as it stands
today I wouldn't say that monitor/mwait obsoletes idle=poll.
Also IMO saying that CPU will run too hot with idle=poll is basically saying that those
CPUs cannot be used for simulations and stuff which run flat out for days (months actually).
Which is obviously not true (again speaking from experience :)).
Max
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-12-02 1:27 ` Max Krasnyansky
@ 2005-12-02 1:45 ` Andi Kleen
2005-12-03 2:17 ` Max Krasnyansky
0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2005-12-02 1:45 UTC (permalink / raw)
To: Max Krasnyansky
Cc: Andi Kleen, Ingo Molnar, Steven Rostedt, acpi-devel, len.brown,
nando, rlrevell, linux-kernel, paulmck, kr, tglx, pluto,
john.cooper, bene, dwalker, trini, george, akpm
> Also IMO saying that CPU will run too hot with idle=poll is basically
> saying that those
> CPUs cannot be used for simulations and stuff which run flat out for days
> (months actually).
> Which is obviously not true (again speaking from experience :)).
The CPUs can be used, but many cooling setups
(both AirCon in complete data centers, cooling in Blade Racks, laptops)
the cooling is now often designed to not cool
the maximum thermal output of all systems in parallel, but instead
throttle the systems when things get too hot. This usually
works because in most workloads systems are more often idle
than busy, so no throttling is needed.
On desktops it probably won't throttle, but just become noisy
when all the fans spin up.
All things you don't really want.
Super computing is different of course, but even there maximum
capacity of the air condition often limits how many CPUs you can buy.
And you need all the help you can get.
That said you're right that there is still a small niche
where idle=poll makes sense, but it's definitely nothing
that should be encouraged to be used regularly like that
original patch would.
-Andi
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-12-02 1:45 ` Andi Kleen
@ 2005-12-03 2:17 ` Max Krasnyansky
0 siblings, 0 replies; 26+ messages in thread
From: Max Krasnyansky @ 2005-12-03 2:17 UTC (permalink / raw)
To: Andi Kleen
Cc: Ingo Molnar, Steven Rostedt, acpi-devel, len.brown, nando,
rlrevell, linux-kernel, paulmck, kr, tglx, pluto, john.cooper,
bene, dwalker, trini, george, akpm
Andi Kleen wrote:
>> Also IMO saying that CPU will run too hot with idle=poll is basically
>> saying that those
>> CPUs cannot be used for simulations and stuff which run flat out for days
>> (months actually).
>> Which is obviously not true (again speaking from experience :)).
>
> The CPUs can be used, but many cooling setups
> (both AirCon in complete data centers, cooling in Blade Racks, laptops)
> the cooling is now often designed to not cool
> the maximum thermal output of all systems in parallel, but instead
> throttle the systems when things get too hot. This usually
> works because in most workloads systems are more often idle
> than busy, so no throttling is needed.
>
> On desktops it probably won't throttle, but just become noisy
> when all the fans spin up.
>
> All things you don't really want.
We do it (simulations that is) on normal 1U and desktop machines. No special
cooling and stuff. And it does not cause any problems. Granted we don't use
cheap/crappy machines but still it's unmodified off-the-shelf HW.
btw That ZPro machine that I mentioned used to run with idle=poll for weeks
and fans would never spin up unless you put real load on it.
> Super computing is different of course, but even there maximum
> capacity of the air condition often limits how many CPUs you can buy.
> And you need all the help you can get.
>
> That said you're right that there is still a small niche
> where idle=poll makes sense, but it's definitely nothing
> that should be encouraged to be used regularly like that
> original patch would.
Agreed.
Max
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [RFC][PATCH] Runtime switching of the idle function [take 2]
2005-11-29 13:08 ` Pavel Machek
@ 2005-12-18 15:26 ` Steven Rostedt
0 siblings, 0 replies; 26+ messages in thread
From: Steven Rostedt @ 2005-12-18 15:26 UTC (permalink / raw)
To: Pavel Machek
Cc: Ingo Molnar, acpi-devel, len.brown, Andrew Morton,
Fernando Lopez-Lezcano, Lee Revell, linux-kernel,
Paul E. McKenney, K.R. Foley, Thomas Gleixner, pluto, john cooper,
Benedikt Spranger, Daniel Walker, Tom Rini, George Anzinger
On Tue, 29 Nov 2005, Pavel Machek wrote:
> Hi!
>
> > Description:
> >
> > This patch creates a directory in /sys/kernel called idle. This
> > directory contains two files: idle_ctrl and idle_methods. Reading
> > idle_ctrl will show the function that is currently being used for idle,
> > and idle_methods shows the available methods for the user to send write
> > into idle_ctrl to change which function to use for idle.
>
> Pretty ugly interface, I'd say... is listing function really neccessary?
>
What interface would you prefer? And the listing was a feature request
made by Ingo.
But this is pretty much moot, since the patch is not going any further
than the RT patch. And even then, it probably is only temporary, if it is
even still in there (I haven't checked).
--Steve
^ permalink raw reply [flat|nested] 26+ messages in thread
end of thread, other threads:[~2005-12-18 15:26 UTC | newest]
Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-11-29 19:37 [RFC][PATCH] Runtime switching of the idle function [take 2] Brown, Len
2005-11-29 19:53 ` Andi Kleen
2005-11-29 20:35 ` Lee Revell
2005-11-29 20:51 ` Andi Kleen
2005-11-29 23:55 ` Lee Revell
2005-11-30 1:06 ` Andi Kleen
2005-11-30 1:22 ` Lee Revell
2005-11-30 1:58 ` Andi Kleen
2005-11-30 2:19 ` john stultz
-- strict thread matches above, loose matches on Subject: below --
2005-11-15 9:08 2.6.14-rt13 Ingo Molnar
2005-11-18 18:02 ` 2.6.14-rt13 Fernando Lopez-Lezcano
2005-11-18 21:54 ` 2.6.14-rt13 Lee Revell
2005-11-18 22:05 ` 2.6.14-rt13 Fernando Lopez-Lezcano
2005-11-18 22:07 ` 2.6.14-rt13 Ingo Molnar
2005-11-18 22:41 ` 2.6.14-rt13 Fernando Lopez-Lezcano
2005-11-19 2:39 ` 2.6.14-rt13 Steven Rostedt
2005-11-24 15:07 ` 2.6.14-rt13 Ingo Molnar
2005-11-25 20:56 ` [RFC][PATCH] Runtime switching to idle_poll (was: Re: 2.6.14-rt13) Steven Rostedt
2005-11-26 13:05 ` Ingo Molnar
2005-11-29 2:48 ` [RFC][PATCH] Runtime switching of the idle function [take 2] Steven Rostedt
2005-11-29 3:02 ` Andrew Morton
2005-11-29 3:42 ` Steven Rostedt
2005-11-29 4:01 ` Andrew Morton
2005-11-29 6:44 ` Ingo Molnar
2005-11-29 6:55 ` Nick Piggin
2005-11-29 18:05 ` Andi Kleen
2005-11-29 14:19 ` Steven Rostedt
2005-11-29 14:50 ` Andi Kleen
2005-11-29 15:42 ` Steven Rostedt
2005-12-02 1:27 ` Max Krasnyansky
2005-12-02 1:45 ` Andi Kleen
2005-12-03 2:17 ` Max Krasnyansky
2005-11-29 4:22 ` john stultz
2005-11-29 14:22 ` Steven Rostedt
2005-11-29 13:08 ` Pavel Machek
2005-12-18 15:26 ` Steven Rostedt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox