Linux Power Management development
 help / color / mirror / Atom feed
* [PATCH 01/10] cgroup: add cgroup_root_mutex
From: Tejun Heo @ 2011-11-01 23:46 UTC (permalink / raw)
  To: paul, rjw, lizf
  Cc: fweisbec, containers, linux-kernel, oleg, Tejun Heo, linux-pm,
	akpm, kamezawa.hiroyu
In-Reply-To: <1320191193-8110-1-git-send-email-tj@kernel.org>

cgroup wants to make threadgroup stable while modifying cgroup
hierarchies which will introduce locking dependency on
cred_guard_mutex from cgroup_mutex.  This unfortunately completes
circular dependency.

 A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
 B. namespace_sem -> cgroup_mutex

B is from cgroup_show_options() and this patch breaks it by
introducing another mutex cgroup_root_mutex which nests inside
cgroup_mutex and protects cgroupfs_root.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
---
 kernel/cgroup.c |   64 ++++++++++++++++++++++++++++++++++++-------------------
 1 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 453100a..efa5886 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
 
 #include <linux/atomic.h>
 
+/*
+ * cgroup_mutex is the master lock.  Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
+ * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
+ * release_agent_path and so on.  Modifying requires both cgroup_mutex and
+ * cgroup_root_mutex.  Readers can acquire either of the two.  This is to
+ * break the following locking order cycle.
+ *
+ *  A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
+ *  B. namespace_sem -> cgroup_mutex
+ *
+ * B happens only through cgroup_show_options() and using cgroup_root_mutex
+ * breaks it.
+ */
 static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_MUTEX(cgroup_root_mutex);
 
 /*
  * Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	int i;
 
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
+	BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
 
 	removed_bits = root->actual_subsys_bits & ~final_bits;
 	added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1043,7 +1061,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
 	struct cgroup_subsys *ss;
 
-	mutex_lock(&cgroup_mutex);
+	mutex_lock(&cgroup_root_mutex);
 	for_each_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
 	if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
 		seq_printf(seq, ",name=%s", root->name);
-	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_root_mutex);
 	return 0;
 }
 
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
+	mutex_lock(&cgroup_root_mutex);
 
 	/* See what subsystems are wanted */
 	ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
+	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	int ret = 0;
 	struct super_block *sb;
 	struct cgroupfs_root *new_root;
+	struct inode *inode;
 
 	/* First find the desired set of subsystems */
 	mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		/* We used the new root structure, so this is a new hierarchy */
 		struct list_head tmp_cg_links;
 		struct cgroup *root_cgrp = &root->top_cgroup;
-		struct inode *inode;
 		struct cgroupfs_root *existing_root;
 		const struct cred *cred;
 		int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
+		mutex_lock(&cgroup_root_mutex);
 
-		if (strlen(root->name)) {
-			/* Check for name clashes with existing mounts */
-			for_each_active_root(existing_root) {
-				if (!strcmp(existing_root->name, root->name)) {
-					ret = -EBUSY;
-					mutex_unlock(&cgroup_mutex);
-					mutex_unlock(&inode->i_mutex);
-					goto drop_new_super;
-				}
-			}
-		}
+		/* Check for name clashes with existing mounts */
+		ret = -EBUSY;
+		if (strlen(root->name))
+			for_each_active_root(existing_root)
+				if (!strcmp(existing_root->name, root->name))
+					goto unlock_drop;
 
 		/*
 		 * We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 * have some link structures left over
 		 */
 		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
-		if (ret) {
-			mutex_unlock(&cgroup_mutex);
-			mutex_unlock(&inode->i_mutex);
-			goto drop_new_super;
-		}
+		if (ret)
+			goto unlock_drop;
 
 		ret = rebind_subsystems(root, root->subsys_bits);
 		if (ret == -EBUSY) {
-			mutex_unlock(&cgroup_mutex);
-			mutex_unlock(&inode->i_mutex);
 			free_cg_links(&tmp_cg_links);
-			goto drop_new_super;
+			goto unlock_drop;
 		}
 		/*
 		 * There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		cred = override_creds(&init_cred);
 		cgroup_populate_dir(root_cgrp);
 		revert_creds(cred);
+		mutex_unlock(&cgroup_root_mutex);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
 	} else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	kfree(opts.name);
 	return dget(sb->s_root);
 
+ unlock_drop:
+	mutex_unlock(&cgroup_root_mutex);
+	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&inode->i_mutex);
  drop_new_super:
 	deactivate_locked_super(sb);
  drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	BUG_ON(!list_empty(&cgrp->sibling));
 
 	mutex_lock(&cgroup_mutex);
+	mutex_lock(&cgroup_root_mutex);
 
 	/* Rebind all subsystems back to the default hierarchy */
 	ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
 		root_count--;
 	}
 
+	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 
 	kill_litter_super(sb);
@@ -2308,7 +2326,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 		return -EINVAL;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
+	mutex_lock(&cgroup_root_mutex);
 	strcpy(cgrp->root->release_agent_path, buffer);
+	mutex_unlock(&cgroup_root_mutex);
 	cgroup_unlock();
 	return 0;
 }
-- 
1.7.3.1

^ permalink raw reply related

* [PATCHSET] cgroup: stable threadgroup during attach & subsys methods consolidation
From: Tejun Heo @ 2011-11-01 23:46 UTC (permalink / raw)
  To: paul, rjw, lizf
  Cc: fweisbec, containers, linux-kernel, oleg, linux-pm, akpm,
	kamezawa.hiroyu

Hello,

NOT FOR THIS MERGE WINDOW.

This patchset is combination of the following two patchsets.

 [1] cgroup: extend threadgroup locking
 [2] cgroup: introduce cgroup_taskset and consolidate subsys methods, take#2

Changes from the last postings are

* 0001-cgroup-add-cgroup_root_mutex.patch replaces mutex ordering
  reversal patch, which Oleg found out to be broken.  Instead, a new
  sub-mutex cgroup_root_mutex is introduced to break circular
  dependency.

* Rebased on top of the current linus/master.

* Other minor changes to reflect comments from reviews.

* Reviewed/Acked-by's added.

This patchset addresses the following two issues.

1. cgroup currently only blocks new threads from joining the target
   threadgroup during migration, and on-going migration could race
   against exec and exit leading to interesting problems - the
   symmetry between various attach methods, task exiting during method
   execution, ->exit() racing against attach methods, migrating task
   switching basic properties during exec and so on.

   This is resolved by extending threadgroup locking such that it
   covers all operations which can alter the threadgroup - fork, exit
   and exec, and update cgroup to take advantage of it.  rwsem read
   ops are added to exit path but exec is excluded by grabbing the
   existing cred_guard_mutex from threadgroup locking helper.

   This makes threadgroup locking complete and resolves cgroup issues
   stemming from the target taskset being unstable.

2. cgroup has grown quite some number of subsys methods.  Some of them
   are overlapping, inconsistent with each other and called under
   different conditions depending on whether they're called for a
   single task or whole process.  Unfortunately, these callbacks are
   complicated and incomplete at the same time.

   * ->attach_task() is called after migration for task attach but
     before for process.

   * Ditto for ->pre_attach().

   * ->can_attach_task() is called for every task in the thread group
     ->but attach_task() skips the ones which don't actually change
     ->cgroups.

   * Task attach becomes noop if the task isn't actually moving.
     Process attach is always performed.

   * ->attach_task() doesn't (or at least aren't supposed to) have
     access to the old cgroup.

   * During cancel, there's no way to access the affected tasks.

   This patchset introduces cgroup_taskset along with some accessors
   and iterator, updates methods to use it, consolidates usages and
   drops superflous methods.

 0001-cgroup-add-cgroup_root_mutex.patch
 0002-threadgroup-rename-signal-threadgroup_fork_lock-to-g.patch
 0003-threadgroup-extend-threadgroup_lock-to-cover-exit-an.patch
 0004-cgroup-always-lock-threadgroup-during-migration.patch
 0005-cgroup-subsys-attach_task-should-be-called-after-mig.patch
 0006-cgroup-improve-old-cgroup-handling-in-cgroup_attach_.patch
 0007-cgroup-introduce-cgroup_taskset-and-use-it-in-subsys.patch
 0008-cgroup-don-t-use-subsys-can_attach_task-or-attach_ta.patch
 0009-cgroup-cpuset-don-t-use-ss-pre_attach.patch
 0010-cgroup-kill-subsys-can_attach_task-pre_attach-and-at.patch

0001-0004 implement stable thread group.

0005-0010 introduce taskset and consolidate subsys callbacks.

This patchset is on top of the current linus/master 839d881074 "Merge
branch 'i2c-for-linus' of ..." and also available in the following git
branch.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git cgroup-cleanup

If this looks okay, I think it would be best to route this through pm
tree as there are and will be intersecting changes (mostly around
cgroup_freezer).

diffstat follows.

 Documentation/cgroups/cgroups.txt |   51 ++----
 block/blk-cgroup.c                |   45 +++--
 include/linux/cgroup.h            |   31 ++-
 include/linux/init_task.h         |    9 -
 include/linux/sched.h             |   62 +++++--
 kernel/cgroup.c                   |  320 +++++++++++++++++++++++---------------
 kernel/cgroup_freezer.c           |   27 +--
 kernel/cpuset.c                   |  105 +++++-------
 kernel/events/core.c              |   13 -
 kernel/exit.c                     |   17 +-
 kernel/fork.c                     |    8 
 kernel/sched.c                    |   31 ++-
 mm/memcontrol.c                   |   16 -
 security/device_cgroup.c          |    7 
 14 files changed, 430 insertions(+), 312 deletions(-)

Thanks.

--
tejun

[1] http://thread.gmane.org/gmane.linux.kernel/1187853
[2] http://thread.gmane.org/gmane.linux.kernel/1184375

^ permalink raw reply

* Re: [REGRESSION]: hibernate/sleep regression w/ bisection
From: Tejun Heo @ 2011-11-01 20:19 UTC (permalink / raw)
  To: Andrew Watts; +Cc: linux-pm, dmitry.torokhov, linux-kernel
In-Reply-To: <20111101191459.GA2748@zeus>

On Tue, Nov 01, 2011 at 02:15:16PM -0500, Andrew Watts wrote:
> On Tue, Nov 01, 2011 at 09:42:35AM -0700, Tejun Heo wrote:
> > 
> > Does the following patch fix the problem?
> > 
> > Thanks.
> 
> Unfortunately, it does not fix the problem.

Odd... how about the following one?  Thanks.

diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index ba70058..14d560a 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -299,9 +299,8 @@ static int serio_queue_event(void *object, struct module *owner,
 	event->owner = owner;
 
 	list_add_tail(&event->node, &serio_event_list);
-	queue_work(system_long_wq, &serio_event_work);
-
 out:
+	queue_work(system_long_wq, &serio_event_work);
 	spin_unlock_irqrestore(&serio_event_lock, flags);
 	return retval;
 }

^ permalink raw reply related

* Re: [REGRESSION]: hibernate/sleep regression w/ bisection
From: Andrew Watts @ 2011-11-01 19:15 UTC (permalink / raw)
  To: Tejun Heo; +Cc: linux-pm, dmitry.torokhov, linux-kernel
In-Reply-To: <20111101164235.GS18855@google.com>

On Tue, Nov 01, 2011 at 09:42:35AM -0700, Tejun Heo wrote:
> 
> Does the following patch fix the problem?
> 
> Thanks.

Unfortunately, it does not fix the problem.

~ Andy

^ permalink raw reply

* Re: [REGRESSION]: hibernate/sleep regression w/ bisection
From: Tejun Heo @ 2011-11-01 16:42 UTC (permalink / raw)
  To: Andrew Watts; +Cc: linux-pm, dmitry.torokhov, linux-kernel
In-Reply-To: <20111101124759.GA1326@zeus>

On Tue, Nov 01, 2011 at 07:48:19AM -0500, Andrew Watts wrote:
> Hi.
> 
> Hibernate/sleep (echo disk/mem > /sys/power/state) has presented problems
> for me starting with 2.6.39. Kernel 2.6.37.6 was the last completely bug-free
> version I used (I skipped the 2.6.38 branch entirely).
> 
> The symptoms are that upon resume (from sleep/hibernate) there is no video
> nor any keyboard input with the exception of sysrq.
> 
> It has been a frustrating bug to hunt down because it is not easily
> reproduced; sometimes the bug doesn't pop up until after a long sequence of
> hibernate/sleep cycles.
> 
> I successfully bisected the problem to: 8ee294cd9def000.
> 
> =======
> Commit: 8ee294cd9def0004887da7f44b80563493b0a097
> Author: Dmitry Torokhov <dmitry.torokhov@gmail.com>
> Date:   Mon Nov 15 01:39:57 2010 -0800
> Input: serio - convert to common workqueue instead of a thread
> =======
> 
> Backing out 8ee294cd9def000 (which requires reversing part of
> 1d64b655dc083df also) fixes this particular problem on 2.6.39.4,
> 3.0.8, and 3.1.
> 
> Unfortunately, in 3.0.8 and 3.1 I have other suspend/hibernate problems
> that I will investigate next and detail in a different regression report.

Does the following patch fix the problem?

Thanks.

diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index ba70058..95aeebd 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -949,6 +949,7 @@ static int serio_resume(struct device *dev)
 	 * deal with it.
 	 */
 	serio_queue_event(serio, NULL, SERIO_RECONNECT_PORT);
+	queue_work(system_long_wq, &serio_event_work);
 
 	return 0;
 }

^ permalink raw reply related

* [REGRESSION]: hibernate/sleep regression w/ bisection
From: Andrew Watts @ 2011-11-01 12:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: tj, linux-pm, dmitry.torokhov

Hi.

Hibernate/sleep (echo disk/mem > /sys/power/state) has presented problems
for me starting with 2.6.39. Kernel 2.6.37.6 was the last completely bug-free
version I used (I skipped the 2.6.38 branch entirely).

The symptoms are that upon resume (from sleep/hibernate) there is no video
nor any keyboard input with the exception of sysrq.

It has been a frustrating bug to hunt down because it is not easily
reproduced; sometimes the bug doesn't pop up until after a long sequence of
hibernate/sleep cycles.

I successfully bisected the problem to: 8ee294cd9def000.

=======
Commit: 8ee294cd9def0004887da7f44b80563493b0a097
Author: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date:   Mon Nov 15 01:39:57 2010 -0800
Input: serio - convert to common workqueue instead of a thread
=======

Backing out 8ee294cd9def000 (which requires reversing part of
1d64b655dc083df also) fixes this particular problem on 2.6.39.4,
3.0.8, and 3.1.

Unfortunately, in 3.0.8 and 3.1 I have other suspend/hibernate problems
that I will investigate next and detail in a different regression report.

~ Andy

System specs: P4/Radeon GPU

^ permalink raw reply

* Resume problem, reboot instead of resume after suspend on Zepto Nox A14
From: Damien Thébault @ 2011-11-01 12:27 UTC (permalink / raw)
  To: linux-pm

Hello,

I have a problem with suspend and resume on my laptop, I tried to
solve the problem myself but nothing works.

I was using kernel 2.6.36 in the past and suspend was working just out
of the box (however, I had problem with sound and needed a more recent
kernel for it).
With the next kernel, suspend was not working anymore, but I thought
it was just a temporary bug.
Some month later, I retried with a more recent kernel, and it still
failed. So I bisected the problem and found that commit
bd25f4dd6972755579d0ea50d1a5ace2e9b00d1a was the first "bad" commit
("HID: hiddev: use usb_find_interface, get rid of BKL"). This commit
made other laptops to fail suspend/resume too, and it was reverted
later (commit 9c9e54a8df0be48aa359744f412377cc55c3b7d2 "HID: hiddev:
fix memory corruption due to invalid intfdata").
However, other changes must have been made elsewhere, and even after
the revert, suspend didn't work anymore, and it's still not working
now (3.0.6 currently).

A lot of things changed in the kernel, and I don't expect to be able
to solve my problem by looking at the git history, so I'm trying to
solve the problem by using standard power-management debugging.

First I'll describe the problem:
 - I launch suspend to ram with any tool (I used to run acpitool -s,
but echo mem > /sys/power/state or s2ram works too)
 - The computer goes to sleep, the power led blinks slowly
 - I push a keyboard key (usually Ctrl), but any key or the power key
are working too
 - Instead of resuming successfully, the computer reboots and the BIOS
runs, like a cold power-up would do.

I looked at Documentation/power/basic-pm-debugging.txt and tested
modes of hibernations:
 - echo freezer > /sys/power/pm_test
 - echo mem > /sys/power/state
This worked just fine, I then tested with "devices", "platform",
"processors" and "core", and all worked.
If I use "none" to use the normal suspend, it just reboots to the BIOS.

I then followed the instructions at Documentation/power/s2ram.txt and
used PM_TRACE:
 - echo 1 > /sys/power/pm_trace
 - echo mem > /sys/power/state
On reboot, I can see the "Magic number" line, but no hash is matching.

I then tried suspend to disk, everything works just fine there, it
works perfectly.

I searched on forums and mailing lists to see which quirks I could try:
 - "i8042.reset=1": same behaviour
 - "acpi_sleep=s3_beep": same behaviour, no beep at all
 - "acpi_sleep=nonvs": same behaviour
 - "no_console_suspend": same behaviour

The DSDT does not contain any suspect windows-specific switches,
there's some "windows 2006" checks, but nothing for other versions,
and as I wrote previously linux was suspending well with the 2.6.36
kernel.
I'm not even sure that pressing a key does resume the kernel at all
(since s3_beep doesn't do any sound and PM_TRACE doesn't show
anything).

What could I do to debug this further? Is there a list of parameters to try?
What is the extra step done between "core" and "none" in /sys/power/pm_test?

This is a Zepto Nox A14 laptop, Core 2 Duo, ICH9 chipset, iwlagn wifi,
... I can provide dmidecode output, ACPI dumps and so on.

Regards,
-- 
Damien Thebault

^ permalink raw reply

* Re: [PATCH v9 2/4] cpuidle: Remove CPUIDLE_FLAG_IGNORE and dev->prepare()
From: Deepthi Dharwar @ 2011-10-31  7:32 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: khilman, venki, ak, len.brown, peterz, linux-pm, linux-kernel,
	linux-acpi, linux-sh, linux-pm, linux-omap, linux-arm-kernel
In-Reply-To: <4EAABB07.3000305@linux.intel.com>

On Friday 28 October 2011 07:54 PM, Arjan van de Ven wrote:
> 
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> On 10/28/2011 3:50 AM, Deepthi Dharwar wrote:
>> The cpuidle_device->prepare() mechanism causes updates to the
>> cpuidle_state[].flags, setting and clearing CPUIDLE_FLAG_IGNORE
>> to tell the governor not to chose a state on a per-cpu basis at
>> run-time. State demotion is now handled by the driver and it returns
>> the actual state entered. Hence, this mechanism is not required.
>> Also this removes per-cpu flags from cpuidle_state enabling
>> it to be made global.
>>
> 
> having worked on some newer platforms....
> this one is really still needed. doing this inside the actual states
> does not work,
> because if the deepest 3 states are invalid, the same (somewhat
> expensive) test would have to be done 3 times,
> and each of the states would have to fail before the 4th one gets chosen.
> that's just not going to work
> 
> (in the state handler you can't know what other state to fall back to,
> and especially not how to enter such a fallback state)
> 
> -----BEGIN PGP SIGNATURE-----
> Version: GnuPG v2.0.17 (MingW32)
> 
> iQEcBAEBAgAGBQJOqrsGAAoJEEHdSxh4DVnEu7EH/i5lEJctBAIubJOcZz/tvBFp
> XYmAe/HqNtSXeHOVsJkTf8y4ppE8487exF7xxMik4GRN0CZNCtkyMezqDVu+eDim
> O/UUbScsAc5cSY6mkjOFXLFup+mi1nkRUnAbxXEyTMhWwcbfr2OvcuO7l7TmATML
> hu87P3PVEafEop3q2+uWMc57fFxnNFfEDqRx6N9V+OJKV5dHrRYL4G4E01fYGFLo
> xTR0IW7nB15L0C29zk9uk/Dqow8SoJZA83c7p7AieP5zdntb6p7noIf03qmdp19f
> fulwMwembCHivo+pLO+jAMhKD1T6VYoCyiYW0LHrQ2E07fayBhFJCxlazgKFcl0=
> =FL6o
> -----END PGP SIGNATURE-----
> 
Hi Arjan,

Thanks for the review. 

We retain the dev->prepare() routine and CPUIDLE_FLAG_IGNORE 
but still allow the dev->enter() to return index ? 
So by retaining it, transition to the idle states
would be much quicker in case one more states are invalid.

Also to note is that in the newer design, we have split the 
cpuidle_state structure. One global struct, cpuidle_states[] that
contains all the state related information including flags, and 
the other cpuidle_device that contain statistics and other data 
that are per-cpu basis. 
So the flags are not per cpu per state basis but 
maintained globally as per state basis. 

So if we have to enable CPUIDLE_FLAG_IGNORE flag in this 
current new design, then I am thinking if we needed to have this 
on a per-cpu basis. If so, then flags have to be moved into cpuidle_device 
struct rather than cpuidle_state struct. 

Is it a good idea to retain these flags as global (part of cpuidle_states) 
or make it per-cpu basis ? 

-Thanks
Deepthi   

^ permalink raw reply

* Re: hiberante hangs TCP Re: [EXAMPLE CODE] Parasite thread injection and TCP connection hijacking
From: David Fries @ 2011-10-30 20:43 UTC (permalink / raw)
  To: Tejun Heo; +Cc: netdev, linux-pm, linux-kernel
In-Reply-To: <20111030201618.GA7696@google.com>

On Sun, Oct 30, 2011 at 01:16:18PM -0700, Tejun Heo wrote:
> (cc'ing Rafael and linux-pm)
> 
> On Sat, Oct 29, 2011 at 11:48:21PM -0500, David Fries wrote:
> > I saw the write up on this on lwn.net, pretty creative by the way, and
> > it got me thinking about a different checkpoint/restart problem I've
> > been running into.  Specifically in hibernating to disk.  In the
> > hibernate case active TCP connections hang after resuming, while an
> > idle TCP connection will continue after the system is back up.  My
> > observation is the kernel checkpoints itself to memory, enables
> > devices, writes out that checkpoint image to storage, then powers off.
> > The problem is if TCP packets are received while writing to storage,
> > the kernel will continue to queue and ack those TCP packets, but the
> > running kernel and it's network state is shortly lost.  When the
> > computer resumes, those TCP byte sequences hang the TCP connection for
> > an extended period of time while the resumed computer refuses to
> > acknowledge the data that was received after checkpointing and the now
> > running kernel knew nothing about, and the other computer tries in
> > vain to resend any data that hadn't yet been acknowledged, which is
> > always after the data that was lost, until one of them eventually
> > gives up.
> > 
> > I've been wondering if it was safe or possible to leave any network
> > interfaces down after the checkpoint, or what the right solution would
> > be.  I didn't think marking every TCP connection with a ZOMBIE_KERNEL
> > bit just after the kernel checkpoint (for the kernel is walking dead
> > and won't remember anything that happens), and then prevent any TCP
> > acks from being sent for those connections would be the right
> > solution.  I've taken to unplugging the physical lan cable,
> > hibernating to disk, and plugging it back in after the system is down,
> > to avoid the problem.  Any ideas?
> 
> Hmmm... sounds like taking down network interfaces before starting
> hibernation sequence should be enough, which shouldn't be too
> difficult to implement from userland.  Rafael, what do you think?

What I observe is the kernel prints out "Preallocating image memory",
then when the screen goes blank the network link light also goes out,
then the screen comes back on with "Compressing and saving" along with
the link light comes on, until it has been saved and the system shuts
down.  So the kernel is already brining the network down, it just
needs to keep it there until the original check pointed kernel is back
up.

Userspace bringing the network interfaces down is problematic.  As an
example one of my systems is running hostapd as an access point and
bridging that to the wired ethernet, that's not a trivial task to
setup and take down (the Debian ifup can set it up, but I've not
figured out yet how to get ifdown to take everything down cleanly, and
I sometimes manually run hostapd if I'm troubleshooting).  Any
manually added routes would go away, good luck in setting everything
back up the way it was before for all the different configurations out
there in userspace.  Add to those issues programs would now have a
time when networking is down that they wouldn't have otherwise seen.

-- 
David Fries <david@fries.net>    PGP pub CB1EE8F0
http://fries.net/~david/

^ permalink raw reply

* Re: hiberante hangs TCP Re: [EXAMPLE CODE] Parasite thread injection and TCP connection hijacking
From: Tejun Heo @ 2011-10-30 20:16 UTC (permalink / raw)
  To: David Fries; +Cc: netdev, linux-pm, linux-kernel
In-Reply-To: <20111030044821.GA23741@spacedout.fries.net>

(cc'ing Rafael and linux-pm)

On Sat, Oct 29, 2011 at 11:48:21PM -0500, David Fries wrote:
> I saw the write up on this on lwn.net, pretty creative by the way, and
> it got me thinking about a different checkpoint/restart problem I've
> been running into.  Specifically in hibernating to disk.  In the
> hibernate case active TCP connections hang after resuming, while an
> idle TCP connection will continue after the system is back up.  My
> observation is the kernel checkpoints itself to memory, enables
> devices, writes out that checkpoint image to storage, then powers off.
> The problem is if TCP packets are received while writing to storage,
> the kernel will continue to queue and ack those TCP packets, but the
> running kernel and it's network state is shortly lost.  When the
> computer resumes, those TCP byte sequences hang the TCP connection for
> an extended period of time while the resumed computer refuses to
> acknowledge the data that was received after checkpointing and the now
> running kernel knew nothing about, and the other computer tries in
> vain to resend any data that hadn't yet been acknowledged, which is
> always after the data that was lost, until one of them eventually
> gives up.
> 
> I've been wondering if it was safe or possible to leave any network
> interfaces down after the checkpoint, or what the right solution would
> be.  I didn't think marking every TCP connection with a ZOMBIE_KERNEL
> bit just after the kernel checkpoint (for the kernel is walking dead
> and won't remember anything that happens), and then prevent any TCP
> acks from being sent for those connections would be the right
> solution.  I've taken to unplugging the physical lan cable,
> hibernating to disk, and plugging it back in after the system is down,
> to avoid the problem.  Any ideas?

Hmmm... sounds like taking down network interfaces before starting
hibernation sequence should be enough, which shouldn't be too
difficult to implement from userland.  Rafael, what do you think?

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH 4/6 v2] PM: Limit race conditions between runtime PM and system sleep (v2)
From: Linus Walleij @ 2011-10-28 20:00 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Kevin Hilman, linux-scsi, Greg KH, LKML, Jesse Barnes,
	Ulf Hansson, Tejun Heo, Linux PM mailing list, stable
In-Reply-To: <201110272206.35180.rjw@sisk.pl>

On Thu, Oct 27, 2011 at 10:06 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:

>> We are trying to produce
>> a runtime PM system of product quality based on 3.0.y and we've
>> already had to backport this patch ourselves to get things stable.
>>
>> We have also backported:
>> PM: Introduce generic "noirq" callback routines for subsystems (v2)
>> PM / Runtime: Update documentation of interactions with system sleep
>> PM / Runtime: Add new helper function: pm_runtime_status_suspended()
>>
>> And now it seems to be sufficient to get this thing going.
>
> Well, it isn't a simple fix and it changes the code's behavior quite
> significantly, so I thought it might not be a good idea to risk problems
> with -stable because of it.  Perhaps let's see how it works out in 3.1
> and backport it later if there are no problem reports related to it?

OK no hurries, let's give it a spin in 3.1 for some weeks or so.

Yours,
Linus Walleij

^ permalink raw reply

* Re: [PATCH v9 2/4] cpuidle: Remove CPUIDLE_FLAG_IGNORE and dev->prepare()
From: Arjan van de Ven @ 2011-10-28 14:24 UTC (permalink / raw)
  To: Deepthi Dharwar
  Cc: khilman, venki, ak, len.brown, peterz, linux-pm, linux-kernel,
	linux-acpi, linux-sh, linux-pm, linux-omap, linux-arm-kernel
In-Reply-To: <20111028105020.7520.68014.stgit@localhost6.localdomain6>


-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
 
On 10/28/2011 3:50 AM, Deepthi Dharwar wrote:
> The cpuidle_device->prepare() mechanism causes updates to the
> cpuidle_state[].flags, setting and clearing CPUIDLE_FLAG_IGNORE
> to tell the governor not to chose a state on a per-cpu basis at
> run-time. State demotion is now handled by the driver and it returns
> the actual state entered. Hence, this mechanism is not required.
> Also this removes per-cpu flags from cpuidle_state enabling
> it to be made global.
>
 
having worked on some newer platforms....
this one is really still needed. doing this inside the actual states
does not work,
because if the deepest 3 states are invalid, the same (somewhat
expensive) test would have to be done 3 times,
and each of the states would have to fail before the 4th one gets chosen.
that's just not going to work
 
(in the state handler you can't know what other state to fall back to,
and especially not how to enter such a fallback state)
 
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2.0.17 (MingW32)
 
iQEcBAEBAgAGBQJOqrsGAAoJEEHdSxh4DVnEu7EH/i5lEJctBAIubJOcZz/tvBFp
XYmAe/HqNtSXeHOVsJkTf8y4ppE8487exF7xxMik4GRN0CZNCtkyMezqDVu+eDim
O/UUbScsAc5cSY6mkjOFXLFup+mi1nkRUnAbxXEyTMhWwcbfr2OvcuO7l7TmATML
hu87P3PVEafEop3q2+uWMc57fFxnNFfEDqRx6N9V+OJKV5dHrRYL4G4E01fYGFLo
xTR0IW7nB15L0C29zk9uk/Dqow8SoJZA83c7p7AieP5zdntb6p7noIf03qmdp19f
fulwMwembCHivo+pLO+jAMhKD1T6VYoCyiYW0LHrQ2E07fayBhFJCxlazgKFcl0=
=FL6o
-----END PGP SIGNATURE-----

^ permalink raw reply

* [PATCH] CPU hotplug, PM: Remove unused symbol 'suspend_cpu_hotplug'
From: Srivatsa S. Bhat @ 2011-10-28 13:47 UTC (permalink / raw)
  To: rjw
  Cc: mingo, amwang, linux-pm, gregkh, linux-kernel, akpm, linux-pm,
	a.p.zijlstra



Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/cpu.h |    4 ----
 1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b1a635a..6cb60fd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -196,13 +196,9 @@ static inline void cpu_hotplug_driver_unlock(void)
 #endif		/* CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_PM_SLEEP_SMP
-extern int suspend_cpu_hotplug;
-
 extern int disable_nonboot_cpus(void);
 extern void enable_nonboot_cpus(void);
 #else /* !CONFIG_PM_SLEEP_SMP */
-#define suspend_cpu_hotplug	0
-
 static inline int disable_nonboot_cpus(void) { return 0; }
 static inline void enable_nonboot_cpus(void) {}
 #endif /* !CONFIG_PM_SLEEP_SMP */

^ permalink raw reply related

* [PATCH v9 4/4] cpuidle: Single/Global registration of idle states
From: Deepthi Dharwar @ 2011-10-28 10:50 UTC (permalink / raw)
  To: khilman, venki, ak, len.brown, peterz, rjw, santosh.shilimkar,
	arjan, lenb
  Cc: linux-sh, linux-pm, linux-kernel, linux-acpi, linux-pm,
	linux-omap, linux-arm-kernel
In-Reply-To: <20111028104945.7520.83828.stgit@localhost6.localdomain6>

This patch makes the cpuidle_states structure global (single copy)
instead of per-cpu. The statistics needed on per-cpu basis
by the governor are kept per-cpu. This simplifies the cpuidle
subsystem as state registration is done by single cpu only.
Having single copy of cpuidle_states saves memory. Rare case
of asymmetric C-states can be handled within the cpuidle driver
and architectures such as POWER do not have asymmetric C-states.

Having single/global registration of all the idle states,
dynamic C-state transitions on x86 are handled by
the boot cpu. Here, the boot cpu  would disable all the devices,
re-populate the states and later enable all the devices,
irrespective of the cpu that would receive the notification first.

Reference:
https://lkml.org/lkml/2011/4/25/83

Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Trinabh Gupta <g.trinabh@gmail.com>
Tested-by: Jean Pihet <j-pihet@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-at91/cpuidle.c          |   31 +++--
 arch/arm/mach-davinci/cpuidle.c       |   39 ++++---
 arch/arm/mach-exynos4/cpuidle.c       |   23 ++--
 arch/arm/mach-kirkwood/cpuidle.c      |   30 +++--
 arch/arm/mach-omap2/cpuidle34xx.c     |   73 ++++++++-----
 arch/sh/kernel/cpu/shmobile/cpuidle.c |   18 ++-
 drivers/acpi/processor_driver.c       |   20 +--
 drivers/acpi/processor_idle.c         |  191 +++++++++++++++++++++++++++++----
 drivers/cpuidle/cpuidle.c             |   45 ++------
 drivers/cpuidle/driver.c              |   25 ++++
 drivers/cpuidle/governors/ladder.c    |   28 +++--
 drivers/cpuidle/governors/menu.c      |   20 ++-
 drivers/cpuidle/sysfs.c               |    3 -
 drivers/idle/intel_idle.c             |   80 +++++++++++---
 include/acpi/processor.h              |    1 
 include/linux/cpuidle.h               |   19 ++-
 16 files changed, 439 insertions(+), 207 deletions(-)

diff --git a/arch/arm/mach-at91/cpuidle.c b/arch/arm/mach-at91/cpuidle.c
index 4696a0d..93178f6 100644
--- a/arch/arm/mach-at91/cpuidle.c
+++ b/arch/arm/mach-at91/cpuidle.c
@@ -33,6 +33,7 @@ static struct cpuidle_driver at91_idle_driver = {
 
 /* Actual code that puts the SoC in different idle states */
 static int at91_enter_idle(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
 			       int index)
 {
 	struct timeval before, after;
@@ -64,27 +65,29 @@ static int at91_enter_idle(struct cpuidle_device *dev,
 static int at91_init_cpuidle(void)
 {
 	struct cpuidle_device *device;
-
-	cpuidle_register_driver(&at91_idle_driver);
+	struct cpuidle_driver *driver = &at91_idle_driver;
 
 	device = &per_cpu(at91_cpuidle_device, smp_processor_id());
 	device->state_count = AT91_MAX_STATES;
+	driver->state_count = AT91_MAX_STATES;
 
 	/* Wait for interrupt state */
-	device->states[0].enter = at91_enter_idle;
-	device->states[0].exit_latency = 1;
-	device->states[0].target_residency = 10000;
-	device->states[0].flags = CPUIDLE_FLAG_TIME_VALID;
-	strcpy(device->states[0].name, "WFI");
-	strcpy(device->states[0].desc, "Wait for interrupt");
+	driver->states[0].enter = at91_enter_idle;
+	driver->states[0].exit_latency = 1;
+	driver->states[0].target_residency = 10000;
+	driver->states[0].flags = CPUIDLE_FLAG_TIME_VALID;
+	strcpy(driver->states[0].name, "WFI");
+	strcpy(driver->states[0].desc, "Wait for interrupt");
 
 	/* Wait for interrupt and RAM self refresh state */
-	device->states[1].enter = at91_enter_idle;
-	device->states[1].exit_latency = 10;
-	device->states[1].target_residency = 10000;
-	device->states[1].flags = CPUIDLE_FLAG_TIME_VALID;
-	strcpy(device->states[1].name, "RAM_SR");
-	strcpy(device->states[1].desc, "WFI and RAM Self Refresh");
+	driver->states[1].enter = at91_enter_idle;
+	driver->states[1].exit_latency = 10;
+	driver->states[1].target_residency = 10000;
+	driver->states[1].flags = CPUIDLE_FLAG_TIME_VALID;
+	strcpy(driver->states[1].name, "RAM_SR");
+	strcpy(driver->states[1].desc, "WFI and RAM Self Refresh");
+
+	cpuidle_register_driver(&at91_idle_driver);
 
 	if (cpuidle_register_device(device)) {
 		printk(KERN_ERR "at91_init_cpuidle: Failed registering\n");
diff --git a/arch/arm/mach-davinci/cpuidle.c b/arch/arm/mach-davinci/cpuidle.c
index f2d2f34..dbeeccd 100644
--- a/arch/arm/mach-davinci/cpuidle.c
+++ b/arch/arm/mach-davinci/cpuidle.c
@@ -78,6 +78,7 @@ static struct davinci_ops davinci_states[DAVINCI_CPUIDLE_MAX_STATES] = {
 
 /* Actual code that puts the SoC in different idle states */
 static int davinci_enter_idle(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
 						int index)
 {
 	struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
@@ -109,6 +110,7 @@ static int __init davinci_cpuidle_probe(struct platform_device *pdev)
 {
 	int ret;
 	struct cpuidle_device *device;
+	struct cpuidle_driver *driver = &davinci_idle_driver;
 	struct davinci_cpuidle_config *pdata = pdev->dev.platform_data;
 
 	device = &per_cpu(davinci_cpuidle_device, smp_processor_id());
@@ -120,32 +122,33 @@ static int __init davinci_cpuidle_probe(struct platform_device *pdev)
 
 	ddr2_reg_base = pdata->ddr2_ctlr_base;
 
-	ret = cpuidle_register_driver(&davinci_idle_driver);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to register driver\n");
-		return ret;
-	}
-
 	/* Wait for interrupt state */
-	device->states[0].enter = davinci_enter_idle;
-	device->states[0].exit_latency = 1;
-	device->states[0].target_residency = 10000;
-	device->states[0].flags = CPUIDLE_FLAG_TIME_VALID;
-	strcpy(device->states[0].name, "WFI");
-	strcpy(device->states[0].desc, "Wait for interrupt");
+	driver->states[0].enter = davinci_enter_idle;
+	driver->states[0].exit_latency = 1;
+	driver->states[0].target_residency = 10000;
+	driver->states[0].flags = CPUIDLE_FLAG_TIME_VALID;
+	strcpy(driver->states[0].name, "WFI");
+	strcpy(driver->states[0].desc, "Wait for interrupt");
 
 	/* Wait for interrupt and DDR self refresh state */
-	device->states[1].enter = davinci_enter_idle;
-	device->states[1].exit_latency = 10;
-	device->states[1].target_residency = 10000;
-	device->states[1].flags = CPUIDLE_FLAG_TIME_VALID;
-	strcpy(device->states[1].name, "DDR SR");
-	strcpy(device->states[1].desc, "WFI and DDR Self Refresh");
+	driver->states[1].enter = davinci_enter_idle;
+	driver->states[1].exit_latency = 10;
+	driver->states[1].target_residency = 10000;
+	driver->states[1].flags = CPUIDLE_FLAG_TIME_VALID;
+	strcpy(driver->states[1].name, "DDR SR");
+	strcpy(driver->states[1].desc, "WFI and DDR Self Refresh");
 	if (pdata->ddr2_pdown)
 		davinci_states[1].flags |= DAVINCI_CPUIDLE_FLAGS_DDR2_PWDN;
 	cpuidle_set_statedata(&device->states_usage[1], &davinci_states[1]);
 
 	device->state_count = DAVINCI_CPUIDLE_MAX_STATES;
+	driver->state_count = DAVINCI_CPUIDLE_MAX_STATES;
+
+	ret = cpuidle_register_driver(&davinci_idle_driver);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register driver\n");
+		return ret;
+	}
 
 	ret = cpuidle_register_device(device);
 	if (ret) {
diff --git a/arch/arm/mach-exynos4/cpuidle.c b/arch/arm/mach-exynos4/cpuidle.c
index ea026e7..35f6502 100644
--- a/arch/arm/mach-exynos4/cpuidle.c
+++ b/arch/arm/mach-exynos4/cpuidle.c
@@ -16,6 +16,7 @@
 #include <asm/proc-fns.h>
 
 static int exynos4_enter_idle(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
 			      int index);
 
 static struct cpuidle_state exynos4_cpuidle_set[] = {
@@ -37,6 +38,7 @@ static struct cpuidle_driver exynos4_idle_driver = {
 };
 
 static int exynos4_enter_idle(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
 			      int index)
 {
 	struct timeval before, after;
@@ -60,22 +62,23 @@ static int __init exynos4_init_cpuidle(void)
 {
 	int i, max_cpuidle_state, cpu_id;
 	struct cpuidle_device *device;
-
+	struct cpuidle_driver *drv = &exynos4_idle_driver;
+
+	/* Setup cpuidle driver */
+	drv->state_count = (sizeof(exynos4_cpuidle_set) /
+				       sizeof(struct cpuidle_state));
+	max_cpuidle_state = drv->state_count;
+	for (i = 0; i < max_cpuidle_state; i++) {
+		memcpy(&drv->states[i], &exynos4_cpuidle_set[i],
+				sizeof(struct cpuidle_state));
+	}
 	cpuidle_register_driver(&exynos4_idle_driver);
 
 	for_each_cpu(cpu_id, cpu_online_mask) {
 		device = &per_cpu(exynos4_cpuidle_device, cpu_id);
 		device->cpu = cpu_id;
 
-		device->state_count = (sizeof(exynos4_cpuidle_set) /
-					       sizeof(struct cpuidle_state));
-
-		max_cpuidle_state = device->state_count;
-
-		for (i = 0; i < max_cpuidle_state; i++) {
-			memcpy(&device->states[i], &exynos4_cpuidle_set[i],
-					sizeof(struct cpuidle_state));
-		}
+		device->state_count = drv->state_count;
 
 		if (cpuidle_register_device(device)) {
 			printk(KERN_ERR "CPUidle register device failed\n,");
diff --git a/arch/arm/mach-kirkwood/cpuidle.c b/arch/arm/mach-kirkwood/cpuidle.c
index 358dd80..ffd690d 100644
--- a/arch/arm/mach-kirkwood/cpuidle.c
+++ b/arch/arm/mach-kirkwood/cpuidle.c
@@ -32,6 +32,7 @@ static DEFINE_PER_CPU(struct cpuidle_device, kirkwood_cpuidle_device);
 
 /* Actual code that puts the SoC in different idle states */
 static int kirkwood_enter_idle(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
 			       int index)
 {
 	struct timeval before, after;
@@ -68,28 +69,29 @@ static int kirkwood_enter_idle(struct cpuidle_device *dev,
 static int kirkwood_init_cpuidle(void)
 {
 	struct cpuidle_device *device;
-
-	cpuidle_register_driver(&kirkwood_idle_driver);
+	struct cpuidle_driver *driver = &kirkwood_idle_driver;
 
 	device = &per_cpu(kirkwood_cpuidle_device, smp_processor_id());
 	device->state_count = KIRKWOOD_MAX_STATES;
+	driver->state_count = KIRKWOOD_MAX_STATES;
 
 	/* Wait for interrupt state */
-	device->states[0].enter = kirkwood_enter_idle;
-	device->states[0].exit_latency = 1;
-	device->states[0].target_residency = 10000;
-	device->states[0].flags = CPUIDLE_FLAG_TIME_VALID;
-	strcpy(device->states[0].name, "WFI");
-	strcpy(device->states[0].desc, "Wait for interrupt");
+	driver->states[0].enter = kirkwood_enter_idle;
+	driver->states[0].exit_latency = 1;
+	driver->states[0].target_residency = 10000;
+	driver->states[0].flags = CPUIDLE_FLAG_TIME_VALID;
+	strcpy(driver->states[0].name, "WFI");
+	strcpy(driver->states[0].desc, "Wait for interrupt");
 
 	/* Wait for interrupt and DDR self refresh state */
-	device->states[1].enter = kirkwood_enter_idle;
-	device->states[1].exit_latency = 10;
-	device->states[1].target_residency = 10000;
-	device->states[1].flags = CPUIDLE_FLAG_TIME_VALID;
-	strcpy(device->states[1].name, "DDR SR");
-	strcpy(device->states[1].desc, "WFI and DDR Self Refresh");
+	driver->states[1].enter = kirkwood_enter_idle;
+	driver->states[1].exit_latency = 10;
+	driver->states[1].target_residency = 10000;
+	driver->states[1].flags = CPUIDLE_FLAG_TIME_VALID;
+	strcpy(driver->states[1].name, "DDR SR");
+	strcpy(driver->states[1].desc, "WFI and DDR Self Refresh");
 
+	cpuidle_register_driver(&kirkwood_idle_driver);
 	if (cpuidle_register_device(device)) {
 		printk(KERN_ERR "kirkwood_init_cpuidle: Failed registering\n");
 		return -EIO;
diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index d3fce7b..1fe35c2 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -88,12 +88,14 @@ static int _cpuidle_deny_idle(struct powerdomain *pwrdm,
 /**
  * omap3_enter_idle - Programs OMAP3 to enter the specified state
  * @dev: cpuidle device
+ * @drv: cpuidle driver
  * @index: the index of state to be entered
  *
  * Called from the CPUidle framework to program the device to the
  * specified target state selected by the governor.
  */
 static int omap3_enter_idle(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
 				int index)
 {
 	struct omap3_idle_statedata *cx =
@@ -148,6 +150,7 @@ return_sleep_time:
 /**
  * next_valid_state - Find next valid C-state
  * @dev: cpuidle device
+ * @drv: cpuidle driver
  * @index: Index of currently selected c-state
  *
  * If the state corresponding to index is valid, index is returned back
@@ -158,10 +161,11 @@ return_sleep_time:
  * if it satisfies the enable_off_mode condition.
  */
 static int next_valid_state(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
 				int index)
 {
 	struct cpuidle_state_usage *curr_usage = &dev->states_usage[index];
-	struct cpuidle_state *curr = &dev->states[index];
+	struct cpuidle_state *curr = &drv->states[index];
 	struct omap3_idle_statedata *cx = cpuidle_get_statedata(curr_usage);
 	u32 mpu_deepest_state = PWRDM_POWER_RET;
 	u32 core_deepest_state = PWRDM_POWER_RET;
@@ -188,7 +192,7 @@ static int next_valid_state(struct cpuidle_device *dev,
 
 		/* Reach the current state starting at highest C-state */
 		for (; idx >= 0; idx--) {
-			if (&dev->states[idx] == curr) {
+			if (&drv->states[idx] == curr) {
 				next_index = idx;
 				break;
 			}
@@ -224,12 +228,14 @@ static int next_valid_state(struct cpuidle_device *dev,
 /**
  * omap3_enter_idle_bm - Checks for any bus activity
  * @dev: cpuidle device
+ * @drv: cpuidle driver
  * @index: array index of target state to be programmed
  *
  * This function checks for any pending activity and then programs
  * the device to the specified or a safer state.
  */
 static int omap3_enter_idle_bm(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
 			       int index)
 {
 	int new_state_idx;
@@ -238,7 +244,7 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	int ret;
 
 	if (!omap3_can_sleep()) {
-		new_state_idx = dev->safe_state_index;
+		new_state_idx = drv->safe_state_index;
 		goto select_state;
 	}
 
@@ -248,7 +254,7 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	 */
 	cam_state = pwrdm_read_pwrst(cam_pd);
 	if (cam_state == PWRDM_POWER_ON) {
-		new_state_idx = dev->safe_state_index;
+		new_state_idx = drv->safe_state_index;
 		goto select_state;
 	}
 
@@ -275,10 +281,10 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	if (per_next_state != per_saved_state)
 		pwrdm_set_next_pwrst(per_pd, per_next_state);
 
-	new_state_idx = next_valid_state(dev, index);
+	new_state_idx = next_valid_state(dev, drv, index);
 
 select_state:
-	ret = omap3_enter_idle(dev, new_state_idx);
+	ret = omap3_enter_idle(dev, drv, new_state_idx);
 
 	/* Restore original PER state if it was modified */
 	if (per_next_state != per_saved_state)
@@ -311,22 +317,30 @@ struct cpuidle_driver omap3_idle_driver = {
 	.owner = 	THIS_MODULE,
 };
 
-/* Helper to fill the C-state common data and register the driver_data */
-static inline struct omap3_idle_statedata *_fill_cstate(
-					struct cpuidle_device *dev,
+/* Helper to fill the C-state common data*/
+static inline void _fill_cstate(struct cpuidle_driver *drv,
 					int idx, const char *descr)
 {
-	struct omap3_idle_statedata *cx = &omap3_idle_data[idx];
-	struct cpuidle_state *state = &dev->states[idx];
-	struct cpuidle_state_usage *state_usage = &dev->states_usage[idx];
+	struct cpuidle_state *state = &drv->states[idx];
 
 	state->exit_latency	= cpuidle_params_table[idx].exit_latency;
 	state->target_residency	= cpuidle_params_table[idx].target_residency;
 	state->flags		= CPUIDLE_FLAG_TIME_VALID;
 	state->enter		= omap3_enter_idle_bm;
-	cx->valid		= cpuidle_params_table[idx].valid;
 	sprintf(state->name, "C%d", idx + 1);
 	strncpy(state->desc, descr, CPUIDLE_DESC_LEN);
+
+}
+
+/* Helper to register the driver_data */
+static inline struct omap3_idle_statedata *_fill_cstate_usage(
+					struct cpuidle_device *dev,
+					int idx)
+{
+	struct omap3_idle_statedata *cx = &omap3_idle_data[idx];
+	struct cpuidle_state_usage *state_usage = &dev->states_usage[idx];
+
+	cx->valid		= cpuidle_params_table[idx].valid;
 	cpuidle_set_statedata(state_usage, cx);
 
 	return cx;
@@ -341,6 +355,7 @@ static inline struct omap3_idle_statedata *_fill_cstate(
 int __init omap3_idle_init(void)
 {
 	struct cpuidle_device *dev;
+	struct cpuidle_driver *drv = &omap3_idle_driver;
 	struct omap3_idle_statedata *cx;
 
 	mpu_pd = pwrdm_lookup("mpu_pwrdm");
@@ -348,45 +363,52 @@ int __init omap3_idle_init(void)
 	per_pd = pwrdm_lookup("per_pwrdm");
 	cam_pd = pwrdm_lookup("cam_pwrdm");
 
-	cpuidle_register_driver(&omap3_idle_driver);
+
+	drv->safe_state_index = -1;
 	dev = &per_cpu(omap3_idle_dev, smp_processor_id());
-	dev->safe_state_index = -1;
 
 	/* C1 . MPU WFI + Core active */
-	cx = _fill_cstate(dev, 0, "MPU ON + CORE ON");
-	(&dev->states[0])->enter = omap3_enter_idle;
-	dev->safe_state_index = 0;
+	_fill_cstate(drv, 0, "MPU ON + CORE ON");
+	(&drv->states[0])->enter = omap3_enter_idle;
+	drv->safe_state_index = 0;
+	cx = _fill_cstate_usage(dev, 0);
 	cx->valid = 1;	/* C1 is always valid */
 	cx->mpu_state = PWRDM_POWER_ON;
 	cx->core_state = PWRDM_POWER_ON;
 
 	/* C2 . MPU WFI + Core inactive */
-	cx = _fill_cstate(dev, 1, "MPU ON + CORE ON");
+	_fill_cstate(drv, 1, "MPU ON + CORE ON");
+	cx = _fill_cstate_usage(dev, 1);
 	cx->mpu_state = PWRDM_POWER_ON;
 	cx->core_state = PWRDM_POWER_ON;
 
 	/* C3 . MPU CSWR + Core inactive */
-	cx = _fill_cstate(dev, 2, "MPU RET + CORE ON");
+	_fill_cstate(drv, 2, "MPU RET + CORE ON");
+	cx = _fill_cstate_usage(dev, 2);
 	cx->mpu_state = PWRDM_POWER_RET;
 	cx->core_state = PWRDM_POWER_ON;
 
 	/* C4 . MPU OFF + Core inactive */
-	cx = _fill_cstate(dev, 3, "MPU OFF + CORE ON");
+	_fill_cstate(drv, 3, "MPU OFF + CORE ON");
+	cx = _fill_cstate_usage(dev, 3);
 	cx->mpu_state = PWRDM_POWER_OFF;
 	cx->core_state = PWRDM_POWER_ON;
 
 	/* C5 . MPU RET + Core RET */
-	cx = _fill_cstate(dev, 4, "MPU RET + CORE RET");
+	_fill_cstate(drv, 4, "MPU RET + CORE RET");
+	cx = _fill_cstate_usage(dev, 4);
 	cx->mpu_state = PWRDM_POWER_RET;
 	cx->core_state = PWRDM_POWER_RET;
 
 	/* C6 . MPU OFF + Core RET */
-	cx = _fill_cstate(dev, 5, "MPU OFF + CORE RET");
+	_fill_cstate(drv, 5, "MPU OFF + CORE RET");
+	cx = _fill_cstate_usage(dev, 5);
 	cx->mpu_state = PWRDM_POWER_OFF;
 	cx->core_state = PWRDM_POWER_RET;
 
 	/* C7 . MPU OFF + Core OFF */
-	cx = _fill_cstate(dev, 6, "MPU OFF + CORE OFF");
+	_fill_cstate(drv, 6, "MPU OFF + CORE OFF");
+	cx = _fill_cstate_usage(dev, 6);
 	/*
 	 * Erratum i583: implementation for ES rev < Es1.2 on 3630. We cannot
 	 * enable OFF mode in a stable form for previous revisions.
@@ -400,6 +422,9 @@ int __init omap3_idle_init(void)
 	cx->mpu_state = PWRDM_POWER_OFF;
 	cx->core_state = PWRDM_POWER_OFF;
 
+	drv->state_count = OMAP3_NUM_STATES;
+	cpuidle_register_driver(&omap3_idle_driver);
+
 	dev->state_count = OMAP3_NUM_STATES;
 	if (cpuidle_register_device(dev)) {
 		printk(KERN_ERR "%s: CPUidle register device failed\n",
diff --git a/arch/sh/kernel/cpu/shmobile/cpuidle.c b/arch/sh/kernel/cpu/shmobile/cpuidle.c
index 7be50d4c..ad1012a 100644
--- a/arch/sh/kernel/cpu/shmobile/cpuidle.c
+++ b/arch/sh/kernel/cpu/shmobile/cpuidle.c
@@ -25,6 +25,7 @@ static unsigned long cpuidle_mode[] = {
 };
 
 static int cpuidle_sleep_enter(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
 				int index)
 {
 	unsigned long allowed_mode = arch_hwblk_sleep_mode();
@@ -64,19 +65,19 @@ static struct cpuidle_driver cpuidle_driver = {
 void sh_mobile_setup_cpuidle(void)
 {
 	struct cpuidle_device *dev = &cpuidle_dev;
+	struct cpuidle_driver *drv = &cpuidle_driver;
 	struct cpuidle_state *state;
 	int i;
 
-	cpuidle_register_driver(&cpuidle_driver);
 
 	for (i = 0; i < CPUIDLE_STATE_MAX; i++) {
-		dev->states[i].name[0] = '\0';
-		dev->states[i].desc[0] = '\0';
+		drv->states[i].name[0] = '\0';
+		drv->states[i].desc[0] = '\0';
 	}
 
 	i = CPUIDLE_DRIVER_STATE_START;
 
-	state = &dev->states[i++];
+	state = &drv->states[i++];
 	snprintf(state->name, CPUIDLE_NAME_LEN, "C1");
 	strncpy(state->desc, "SuperH Sleep Mode", CPUIDLE_DESC_LEN);
 	state->exit_latency = 1;
@@ -86,10 +87,10 @@ void sh_mobile_setup_cpuidle(void)
 	state->flags |= CPUIDLE_FLAG_TIME_VALID;
 	state->enter = cpuidle_sleep_enter;
 
-	dev->safe_state_index = i-1;
+	drv->safe_state_index = i-1;
 
 	if (sh_mobile_sleep_supported & SUSP_SH_SF) {
-		state = &dev->states[i++];
+		state = &drv->states[i++];
 		snprintf(state->name, CPUIDLE_NAME_LEN, "C2");
 		strncpy(state->desc, "SuperH Sleep Mode [SF]",
 			CPUIDLE_DESC_LEN);
@@ -102,7 +103,7 @@ void sh_mobile_setup_cpuidle(void)
 	}
 
 	if (sh_mobile_sleep_supported & SUSP_SH_STANDBY) {
-		state = &dev->states[i++];
+		state = &drv->states[i++];
 		snprintf(state->name, CPUIDLE_NAME_LEN, "C3");
 		strncpy(state->desc, "SuperH Mobile Standby Mode [SF]",
 			CPUIDLE_DESC_LEN);
@@ -114,7 +115,10 @@ void sh_mobile_setup_cpuidle(void)
 		state->enter = cpuidle_sleep_enter;
 	}
 
+	drv->state_count = i;
 	dev->state_count = i;
 
+	cpuidle_register_driver(&cpuidle_driver);
+
 	cpuidle_register_device(dev);
 }
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index a4e0f1b..9d7bc9f 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -426,7 +426,7 @@ static int acpi_cpu_soft_notify(struct notifier_block *nfb,
 
 	if (action == CPU_ONLINE && pr) {
 		acpi_processor_ppc_has_changed(pr, 0);
-		acpi_processor_cst_has_changed(pr);
+		acpi_processor_hotplug(pr);
 		acpi_processor_reevaluate_tstate(pr, action);
 		acpi_processor_tstate_has_changed(pr);
 	}
@@ -503,8 +503,7 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
 	acpi_processor_get_throttling_info(pr);
 	acpi_processor_get_limit_info(pr);
 
-
-	if (cpuidle_get_driver() == &acpi_idle_driver)
+	if (!cpuidle_get_driver() || cpuidle_get_driver() == &acpi_idle_driver)
 		acpi_processor_power_init(pr, device);
 
 	pr->cdev = thermal_cooling_device_register("Processor", device,
@@ -800,17 +799,9 @@ static int __init acpi_processor_init(void)
 
 	memset(&errata, 0, sizeof(errata));
 
-	if (!cpuidle_register_driver(&acpi_idle_driver)) {
-		printk(KERN_DEBUG "ACPI: %s registered with cpuidle\n",
-			acpi_idle_driver.name);
-	} else {
-		printk(KERN_DEBUG "ACPI: acpi_idle yielding to %s\n",
-			cpuidle_get_driver()->name);
-	}
-
 	result = acpi_bus_register_driver(&acpi_processor_driver);
 	if (result < 0)
-		goto out_cpuidle;
+		return result;
 
 	acpi_processor_install_hotplug_notify();
 
@@ -821,11 +812,6 @@ static int __init acpi_processor_init(void)
 	acpi_processor_throttling_init();
 
 	return 0;
-
-out_cpuidle:
-	cpuidle_unregister_driver(&acpi_idle_driver);
-
-	return result;
 }
 
 static void __exit acpi_processor_exit(void)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index e2264d5..73b2909 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -741,11 +741,13 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 /**
  * acpi_idle_enter_c1 - enters an ACPI C1 state-type
  * @dev: the target CPU
+ * @drv: cpuidle driver containing cpuidle state info
  * @index: index of target state
  *
  * This is equivalent to the HALT instruction.
  */
-static int acpi_idle_enter_c1(struct cpuidle_device *dev, int index)
+static int acpi_idle_enter_c1(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int index)
 {
 	ktime_t  kt1, kt2;
 	s64 idle_time;
@@ -787,9 +789,11 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, int index)
 /**
  * acpi_idle_enter_simple - enters an ACPI state without BM handling
  * @dev: the target CPU
+ * @drv: cpuidle driver with cpuidle state information
  * @index: the index of suggested state
  */
-static int acpi_idle_enter_simple(struct cpuidle_device *dev, int index)
+static int acpi_idle_enter_simple(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int index)
 {
 	struct acpi_processor *pr;
 	struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
@@ -869,11 +873,13 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
 /**
  * acpi_idle_enter_bm - enters C3 with proper BM handling
  * @dev: the target CPU
+ * @drv: cpuidle driver containing state data
  * @index: the index of suggested state
  *
  * If BM is detected, the deepest non-C3 idle state is entered instead.
  */
-static int acpi_idle_enter_bm(struct cpuidle_device *dev, int index)
+static int acpi_idle_enter_bm(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int index)
 {
 	struct acpi_processor *pr;
 	struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
@@ -896,9 +902,9 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, int index)
 	}
 
 	if (!cx->bm_sts_skip && acpi_idle_bm_check()) {
-		if (dev->safe_state_index >= 0) {
-			return dev->states[dev->safe_state_index].enter(dev,
-						dev->safe_state_index);
+		if (drv->safe_state_index >= 0) {
+			return drv->states[drv->safe_state_index].enter(dev,
+						drv, drv->safe_state_index);
 		} else {
 			local_irq_disable();
 			acpi_safe_halt();
@@ -993,14 +999,15 @@ struct cpuidle_driver acpi_idle_driver = {
 };
 
 /**
- * acpi_processor_setup_cpuidle - prepares and configures CPUIDLE
+ * acpi_processor_setup_cpuidle_cx - prepares and configures CPUIDLE
+ * device i.e. per-cpu data
+ *
  * @pr: the ACPI processor
  */
-static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
+static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr)
 {
 	int i, count = CPUIDLE_DRIVER_STATE_START;
 	struct acpi_processor_cx *cx;
-	struct cpuidle_state *state;
 	struct cpuidle_state_usage *state_usage;
 	struct cpuidle_device *dev = &pr->power.dev;
 
@@ -1012,18 +1019,12 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
 	}
 
 	dev->cpu = pr->id;
-	dev->safe_state_index = -1;
-	for (i = 0; i < CPUIDLE_STATE_MAX; i++) {
-		dev->states[i].name[0] = '\0';
-		dev->states[i].desc[0] = '\0';
-	}
 
 	if (max_cstate == 0)
 		max_cstate = 1;
 
 	for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
 		cx = &pr->power.states[i];
-		state = &dev->states[count];
 		state_usage = &dev->states_usage[count];
 
 		if (!cx->valid)
@@ -1035,8 +1036,64 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
 		    !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
 			continue;
 #endif
+
 		cpuidle_set_statedata(state_usage, cx);
 
+		count++;
+		if (count == CPUIDLE_STATE_MAX)
+			break;
+	}
+
+	dev->state_count = count;
+
+	if (!count)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * acpi_processor_setup_cpuidle states- prepares and configures cpuidle
+ * global state data i.e. idle routines
+ *
+ * @pr: the ACPI processor
+ */
+static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr)
+{
+	int i, count = CPUIDLE_DRIVER_STATE_START;
+	struct acpi_processor_cx *cx;
+	struct cpuidle_state *state;
+	struct cpuidle_driver *drv = &acpi_idle_driver;
+
+	if (!pr->flags.power_setup_done)
+		return -EINVAL;
+
+	if (pr->flags.power == 0)
+		return -EINVAL;
+
+	drv->safe_state_index = -1;
+	for (i = 0; i < CPUIDLE_STATE_MAX; i++) {
+		drv->states[i].name[0] = '\0';
+		drv->states[i].desc[0] = '\0';
+	}
+
+	if (max_cstate == 0)
+		max_cstate = 1;
+
+	for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
+		cx = &pr->power.states[i];
+
+		if (!cx->valid)
+			continue;
+
+#ifdef CONFIG_HOTPLUG_CPU
+		if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) &&
+		    !pr->flags.has_cst &&
+		    !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
+			continue;
+#endif
+
+		state = &drv->states[count];
 		snprintf(state->name, CPUIDLE_NAME_LEN, "C%d", i);
 		strncpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
 		state->exit_latency = cx->latency;
@@ -1049,13 +1106,13 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
 				state->flags |= CPUIDLE_FLAG_TIME_VALID;
 
 			state->enter = acpi_idle_enter_c1;
-			dev->safe_state_index = count;
+			drv->safe_state_index = count;
 			break;
 
 			case ACPI_STATE_C2:
 			state->flags |= CPUIDLE_FLAG_TIME_VALID;
 			state->enter = acpi_idle_enter_simple;
-			dev->safe_state_index = count;
+			drv->safe_state_index = count;
 			break;
 
 			case ACPI_STATE_C3:
@@ -1071,7 +1128,7 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
 			break;
 	}
 
-	dev->state_count = count;
+	drv->state_count = count;
 
 	if (!count)
 		return -EINVAL;
@@ -1079,7 +1136,7 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
 	return 0;
 }
 
-int acpi_processor_cst_has_changed(struct acpi_processor *pr)
+int acpi_processor_hotplug(struct acpi_processor *pr)
 {
 	int ret = 0;
 
@@ -1100,7 +1157,7 @@ int acpi_processor_cst_has_changed(struct acpi_processor *pr)
 	cpuidle_disable_device(&pr->power.dev);
 	acpi_processor_get_power_info(pr);
 	if (pr->flags.power) {
-		acpi_processor_setup_cpuidle(pr);
+		acpi_processor_setup_cpuidle_cx(pr);
 		ret = cpuidle_enable_device(&pr->power.dev);
 	}
 	cpuidle_resume_and_unlock();
@@ -1108,10 +1165,72 @@ int acpi_processor_cst_has_changed(struct acpi_processor *pr)
 	return ret;
 }
 
+int acpi_processor_cst_has_changed(struct acpi_processor *pr)
+{
+	int cpu;
+	struct acpi_processor *_pr;
+
+	if (disabled_by_idle_boot_param())
+		return 0;
+
+	if (!pr)
+		return -EINVAL;
+
+	if (nocst)
+		return -ENODEV;
+
+	if (!pr->flags.power_setup_done)
+		return -ENODEV;
+
+	/*
+	 * FIXME:  Design the ACPI notification to make it once per
+	 * system instead of once per-cpu.  This condition is a hack
+	 * to make the code that updates C-States be called once.
+	 */
+
+	if (smp_processor_id() == 0 &&
+			cpuidle_get_driver() == &acpi_idle_driver) {
+
+		cpuidle_pause_and_lock();
+		/* Protect against cpu-hotplug */
+		get_online_cpus();
+
+		/* Disable all cpuidle devices */
+		for_each_online_cpu(cpu) {
+			_pr = per_cpu(processors, cpu);
+			if (!_pr || !_pr->flags.power_setup_done)
+				continue;
+			cpuidle_disable_device(&_pr->power.dev);
+		}
+
+		/* Populate Updated C-state information */
+		acpi_processor_setup_cpuidle_states(pr);
+
+		/* Enable all cpuidle devices */
+		for_each_online_cpu(cpu) {
+			_pr = per_cpu(processors, cpu);
+			if (!_pr || !_pr->flags.power_setup_done)
+				continue;
+			acpi_processor_get_power_info(_pr);
+			if (_pr->flags.power) {
+				acpi_processor_setup_cpuidle_cx(_pr);
+				cpuidle_enable_device(&_pr->power.dev);
+			}
+		}
+		put_online_cpus();
+		cpuidle_resume_and_unlock();
+	}
+
+	return 0;
+}
+
+static int acpi_processor_registered;
+
 int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
 			      struct acpi_device *device)
 {
 	acpi_status status = 0;
+	int retval;
 	static int first_run;
 
 	if (disabled_by_idle_boot_param())
@@ -1148,9 +1267,26 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
 	 * platforms that only support C1.
 	 */
 	if (pr->flags.power) {
-		acpi_processor_setup_cpuidle(pr);
-		if (cpuidle_register_device(&pr->power.dev))
-			return -EIO;
+		/* Register acpi_idle_driver if not already registered */
+		if (!acpi_processor_registered) {
+			acpi_processor_setup_cpuidle_states(pr);
+			retval = cpuidle_register_driver(&acpi_idle_driver);
+			if (retval)
+				return retval;
+			printk(KERN_DEBUG "ACPI: %s registered with cpuidle\n",
+					acpi_idle_driver.name);
+		}
+		/* Register per-cpu cpuidle_device. Cpuidle driver
+		 * must already be registered before registering device
+		 */
+		acpi_processor_setup_cpuidle_cx(pr);
+		retval = cpuidle_register_device(&pr->power.dev);
+		if (retval) {
+			if (acpi_processor_registered == 0)
+				cpuidle_unregister_driver(&acpi_idle_driver);
+			return retval;
+		}
+		acpi_processor_registered++;
 	}
 	return 0;
 }
@@ -1161,8 +1297,13 @@ int acpi_processor_power_exit(struct acpi_processor *pr,
 	if (disabled_by_idle_boot_param())
 		return 0;
 
-	cpuidle_unregister_device(&pr->power.dev);
-	pr->flags.power_setup_done = 0;
+	if (pr->flags.power) {
+		cpuidle_unregister_device(&pr->power.dev);
+		acpi_processor_registered--;
+		if (acpi_processor_registered == 0)
+			cpuidle_unregister_driver(&acpi_idle_driver);
+	}
 
+	pr->flags.power_setup_done = 0;
 	return 0;
 }
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index b1de11e..1040bc9 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -61,6 +61,7 @@ static int __cpuidle_register_device(struct cpuidle_device *dev);
 int cpuidle_idle_call(void)
 {
 	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_driver();
 	struct cpuidle_state *target_state;
 	int next_state, entered_state;
 
@@ -84,18 +85,18 @@ int cpuidle_idle_call(void)
 #endif
 
 	/* ask the governor for the next state */
-	next_state = cpuidle_curr_governor->select(dev);
+	next_state = cpuidle_curr_governor->select(drv, dev);
 	if (need_resched()) {
 		local_irq_enable();
 		return 0;
 	}
 
-	target_state = &dev->states[next_state];
+	target_state = &drv->states[next_state];
 
 	trace_power_start(POWER_CSTATE, next_state, dev->cpu);
 	trace_cpu_idle(next_state, dev->cpu);
 
-	entered_state = target_state->enter(dev, next_state);
+	entered_state = target_state->enter(dev, drv, next_state);
 
 	trace_power_end(dev->cpu);
 	trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu);
@@ -163,7 +164,8 @@ void cpuidle_resume_and_unlock(void)
 EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock);
 
 #ifdef CONFIG_ARCH_HAS_CPU_RELAX
-static int poll_idle(struct cpuidle_device *dev, int index)
+static int poll_idle(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int index)
 {
 	ktime_t	t1, t2;
 	s64 diff;
@@ -183,12 +185,9 @@ static int poll_idle(struct cpuidle_device *dev, int index)
 	return index;
 }
 
-static void poll_idle_init(struct cpuidle_device *dev)
+static void poll_idle_init(struct cpuidle_driver *drv)
 {
-	struct cpuidle_state *state = &dev->states[0];
-	struct cpuidle_state_usage *state_usage = &dev->states_usage[0];
-
-	cpuidle_set_statedata(state_usage, NULL);
+	struct cpuidle_state *state = &drv->states[0];
 
 	snprintf(state->name, CPUIDLE_NAME_LEN, "POLL");
 	snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE");
@@ -199,7 +198,7 @@ static void poll_idle_init(struct cpuidle_device *dev)
 	state->enter = poll_idle;
 }
 #else
-static void poll_idle_init(struct cpuidle_device *dev) {}
+static void poll_idle_init(struct cpuidle_driver *drv) {}
 #endif /* CONFIG_ARCH_HAS_CPU_RELAX */
 
 /**
@@ -226,13 +225,13 @@ int cpuidle_enable_device(struct cpuidle_device *dev)
 			return ret;
 	}
 
-	poll_idle_init(dev);
+	poll_idle_init(cpuidle_get_driver());
 
 	if ((ret = cpuidle_add_state_sysfs(dev)))
 		return ret;
 
 	if (cpuidle_curr_governor->enable &&
-	    (ret = cpuidle_curr_governor->enable(dev)))
+	    (ret = cpuidle_curr_governor->enable(cpuidle_get_driver(), dev)))
 		goto fail_sysfs;
 
 	for (i = 0; i < dev->state_count; i++) {
@@ -273,7 +272,7 @@ void cpuidle_disable_device(struct cpuidle_device *dev)
 	dev->enabled = 0;
 
 	if (cpuidle_curr_governor->disable)
-		cpuidle_curr_governor->disable(dev);
+		cpuidle_curr_governor->disable(cpuidle_get_driver(), dev);
 
 	cpuidle_remove_state_sysfs(dev);
 	enabled_devices--;
@@ -301,26 +300,6 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
 
 	init_completion(&dev->kobj_unregister);
 
-	/*
-	 * cpuidle driver should set the dev->power_specified bit
-	 * before registering the device if the driver provides
-	 * power_usage numbers.
-	 *
-	 * For those devices whose ->power_specified is not set,
-	 * we fill in power_usage with decreasing values as the
-	 * cpuidle code has an implicit assumption that state Cn
-	 * uses less power than C(n-1).
-	 *
-	 * With CONFIG_ARCH_HAS_CPU_RELAX, C0 is already assigned
-	 * an power value of -1.  So we use -2, -3, etc, for other
-	 * c-states.
-	 */
-	if (!dev->power_specified) {
-		int i;
-		for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++)
-			dev->states[i].power_usage = -1 - i;
-	}
-
 	per_cpu(cpuidle_devices, dev->cpu) = dev;
 	list_add(&dev->device_list, &cpuidle_detected_devices);
 	if ((ret = cpuidle_add_sysfs(sys_dev))) {
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 3f7e3ce..284d7af 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -17,6 +17,30 @@
 static struct cpuidle_driver *cpuidle_curr_driver;
 DEFINE_SPINLOCK(cpuidle_driver_lock);
 
+static void __cpuidle_register_driver(struct cpuidle_driver *drv)
+{
+	int i;
+	/*
+	 * cpuidle driver should set the drv->power_specified bit
+	 * before registering if the driver provides
+	 * power_usage numbers.
+	 *
+	 * If power_specified is not set,
+	 * we fill in power_usage with decreasing values as the
+	 * cpuidle code has an implicit assumption that state Cn
+	 * uses less power than C(n-1).
+	 *
+	 * With CONFIG_ARCH_HAS_CPU_RELAX, C0 is already assigned
+	 * an power value of -1.  So we use -2, -3, etc, for other
+	 * c-states.
+	 */
+	if (!drv->power_specified) {
+		for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++)
+			drv->states[i].power_usage = -1 - i;
+	}
+}
+
+
 /**
  * cpuidle_register_driver - registers a driver
  * @drv: the driver
@@ -34,6 +58,7 @@ int cpuidle_register_driver(struct cpuidle_driver *drv)
 		spin_unlock(&cpuidle_driver_lock);
 		return -EBUSY;
 	}
+	__cpuidle_register_driver(drv);
 	cpuidle_curr_driver = drv;
 	spin_unlock(&cpuidle_driver_lock);
 
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 78b06d2..261e57a 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -60,9 +60,11 @@ static inline void ladder_do_selection(struct ladder_device *ldev,
 
 /**
  * ladder_select_state - selects the next state to enter
+ * @drv: cpuidle driver
  * @dev: the CPU
  */
-static int ladder_select_state(struct cpuidle_device *dev)
+static int ladder_select_state(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev)
 {
 	struct ladder_device *ldev = &__get_cpu_var(ladder_devices);
 	struct ladder_device_state *last_state;
@@ -77,15 +79,17 @@ static int ladder_select_state(struct cpuidle_device *dev)
 
 	last_state = &ldev->states[last_idx];
 
-	if (dev->states[last_idx].flags & CPUIDLE_FLAG_TIME_VALID)
-		last_residency = cpuidle_get_last_residency(dev) - dev->states[last_idx].exit_latency;
+	if (drv->states[last_idx].flags & CPUIDLE_FLAG_TIME_VALID) {
+		last_residency = cpuidle_get_last_residency(dev) - \
+					 drv->states[last_idx].exit_latency;
+	}
 	else
 		last_residency = last_state->threshold.promotion_time + 1;
 
 	/* consider promotion */
-	if (last_idx < dev->state_count - 1 &&
+	if (last_idx < drv->state_count - 1 &&
 	    last_residency > last_state->threshold.promotion_time &&
-	    dev->states[last_idx + 1].exit_latency <= latency_req) {
+	    drv->states[last_idx + 1].exit_latency <= latency_req) {
 		last_state->stats.promotion_count++;
 		last_state->stats.demotion_count = 0;
 		if (last_state->stats.promotion_count >= last_state->threshold.promotion_count) {
@@ -96,11 +100,11 @@ static int ladder_select_state(struct cpuidle_device *dev)
 
 	/* consider demotion */
 	if (last_idx > CPUIDLE_DRIVER_STATE_START &&
-	    dev->states[last_idx].exit_latency > latency_req) {
+	    drv->states[last_idx].exit_latency > latency_req) {
 		int i;
 
 		for (i = last_idx - 1; i > CPUIDLE_DRIVER_STATE_START; i--) {
-			if (dev->states[i].exit_latency <= latency_req)
+			if (drv->states[i].exit_latency <= latency_req)
 				break;
 		}
 		ladder_do_selection(ldev, last_idx, i);
@@ -123,9 +127,11 @@ static int ladder_select_state(struct cpuidle_device *dev)
 
 /**
  * ladder_enable_device - setup for the governor
+ * @drv: cpuidle driver
  * @dev: the CPU
  */
-static int ladder_enable_device(struct cpuidle_device *dev)
+static int ladder_enable_device(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev)
 {
 	int i;
 	struct ladder_device *ldev = &per_cpu(ladder_devices, dev->cpu);
@@ -134,8 +140,8 @@ static int ladder_enable_device(struct cpuidle_device *dev)
 
 	ldev->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 
-	for (i = 0; i < dev->state_count; i++) {
-		state = &dev->states[i];
+	for (i = 0; i < drv->state_count; i++) {
+		state = &drv->states[i];
 		lstate = &ldev->states[i];
 
 		lstate->stats.promotion_count = 0;
@@ -144,7 +150,7 @@ static int ladder_enable_device(struct cpuidle_device *dev)
 		lstate->threshold.promotion_count = PROMOTION_COUNT;
 		lstate->threshold.demotion_count = DEMOTION_COUNT;
 
-		if (i < dev->state_count - 1)
+		if (i < drv->state_count - 1)
 			lstate->threshold.promotion_time = state->exit_latency;
 		if (i > 0)
 			lstate->threshold.demotion_time = state->exit_latency;
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 31dd287..b2b0c32 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -182,7 +182,7 @@ static inline int performance_multiplier(void)
 
 static DEFINE_PER_CPU(struct menu_device, menu_devices);
 
-static void menu_update(struct cpuidle_device *dev);
+static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
 
 /* This implements DIV_ROUND_CLOSEST but avoids 64 bit division */
 static u64 div_round64(u64 dividend, u32 divisor)
@@ -228,9 +228,10 @@ static void detect_repeating_patterns(struct menu_device *data)
 
 /**
  * menu_select - selects the next idle state to enter
+ * @drv: cpuidle driver containing state data
  * @dev: the CPU
  */
-static int menu_select(struct cpuidle_device *dev)
+static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 {
 	struct menu_device *data = &__get_cpu_var(menu_devices);
 	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
@@ -240,7 +241,7 @@ static int menu_select(struct cpuidle_device *dev)
 	struct timespec t;
 
 	if (data->needs_update) {
-		menu_update(dev);
+		menu_update(drv, dev);
 		data->needs_update = 0;
 	}
 
@@ -285,8 +286,8 @@ static int menu_select(struct cpuidle_device *dev)
 	 * Find the idle state with the lowest power while satisfying
 	 * our constraints.
 	 */
-	for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++) {
-		struct cpuidle_state *s = &dev->states[i];
+	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
+		struct cpuidle_state *s = &drv->states[i];
 
 		if (s->target_residency > data->predicted_us)
 			continue;
@@ -323,14 +324,15 @@ static void menu_reflect(struct cpuidle_device *dev, int index)
 
 /**
  * menu_update - attempts to guess what happened after entry
+ * @drv: cpuidle driver containing state data
  * @dev: the CPU
  */
-static void menu_update(struct cpuidle_device *dev)
+static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 {
 	struct menu_device *data = &__get_cpu_var(menu_devices);
 	int last_idx = data->last_state_idx;
 	unsigned int last_idle_us = cpuidle_get_last_residency(dev);
-	struct cpuidle_state *target = &dev->states[last_idx];
+	struct cpuidle_state *target = &drv->states[last_idx];
 	unsigned int measured_us;
 	u64 new_factor;
 
@@ -384,9 +386,11 @@ static void menu_update(struct cpuidle_device *dev)
 
 /**
  * menu_enable_device - scans a CPU's states and does setup
+ * @drv: cpuidle driver
  * @dev: the CPU
  */
-static int menu_enable_device(struct cpuidle_device *dev)
+static int menu_enable_device(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev)
 {
 	struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
 
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 8a1ace1..1e756e1 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -322,13 +322,14 @@ int cpuidle_add_state_sysfs(struct cpuidle_device *device)
 {
 	int i, ret = -ENOMEM;
 	struct cpuidle_state_kobj *kobj;
+	struct cpuidle_driver *drv = cpuidle_get_driver();
 
 	/* state statistics */
 	for (i = 0; i < device->state_count; i++) {
 		kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL);
 		if (!kobj)
 			goto error_state;
-		kobj->state = &device->states[i];
+		kobj->state = &drv->states[i];
 		kobj->state_usage = &device->states_usage[i];
 		init_completion(&kobj->kobj_unregister);
 
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3aa8d4c..5be9d59 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -81,7 +81,8 @@ static unsigned int mwait_substates;
 static unsigned int lapic_timer_reliable_states = (1 << 1);	 /* Default to only C1 */
 
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
-static int intel_idle(struct cpuidle_device *dev, int index);
+static int intel_idle(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv, int index);
 
 static struct cpuidle_state *cpuidle_state_table;
 
@@ -227,13 +228,15 @@ static int get_driver_data(int cstate)
 /**
  * intel_idle
  * @dev: cpuidle_device
+ * @drv: cpuidle driver
  * @index: index of cpuidle state
  *
  */
-static int intel_idle(struct cpuidle_device *dev, int index)
+static int intel_idle(struct cpuidle_device *dev,
+		struct cpuidle_driver *drv, int index)
 {
 	unsigned long ecx = 1; /* break on interrupt flag */
-	struct cpuidle_state *state = &dev->states[index];
+	struct cpuidle_state *state = &drv->states[index];
 	struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
 	unsigned long eax = (unsigned long)cpuidle_get_statedata(state_usage);
 	unsigned int cstate;
@@ -420,6 +423,60 @@ static void intel_idle_cpuidle_devices_uninit(void)
 	return;
 }
 /*
+ * intel_idle_cpuidle_driver_init()
+ * allocate, initialize cpuidle_states
+ */
+static int intel_idle_cpuidle_driver_init(void)
+{
+	int cstate;
+	struct cpuidle_driver *drv = &intel_idle_driver;
+
+	drv->state_count = 1;
+
+	for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) {
+		int num_substates;
+
+		if (cstate > max_cstate) {
+			printk(PREFIX "max_cstate %d reached\n",
+				max_cstate);
+			break;
+		}
+
+		/* does the state exist in CPUID.MWAIT? */
+		num_substates = (mwait_substates >> ((cstate) * 4))
+					& MWAIT_SUBSTATE_MASK;
+		if (num_substates == 0)
+			continue;
+		/* is the state not enabled? */
+		if (cpuidle_state_table[cstate].enter == NULL) {
+			/* does the driver not know about the state? */
+			if (*cpuidle_state_table[cstate].name == '\0')
+				pr_debug(PREFIX "unaware of model 0x%x"
+					" MWAIT %d please"
+					" contact lenb@kernel.org",
+				boot_cpu_data.x86_model, cstate);
+			continue;
+		}
+
+		if ((cstate > 2) &&
+			!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+			mark_tsc_unstable("TSC halts in idle"
+					" states deeper than C2");
+
+		drv->states[drv->state_count] =	/* structure copy */
+			cpuidle_state_table[cstate];
+
+		drv->state_count += 1;
+	}
+
+	if (auto_demotion_disable_flags)
+		smp_call_function(auto_demotion_disable, NULL, 1);
+
+	return 0;
+}
+
+
+/*
  * intel_idle_cpuidle_devices_init()
  * allocate, initialize, register cpuidle_devices
  */
@@ -453,23 +510,9 @@ static int intel_idle_cpuidle_devices_init(void)
 				continue;
 			/* is the state not enabled? */
 			if (cpuidle_state_table[cstate].enter == NULL) {
-				/* does the driver not know about the state? */
-				if (*cpuidle_state_table[cstate].name == '\0')
-					pr_debug(PREFIX "unaware of model 0x%x"
-						" MWAIT %d please"
-						" contact lenb@kernel.org",
-					boot_cpu_data.x86_model, cstate);
 				continue;
 			}
 
-			if ((cstate > 2) &&
-				!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
-				mark_tsc_unstable("TSC halts in idle"
-					" states deeper than C2");
-
-			dev->states[dev->state_count] =	/* structure copy */
-				cpuidle_state_table[cstate];
-
 			dev->states_usage[dev->state_count].driver_data =
 				(void *)get_driver_data(cstate);
 
@@ -484,8 +527,6 @@ static int intel_idle_cpuidle_devices_init(void)
 			return -EIO;
 		}
 	}
-	if (auto_demotion_disable_flags)
-		smp_call_function(auto_demotion_disable, NULL, 1);
 
 	return 0;
 }
@@ -503,6 +544,7 @@ static int __init intel_idle_init(void)
 	if (retval)
 		return retval;
 
+	intel_idle_cpuidle_driver_init();
 	retval = cpuidle_register_driver(&intel_idle_driver);
 	if (retval) {
 		printk(KERN_DEBUG PREFIX "intel_idle yielding to %s",
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 67055f1..610f6fb 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -329,6 +329,7 @@ extern void acpi_processor_throttling_init(void);
 int acpi_processor_power_init(struct acpi_processor *pr,
 			      struct acpi_device *device);
 int acpi_processor_cst_has_changed(struct acpi_processor *pr);
+int acpi_processor_hotplug(struct acpi_processor *pr);
 int acpi_processor_power_exit(struct acpi_processor *pr,
 			      struct acpi_device *device);
 int acpi_processor_suspend(struct acpi_device * device, pm_message_t state);
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 0156540..c904188 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -22,6 +22,7 @@
 #define CPUIDLE_DESC_LEN	32
 
 struct cpuidle_device;
+struct cpuidle_driver;
 
 
 /****************************
@@ -45,6 +46,7 @@ struct cpuidle_state {
 	unsigned int	target_residency; /* in US */
 
 	int (*enter)	(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
 			int index);
 };
 
@@ -83,12 +85,10 @@ struct cpuidle_state_kobj {
 struct cpuidle_device {
 	unsigned int		registered:1;
 	unsigned int		enabled:1;
-	unsigned int		power_specified:1;
 	unsigned int		cpu;
 
 	int			last_residency;
 	int			state_count;
-	struct cpuidle_state	states[CPUIDLE_STATE_MAX];
 	struct cpuidle_state_usage	states_usage[CPUIDLE_STATE_MAX];
 	struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
 
@@ -96,7 +96,6 @@ struct cpuidle_device {
 	struct kobject		kobj;
 	struct completion	kobj_unregister;
 	void			*governor_data;
-	int			safe_state_index;
 };
 
 DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
@@ -120,6 +119,11 @@ static inline int cpuidle_get_last_residency(struct cpuidle_device *dev)
 struct cpuidle_driver {
 	char			name[CPUIDLE_NAME_LEN];
 	struct module 		*owner;
+
+	unsigned int		power_specified:1;
+	struct cpuidle_state	states[CPUIDLE_STATE_MAX];
+	int			state_count;
+	int			safe_state_index;
 };
 
 #ifdef CONFIG_CPU_IDLE
@@ -166,10 +170,13 @@ struct cpuidle_governor {
 	struct list_head 	governor_list;
 	unsigned int		rating;
 
-	int  (*enable)		(struct cpuidle_device *dev);
-	void (*disable)		(struct cpuidle_device *dev);
+	int  (*enable)		(struct cpuidle_driver *drv,
+					struct cpuidle_device *dev);
+	void (*disable)		(struct cpuidle_driver *drv,
+					struct cpuidle_device *dev);
 
-	int  (*select)		(struct cpuidle_device *dev);
+	int  (*select)		(struct cpuidle_driver *drv,
+					struct cpuidle_device *dev);
 	void (*reflect)		(struct cpuidle_device *dev, int index);
 
 	struct module 		*owner;

^ permalink raw reply related

* [PATCH v9 3/4] cpuidle: Split cpuidle_state structure and move per-cpu statistics fields
From: Deepthi Dharwar @ 2011-10-28 10:50 UTC (permalink / raw)
  To: khilman, venki, ak, len.brown, peterz, rjw, santosh.shilimkar,
	arjan, lenb
  Cc: linux-sh, linux-pm, linux-kernel, linux-acpi, linux-pm,
	linux-omap, linux-arm-kernel
In-Reply-To: <20111028104945.7520.83828.stgit@localhost6.localdomain6>

This is the first step towards global registration of cpuidle
states. The statistics used primarily by the governor are per-cpu
and have to be split from rest of the fields inside cpuidle_state,
which would be made global i.e. single copy. The driver_data field
is also per-cpu and moved.

Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Trinabh Gupta <g.trinabh@gmail.com>
Tested-by: Jean Pihet <j-pihet@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-davinci/cpuidle.c   |    5 ++--
 arch/arm/mach-omap2/cpuidle34xx.c |   13 ++++++----
 drivers/acpi/processor_idle.c     |   25 ++++++++++----------
 drivers/cpuidle/cpuidle.c         |   11 +++++----
 drivers/cpuidle/sysfs.c           |   19 ++++++++++-----
 drivers/idle/intel_idle.c         |   46 +++++++++++++++++++++++++++----------
 include/linux/cpuidle.h           |   25 ++++++++++++--------
 7 files changed, 90 insertions(+), 54 deletions(-)

diff --git a/arch/arm/mach-davinci/cpuidle.c b/arch/arm/mach-davinci/cpuidle.c
index ca8582a..f2d2f34 100644
--- a/arch/arm/mach-davinci/cpuidle.c
+++ b/arch/arm/mach-davinci/cpuidle.c
@@ -80,7 +80,8 @@ static struct davinci_ops davinci_states[DAVINCI_CPUIDLE_MAX_STATES] = {
 static int davinci_enter_idle(struct cpuidle_device *dev,
 						int index)
 {
-	struct davinci_ops *ops = cpuidle_get_statedata(&dev->states[index]);
+	struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
+	struct davinci_ops *ops = cpuidle_get_statedata(state_usage);
 	struct timeval before, after;
 	int idle_time;
 
@@ -142,7 +143,7 @@ static int __init davinci_cpuidle_probe(struct platform_device *pdev)
 	strcpy(device->states[1].desc, "WFI and DDR Self Refresh");
 	if (pdata->ddr2_pdown)
 		davinci_states[1].flags |= DAVINCI_CPUIDLE_FLAGS_DDR2_PWDN;
-	cpuidle_set_statedata(&device->states[1], &davinci_states[1]);
+	cpuidle_set_statedata(&device->states_usage[1], &davinci_states[1]);
 
 	device->state_count = DAVINCI_CPUIDLE_MAX_STATES;
 
diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index 58425c7..d3fce7b 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -97,7 +97,7 @@ static int omap3_enter_idle(struct cpuidle_device *dev,
 				int index)
 {
 	struct omap3_idle_statedata *cx =
-			cpuidle_get_statedata(&dev->states[index]);
+			cpuidle_get_statedata(&dev->states_usage[index]);
 	struct timespec ts_preidle, ts_postidle, ts_idle;
 	u32 mpu_state = cx->mpu_state, core_state = cx->core_state;
 	int idle_time;
@@ -160,8 +160,9 @@ return_sleep_time:
 static int next_valid_state(struct cpuidle_device *dev,
 				int index)
 {
+	struct cpuidle_state_usage *curr_usage = &dev->states_usage[index];
 	struct cpuidle_state *curr = &dev->states[index];
-	struct omap3_idle_statedata *cx = cpuidle_get_statedata(curr);
+	struct omap3_idle_statedata *cx = cpuidle_get_statedata(curr_usage);
 	u32 mpu_deepest_state = PWRDM_POWER_RET;
 	u32 core_deepest_state = PWRDM_POWER_RET;
 	int next_index = -1;
@@ -202,7 +203,7 @@ static int next_valid_state(struct cpuidle_device *dev,
 		 */
 		idx--;
 		for (; idx >= 0; idx--) {
-			cx = cpuidle_get_statedata(&dev->states[idx]);
+			cx = cpuidle_get_statedata(&dev->states_usage[idx]);
 			if ((cx->valid) &&
 			    (cx->mpu_state >= mpu_deepest_state) &&
 			    (cx->core_state >= core_deepest_state)) {
@@ -231,7 +232,6 @@ static int next_valid_state(struct cpuidle_device *dev,
 static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 			       int index)
 {
-	struct cpuidle_state *state = &dev->states[index];
 	int new_state_idx;
 	u32 core_next_state, per_next_state = 0, per_saved_state = 0, cam_state;
 	struct omap3_idle_statedata *cx;
@@ -264,7 +264,7 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	 * Prevent PER off if CORE is not in retention or off as this
 	 * would disable PER wakeups completely.
 	 */
-	cx = cpuidle_get_statedata(state);
+	cx = cpuidle_get_statedata(&dev->states_usage[index]);
 	core_next_state = cx->core_state;
 	per_next_state = per_saved_state = pwrdm_read_next_pwrst(per_pd);
 	if ((per_next_state == PWRDM_POWER_OFF) &&
@@ -318,6 +318,7 @@ static inline struct omap3_idle_statedata *_fill_cstate(
 {
 	struct omap3_idle_statedata *cx = &omap3_idle_data[idx];
 	struct cpuidle_state *state = &dev->states[idx];
+	struct cpuidle_state_usage *state_usage = &dev->states_usage[idx];
 
 	state->exit_latency	= cpuidle_params_table[idx].exit_latency;
 	state->target_residency	= cpuidle_params_table[idx].target_residency;
@@ -326,7 +327,7 @@ static inline struct omap3_idle_statedata *_fill_cstate(
 	cx->valid		= cpuidle_params_table[idx].valid;
 	sprintf(state->name, "C%d", idx + 1);
 	strncpy(state->desc, descr, CPUIDLE_DESC_LEN);
-	cpuidle_set_statedata(state, cx);
+	cpuidle_set_statedata(state_usage, cx);
 
 	return cx;
 }
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 463cc09..e2264d5 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -745,14 +745,13 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
  *
  * This is equivalent to the HALT instruction.
  */
-static int acpi_idle_enter_c1(struct cpuidle_device *dev,
-				int index)
+static int acpi_idle_enter_c1(struct cpuidle_device *dev, int index)
 {
 	ktime_t  kt1, kt2;
 	s64 idle_time;
 	struct acpi_processor *pr;
-	struct cpuidle_state *state = &dev->states[index];
-	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
+	struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
+	struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage);
 
 	pr = __this_cpu_read(processors);
 	dev->last_residency = 0;
@@ -790,12 +789,11 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
  * @dev: the target CPU
  * @index: the index of suggested state
  */
-static int acpi_idle_enter_simple(struct cpuidle_device *dev,
-				int index)
+static int acpi_idle_enter_simple(struct cpuidle_device *dev, int index)
 {
 	struct acpi_processor *pr;
-	struct cpuidle_state *state = &dev->states[index];
-	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
+	struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
+	struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage);
 	ktime_t  kt1, kt2;
 	s64 idle_time_ns;
 	s64 idle_time;
@@ -875,12 +873,11 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
  *
  * If BM is detected, the deepest non-C3 idle state is entered instead.
  */
-static int acpi_idle_enter_bm(struct cpuidle_device *dev,
-				int index)
+static int acpi_idle_enter_bm(struct cpuidle_device *dev, int index)
 {
 	struct acpi_processor *pr;
-	struct cpuidle_state *state = &dev->states[index];
-	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
+	struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
+	struct acpi_processor_cx *cx = cpuidle_get_statedata(state_usage);
 	ktime_t  kt1, kt2;
 	s64 idle_time_ns;
 	s64 idle_time;
@@ -1004,6 +1001,7 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
 	int i, count = CPUIDLE_DRIVER_STATE_START;
 	struct acpi_processor_cx *cx;
 	struct cpuidle_state *state;
+	struct cpuidle_state_usage *state_usage;
 	struct cpuidle_device *dev = &pr->power.dev;
 
 	if (!pr->flags.power_setup_done)
@@ -1026,6 +1024,7 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
 	for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
 		cx = &pr->power.states[i];
 		state = &dev->states[count];
+		state_usage = &dev->states_usage[count];
 
 		if (!cx->valid)
 			continue;
@@ -1036,7 +1035,7 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
 		    !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
 			continue;
 #endif
-		cpuidle_set_statedata(state, cx);
+		cpuidle_set_statedata(state_usage, cx);
 
 		snprintf(state->name, CPUIDLE_NAME_LEN, "C%d", i);
 		strncpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 984d178..b1de11e 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -105,9 +105,9 @@ int cpuidle_idle_call(void)
 		/* This can be moved to within driver enter routine
 		 * but that results in multiple copies of same code.
 		 */
-		dev->states[entered_state].time +=
+		dev->states_usage[entered_state].time +=
 				(unsigned long long)dev->last_residency;
-		dev->states[entered_state].usage++;
+		dev->states_usage[entered_state].usage++;
 	}
 
 	/* give the governor an opportunity to reflect on the outcome */
@@ -186,8 +186,9 @@ static int poll_idle(struct cpuidle_device *dev, int index)
 static void poll_idle_init(struct cpuidle_device *dev)
 {
 	struct cpuidle_state *state = &dev->states[0];
+	struct cpuidle_state_usage *state_usage = &dev->states_usage[0];
 
-	cpuidle_set_statedata(state, NULL);
+	cpuidle_set_statedata(state_usage, NULL);
 
 	snprintf(state->name, CPUIDLE_NAME_LEN, "POLL");
 	snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE");
@@ -235,8 +236,8 @@ int cpuidle_enable_device(struct cpuidle_device *dev)
 		goto fail_sysfs;
 
 	for (i = 0; i < dev->state_count; i++) {
-		dev->states[i].usage = 0;
-		dev->states[i].time = 0;
+		dev->states_usage[i].usage = 0;
+		dev->states_usage[i].time = 0;
 	}
 	dev->last_residency = 0;
 
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index be7917ec..8a1ace1 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -216,7 +216,8 @@ static struct kobj_type ktype_cpuidle = {
 
 struct cpuidle_state_attr {
 	struct attribute attr;
-	ssize_t (*show)(struct cpuidle_state *, char *);
+	ssize_t (*show)(struct cpuidle_state *, \
+					struct cpuidle_state_usage *, char *);
 	ssize_t (*store)(struct cpuidle_state *, const char *, size_t);
 };
 
@@ -224,19 +225,22 @@ struct cpuidle_state_attr {
 static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0444, show, NULL)
 
 #define define_show_state_function(_name) \
-static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, \
+			 struct cpuidle_state_usage *state_usage, char *buf) \
 { \
 	return sprintf(buf, "%u\n", state->_name);\
 }
 
 #define define_show_state_ull_function(_name) \
-static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, \
+			struct cpuidle_state_usage *state_usage, char *buf) \
 { \
-	return sprintf(buf, "%llu\n", state->_name);\
+	return sprintf(buf, "%llu\n", state_usage->_name);\
 }
 
 #define define_show_state_str_function(_name) \
-static ssize_t show_state_##_name(struct cpuidle_state *state, char *buf) \
+static ssize_t show_state_##_name(struct cpuidle_state *state, \
+			struct cpuidle_state_usage *state_usage, char *buf) \
 { \
 	if (state->_name[0] == '\0')\
 		return sprintf(buf, "<null>\n");\
@@ -269,16 +273,18 @@ static struct attribute *cpuidle_state_default_attrs[] = {
 
 #define kobj_to_state_obj(k) container_of(k, struct cpuidle_state_kobj, kobj)
 #define kobj_to_state(k) (kobj_to_state_obj(k)->state)
+#define kobj_to_state_usage(k) (kobj_to_state_obj(k)->state_usage)
 #define attr_to_stateattr(a) container_of(a, struct cpuidle_state_attr, attr)
 static ssize_t cpuidle_state_show(struct kobject * kobj,
 	struct attribute * attr ,char * buf)
 {
 	int ret = -EIO;
 	struct cpuidle_state *state = kobj_to_state(kobj);
+	struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj);
 	struct cpuidle_state_attr * cattr = attr_to_stateattr(attr);
 
 	if (cattr->show)
-		ret = cattr->show(state, buf);
+		ret = cattr->show(state, state_usage, buf);
 
 	return ret;
 }
@@ -323,6 +329,7 @@ int cpuidle_add_state_sysfs(struct cpuidle_device *device)
 		if (!kobj)
 			goto error_state;
 		kobj->state = &device->states[i];
+		kobj->state_usage = &device->states_usage[i];
 		init_completion(&kobj->kobj_unregister);
 
 		ret = kobject_init_and_add(&kobj->kobj, &ktype_state_cpuidle, &device->kobj,
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index a1c888d..3aa8d4c 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -109,7 +109,6 @@ static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C1 */
 		.name = "C1-NHM",
 		.desc = "MWAIT 0x00",
-		.driver_data = (void *) 0x00,
 		.flags = CPUIDLE_FLAG_TIME_VALID,
 		.exit_latency = 3,
 		.target_residency = 6,
@@ -117,7 +116,6 @@ static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C2 */
 		.name = "C3-NHM",
 		.desc = "MWAIT 0x10",
-		.driver_data = (void *) 0x10,
 		.flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 20,
 		.target_residency = 80,
@@ -125,7 +123,6 @@ static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C3 */
 		.name = "C6-NHM",
 		.desc = "MWAIT 0x20",
-		.driver_data = (void *) 0x20,
 		.flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 200,
 		.target_residency = 800,
@@ -137,7 +134,6 @@ static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C1 */
 		.name = "C1-SNB",
 		.desc = "MWAIT 0x00",
-		.driver_data = (void *) 0x00,
 		.flags = CPUIDLE_FLAG_TIME_VALID,
 		.exit_latency = 1,
 		.target_residency = 1,
@@ -145,7 +141,6 @@ static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C2 */
 		.name = "C3-SNB",
 		.desc = "MWAIT 0x10",
-		.driver_data = (void *) 0x10,
 		.flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 80,
 		.target_residency = 211,
@@ -153,7 +148,6 @@ static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C3 */
 		.name = "C6-SNB",
 		.desc = "MWAIT 0x20",
-		.driver_data = (void *) 0x20,
 		.flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 104,
 		.target_residency = 345,
@@ -161,7 +155,6 @@ static struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C4 */
 		.name = "C7-SNB",
 		.desc = "MWAIT 0x30",
-		.driver_data = (void *) 0x30,
 		.flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 109,
 		.target_residency = 345,
@@ -173,7 +166,6 @@ static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C1 */
 		.name = "C1-ATM",
 		.desc = "MWAIT 0x00",
-		.driver_data = (void *) 0x00,
 		.flags = CPUIDLE_FLAG_TIME_VALID,
 		.exit_latency = 1,
 		.target_residency = 4,
@@ -181,7 +173,6 @@ static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C2 */
 		.name = "C2-ATM",
 		.desc = "MWAIT 0x10",
-		.driver_data = (void *) 0x10,
 		.flags = CPUIDLE_FLAG_TIME_VALID,
 		.exit_latency = 20,
 		.target_residency = 80,
@@ -190,7 +181,6 @@ static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C4 */
 		.name = "C4-ATM",
 		.desc = "MWAIT 0x30",
-		.driver_data = (void *) 0x30,
 		.flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 100,
 		.target_residency = 400,
@@ -199,13 +189,41 @@ static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = {
 	{ /* MWAIT C6 */
 		.name = "C6-ATM",
 		.desc = "MWAIT 0x52",
-		.driver_data = (void *) 0x52,
 		.flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
 		.exit_latency = 140,
 		.target_residency = 560,
 		.enter = &intel_idle },
 };
 
+static int get_driver_data(int cstate)
+{
+	int driver_data;
+	switch (cstate) {
+
+	case 1:	/* MWAIT C1 */
+		driver_data = 0x00;
+		break;
+	case 2:	/* MWAIT C2 */
+		driver_data = 0x10;
+		break;
+	case 3:	/* MWAIT C3 */
+		driver_data = 0x20;
+		break;
+	case 4:	/* MWAIT C4 */
+		driver_data = 0x30;
+		break;
+	case 5:	/* MWAIT C5 */
+		driver_data = 0x40;
+		break;
+	case 6:	/* MWAIT C6 */
+		driver_data = 0x52;
+		break;
+	default:
+		driver_data = 0x00;
+	}
+	return driver_data;
+}
+
 /**
  * intel_idle
  * @dev: cpuidle_device
@@ -216,7 +234,8 @@ static int intel_idle(struct cpuidle_device *dev, int index)
 {
 	unsigned long ecx = 1; /* break on interrupt flag */
 	struct cpuidle_state *state = &dev->states[index];
-	unsigned long eax = (unsigned long)cpuidle_get_statedata(state);
+	struct cpuidle_state_usage *state_usage = &dev->states_usage[index];
+	unsigned long eax = (unsigned long)cpuidle_get_statedata(state_usage);
 	unsigned int cstate;
 	ktime_t kt_before, kt_after;
 	s64 usec_delta;
@@ -451,6 +470,9 @@ static int intel_idle_cpuidle_devices_init(void)
 			dev->states[dev->state_count] =	/* structure copy */
 				cpuidle_state_table[cstate];
 
+			dev->states_usage[dev->state_count].driver_data =
+				(void *)get_driver_data(cstate);
+
 			dev->state_count += 1;
 		}
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index c6d85cf..0156540 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -28,19 +28,22 @@ struct cpuidle_device;
  * CPUIDLE DEVICE INTERFACE *
  ****************************/
 
+struct cpuidle_state_usage {
+	void		*driver_data;
+
+	unsigned long long	usage;
+	unsigned long long	time; /* in US */
+};
+
 struct cpuidle_state {
 	char		name[CPUIDLE_NAME_LEN];
 	char		desc[CPUIDLE_DESC_LEN];
-	void		*driver_data;
 
 	unsigned int	flags;
 	unsigned int	exit_latency; /* in US */
 	unsigned int	power_usage; /* in mW */
 	unsigned int	target_residency; /* in US */
 
-	unsigned long long	usage;
-	unsigned long long	time; /* in US */
-
 	int (*enter)	(struct cpuidle_device *dev,
 			int index);
 };
@@ -52,26 +55,27 @@ struct cpuidle_state {
 
 /**
  * cpuidle_get_statedata - retrieves private driver state data
- * @state: the state
+ * @st_usage: the state usage statistics
  */
-static inline void * cpuidle_get_statedata(struct cpuidle_state *state)
+static inline void *cpuidle_get_statedata(struct cpuidle_state_usage *st_usage)
 {
-	return state->driver_data;
+	return st_usage->driver_data;
 }
 
 /**
  * cpuidle_set_statedata - stores private driver state data
- * @state: the state
+ * @st_usage: the state usage statistics
  * @data: the private data
  */
 static inline void
-cpuidle_set_statedata(struct cpuidle_state *state, void *data)
+cpuidle_set_statedata(struct cpuidle_state_usage *st_usage, void *data)
 {
-	state->driver_data = data;
+	st_usage->driver_data = data;
 }
 
 struct cpuidle_state_kobj {
 	struct cpuidle_state *state;
+	struct cpuidle_state_usage *state_usage;
 	struct completion kobj_unregister;
 	struct kobject kobj;
 };
@@ -85,6 +89,7 @@ struct cpuidle_device {
 	int			last_residency;
 	int			state_count;
 	struct cpuidle_state	states[CPUIDLE_STATE_MAX];
+	struct cpuidle_state_usage	states_usage[CPUIDLE_STATE_MAX];
 	struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
 
 	struct list_head 	device_list;

^ permalink raw reply related

* [PATCH v9 2/4] cpuidle: Remove CPUIDLE_FLAG_IGNORE and dev->prepare()
From: Deepthi Dharwar @ 2011-10-28 10:50 UTC (permalink / raw)
  To: khilman, venki, ak, len.brown, peterz, rjw, santosh.shilimkar,
	arjan, lenb
  Cc: linux-sh, linux-pm, linux-kernel, linux-acpi, linux-pm,
	linux-omap, linux-arm-kernel
In-Reply-To: <20111028104945.7520.83828.stgit@localhost6.localdomain6>

The cpuidle_device->prepare() mechanism causes updates to the
cpuidle_state[].flags, setting and clearing CPUIDLE_FLAG_IGNORE
to tell the governor not to chose a state on a per-cpu basis at
run-time. State demotion is now handled by the driver and it returns
the actual state entered. Hence, this mechanism is not required.
Also this removes per-cpu flags from cpuidle_state enabling
it to be made global.

Reference:
https://lkml.org/lkml/2011/3/25/52

Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm>
Signed-off-by: Trinabh Gupta <g.trinabh@gmail.com>
Tested-by: Jean Pihet <j-pihet@ti.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
---
 drivers/cpuidle/cpuidle.c        |   10 ----------
 drivers/cpuidle/governors/menu.c |    2 --
 include/linux/cpuidle.h          |    3 ---
 3 files changed, 0 insertions(+), 15 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 8faf3a6..984d178 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -83,16 +83,6 @@ int cpuidle_idle_call(void)
 	hrtimer_peek_ahead_timers();
 #endif
 
-	/*
-	 * Call the device's prepare function before calling the
-	 * governor's select function.  ->prepare gives the device's
-	 * cpuidle driver a chance to update any dynamic information
-	 * of its cpuidle states for the current idle period, e.g.
-	 * state availability, latencies, residencies, etc.
-	 */
-	if (dev->prepare)
-		dev->prepare(dev);
-
 	/* ask the governor for the next state */
 	next_state = cpuidle_curr_governor->select(dev);
 	if (need_resched()) {
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 3c44c53..31dd287 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -288,8 +288,6 @@ static int menu_select(struct cpuidle_device *dev)
 	for (i = CPUIDLE_DRIVER_STATE_START; i < dev->state_count; i++) {
 		struct cpuidle_state *s = &dev->states[i];
 
-		if (s->flags & CPUIDLE_FLAG_IGNORE)
-			continue;
 		if (s->target_residency > data->predicted_us)
 			continue;
 		if (s->exit_latency > latency_req)
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8da811b..c6d85cf 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -47,7 +47,6 @@ struct cpuidle_state {
 
 /* Idle State Flags */
 #define CPUIDLE_FLAG_TIME_VALID	(0x01) /* is residency time measurable? */
-#define CPUIDLE_FLAG_IGNORE	(0x100) /* ignore during this idle period */
 
 #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000)
 
@@ -93,8 +92,6 @@ struct cpuidle_device {
 	struct completion	kobj_unregister;
 	void			*governor_data;
 	int			safe_state_index;
-
-	int (*prepare)		(struct cpuidle_device *dev);
 };
 
 DECLARE_PER_CPU(struct cpuidle_device *, cpuidle_devices);

^ permalink raw reply related

* [PATCH v9 1/4] cpuidle: Move dev->last_residency update to driver enter routine; remove dev->last_state
From: Deepthi Dharwar @ 2011-10-28 10:50 UTC (permalink / raw)
  To: khilman, venki, ak, len.brown, peterz, rjw, santosh.shilimkar,
	arjan, lenb
  Cc: linux-sh, linux-pm, linux-kernel, linux-acpi, linux-pm,
	linux-omap, linux-arm-kernel
In-Reply-To: <20111028104945.7520.83828.stgit@localhost6.localdomain6>

Cpuidle governor only suggests the state to enter using the
governor->select() interface, but allows the low level driver to
override the recommended state. The actual entered state
may be different because of software or hardware demotion. Software
demotion is done by the back-end cpuidle driver and can be accounted
correctly. Current cpuidle code uses last_state field to capture the
actual state entered and based on that updates the statistics for the
state entered.

Ideally the driver enter routine should update the counters,
and it should return the state actually entered rather than the time
spent there. The generic cpuidle code should simply handle where
the counters live in the sysfs namespace, not updating the counters.

Reference:
https://lkml.org/lkml/2011/3/25/52

Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Trinabh Gupta <g.trinabh@gmail.com>
Tested-by: Jean Pihet <j-pihet@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Kevin Hilman <khilman@ti.com>
---
 arch/arm/mach-at91/cpuidle.c          |   10 +++-
 arch/arm/mach-davinci/cpuidle.c       |    9 +++-
 arch/arm/mach-exynos4/cpuidle.c       |    7 ++-
 arch/arm/mach-kirkwood/cpuidle.c      |   12 ++++-
 arch/arm/mach-omap2/cpuidle34xx.c     |   67 +++++++++++++++++------------
 arch/sh/kernel/cpu/shmobile/cpuidle.c |   12 +++--
 drivers/acpi/processor_idle.c         |   75 ++++++++++++++++++++++-----------
 drivers/cpuidle/cpuidle.c             |   32 +++++++-------
 drivers/cpuidle/governors/ladder.c    |   13 ++++++
 drivers/cpuidle/governors/menu.c      |    7 ++-
 drivers/idle/intel_idle.c             |   12 ++++-
 include/linux/cpuidle.h               |    7 +--
 12 files changed, 164 insertions(+), 99 deletions(-)

diff --git a/arch/arm/mach-at91/cpuidle.c b/arch/arm/mach-at91/cpuidle.c
index 1cfeac1..4696a0d 100644
--- a/arch/arm/mach-at91/cpuidle.c
+++ b/arch/arm/mach-at91/cpuidle.c
@@ -33,7 +33,7 @@ static struct cpuidle_driver at91_idle_driver = {
 
 /* Actual code that puts the SoC in different idle states */
 static int at91_enter_idle(struct cpuidle_device *dev,
-			       struct cpuidle_state *state)
+			       int index)
 {
 	struct timeval before, after;
 	int idle_time;
@@ -41,10 +41,10 @@ static int at91_enter_idle(struct cpuidle_device *dev,
 
 	local_irq_disable();
 	do_gettimeofday(&before);
-	if (state == &dev->states[0])
+	if (index == 0)
 		/* Wait for interrupt state */
 		cpu_do_idle();
-	else if (state == &dev->states[1]) {
+	else if (index == 1) {
 		asm("b 1f; .align 5; 1:");
 		asm("mcr p15, 0, r0, c7, c10, 4");	/* drain write buffer */
 		saved_lpr = sdram_selfrefresh_enable();
@@ -55,7 +55,9 @@ static int at91_enter_idle(struct cpuidle_device *dev,
 	local_irq_enable();
 	idle_time = (after.tv_sec - before.tv_sec) * USEC_PER_SEC +
 			(after.tv_usec - before.tv_usec);
-	return idle_time;
+
+	dev->last_residency = idle_time;
+	return index;
 }
 
 /* Initialize CPU idle by registering the idle states */
diff --git a/arch/arm/mach-davinci/cpuidle.c b/arch/arm/mach-davinci/cpuidle.c
index bd59f31..ca8582a 100644
--- a/arch/arm/mach-davinci/cpuidle.c
+++ b/arch/arm/mach-davinci/cpuidle.c
@@ -78,9 +78,9 @@ static struct davinci_ops davinci_states[DAVINCI_CPUIDLE_MAX_STATES] = {
 
 /* Actual code that puts the SoC in different idle states */
 static int davinci_enter_idle(struct cpuidle_device *dev,
-						struct cpuidle_state *state)
+						int index)
 {
-	struct davinci_ops *ops = cpuidle_get_statedata(state);
+	struct davinci_ops *ops = cpuidle_get_statedata(&dev->states[index]);
 	struct timeval before, after;
 	int idle_time;
 
@@ -98,7 +98,10 @@ static int davinci_enter_idle(struct cpuidle_device *dev,
 	local_irq_enable();
 	idle_time = (after.tv_sec - before.tv_sec) * USEC_PER_SEC +
 			(after.tv_usec - before.tv_usec);
-	return idle_time;
+
+	dev->last_residency = idle_time;
+
+	return index;
 }
 
 static int __init davinci_cpuidle_probe(struct platform_device *pdev)
diff --git a/arch/arm/mach-exynos4/cpuidle.c b/arch/arm/mach-exynos4/cpuidle.c
index bf7e96f..ea026e7 100644
--- a/arch/arm/mach-exynos4/cpuidle.c
+++ b/arch/arm/mach-exynos4/cpuidle.c
@@ -16,7 +16,7 @@
 #include <asm/proc-fns.h>
 
 static int exynos4_enter_idle(struct cpuidle_device *dev,
-			      struct cpuidle_state *state);
+			      int index);
 
 static struct cpuidle_state exynos4_cpuidle_set[] = {
 	[0] = {
@@ -37,7 +37,7 @@ static struct cpuidle_driver exynos4_idle_driver = {
 };
 
 static int exynos4_enter_idle(struct cpuidle_device *dev,
-			      struct cpuidle_state *state)
+			      int index)
 {
 	struct timeval before, after;
 	int idle_time;
@@ -52,7 +52,8 @@ static int exynos4_enter_idle(struct cpuidle_device *dev,
 	idle_time = (after.tv_sec - before.tv_sec) * USEC_PER_SEC +
 		    (after.tv_usec - before.tv_usec);
 
-	return idle_time;
+	dev->last_residency = idle_time;
+	return index;
 }
 
 static int __init exynos4_init_cpuidle(void)
diff --git a/arch/arm/mach-kirkwood/cpuidle.c b/arch/arm/mach-kirkwood/cpuidle.c
index f68d33f..358dd80 100644
--- a/arch/arm/mach-kirkwood/cpuidle.c
+++ b/arch/arm/mach-kirkwood/cpuidle.c
@@ -32,17 +32,17 @@ static DEFINE_PER_CPU(struct cpuidle_device, kirkwood_cpuidle_device);
 
 /* Actual code that puts the SoC in different idle states */
 static int kirkwood_enter_idle(struct cpuidle_device *dev,
-			       struct cpuidle_state *state)
+			       int index)
 {
 	struct timeval before, after;
 	int idle_time;
 
 	local_irq_disable();
 	do_gettimeofday(&before);
-	if (state == &dev->states[0])
+	if (index == 0)
 		/* Wait for interrupt state */
 		cpu_do_idle();
-	else if (state == &dev->states[1]) {
+	else if (index == 1) {
 		/*
 		 * Following write will put DDR in self refresh.
 		 * Note that we have 256 cycles before DDR puts it
@@ -57,7 +57,11 @@ static int kirkwood_enter_idle(struct cpuidle_device *dev,
 	local_irq_enable();
 	idle_time = (after.tv_sec - before.tv_sec) * USEC_PER_SEC +
 			(after.tv_usec - before.tv_usec);
-	return idle_time;
+
+	/* Update last residency */
+	dev->last_residency = idle_time;
+
+	return index;
 }
 
 /* Initialize CPU idle by registering the idle states */
diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index 4bf6e6e..58425c7 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -88,17 +88,19 @@ static int _cpuidle_deny_idle(struct powerdomain *pwrdm,
 /**
  * omap3_enter_idle - Programs OMAP3 to enter the specified state
  * @dev: cpuidle device
- * @state: The target state to be programmed
+ * @index: the index of state to be entered
  *
  * Called from the CPUidle framework to program the device to the
  * specified target state selected by the governor.
  */
 static int omap3_enter_idle(struct cpuidle_device *dev,
-			struct cpuidle_state *state)
+				int index)
 {
-	struct omap3_idle_statedata *cx = cpuidle_get_statedata(state);
+	struct omap3_idle_statedata *cx =
+			cpuidle_get_statedata(&dev->states[index]);
 	struct timespec ts_preidle, ts_postidle, ts_idle;
 	u32 mpu_state = cx->mpu_state, core_state = cx->core_state;
+	int idle_time;
 
 	/* Used to keep track of the total time in idle */
 	getnstimeofday(&ts_preidle);
@@ -113,7 +115,7 @@ static int omap3_enter_idle(struct cpuidle_device *dev,
 		goto return_sleep_time;
 
 	/* Deny idle for C1 */
-	if (state == &dev->states[0]) {
+	if (index == 0) {
 		pwrdm_for_each_clkdm(mpu_pd, _cpuidle_deny_idle);
 		pwrdm_for_each_clkdm(core_pd, _cpuidle_deny_idle);
 	}
@@ -122,7 +124,7 @@ static int omap3_enter_idle(struct cpuidle_device *dev,
 	omap_sram_idle();
 
 	/* Re-allow idle for C1 */
-	if (state == &dev->states[0]) {
+	if (index == 0) {
 		pwrdm_for_each_clkdm(mpu_pd, _cpuidle_allow_idle);
 		pwrdm_for_each_clkdm(core_pd, _cpuidle_allow_idle);
 	}
@@ -134,28 +136,35 @@ return_sleep_time:
 	local_irq_enable();
 	local_fiq_enable();
 
-	return ts_idle.tv_nsec / NSEC_PER_USEC + ts_idle.tv_sec * USEC_PER_SEC;
+	idle_time = ts_idle.tv_nsec / NSEC_PER_USEC + ts_idle.tv_sec * \
+								USEC_PER_SEC;
+
+	/* Update cpuidle counters */
+	dev->last_residency = idle_time;
+
+	return index;
 }
 
 /**
  * next_valid_state - Find next valid C-state
  * @dev: cpuidle device
- * @state: Currently selected C-state
+ * @index: Index of currently selected c-state
  *
- * If the current state is valid, it is returned back to the caller.
- * Else, this function searches for a lower c-state which is still
- * valid.
+ * If the state corresponding to index is valid, index is returned back
+ * to the caller. Else, this function searches for a lower c-state which is
+ * still valid (as defined in omap3_power_states[]) and returns its index.
  *
  * A state is valid if the 'valid' field is enabled and
  * if it satisfies the enable_off_mode condition.
  */
-static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
-					      struct cpuidle_state *curr)
+static int next_valid_state(struct cpuidle_device *dev,
+				int index)
 {
-	struct cpuidle_state *next = NULL;
+	struct cpuidle_state *curr = &dev->states[index];
 	struct omap3_idle_statedata *cx = cpuidle_get_statedata(curr);
 	u32 mpu_deepest_state = PWRDM_POWER_RET;
 	u32 core_deepest_state = PWRDM_POWER_RET;
+	int next_index = -1;
 
 	if (enable_off_mode) {
 		mpu_deepest_state = PWRDM_POWER_OFF;
@@ -172,20 +181,20 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 	if ((cx->valid) &&
 	    (cx->mpu_state >= mpu_deepest_state) &&
 	    (cx->core_state >= core_deepest_state)) {
-		return curr;
+		return index;
 	} else {
 		int idx = OMAP3_NUM_STATES - 1;
 
 		/* Reach the current state starting at highest C-state */
 		for (; idx >= 0; idx--) {
 			if (&dev->states[idx] == curr) {
-				next = &dev->states[idx];
+				next_index = idx;
 				break;
 			}
 		}
 
 		/* Should never hit this condition */
-		WARN_ON(next == NULL);
+		WARN_ON(next_index == -1);
 
 		/*
 		 * Drop to next valid state.
@@ -197,37 +206,39 @@ static struct cpuidle_state *next_valid_state(struct cpuidle_device *dev,
 			if ((cx->valid) &&
 			    (cx->mpu_state >= mpu_deepest_state) &&
 			    (cx->core_state >= core_deepest_state)) {
-				next = &dev->states[idx];
+				next_index = idx;
 				break;
 			}
 		}
 		/*
 		 * C1 is always valid.
-		 * So, no need to check for 'next==NULL' outside this loop.
+		 * So, no need to check for 'next_index == -1' outside
+		 * this loop.
 		 */
 	}
 
-	return next;
+	return next_index;
 }
 
 /**
  * omap3_enter_idle_bm - Checks for any bus activity
  * @dev: cpuidle device
- * @state: The target state to be programmed
+ * @index: array index of target state to be programmed
  *
  * This function checks for any pending activity and then programs
  * the device to the specified or a safer state.
  */
 static int omap3_enter_idle_bm(struct cpuidle_device *dev,
-			       struct cpuidle_state *state)
+			       int index)
 {
-	struct cpuidle_state *new_state;
+	struct cpuidle_state *state = &dev->states[index];
+	int new_state_idx;
 	u32 core_next_state, per_next_state = 0, per_saved_state = 0, cam_state;
 	struct omap3_idle_statedata *cx;
 	int ret;
 
 	if (!omap3_can_sleep()) {
-		new_state = dev->safe_state;
+		new_state_idx = dev->safe_state_index;
 		goto select_state;
 	}
 
@@ -237,7 +248,7 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	 */
 	cam_state = pwrdm_read_pwrst(cam_pd);
 	if (cam_state == PWRDM_POWER_ON) {
-		new_state = dev->safe_state;
+		new_state_idx = dev->safe_state_index;
 		goto select_state;
 	}
 
@@ -264,11 +275,10 @@ static int omap3_enter_idle_bm(struct cpuidle_device *dev,
 	if (per_next_state != per_saved_state)
 		pwrdm_set_next_pwrst(per_pd, per_next_state);
 
-	new_state = next_valid_state(dev, state);
+	new_state_idx = next_valid_state(dev, index);
 
 select_state:
-	dev->last_state = new_state;
-	ret = omap3_enter_idle(dev, new_state);
+	ret = omap3_enter_idle(dev, new_state_idx);
 
 	/* Restore original PER state if it was modified */
 	if (per_next_state != per_saved_state)
@@ -339,11 +349,12 @@ int __init omap3_idle_init(void)
 
 	cpuidle_register_driver(&omap3_idle_driver);
 	dev = &per_cpu(omap3_idle_dev, smp_processor_id());
+	dev->safe_state_index = -1;
 
 	/* C1 . MPU WFI + Core active */
 	cx = _fill_cstate(dev, 0, "MPU ON + CORE ON");
 	(&dev->states[0])->enter = omap3_enter_idle;
-	dev->safe_state = &dev->states[0];
+	dev->safe_state_index = 0;
 	cx->valid = 1;	/* C1 is always valid */
 	cx->mpu_state = PWRDM_POWER_ON;
 	cx->core_state = PWRDM_POWER_ON;
diff --git a/arch/sh/kernel/cpu/shmobile/cpuidle.c b/arch/sh/kernel/cpu/shmobile/cpuidle.c
index e4469e72..7be50d4c 100644
--- a/arch/sh/kernel/cpu/shmobile/cpuidle.c
+++ b/arch/sh/kernel/cpu/shmobile/cpuidle.c
@@ -25,11 +25,11 @@ static unsigned long cpuidle_mode[] = {
 };
 
 static int cpuidle_sleep_enter(struct cpuidle_device *dev,
-			       struct cpuidle_state *state)
+				int index)
 {
 	unsigned long allowed_mode = arch_hwblk_sleep_mode();
 	ktime_t before, after;
-	int requested_state = state - &dev->states[0];
+	int requested_state = index;
 	int allowed_state;
 	int k;
 
@@ -46,11 +46,13 @@ static int cpuidle_sleep_enter(struct cpuidle_device *dev,
 	 */
 	k = min_t(int, allowed_state, requested_state);
 
-	dev->last_state = &dev->states[k];
 	before = ktime_get();
 	sh_mobile_call_standby(cpuidle_mode[k]);
 	after = ktime_get();
-	return ktime_to_ns(ktime_sub(after, before)) >> 10;
+
+	dev->last_residency = (int)ktime_to_ns(ktime_sub(after, before)) >> 10;
+
+	return k;
 }
 
 static struct cpuidle_device cpuidle_dev;
@@ -84,7 +86,7 @@ void sh_mobile_setup_cpuidle(void)
 	state->flags |= CPUIDLE_FLAG_TIME_VALID;
 	state->enter = cpuidle_sleep_enter;
 
-	dev->safe_state = state;
+	dev->safe_state_index = i-1;
 
 	if (sh_mobile_sleep_supported & SUSP_SH_SF) {
 		state = &dev->states[i++];
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 9b88f98..463cc09 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -741,22 +741,24 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 /**
  * acpi_idle_enter_c1 - enters an ACPI C1 state-type
  * @dev: the target CPU
- * @state: the state data
+ * @index: index of target state
  *
  * This is equivalent to the HALT instruction.
  */
 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
-			      struct cpuidle_state *state)
+				int index)
 {
 	ktime_t  kt1, kt2;
 	s64 idle_time;
 	struct acpi_processor *pr;
+	struct cpuidle_state *state = &dev->states[index];
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 
 	pr = __this_cpu_read(processors);
+	dev->last_residency = 0;
 
 	if (unlikely(!pr))
-		return 0;
+		return -EINVAL;
 
 	local_irq_disable();
 
@@ -764,7 +766,7 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 	if (acpi_idle_suspend) {
 		local_irq_enable();
 		cpu_relax();
-		return 0;
+		return -EINVAL;
 	}
 
 	lapic_timer_state_broadcast(pr, cx, 1);
@@ -773,37 +775,46 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
 	kt2 = ktime_get_real();
 	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
+	/* Update device last_residency*/
+	dev->last_residency = (int)idle_time;
+
 	local_irq_enable();
 	cx->usage++;
 	lapic_timer_state_broadcast(pr, cx, 0);
 
-	return idle_time;
+	return index;
 }
 
 /**
  * acpi_idle_enter_simple - enters an ACPI state without BM handling
  * @dev: the target CPU
- * @state: the state data
+ * @index: the index of suggested state
  */
 static int acpi_idle_enter_simple(struct cpuidle_device *dev,
-				  struct cpuidle_state *state)
+				int index)
 {
 	struct acpi_processor *pr;
+	struct cpuidle_state *state = &dev->states[index];
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 	ktime_t  kt1, kt2;
 	s64 idle_time_ns;
 	s64 idle_time;
 
 	pr = __this_cpu_read(processors);
+	dev->last_residency = 0;
 
 	if (unlikely(!pr))
-		return 0;
-
-	if (acpi_idle_suspend)
-		return(acpi_idle_enter_c1(dev, state));
+		return -EINVAL;
 
 	local_irq_disable();
 
+	if (acpi_idle_suspend) {
+		local_irq_enable();
+		cpu_relax();
+		return -EINVAL;
+	}
+
+
 	if (cx->entry_method != ACPI_CSTATE_FFH) {
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
@@ -815,7 +826,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
 		if (unlikely(need_resched())) {
 			current_thread_info()->status |= TS_POLLING;
 			local_irq_enable();
-			return 0;
+			return -EINVAL;
 		}
 	}
 
@@ -837,6 +848,9 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
 	idle_time = idle_time_ns;
 	do_div(idle_time, NSEC_PER_USEC);
 
+	/* Update device last_residency*/
+	dev->last_residency = (int)idle_time;
+
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(idle_time_ns);
 
@@ -848,7 +862,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
 
 	lapic_timer_state_broadcast(pr, cx, 0);
 	cx->time += idle_time;
-	return idle_time;
+	return index;
 }
 
 static int c3_cpu_count;
@@ -857,14 +871,15 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
 /**
  * acpi_idle_enter_bm - enters C3 with proper BM handling
  * @dev: the target CPU
- * @state: the state data
+ * @index: the index of suggested state
  *
  * If BM is detected, the deepest non-C3 idle state is entered instead.
  */
 static int acpi_idle_enter_bm(struct cpuidle_device *dev,
-			      struct cpuidle_state *state)
+				int index)
 {
 	struct acpi_processor *pr;
+	struct cpuidle_state *state = &dev->states[index];
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 	ktime_t  kt1, kt2;
 	s64 idle_time_ns;
@@ -872,22 +887,26 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
 
 
 	pr = __this_cpu_read(processors);
+	dev->last_residency = 0;
 
 	if (unlikely(!pr))
-		return 0;
+		return -EINVAL;
 
-	if (acpi_idle_suspend)
-		return(acpi_idle_enter_c1(dev, state));
+
+	if (acpi_idle_suspend) {
+		cpu_relax();
+		return -EINVAL;
+	}
 
 	if (!cx->bm_sts_skip && acpi_idle_bm_check()) {
-		if (dev->safe_state) {
-			dev->last_state = dev->safe_state;
-			return dev->safe_state->enter(dev, dev->safe_state);
+		if (dev->safe_state_index >= 0) {
+			return dev->states[dev->safe_state_index].enter(dev,
+						dev->safe_state_index);
 		} else {
 			local_irq_disable();
 			acpi_safe_halt();
 			local_irq_enable();
-			return 0;
+			return -EINVAL;
 		}
 	}
 
@@ -904,7 +923,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
 		if (unlikely(need_resched())) {
 			current_thread_info()->status |= TS_POLLING;
 			local_irq_enable();
-			return 0;
+			return -EINVAL;
 		}
 	}
 
@@ -954,6 +973,9 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
 	idle_time = idle_time_ns;
 	do_div(idle_time, NSEC_PER_USEC);
 
+	/* Update device last_residency*/
+	dev->last_residency = (int)idle_time;
+
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(idle_time_ns);
 
@@ -965,7 +987,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
 
 	lapic_timer_state_broadcast(pr, cx, 0);
 	cx->time += idle_time;
-	return idle_time;
+	return index;
 }
 
 struct cpuidle_driver acpi_idle_driver = {
@@ -992,6 +1014,7 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
 	}
 
 	dev->cpu = pr->id;
+	dev->safe_state_index = -1;
 	for (i = 0; i < CPUIDLE_STATE_MAX; i++) {
 		dev->states[i].name[0] = '\0';
 		dev->states[i].desc[0] = '\0';
@@ -1027,13 +1050,13 @@ static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
 				state->flags |= CPUIDLE_FLAG_TIME_VALID;
 
 			state->enter = acpi_idle_enter_c1;
-			dev->safe_state = state;
+			dev->safe_state_index = count;
 			break;
 
 			case ACPI_STATE_C2:
 			state->flags |= CPUIDLE_FLAG_TIME_VALID;
 			state->enter = acpi_idle_enter_simple;
-			dev->safe_state = state;
+			dev->safe_state_index = count;
 			break;
 
 			case ACPI_STATE_C3:
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 0df0141..8faf3a6 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -62,7 +62,7 @@ int cpuidle_idle_call(void)
 {
 	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
 	struct cpuidle_state *target_state;
-	int next_state;
+	int next_state, entered_state;
 
 	if (off)
 		return -ENODEV;
@@ -102,26 +102,27 @@ int cpuidle_idle_call(void)
 
 	target_state = &dev->states[next_state];
 
-	/* enter the state and update stats */
-	dev->last_state = target_state;
-
 	trace_power_start(POWER_CSTATE, next_state, dev->cpu);
 	trace_cpu_idle(next_state, dev->cpu);
 
-	dev->last_residency = target_state->enter(dev, target_state);
+	entered_state = target_state->enter(dev, next_state);
 
 	trace_power_end(dev->cpu);
 	trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu);
 
-	if (dev->last_state)
-		target_state = dev->last_state;
-
-	target_state->time += (unsigned long long)dev->last_residency;
-	target_state->usage++;
+	if (entered_state >= 0) {
+		/* Update cpuidle counters */
+		/* This can be moved to within driver enter routine
+		 * but that results in multiple copies of same code.
+		 */
+		dev->states[entered_state].time +=
+				(unsigned long long)dev->last_residency;
+		dev->states[entered_state].usage++;
+	}
 
 	/* give the governor an opportunity to reflect on the outcome */
 	if (cpuidle_curr_governor->reflect)
-		cpuidle_curr_governor->reflect(dev);
+		cpuidle_curr_governor->reflect(dev, entered_state);
 
 	return 0;
 }
@@ -172,11 +173,10 @@ void cpuidle_resume_and_unlock(void)
 EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock);
 
 #ifdef CONFIG_ARCH_HAS_CPU_RELAX
-static int poll_idle(struct cpuidle_device *dev, struct cpuidle_state *st)
+static int poll_idle(struct cpuidle_device *dev, int index)
 {
 	ktime_t	t1, t2;
 	s64 diff;
-	int ret;
 
 	t1 = ktime_get();
 	local_irq_enable();
@@ -188,8 +188,9 @@ static int poll_idle(struct cpuidle_device *dev, struct cpuidle_state *st)
 	if (diff > INT_MAX)
 		diff = INT_MAX;
 
-	ret = (int) diff;
-	return ret;
+	dev->last_residency = (int) diff;
+
+	return index;
 }
 
 static void poll_idle_init(struct cpuidle_device *dev)
@@ -248,7 +249,6 @@ int cpuidle_enable_device(struct cpuidle_device *dev)
 		dev->states[i].time = 0;
 	}
 	dev->last_residency = 0;
-	dev->last_state = NULL;
 
 	smp_wmb();
 
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index f62fde2..78b06d2 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -153,11 +153,24 @@ static int ladder_enable_device(struct cpuidle_device *dev)
 	return 0;
 }
 
+/**
+ * ladder_reflect - update the correct last_state_idx
+ * @dev: the CPU
+ * @index: the index of actual state entered
+ */
+static void ladder_reflect(struct cpuidle_device *dev, int index)
+{
+	struct ladder_device *ldev = &__get_cpu_var(ladder_devices);
+	if (index > 0)
+		ldev->last_state_idx = index;
+}
+
 static struct cpuidle_governor ladder_governor = {
 	.name =		"ladder",
 	.rating =	10,
 	.enable =	ladder_enable_device,
 	.select =	ladder_select_state,
+	.reflect =	ladder_reflect,
 	.owner =	THIS_MODULE,
 };
 
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 3600f19..3c44c53 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -310,14 +310,17 @@ static int menu_select(struct cpuidle_device *dev)
 /**
  * menu_reflect - records that data structures need update
  * @dev: the CPU
+ * @index: the index of actual entered state
  *
  * NOTE: it's important to be fast here because this operation will add to
  *       the overall exit latency.
  */
-static void menu_reflect(struct cpuidle_device *dev)
+static void menu_reflect(struct cpuidle_device *dev, int index)
 {
 	struct menu_device *data = &__get_cpu_var(menu_devices);
-	data->needs_update = 1;
+	data->last_state_idx = index;
+	if (index >= 0)
+		data->needs_update = 1;
 }
 
 /**
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index a46dddf..a1c888d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -81,7 +81,7 @@ static unsigned int mwait_substates;
 static unsigned int lapic_timer_reliable_states = (1 << 1);	 /* Default to only C1 */
 
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
-static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state);
+static int intel_idle(struct cpuidle_device *dev, int index);
 
 static struct cpuidle_state *cpuidle_state_table;
 
@@ -209,12 +209,13 @@ static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = {
 /**
  * intel_idle
  * @dev: cpuidle_device
- * @state: cpuidle state
+ * @index: index of cpuidle state
  *
  */
-static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
+static int intel_idle(struct cpuidle_device *dev, int index)
 {
 	unsigned long ecx = 1; /* break on interrupt flag */
+	struct cpuidle_state *state = &dev->states[index];
 	unsigned long eax = (unsigned long)cpuidle_get_statedata(state);
 	unsigned int cstate;
 	ktime_t kt_before, kt_after;
@@ -256,7 +257,10 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
 	if (!(lapic_timer_reliable_states & (1 << (cstate))))
 		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
 
-	return usec_delta;
+	/* Update cpuidle counters */
+	dev->last_residency = (int)usec_delta;
+
+	return index;
 }
 
 static void __setup_broadcast_timer(void *arg)
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index b51629e..8da811b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -42,7 +42,7 @@ struct cpuidle_state {
 	unsigned long long	time; /* in US */
 
 	int (*enter)	(struct cpuidle_device *dev,
-			 struct cpuidle_state *state);
+			int index);
 };
 
 /* Idle State Flags */
@@ -87,13 +87,12 @@ struct cpuidle_device {
 	int			state_count;
 	struct cpuidle_state	states[CPUIDLE_STATE_MAX];
 	struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
-	struct cpuidle_state	*last_state;
 
 	struct list_head 	device_list;
 	struct kobject		kobj;
 	struct completion	kobj_unregister;
 	void			*governor_data;
-	struct cpuidle_state	*safe_state;
+	int			safe_state_index;
 
 	int (*prepare)		(struct cpuidle_device *dev);
 };
@@ -169,7 +168,7 @@ struct cpuidle_governor {
 	void (*disable)		(struct cpuidle_device *dev);
 
 	int  (*select)		(struct cpuidle_device *dev);
-	void (*reflect)		(struct cpuidle_device *dev);
+	void (*reflect)		(struct cpuidle_device *dev, int index);
 
 	struct module 		*owner;
 };

^ permalink raw reply related

* [PATCH v9 0/4] cpuidle: Global registration of idle states with per-cpu statistics
From: Deepthi Dharwar @ 2011-10-28 10:49 UTC (permalink / raw)
  To: khilman, venki, ak, len.brown, peterz, rjw, santosh.shilimkar,
	arjan, lenb
  Cc: linux-sh, linux-pm, linux-kernel, linux-acpi, linux-pm,
	linux-omap, linux-arm-kernel

Version 6 of this patch series dated 22nd Sept 2011 was 
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Kevin Hilman <khilman@ti.com> for OMAP specific parts.
Reviewed-by: Kevin Hilman <khilman@ti.com>
Tested-by: Jean Pihet <j-pihet@ti.com>

Hi Len, 
Can you please queue this series for the next mainline merge window by merging
it into your development tree and also linux-next for further testing.

I have also tested it by applying these patches on your ACPI tree hosted @github.

Thanks
-Deepthi

--

The following patch series implements global registration of cpuidle
states, and also has the necessary data structure changes to
accommodate the per-cpu writable members of the cpuidle_states
structure.

This patch series had been in discussion earlier and
following are the links to the previous discussions.

v8 --> https://lkml.org/lkml/2011/10/3/52
v7 --> https://lkml.org/lkml/2011/9/27/74 
v6 --> https://lkml.org/lkml/2011/9/22/58
v5 --> https://lkml.org/lkml/2011/6/6/259
v4 --> https://lkml.org/lkml/2011/4/28/312
v3 --> https://lkml.org/lkml/2011/2/8/73
v2 --> https://lkml.org/lkml/2011/1/13/98
v1 --> https://lkml.org/lkml/2011/3/22/161

Changes from previous version (v8):

   1. Rebased and tested on 3.1

Tests done:

    1. Compile tested for ARM using the following configs: da8xx_omapl_defconfig,
    kirkwood_defconfig, omap2plus_defconfig, at91rm9200_defconfig
      
    2. Boot tested on x86 nehalem with multiple C-states for both intel_idle
    and acpi_idle drivers.

    3. Boot tested on T60p thinkpad that has T2600 cpu with multiple C-states. 
    Also tested the case when there is dynamic changes in C-states 
    AC <-> Battery Power switch.

Brief description of the patches:

Core change in this series is to split the cpuidle_device structure 
into two parts, i.e  global and per-cpu basis. 

The per-cpu pieces are mostly generic statistics that can be independent 
of current running driver. As a result of these changes, there is single 
copy of cpuidle_states structure and single registration done by one 
cpu. The low level driver is free to set per-cpu driver data on
each cpu if needed using the cpuidle_set_statedata() as the case
today. Only in very rare cases asymmetric C-states exist which can be 
handled within the cpuidle driver. Most architectures do not have 
asymmetric C-states.

First two patches in the series facilitate splitting of cpuidle_states
and cpuidle_device structure and next two patches do the actual split,
change the API's and make existing code follow the changed API.

[1/4] - Move the idle residency accounting part from cpuidle.c to
the respective low level drivers, so that the accounting can
be accurately maintained if the driver decides to demote the
chosen (suggested) by the governor.

[2/4] - removes the cpuidle_device()->prepare API since is is not
widely used and the only use case was to allow software
demotion using CPUIDLE_FLAG_IGNORE flag.  Both these
functions can be absorbed within the cpuidle back-end
driver ad hence deprecating the prepare routine and the
CPUIDLE_FLAG_IGNORE flag.

    - Ref: https://lkml.org/lkml/2011/3/25/52

[3/4] - Splits the usage statistics (read/write) part out of
cpuidle_state structure, so that the states can become read
only and hence made global.

[4/4] - Most APIs will now need to pass pointer to both global
cpuidle_driver and per-cpu cpuidle_device structure.

 arch/arm/mach-at91/cpuidle.c          |   41 +++--
 arch/arm/mach-davinci/cpuidle.c       |   51 ++++---
 arch/arm/mach-exynos4/cpuidle.c       |   30 ++--
 arch/arm/mach-kirkwood/cpuidle.c      |   42 +++---
 arch/arm/mach-omap2/cpuidle34xx.c     |  133 +++++++++++------
 arch/sh/kernel/cpu/shmobile/cpuidle.c |   28 ++--
 drivers/acpi/processor_driver.c       |   20 ---
 drivers/acpi/processor_idle.c         |  251 +++++++++++++++++++++++++++------
 drivers/cpuidle/cpuidle.c             |   86 ++++-------
 drivers/cpuidle/driver.c              |   25 +++
 drivers/cpuidle/governors/ladder.c    |   41 ++++-
 drivers/cpuidle/governors/menu.c      |   29 ++--
 drivers/cpuidle/sysfs.c               |   22 ++-
 drivers/idle/intel_idle.c             |  130 +++++++++++++----
 include/acpi/processor.h              |    1 
 include/linux/cpuidle.h               |   52 ++++---
 16 files changed, 650 insertions(+), 332 deletions(-)


-- 

-Deepthi

^ permalink raw reply

* [PATCH 2/2] nfs: make TASK_KILLABLE sleeps attempt to freeze
From: Jeff Layton @ 2011-10-28  6:08 UTC (permalink / raw)
  To: rjw; +Cc: linux-pm, john, trond.myklebust, linux-nfs, linux-kernel
In-Reply-To: <1319782112-15094-1-git-send-email-jlayton@redhat.com>

Allow the TASK_KILLABLE sleeps in NFS layer to respect the freezer. This
should allow suspend and hibernate events to occur, even when the client
is looping on an EJUKEBOX or NFS4ERR_DELAY sort of error.

Tested-by: John Hughes <john@Calva.COM>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/nfs/inode.c    |    4 +++-
 fs/nfs/nfs3proc.c |    4 +++-
 fs/nfs/nfs4proc.c |   13 +++++++++----
 fs/nfs/proc.c     |    4 +++-
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4dc6d07..4b91e24 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -38,6 +38,7 @@
 #include <linux/nfs_xdr.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/freezer.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -77,7 +78,8 @@ int nfs_wait_bit_killable(void *word)
 {
 	if (fatal_signal_pending(current))
 		return -ERESTARTSYS;
-	schedule();
+	if (!try_to_freeze())
+		schedule();
 	return 0;
 }
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 85f1690..5354219 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -17,6 +17,7 @@
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
 #include <linux/nfs_mount.h>
+#include <linux/freezer.h>
 
 #include "iostat.h"
 #include "internal.h"
@@ -32,7 +33,8 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 		res = rpc_call_sync(clnt, msg, flags);
 		if (res != -EJUKEBOX && res != -EKEYEXPIRED)
 			break;
-		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+		if (!try_to_freeze())
+			schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
 	} while (!fatal_signal_pending(current));
 	return res;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d2ae413..085eb8a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -53,6 +53,7 @@
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
+#include <linux/freezer.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -241,10 +242,12 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 		*timeout = NFS4_POLL_RETRY_MIN;
 	if (*timeout > NFS4_POLL_RETRY_MAX)
 		*timeout = NFS4_POLL_RETRY_MAX;
-	schedule_timeout_killable(*timeout);
+	if (!try_to_freeze()) {
+		schedule_timeout_killable(*timeout);
+		*timeout <<= 1;
+	}
 	if (fatal_signal_pending(current))
 		res = -ERESTARTSYS;
-	*timeout <<= 1;
 	return res;
 }
 
@@ -3951,8 +3954,10 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 static unsigned long
 nfs4_set_lock_task_retry(unsigned long timeout)
 {
-	schedule_timeout_killable(timeout);
-	timeout <<= 1;
+	if (!try_to_freeze()) {
+		schedule_timeout_killable(timeout);
+		timeout <<= 1;
+	}
 	if (timeout > NFS4_LOCK_MAXTIMEOUT)
 		return NFS4_LOCK_MAXTIMEOUT;
 	return timeout;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ac40b85..0b70e85 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -41,6 +41,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
+#include <linux/freezer.h>
 #include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
@@ -59,7 +60,8 @@ nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 		res = rpc_call_sync(clnt, msg, flags);
 		if (res != -EKEYEXPIRED)
 			break;
-		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+		if (!try_to_freeze())
+			schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
 	} while (!fatal_signal_pending(current));
 	return res;
-- 
1.7.6.4

^ permalink raw reply related

* [PATCH 1/2] sunrpc: make rpc_wait_bit_killable handle freeze events
From: Jeff Layton @ 2011-10-28  6:08 UTC (permalink / raw)
  To: rjw; +Cc: linux-pm, john, trond.myklebust, linux-nfs, linux-kernel

Allow the wait_on_bit_killable sleeps in SUNRPC layer to respect the
freezer. This should allow suspend and hibernate events to occur, even
when there are RPC's pending on the wire.

Tested-by: John Hughes <john@Calva.COM>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 net/sunrpc/sched.c |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index d12ffa5..09bb64e 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -18,6 +18,7 @@
 #include <linux/smp.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
+#include <linux/freezer.h>
 
 #include <linux/sunrpc/clnt.h>
 
@@ -231,7 +232,8 @@ static int rpc_wait_bit_killable(void *word)
 {
 	if (fatal_signal_pending(current))
 		return -ERESTARTSYS;
-	schedule();
+	if (!try_to_freeze())
+		schedule();
 	return 0;
 }
 
-- 
1.7.6.4

^ permalink raw reply related

* Re: [PATCH 1/4] freezer: make fake_signal_wake_up wake TASK_KILLABLE tasks too
From: Steve French @ 2011-10-27 20:26 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: linux-cifs, linux-nfs, Jeff Layton, trond.myklebust, linux-kernel,
	john, linux-pm
In-Reply-To: <201110272222.39962.rjw@sisk.pl>

FYI - I included the cifs one "cifs, freezer: add
wait_event_freezekillable and have cifs use it" (and the corequisite
fix for the build break when freezer not enabled) in my recent cifs
merge request.

On Thu, Oct 27, 2011 at 3:22 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> On Thursday, October 27, 2011, Rafael J. Wysocki wrote:
>> On Wednesday, October 26, 2011, Jeff Layton wrote:
>> > On Tue, 11 Oct 2011 21:14:28 +0200
>> > "Rafael J. Wysocki" <rjw@sisk.pl> wrote:
>> >
>> > > On Tuesday, October 11, 2011, Jeff Layton wrote:
>> > > > On Tue, 11 Oct 2011 08:18:48 +0200
>> > > > Pavel Machek <pavel@ucw.cz> wrote:
>> > > >
>> > > > >
>> > > > > Hi!
>> > > > >
>> > > > > > TASK_KILLABLE is often used to put tasks to sleep for quite some time.
>> > > > > > One of the most common uses is to put tasks to sleep while waiting for
>> > > > > > replies from a server on a networked filesystem (such as CIFS or NFS).
>> > > > > >
>> > > > > > Unfortunately, fake_signal_wake_up does not currently wake up tasks
>> > > > > > that are sleeping in TASK_KILLABLE state. This means that even if the
>> > > > > > code were in place to allow them to freeze while in this sleep, it
>> > > > > > wouldn't work anyway.
>> > > > > >
>> > > > > > This patch changes this function to wake tasks in this state as well.
>> > > > > > This should be harmless -- if the code doing the sleeping doesn't have
>> > > > > > handling to deal with freezer events, it should just go back to sleep.
>> > > > >
>> > > > > I'm pretty sure this will break something; but that does not mean it
>> > > > > is bad idea, just that it should be merged early and tested a lot.
>> > > > >
>> > > >
>> > > > FWIW, I looked at most of the places in the kernel that do
>> > > > TASK_KILLABLE sleeps and they look like they'll handle this correctly.
>> > > > The main one I wasn't sure about was mem_cgroup_handle_oom(), but I
>> > > > think it'll do the right thing too. I certainly could have missed
>> > > > something though...
>> > > >
>> > > > In any case, would you mind merging this via the linux-pm tree for 3.2?
>> > >
>> > > I will push it for 3.2.
>> > >
>> >
>> > Hi Rafael,
>> >
>> > Trond asked if you would also be willing to push patches 3 and 4 in
>> > this series for 3.2 as well [1]? Note that patch #4 got another revision so
>> > we'll want to make sure that you get that one. I can resend the
>> > nfs/sunrpc patches if that will help...
>> >
>> > [1]: I think Steve F is going to push patch #2, so that one shouldn't
>> > be an issue.
>>
>> Well, I've already sent my pull request.  I can keep these patches in my
>> tree for the next pull request, though (I'm sure there will be fixes against
>> 3.2, so they will go along with those).
>
> BTW, do you have current versions handy?  Or hasn't they changed?
>
> Rafael
>



-- 
Thanks,

Steve

^ permalink raw reply

* Re: [PATCH 1/4] freezer: make fake_signal_wake_up wake TASK_KILLABLE tasks too
From: Rafael J. Wysocki @ 2011-10-27 20:22 UTC (permalink / raw)
  To: linux-pm
  Cc: linux-cifs, linux-nfs, Jeff Layton, trond.myklebust, linux-kernel,
	smfrench, john
In-Reply-To: <201110272221.06782.rjw@sisk.pl>

On Thursday, October 27, 2011, Rafael J. Wysocki wrote:
> On Wednesday, October 26, 2011, Jeff Layton wrote:
> > On Tue, 11 Oct 2011 21:14:28 +0200
> > "Rafael J. Wysocki" <rjw@sisk.pl> wrote:
> > 
> > > On Tuesday, October 11, 2011, Jeff Layton wrote:
> > > > On Tue, 11 Oct 2011 08:18:48 +0200
> > > > Pavel Machek <pavel@ucw.cz> wrote:
> > > > 
> > > > > 
> > > > > Hi!
> > > > > 
> > > > > > TASK_KILLABLE is often used to put tasks to sleep for quite some time.
> > > > > > One of the most common uses is to put tasks to sleep while waiting for
> > > > > > replies from a server on a networked filesystem (such as CIFS or NFS).
> > > > > > 
> > > > > > Unfortunately, fake_signal_wake_up does not currently wake up tasks
> > > > > > that are sleeping in TASK_KILLABLE state. This means that even if the
> > > > > > code were in place to allow them to freeze while in this sleep, it
> > > > > > wouldn't work anyway.
> > > > > > 
> > > > > > This patch changes this function to wake tasks in this state as well.
> > > > > > This should be harmless -- if the code doing the sleeping doesn't have
> > > > > > handling to deal with freezer events, it should just go back to sleep.
> > > > > 
> > > > > I'm pretty sure this will break something; but that does not mean it
> > > > > is bad idea, just that it should be merged early and tested a lot.
> > > > > 
> > > > 
> > > > FWIW, I looked at most of the places in the kernel that do
> > > > TASK_KILLABLE sleeps and they look like they'll handle this correctly.
> > > > The main one I wasn't sure about was mem_cgroup_handle_oom(), but I
> > > > think it'll do the right thing too. I certainly could have missed
> > > > something though...
> > > > 
> > > > In any case, would you mind merging this via the linux-pm tree for 3.2?
> > > 
> > > I will push it for 3.2.
> > > 
> > 
> > Hi Rafael,
> > 
> > Trond asked if you would also be willing to push patches 3 and 4 in
> > this series for 3.2 as well [1]? Note that patch #4 got another revision so
> > we'll want to make sure that you get that one. I can resend the
> > nfs/sunrpc patches if that will help...
> > 
> > [1]: I think Steve F is going to push patch #2, so that one shouldn't
> > be an issue. 
> 
> Well, I've already sent my pull request.  I can keep these patches in my
> tree for the next pull request, though (I'm sure there will be fixes against
> 3.2, so they will go along with those).

BTW, do you have current versions handy?  Or hasn't they changed?

Rafael

^ permalink raw reply

* Re: [PATCH 1/4] freezer: make fake_signal_wake_up wake TASK_KILLABLE tasks too
From: Rafael J. Wysocki @ 2011-10-27 20:21 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-cifs, linux-nfs, trond.myklebust, linux-kernel, smfrench,
	john, linux-pm
In-Reply-To: <20111026155548.4a7c3dd8@corrin.poochiereds.net>

On Wednesday, October 26, 2011, Jeff Layton wrote:
> On Tue, 11 Oct 2011 21:14:28 +0200
> "Rafael J. Wysocki" <rjw@sisk.pl> wrote:
> 
> > On Tuesday, October 11, 2011, Jeff Layton wrote:
> > > On Tue, 11 Oct 2011 08:18:48 +0200
> > > Pavel Machek <pavel@ucw.cz> wrote:
> > > 
> > > > 
> > > > Hi!
> > > > 
> > > > > TASK_KILLABLE is often used to put tasks to sleep for quite some time.
> > > > > One of the most common uses is to put tasks to sleep while waiting for
> > > > > replies from a server on a networked filesystem (such as CIFS or NFS).
> > > > > 
> > > > > Unfortunately, fake_signal_wake_up does not currently wake up tasks
> > > > > that are sleeping in TASK_KILLABLE state. This means that even if the
> > > > > code were in place to allow them to freeze while in this sleep, it
> > > > > wouldn't work anyway.
> > > > > 
> > > > > This patch changes this function to wake tasks in this state as well.
> > > > > This should be harmless -- if the code doing the sleeping doesn't have
> > > > > handling to deal with freezer events, it should just go back to sleep.
> > > > 
> > > > I'm pretty sure this will break something; but that does not mean it
> > > > is bad idea, just that it should be merged early and tested a lot.
> > > > 
> > > 
> > > FWIW, I looked at most of the places in the kernel that do
> > > TASK_KILLABLE sleeps and they look like they'll handle this correctly.
> > > The main one I wasn't sure about was mem_cgroup_handle_oom(), but I
> > > think it'll do the right thing too. I certainly could have missed
> > > something though...
> > > 
> > > In any case, would you mind merging this via the linux-pm tree for 3.2?
> > 
> > I will push it for 3.2.
> > 
> 
> Hi Rafael,
> 
> Trond asked if you would also be willing to push patches 3 and 4 in
> this series for 3.2 as well [1]? Note that patch #4 got another revision so
> we'll want to make sure that you get that one. I can resend the
> nfs/sunrpc patches if that will help...
> 
> [1]: I think Steve F is going to push patch #2, so that one shouldn't
> be an issue. 

Well, I've already sent my pull request.  I can keep these patches in my
tree for the next pull request, though (I'm sure there will be fixes against
3.2, so they will go along with those).

Thanks,
Rafael

^ permalink raw reply

* Re: [PATCH 4/6 v2] PM: Limit race conditions between runtime PM and system sleep (v2)
From: Rafael J. Wysocki @ 2011-10-27 20:06 UTC (permalink / raw)
  To: Linus Walleij
  Cc: Kevin Hilman, linux-scsi, Greg KH, LKML, Jesse Barnes, Tejun Heo,
	Linux PM mailing list, stable
In-Reply-To: <CAKnu2Mru4c-2LBCZ_k3ZA+v47u0+um8igYU7EWt8=qOTVgcUjA@mail.gmail.com>

On Thursday, October 27, 2011, Linus Walleij wrote:
> 2011/6/29 Rafael J. Wysocki <rjw@sisk.pl>:
> 
> > One of the roles of the PM core is to prevent different PM callbacks
> > executed for the same device object from racing with each other.
> > Unfortunately, after commit e8665002477f0278f84f898145b1f141ba26ee26
> > (PM: Allow pm_runtime_suspend() to succeed during system suspend)
> > runtime PM callbacks may be executed concurrently with system
> > suspend/resume callbacks for the same device.
> (...)
> > Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
> 
> A quick question: is there some specific reason why this patch should
> not go into the 3.0.y stable releases?
>
> We are trying to produce
> a runtime PM system of product quality based on 3.0.y and we've
> already had to backport this patch ourselves to get things stable.
> 
> We have also backported:
> PM: Introduce generic "noirq" callback routines for subsystems (v2)
> PM / Runtime: Update documentation of interactions with system sleep
> PM / Runtime: Add new helper function: pm_runtime_status_suspended()
> 
> And now it seems to be sufficient to get this thing going.

Well, it isn't a simple fix and it changes the code's behavior quite
significantly, so I thought it might not be a good idea to risk problems
with -stable because of it.  Perhaps let's see how it works out in 3.1
and backport it later if there are no problem reports related to it?

Rafael

^ permalink raw reply

* Re: [PATCH 4/6 v2] PM: Limit race conditions between runtime PM and system sleep (v2)
From: Linus Walleij @ 2011-10-27 13:54 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Kevin Hilman, linux-scsi, Greg KH, LKML, Jesse Barnes, Tejun Heo,
	Linux PM mailing list, stable
In-Reply-To: <201106292334.24518.rjw@sisk.pl>

2011/6/29 Rafael J. Wysocki <rjw@sisk.pl>:

> One of the roles of the PM core is to prevent different PM callbacks
> executed for the same device object from racing with each other.
> Unfortunately, after commit e8665002477f0278f84f898145b1f141ba26ee26
> (PM: Allow pm_runtime_suspend() to succeed during system suspend)
> runtime PM callbacks may be executed concurrently with system
> suspend/resume callbacks for the same device.
(...)
> Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>

A quick question: is there some specific reason why this patch should
not go into the 3.0.y stable releases? We are trying to produce
a runtime PM system of product quality based on 3.0.y and we've
already had to backport this patch ourselves to get things stable.

We have also backported:
PM: Introduce generic "noirq" callback routines for subsystems (v2)
PM / Runtime: Update documentation of interactions with system sleep
PM / Runtime: Add new helper function: pm_runtime_status_suspended()

And now it seems to be sufficient to get this thing going.

Yours,
Linus Walleij

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox