Linux Power Management development
 help / color / mirror / Atom feed
* Re: [PATCH 5/5] cpuidle: stop depending on pm_idle
From: Trinabh Gupta @ 2011-08-04 17:21 UTC (permalink / raw)
  To: Len Brown; +Cc: Kevin Hilman, Len Brown, linux-pm, x86, linux-kernel
In-Reply-To: <619b3f9e65307529dd4bbc98efe9d2f3b632646c.1312400543.git.len.brown@intel.com>


[-- Attachment #1.1: Type: text/plain, Size: 8684 bytes --]

On Wed, Aug 3, 2011 at 12:44 PM, Len Brown <lenb@kernel.org> wrote:

> From: Len Brown <len.brown@intel.com>
>
> cpuidle users should call cpuidle_call_idle() directly
> rather than via (pm_idle)() function pointer.
>
> Architecture may choose to continue using (pm_idle)(),
> but cpuidle need not depend on it:
>
>  my_arch_cpu_idle()
>        ...
>        if(cpuidle_call_idle())
>                pm_idle();
>
> cc: x86@kernel.org
> cc: Kevin Hilman <khilman@deeprootsystems.com>
> cc: Paul Mundt <lethal@linux-sh.org>
> Signed-off-by: Len Brown <len.brown@intel.com>
> ---
>  arch/arm/kernel/process.c    |    4 +++-
>  arch/sh/kernel/idle.c        |    6 ++++--
>  arch/x86/kernel/process_32.c |    4 +++-
>  arch/x86/kernel/process_64.c |    4 +++-
>  drivers/cpuidle/cpuidle.c    |   38 ++++++++++++++++++--------------------
>  include/linux/cpuidle.h      |    2 ++
>  6 files changed, 33 insertions(+), 25 deletions(-)
>
> diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
> index 5e1e541..d7ee0d4 100644
> --- a/arch/arm/kernel/process.c
> +++ b/arch/arm/kernel/process.c
> @@ -30,6 +30,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/random.h>
>  #include <linux/hw_breakpoint.h>
> +#include <linux/cpuidle.h>
>
>  #include <asm/cacheflush.h>
>  #include <asm/leds.h>
> @@ -196,7 +197,8 @@ void cpu_idle(void)
>                                cpu_relax();
>                        } else {
>                                stop_critical_timings();
> -                               pm_idle();
> +                               if (cpuidle_call_idle())
>

Hi Len,

This should be cpuidle_idle_call()


> +                                       pm_idle();
>                                start_critical_timings();
>                                /*
>                                 * This will eventually be removed - pm_idle
> diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
> index 425d604..9c7099e 100644
> --- a/arch/sh/kernel/idle.c
> +++ b/arch/sh/kernel/idle.c
> @@ -16,12 +16,13 @@
>  #include <linux/thread_info.h>
>  #include <linux/irqflags.h>
>  #include <linux/smp.h>
> +#include <linux/cpuidle.h>
>  #include <asm/pgalloc.h>
>  #include <asm/system.h>
>  #include <asm/atomic.h>
>  #include <asm/smp.h>
>
> -void (*pm_idle)(void) = NULL;
> +static void (*pm_idle)(void);
>
>  static int hlt_counter;
>
> @@ -100,7 +101,8 @@ void cpu_idle(void)
>                        local_irq_disable();
>                        /* Don't trace irqs off for idle */
>                        stop_critical_timings();
> -                       pm_idle();
> +                       if (cpuidle_call_idle())
>

Again this should be cpuidle_idle_call()

+                               pm_idle();
>                        /*
>                         * Sanity check to ensure that pm_idle() returns
>                         * with IRQs enabled
> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
> index a3d0dc5..7a3b651 100644
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -38,6 +38,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/io.h>
>  #include <linux/kdebug.h>
> +#include <linux/cpuidle.h>
>
>  #include <asm/pgtable.h>
>  #include <asm/system.h>
> @@ -109,7 +110,8 @@ void cpu_idle(void)
>                        local_irq_disable();
>                        /* Don't trace irqs off for idle */
>                        stop_critical_timings();
> -                       pm_idle();
> +                       if (cpuidle_idle_call())
> +                               pm_idle();
>                        start_critical_timings();
>                }
>                tick_nohz_restart_sched_tick();
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index ca6f7ab..f693e44 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -37,6 +37,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/io.h>
>  #include <linux/ftrace.h>
> +#include <linux/cpuidle.h>
>
>  #include <asm/pgtable.h>
>  #include <asm/system.h>
> @@ -136,7 +137,8 @@ void cpu_idle(void)
>                        enter_idle();
>                        /* Don't trace irqs off for idle */
>                        stop_critical_timings();
> -                       pm_idle();
> +                       if (cpuidle_idle_call())
> +                               pm_idle();
>                        start_critical_timings();
>
>                        /* In many cases the interrupt that ended idle
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index 041df0b..d4c5423 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -25,10 +25,10 @@ DEFINE_PER_CPU(struct cpuidle_device *,
> cpuidle_devices);
>
>  DEFINE_MUTEX(cpuidle_lock);
>  LIST_HEAD(cpuidle_detected_devices);
> -static void (*pm_idle_old)(void);
>
>  static int enabled_devices;
>  static int off __read_mostly;
> +static int initialized __read_mostly;
>
>  int cpuidle_disabled(void)
>  {
> @@ -56,25 +56,23 @@ static int __cpuidle_register_device(struct
> cpuidle_device *dev);
>  * cpuidle_idle_call - the main idle loop
>  *
>  * NOTE: no locks or semaphores should be used here
> + * return non-zero on failure
>  */
> -static void cpuidle_idle_call(void)
> +int cpuidle_idle_call(void)
>  {
>        struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
>        struct cpuidle_state *target_state;
>        int next_state;
>
> +       if (off)
> +               return -ENODEV;
> +
> +       if (!initialized)
> +               return -ENODEV;
> +
>        /* check if the device is ready */
> -       if (!dev || !dev->enabled) {
> -               if (pm_idle_old)
> -                       pm_idle_old();
> -               else
> -#if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE)
> -                       default_idle();
> -#else
> -                       local_irq_enable();
> -#endif
> -               return;
> -       }
> +       if (!dev || !dev->enabled)
> +               return -EBUSY;
>
>  #if 0
>        /* shows regressions, re-enable for 2.6.29 */
> @@ -99,7 +97,7 @@ static void cpuidle_idle_call(void)
>        next_state = cpuidle_curr_governor->select(dev);
>        if (need_resched()) {
>                local_irq_enable();
> -               return;
> +               return 0;
>        }
>
>        target_state = &dev->states[next_state];
> @@ -124,6 +122,8 @@ static void cpuidle_idle_call(void)
>        /* give the governor an opportunity to reflect on the outcome */
>        if (cpuidle_curr_governor->reflect)
>                cpuidle_curr_governor->reflect(dev);
> +
> +       return 0;
>  }
>
>  /**
> @@ -131,10 +131,10 @@ static void cpuidle_idle_call(void)
>  */
>  void cpuidle_install_idle_handler(void)
>  {
> -       if (enabled_devices && (pm_idle != cpuidle_idle_call)) {
> +       if (enabled_devices) {
>                /* Make sure all changes finished before we switch to new
> idle */
>                smp_wmb();
> -               pm_idle = cpuidle_idle_call;
> +               initialized = 1;
>        }
>  }
>
> @@ -143,8 +143,8 @@ void cpuidle_install_idle_handler(void)
>  */
>  void cpuidle_uninstall_idle_handler(void)
>  {
> -       if (enabled_devices && pm_idle_old && (pm_idle != pm_idle_old)) {
> -               pm_idle = pm_idle_old;
> +       if (enabled_devices) {
> +               initialized = 0;
>                cpuidle_kick_cpus();
>        }
>  }
> @@ -440,8 +440,6 @@ static int __init cpuidle_init(void)
>        if (cpuidle_disabled())
>                return -ENODEV;
>
> -       pm_idle_old = pm_idle;
> -
>        ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
>        if (ret)
>                return ret;
> diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
> index b89f67d..b51629e 100644
> --- a/include/linux/cpuidle.h
> +++ b/include/linux/cpuidle.h
> @@ -123,6 +123,7 @@ struct cpuidle_driver {
>
>  #ifdef CONFIG_CPU_IDLE
>  extern void disable_cpuidle(void);
> +extern int cpuidle_idle_call(void);
>
>  extern int cpuidle_register_driver(struct cpuidle_driver *drv);
>  struct cpuidle_driver *cpuidle_get_driver(void);
> @@ -137,6 +138,7 @@ extern void cpuidle_disable_device(struct
> cpuidle_device *dev);
>
>  #else
>  static inline void disable_cpuidle(void) { }
> +static inline int cpuidle_idle_call(void) { return -ENODEV; }
>
>  static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
>  {return -ENODEV; }
> --
> 1.7.6.396.ge0613
>
> _______________________________________________
> linux-pm mailing list
> linux-pm@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/linux-pm
>

[-- Attachment #1.2: Type: text/html, Size: 11515 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply

* Re: [PATCH 02/11] PM: extend PM QoS with per-device wake-up constraints
From: Mark Brown @ 2011-08-04 13:24 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Linux PM mailing list, linux-omap, Jean Pihet, markgross
In-Reply-To: <201108030016.18174.rjw@sisk.pl>

On Wed, Aug 03, 2011 at 12:16:17AM +0200, Rafael J. Wysocki wrote:
> On Tuesday, August 02, 2011, Kevin Hilman wrote:

> > I disagree and think that both are quite realistic (mainly because they
> > exist today, albiet mostly out of tree because no generic QoS framework
> > exist.  e.g. on OMAP, we have OMAP-specific *kernel* APIs for requesting
> > per-device wakeup latencies, and drivers and frameworks are using them.)
> 
> I'm sure there are frameworks using such things.  I'm also sure there
> are frameworks that don't.  BTW, the "we have it out of the tree" argument is
> not very useful, so I'd appreciate it if you didn't use it.

It's useful to know if people have tried things; it doesn't mean it's
going to be OK for mainline but it is a data point.

> > In this case, the video framework (V4L2) might not want any knobs
> > exposed to userspace because userspace simply doesn't have the knowledge
> > to set appropriate constraints.  I'm less familiar with audio, but I
> > believe audio would be similar (sample rate, number of channels, mixing
> > with other concurrent audio streams, etc. etc. are all known by the
> > kernel-side code.)

Yeah, that sort of stuff and also data like wakeup latencies required to
service interrupts.

> I still don't understand what's wrong with allowing user space to _add_
> requirements.  The will only override the drivers' or frameworks' requirements
> if they are stronger, so the functionality shouldn't be hurt.  They may cause
> some more energy to be used, but if user space wants that, it's pretty much
> fine by me.

On the one hand that's true.  On the other hand that just seems like
going down a bad road where we have drivers that only work when run with
a magic userspace that may or may not be published which is just going
to make people miserable.  I'm not sure there are many people who would
choose to use more power without wanting some functional change so
presumably any users would be seeking to work around some kernel problem
and adding the user interface seems to be saying that this is OK,
expected and a natural part of power optimization.

^ permalink raw reply

* Re: [PATCH] PM: add statistics sysfs file for suspend to ram
From: Rafael J. Wysocki @ 2011-08-04  9:32 UTC (permalink / raw)
  To: Greg KH, Liu, ShuoX; +Cc: Brown, Len, linux-pm@lists.linux-foundation.org
In-Reply-To: <20110804052729.GA12707@suse.de>

On Thursday, August 04, 2011, Greg KH wrote:
> On Thu, Aug 04, 2011 at 01:13:38PM +0800, Liu, ShuoX wrote:
> > >From a906b0b5b4ff3414ceb9fc7a69a3d7c9d66e46b1 Mon Sep 17 00:00:00 2001
> > From: ShuoX Liu <shuox.liu@intel.com>
> > Date: Thu, 28 Jul 2011 10:54:22 +0800
> > Subject: [PATCH] PM: add statistics sysfs file for suspend to ram.
> 
> What's this stuff here for?  That's not needed (hint, I would have to
> edit it out by hand to be able to apply this patch.)
> 
> Thanks for resending a version that passes checkpatch.pl and could be
> applied, but all of my previous comments still stand.  This patch, as
> is, is totally unacceptable.

Agreed, plus I'd like to know the motivation behind it.  That is, we have
quite a few debug facilities in that code, so why are they insufficient?

Thanks,
Rafael

^ permalink raw reply

* Re: [RFC][PATCH] PM / Freezer: Freeze filesystems along with freezing processes (was: Re: PM / hibernate xfs lock up / xfs_reclaim_inodes_ag)
From: Rafael J. Wysocki @ 2011-08-04  9:27 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Christoph, Dave Chinner, LKML, xfs, Christoph Hellwig,
	Linux PM mailing list
In-Reply-To: <20110803172922.GA2126@ucw.cz>

On Wednesday, August 03, 2011, Pavel Machek wrote:
> Hi!
> 
> > Freeze all filesystems during the freezing of tasks by calling
> > freeze_bdev() for each of them and thaw them during the thawing
> > of tasks with the help of thaw_bdev().
> > 
> > This is needed by hibernation, because some filesystems (e.g. XFS)
> > deadlock with the preallocation of memory used by it if the memory
> > pressure caused by it is too heavy.
> > 
> > The additional benefit of this change is that, if something goes
> > wrong after filesystems have been frozen, they will stay in a
> > consistent state and journal replays won't be necessary (e.g. after
> > a failing suspend or resume).  In particular, this should help to
> > solve a long-standing issue that in some cases during resume from
> > hibernation the boot loader causes the journal to be replied for the
> > filesystem containing the kernel image and initrd causing it to
> > become inconsistent with the information stored in the hibernation
> > image.
> 
> > +/**
> > + * freeze_filesystems - Force all filesystems into a consistent state.
> > + */
> > +void freeze_filesystems(void)
> > +{
> > +	struct super_block *sb;
> > +
> > +	lockdep_off();
> 
> Ouch. So... why do we need to silence this?

So that it doesn't complain? :-)

I'll need some time to get the exact details here.

> > +	/*
> > +	 * Freeze in reverse order so filesystems dependant upon others are
> > +	 * frozen in the right order (eg. loopback on ext3).
> > +	 */
> > +	list_for_each_entry_reverse(sb, &super_blocks, s_list) {
> > +		if (!sb->s_root || !sb->s_bdev ||
> > +		    (sb->s_frozen == SB_FREEZE_TRANS) ||
> > +		    (sb->s_flags & MS_RDONLY) ||
> > +		    (sb->s_flags & MS_FROZEN))
> > +			continue;
> 
> Should we stop NFS from modifying remote server, too?

What do you mean exactly?

> Plus... ext3 writes to read-only filesystems on mount; not sure if it
> does it later. But RDONLY means 'user cant write to it' not 'bdev will
> not be modified'. Should we freeze all?
> 
> How can 'already frozen' happen?
> 
> > +	list_for_each_entry(sb, &super_blocks, s_list)
> > +		if (sb->s_flags & MS_FROZEN) {
> > +			sb->s_flags &= ~MS_FROZEN;
> > +			thaw_bdev(sb->s_bdev, sb);
> > +		}
> 
> ...because we'll unfreeze it even if we did not freeze it...

So we need not check MS_FROZEN in freeze_filesystems().  OK

Thanks,
Rafael

^ permalink raw reply

* Re: [PATCH v4 0/3] DEVFREQ, DVFS framework for non-CPU devices
From: MyungJoo Ham @ 2011-08-04  8:15 UTC (permalink / raw)
  To: Kevin Hilman
  Cc: Len Brown, Greg Kroah-Hartman, Kyungmin Park, Thomas Gleixner,
	linux-pm
In-Reply-To: <87aabq5oqo.fsf@ti.com>

On Thu, Aug 4, 2011 at 3:33 AM, Kevin Hilman <khilman@ti.com> wrote:
> MyungJoo Ham <myungjoo.ham@samsung.com> writes:
>
>> On Wed, Aug 3, 2011 at 7:02 AM, Kevin Hilman <khilman@ti.com> wrote:
>
> [...]
>
>>> Maybe I'm not understanding the usage of it fully, but that seems like
>>> hard-coding policy into the framework that might not be appropriate.
>>> For example, what if there are other devices with constraints such that
>>> they cannot currently scale frequency/voltage?
>>>
>>> Mabye MyungJoo can explain in more detail the usecases for tickle?
>>
>> Tickle is not for QoS between devices. It is for faster reaction to
>> (human) user inputs at DVFS side where waiting for DVFS's reaction
>> takes too much time and reducing polling interval costs too much.
>
> This is exactly what quality of service (QoS) is about.
>
> The user (whether it's a human user input or another device) has low
> quality and expects higher quality.  It wants to request better quality,
> so it needs a way to request it.
>
> The proposed "tickle" approach proposed here is simply a "request max
> frequency for duration X" QoS request.
>
> Kevin
>

Ok.. I see.

Now, I can agree with you that tickle is subset of QoS request.

As long as we have QoS request feature on devices with either OPP or
DEVFREQ, tickling is not needed.

I'll remove tickle in the next revision (along with some bugfixes for
bugs found recently).


Anyway, it appears that clock-rate-wise QoS request may be dealt at
OPP so that the OPPs meeting the QoS requests w/ frequency or voltage
specifications are enabled and returned with opp_find_* functions.
Maybe we will need to separate enable/disable by
opp_enable()/opp_disable() from enable/disable by QoS requests so that
the two may have different semantics. Then, QoS requests cannot
override opp_disable and opp_enable cannot override QoS requests. This
way, any clock-setting code properly based on OPP (including any
customized devfreq governors) cannot violate QoS requests.

How about this concept of getting QoS requests associated with clock rates?



Cheers!
MyungJoo.
-- 
MyungJoo Ham, Ph.D.
Mobile Software Platform Lab,
Digital Media and Communications (DMC) Business
Samsung Electronics
cell: 82-10-6714-2858

^ permalink raw reply

* Re: [PATCH] PM: add statistics sysfs file for suspend to ram
From: Greg KH @ 2011-08-04  5:27 UTC (permalink / raw)
  To: Liu, ShuoX; +Cc: Brown, Len, linux-pm@lists.linux-foundation.org
In-Reply-To: <6E3BC7F7C9A4BF4286DD4C043110F30B5B790E5741@shsmsx502.ccr.corp.intel.com>

On Thu, Aug 04, 2011 at 01:13:38PM +0800, Liu, ShuoX wrote:
> >From a906b0b5b4ff3414ceb9fc7a69a3d7c9d66e46b1 Mon Sep 17 00:00:00 2001
> From: ShuoX Liu <shuox.liu@intel.com>
> Date: Thu, 28 Jul 2011 10:54:22 +0800
> Subject: [PATCH] PM: add statistics sysfs file for suspend to ram.

What's this stuff here for?  That's not needed (hint, I would have to
edit it out by hand to be able to apply this patch.)

Thanks for resending a version that passes checkpatch.pl and could be
applied, but all of my previous comments still stand.  This patch, as
is, is totally unacceptable.

greg k-h

^ permalink raw reply

* Re: [PATCH] PM: add statistics sysfs file for suspend to ram
From: Greg KH @ 2011-08-04  5:17 UTC (permalink / raw)
  To: Liu, ShuoX; +Cc: Brown, Len, linux-pm@lists.linux-foundation.org
In-Reply-To: <6E3BC7F7C9A4BF4286DD4C043110F30B5B790E573B@shsmsx502.ccr.corp.intel.com>

On Thu, Aug 04, 2011 at 01:09:51PM +0800, Liu, ShuoX wrote:
> +static ssize_t suspend_stats_show(struct kobject *kobj,
> +               struct kobj_attribute *attr, char *buf)
> +{
> +   int i, index, last_index;
> +   char *s = buf;
> +
> +   last_index = suspend_stats.last_failed + REC_FAILED_DEV_NUM - 1;
> +   last_index %= REC_FAILED_DEV_NUM;
> +   s += sprintf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
> +           "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
> +           "success", suspend_stats.success,
> +           "fail", suspend_stats.fail,
> +           "failed_freeze", suspend_stats.failed_freeze,
> +           "failed_prepare", suspend_stats.failed_prepare,
> +           "failed_suspend", suspend_stats.failed_suspend,
> +           "failed_suspend_noirq",
> +               suspend_stats.failed_suspend_noirq,
> +           "failed_resume", suspend_stats.failed_resume,
> +           "failed_resume_noirq",
> +               suspend_stats.failed_resume_noirq);
> +   s += sprintf(s, "failed_devs:\n  last_failed:\t%s\n",
> +           suspend_stats.failed_devs[last_index]);
> +   for (i = 1; i < REC_FAILED_DEV_NUM; i++) {
> +       index = last_index + REC_FAILED_DEV_NUM - i;
> +       index %= REC_FAILED_DEV_NUM;
> +       s += sprintf(s, "\t\t%s\n",
> +           suspend_stats.failed_devs[index]);
> +   }

And, to top it all of, this is NOT allowed in sysfs at all.

Remember, sysfs is one text field per file.  Not something huge like
this.

Perhaps you should use debugfs instead?

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH] PM: add statistics sysfs file for suspend to ram
From: Greg KH @ 2011-08-04  5:16 UTC (permalink / raw)
  To: Liu, ShuoX; +Cc: Brown, Len, linux-pm@lists.linux-foundation.org
In-Reply-To: <6E3BC7F7C9A4BF4286DD4C043110F30B5B790E573B@shsmsx502.ccr.corp.intel.com>

On Thu, Aug 04, 2011 at 01:09:51PM +0800, Liu, ShuoX wrote:
> From a906b0b5b4ff3414ceb9fc7a69a3d7c9d66e46b1 Mon Sep 17 00:00:00 2001
> From: ShuoX Liu <shuox.liu@intel.com>
> Date: Thu, 28 Jul 2011 10:54:22 +0800
> Subject: [PATCH] PM: add statistics sysfs file for suspend to ram.
> 
> Record S3 failure time about each reason and the latest two failed
> devices' name in S3 progress.
> We can check it through /sys/power/suspend_stats.
> 
> Change-Id: Ieed7fd74e27d3b482675a20cb0bb26b9054a1624

What is that line for?

Also, your patch is corrupted and can not be applied.

Oh, and, as you are adding a sysfs file, you MUST add a
Documentation/ABI file that describes the file and what it is for and
what it does.

Also, why is this file needed?  Who needs this file?  What's wrong with
the kernel log instead?

thanks,

greg k-h

^ permalink raw reply

* [PATCH] PM: add statistics sysfs file for suspend to ram
From: Liu, ShuoX @ 2011-08-04  5:13 UTC (permalink / raw)
  To: Brown, Len, pavel@ucw.cz, rjw@sisk.pl, gregkh@suse.de
  Cc: linux-pm@lists.linux-foundation.org

>From a906b0b5b4ff3414ceb9fc7a69a3d7c9d66e46b1 Mon Sep 17 00:00:00 2001
From: ShuoX Liu <shuox.liu@intel.com>
Date: Thu, 28 Jul 2011 10:54:22 +0800
Subject: [PATCH] PM: add statistics sysfs file for suspend to ram.

Record S3 failure time about each reason and the latest two failed
devices' name in S3 progress.
We can check it through /sys/power/suspend_stats.

Change-Id: Ieed7fd74e27d3b482675a20cb0bb26b9054a1624
Signed-off-by: ShuoX Liu <shuox.liu@intel.com>
---
 drivers/base/power/main.c |   31 +++++++++++++++++++++++++--
 include/linux/suspend.h   |   16 ++++++++++++++
 kernel/power/main.c       |   49 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/suspend.c    |   13 ++++++++++-
 4 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index a854591..da1c561 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -46,6 +46,7 @@ LIST_HEAD(dpm_prepared_list);
 LIST_HEAD(dpm_suspended_list);
 LIST_HEAD(dpm_noirq_list);
 
+struct suspend_stats suspend_stats;
 static DEFINE_MUTEX(dpm_list_mtx);
 static pm_message_t pm_transition;
 
@@ -180,6 +181,15 @@ static void initcall_debug_report(struct device *dev, ktime_t calltime,
 	}
 }
 
+static void dpm_save_dev_name(const char *name)
+{
+	strlcpy(suspend_stats.failed_devs[suspend_stats.last_failed],
+		name,
+		sizeof(suspend_stats.failed_devs[0]));
+	suspend_stats.last_failed++;
+	suspend_stats.last_failed %= REC_FAILED_DEV_NUM;
+}
+
 /**
  * dpm_wait - Wait for a PM operation to complete.
  * @dev: Device to wait for.
@@ -464,8 +474,11 @@ void dpm_resume_noirq(pm_message_t state)
 		mutex_unlock(&dpm_list_mtx);
 
 		error = device_resume_noirq(dev, state);
-		if (error)
+		if (error) {
+			suspend_stats.failed_resume_noirq++;
+			dpm_save_dev_name(dev_name(dev));
 			pm_dev_err(dev, state, " early", error);
+		}
 
 		mutex_lock(&dpm_list_mtx);
 		put_device(dev);
@@ -626,8 +639,11 @@ void dpm_resume(pm_message_t state)
 			mutex_unlock(&dpm_list_mtx);
 
 			error = device_resume(dev, state, false);
-			if (error)
+			if (error) {
+				suspend_stats.failed_resume++;
+				dpm_save_dev_name(dev_name(dev));
 				pm_dev_err(dev, state, "", error);
+			}
 
 			mutex_lock(&dpm_list_mtx);
 		}
@@ -802,6 +818,8 @@ int dpm_suspend_noirq(pm_message_t state)
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
 			pm_dev_err(dev, state, " late", error);
+			suspend_stats.failed_suspend_noirq++;
+			dpm_save_dev_name(dev_name(dev));
 			put_device(dev);
 			break;
 		}
@@ -923,8 +941,10 @@ static void async_suspend(void *data, async_cookie_t cookie)
 	int error;
 
 	error = __device_suspend(dev, pm_transition, true);
-	if (error)
+	if (error) {
+		dpm_save_dev_name(dev_name(dev));
 		pm_dev_err(dev, pm_transition, " async", error);
+	}
 
 	put_device(dev);
 }
@@ -967,6 +987,7 @@ int dpm_suspend(pm_message_t state)
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
 			pm_dev_err(dev, state, "", error);
+			dpm_save_dev_name(dev_name(dev));
 			put_device(dev);
 			break;
 		}
@@ -982,6 +1003,8 @@ int dpm_suspend(pm_message_t state)
 		error = async_error;
 	if (!error)
 		dpm_show_time(starttime, state, NULL);
+	else
+		suspend_stats.failed_suspend++;
 	return error;
 }
 
@@ -1090,6 +1113,8 @@ int dpm_suspend_start(pm_message_t state)
 	error = dpm_prepare(state);
 	if (!error)
 		error = dpm_suspend(state);
+	else
+		suspend_stats.failed_prepare++;
 	return error;
 }
 EXPORT_SYMBOL_GPL(dpm_suspend_start);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 6bbcef2..6a8ff23 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -34,6 +34,22 @@ typedef int __bitwise suspend_state_t;
 #define PM_SUSPEND_MEM		((__force suspend_state_t) 3)
 #define PM_SUSPEND_MAX		((__force suspend_state_t) 4)
 
+struct suspend_stats {
+	int	success;
+	int	fail;
+	int	failed_freeze;
+	int	failed_prepare;
+	int	failed_suspend;
+	int	failed_suspend_noirq;
+	int	failed_resume;
+	int	failed_resume_noirq;
+#define	REC_FAILED_DEV_NUM	2
+	char	failed_devs[REC_FAILED_DEV_NUM][40];
+	int	last_failed;
+};
+
+extern struct suspend_stats suspend_stats;
+
 /**
  * struct platform_suspend_ops - Callbacks for managing platform dependent
  *	system sleep states.
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6c601f8..32eb67b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -133,6 +133,50 @@ power_attr(pm_test);
 
 #endif /* CONFIG_PM_SLEEP */
 
+static ssize_t suspend_stats_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	int i, index, last_index;
+	char *s = buf;
+
+	last_index = suspend_stats.last_failed + REC_FAILED_DEV_NUM - 1;
+	last_index %= REC_FAILED_DEV_NUM;
+	s += sprintf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+			"%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+			"success", suspend_stats.success,
+			"fail", suspend_stats.fail,
+			"failed_freeze", suspend_stats.failed_freeze,
+			"failed_prepare", suspend_stats.failed_prepare,
+			"failed_suspend", suspend_stats.failed_suspend,
+			"failed_suspend_noirq",
+				suspend_stats.failed_suspend_noirq,
+			"failed_resume", suspend_stats.failed_resume,
+			"failed_resume_noirq",
+				suspend_stats.failed_resume_noirq);
+	s += sprintf(s,	"failed_devs:\n  last_failed:\t%s\n",
+			suspend_stats.failed_devs[last_index]);
+	for (i = 1; i < REC_FAILED_DEV_NUM; i++) {
+		index = last_index + REC_FAILED_DEV_NUM - i;
+		index %= REC_FAILED_DEV_NUM;
+		s += sprintf(s, "\t\t%s\n",
+			suspend_stats.failed_devs[index]);
+	}
+
+	if (s != buf)
+		/* convert the last space to a newline */
+		*(s-1) = '\n';
+
+	return s - buf;
+}
+
+static ssize_t suspend_stats_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t n)
+{
+	return n;
+}
+
+power_attr(suspend_stats);
+
 struct kobject *power_kobj;
 
 /**
@@ -194,6 +238,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 	}
 	if (state < PM_SUSPEND_MAX && *s)
 		error = enter_state(state);
+		if (error)
+			suspend_stats.fail++;
+		else
+			suspend_stats.success++;
 #endif
 
  Exit:
@@ -310,6 +358,7 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_DEBUG
 	&pm_test_attr.attr,
 #endif
+	&suspend_stats_attr.attr,
 #endif
 	NULL,
 };
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b6b71ad..9bb4281 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -106,6 +106,8 @@ static int suspend_prepare(void)
 	error = suspend_freeze_processes();
 	if (!error)
 		return 0;
+	else
+		suspend_stats.failed_freeze++;
 
 	suspend_thaw_processes();
 	usermodehelper_enable();
@@ -315,8 +317,15 @@ int enter_state(suspend_state_t state)
  */
 int pm_suspend(suspend_state_t state)
 {
-	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
-		return enter_state(state);
+	int ret;
+	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) {
+		ret = enter_state(state);
+		if (ret)
+			suspend_stats.fail++;
+		else
+			suspend_stats.success++;
+		return ret;
+	}
 	return -EINVAL;
 }
 EXPORT_SYMBOL(pm_suspend);
-- 
1.7.1

^ permalink raw reply related

* [PATCH] PM: add statistics sysfs file for suspend to ram
From: Liu, ShuoX @ 2011-08-04  5:09 UTC (permalink / raw)
  To: Brown, Len, pavel@ucw.cz, rjw@sisk.pl, gregkh@suse.de
  Cc: linux-pm@lists.linux-foundation.org


[-- Attachment #1.1: Type: text/plain, Size: 7944 bytes --]

>From a906b0b5b4ff3414ceb9fc7a69a3d7c9d66e46b1 Mon Sep 17 00:00:00 2001
From: ShuoX Liu <shuox.liu@intel.com>
Date: Thu, 28 Jul 2011 10:54:22 +0800
Subject: [PATCH] PM: add statistics sysfs file for suspend to ram.

Record S3 failure time about each reason and the latest two failed
devices' name in S3 progress.
We can check it through /sys/power/suspend_stats.

Change-Id: Ieed7fd74e27d3b482675a20cb0bb26b9054a1624
Signed-off-by: ShuoX Liu <shuox.liu@intel.com>
---
 drivers/base/power/main.c |   31 +++++++++++++++++++++++++--
 include/linux/suspend.h   |   16 ++++++++++++++
 kernel/power/main.c       |   49 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/suspend.c    |   13 ++++++++++-
 4 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index a854591..da1c561 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -46,6 +46,7 @@ LIST_HEAD(dpm_prepared_list);
 LIST_HEAD(dpm_suspended_list);
 LIST_HEAD(dpm_noirq_list);

+struct suspend_stats suspend_stats;
 static DEFINE_MUTEX(dpm_list_mtx);
 static pm_message_t pm_transition;

@@ -180,6 +181,15 @@ static void initcall_debug_report(struct device *dev, ktime_t calltime,
    }
 }

+static void dpm_save_dev_name(const char *name)
+{
+   strlcpy(suspend_stats.failed_devs[suspend_stats.last_failed],
+       name,
+       sizeof(suspend_stats.failed_devs[0]));
+   suspend_stats.last_failed++;
+   suspend_stats.last_failed %= REC_FAILED_DEV_NUM;
+}
+
 /**
  * dpm_wait - Wait for a PM operation to complete.
  * @dev: Device to wait for.
@@ -464,8 +474,11 @@ void dpm_resume_noirq(pm_message_t state)
        mutex_unlock(&dpm_list_mtx);

        error = device_resume_noirq(dev, state);
-       if (error)
+       if (error) {
+           suspend_stats.failed_resume_noirq++;
+           dpm_save_dev_name(dev_name(dev));
            pm_dev_err(dev, state, " early", error);
+       }

        mutex_lock(&dpm_list_mtx);
        put_device(dev);
@@ -626,8 +639,11 @@ void dpm_resume(pm_message_t state)
            mutex_unlock(&dpm_list_mtx);

            error = device_resume(dev, state, false);
-           if (error)
+           if (error) {
+               suspend_stats.failed_resume++;
+               dpm_save_dev_name(dev_name(dev));
                pm_dev_err(dev, state, "", error);
+           }

            mutex_lock(&dpm_list_mtx);
        }
@@ -802,6 +818,8 @@ int dpm_suspend_noirq(pm_message_t state)
        mutex_lock(&dpm_list_mtx);
        if (error) {
            pm_dev_err(dev, state, " late", error);
+           suspend_stats.failed_suspend_noirq++;
+           dpm_save_dev_name(dev_name(dev));
            put_device(dev);
            break;
        }
@@ -923,8 +941,10 @@ static void async_suspend(void *data, async_cookie_t cookie)
    int error;

    error = __device_suspend(dev, pm_transition, true);
-   if (error)
+   if (error) {
+       dpm_save_dev_name(dev_name(dev));
        pm_dev_err(dev, pm_transition, " async", error);
+   }

    put_device(dev);
 }
@@ -967,6 +987,7 @@ int dpm_suspend(pm_message_t state)
        mutex_lock(&dpm_list_mtx);
        if (error) {
            pm_dev_err(dev, state, "", error);
+           dpm_save_dev_name(dev_name(dev));
            put_device(dev);
            break;
        }
@@ -982,6 +1003,8 @@ int dpm_suspend(pm_message_t state)
        error = async_error;
    if (!error)
        dpm_show_time(starttime, state, NULL);
+   else
+       suspend_stats.failed_suspend++;
    return error;
 }

@@ -1090,6 +1113,8 @@ int dpm_suspend_start(pm_message_t state)
    error = dpm_prepare(state);
    if (!error)
        error = dpm_suspend(state);
+   else
+       suspend_stats.failed_prepare++;
    return error;
 }
 EXPORT_SYMBOL_GPL(dpm_suspend_start);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 6bbcef2..6a8ff23 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -34,6 +34,22 @@ typedef int __bitwise suspend_state_t;
 #define PM_SUSPEND_MEM     ((__force suspend_state_t) 3)
 #define PM_SUSPEND_MAX     ((__force suspend_state_t) 4)

+struct suspend_stats {
+   int success;
+   int fail;
+   int failed_freeze;
+   int failed_prepare;
+   int failed_suspend;
+   int failed_suspend_noirq;
+   int failed_resume;
+   int failed_resume_noirq;
+#define    REC_FAILED_DEV_NUM  2
+   char    failed_devs[REC_FAILED_DEV_NUM][40];
+   int last_failed;
+};
+
+extern struct suspend_stats suspend_stats;
+
 /**
  * struct platform_suspend_ops - Callbacks for managing platform dependent
  * system sleep states.
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6c601f8..32eb67b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -133,6 +133,50 @@ power_attr(pm_test);

 #endif /* CONFIG_PM_SLEEP */

+static ssize_t suspend_stats_show(struct kobject *kobj,
+               struct kobj_attribute *attr, char *buf)
+{
+   int i, index, last_index;
+   char *s = buf;
+
+   last_index = suspend_stats.last_failed + REC_FAILED_DEV_NUM - 1;
+   last_index %= REC_FAILED_DEV_NUM;
+   s += sprintf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+           "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+           "success", suspend_stats.success,
+           "fail", suspend_stats.fail,
+           "failed_freeze", suspend_stats.failed_freeze,
+           "failed_prepare", suspend_stats.failed_prepare,
+           "failed_suspend", suspend_stats.failed_suspend,
+           "failed_suspend_noirq",
+               suspend_stats.failed_suspend_noirq,
+           "failed_resume", suspend_stats.failed_resume,
+           "failed_resume_noirq",
+               suspend_stats.failed_resume_noirq);
+   s += sprintf(s, "failed_devs:\n  last_failed:\t%s\n",
+           suspend_stats.failed_devs[last_index]);
+   for (i = 1; i < REC_FAILED_DEV_NUM; i++) {
+       index = last_index + REC_FAILED_DEV_NUM - i;
+       index %= REC_FAILED_DEV_NUM;
+       s += sprintf(s, "\t\t%s\n",
+           suspend_stats.failed_devs[index]);
+   }
+
+   if (s != buf)
+       /* convert the last space to a newline */
+       *(s-1) = '\n';
+
+   return s - buf;
+}
+
+static ssize_t suspend_stats_store(struct kobject *kobj,
+       struct kobj_attribute *attr, const char *buf, size_t n)
+{
+   return n;
+}
+
+power_attr(suspend_stats);
+
 struct kobject *power_kobj;

 /**
@@ -194,6 +238,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
    }
    if (state < PM_SUSPEND_MAX && *s)
        error = enter_state(state);
+       if (error)
+           suspend_stats.fail++;
+       else
+           suspend_stats.success++;
 #endif

  Exit:
@@ -310,6 +358,7 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_DEBUG
    &pm_test_attr.attr,
 #endif
+   &suspend_stats_attr.attr,
 #endif
    NULL,
 };
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b6b71ad..9bb4281 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -106,6 +106,8 @@ static int suspend_prepare(void)
    error = suspend_freeze_processes();
    if (!error)
        return 0;
+   else
+       suspend_stats.failed_freeze++;

    suspend_thaw_processes();
    usermodehelper_enable();
@@ -315,8 +317,15 @@ int enter_state(suspend_state_t state)
  */
 int pm_suspend(suspend_state_t state)
 {
-   if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
-       return enter_state(state);
+   int ret;
+   if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) {
+       ret = enter_state(state);
+       if (ret)
+           suspend_stats.fail++;
+       else
+           suspend_stats.success++;
+       return ret;
+   }
    return -EINVAL;
 }
 EXPORT_SYMBOL(pm_suspend);
--
1.7.1



[-- Attachment #1.2: Type: text/html, Size: 16111 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply related

* Re: [GIT PULL] idle patches for Linux 3.1
From: Len Brown @ 2011-08-04  5:03 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-pm, linux-kernel
In-Reply-To: <CA+55aFxikiro4cEwGEOMCGUswyPsaAwyspP1cFUviJ5OUKOZJw@mail.gmail.com>

> On Wed, Aug 3, 2011 at 10:54 AM, Len Brown <lenb@kernel.org> wrote:
> >
> > please pull from:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux-idle-2.6.git idle-release
> 
> As with the APEI tree, the commits are all recent and cannot have been
> in -next.  It's the last day before the merge window closes, please
> tell me why I should take this in this cycle at all?

I re-based this branch immediately before pushing
in order to add "Acked-by" annotations.

The cpuidle patches were in linux-next back in April.
But they were on a branch with other patches that got
deferred till 2012 b/c some folks were uneasy with
me deleting so much code so suddenly.  I cherry-picked
some simple ones here b/c IBM asked me to in order to
make their upcoming p-series cpuidle patches cleaner.

The mrst pmu.c driver is new.  MRST doesn't work without it.
Of course, if you don't have CONFIG_X86_MRST=y, you don't care.

Yes, this pull request is late, and if you skip it,
life will still go on.  You'll simply see the same
patches wating for you in 10 weeks.  On the other hand,
some developers will have a happier summer if these get upstream now,
and I'm here to support them if anything goes wrong.

thanks,
-Len

^ permalink raw reply

* Re: [GIT PULL] idle patches for Linux 3.1
From: Linus Torvalds @ 2011-08-04  0:55 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-pm, linux-kernel
In-Reply-To: <alpine.LFD.2.02.1108031627060.19398@x980>

On Wed, Aug 3, 2011 at 10:54 AM, Len Brown <lenb@kernel.org> wrote:
>
> please pull from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux-idle-2.6.git idle-release

As with the APEI tree, the commits are all recent and cannot have been
in -next.  It's the last day before the merge window closes, please
tell me why I should take this in this cycle at all?

                Linus

^ permalink raw reply

* Re: [PATCH 1/5] mrst_pmu: driver for Intel Moorestown Power Management Unit
From: H. Peter Anvin @ 2011-08-03 23:04 UTC (permalink / raw)
  To: Len Brown; +Cc: Len Brown, linux-pm, linux-kernel
In-Reply-To: <8e24ed2af46a5d008d85bfeb128d25efc394a3fd.1312400543.git.len.brown@intel.com>

On 08/03/2011 12:44 PM, Len Brown wrote:
> From: Len Brown <len.brown@intel.com>
> 
> The Moorestown (MRST) Power Management Unit (PMU) driver
> directs the SOC power states in the "Langwell" south complex (SCU).
> 
> It hooks pci_platform_pm_ops[] and thus observes all PCI ".set_state"
> requests.  For devices in the SC, the pmu driver translates those
> PCI requests into the appropriate commands for the SCU.
> 
> The PMU driver helps implement S0i3, a deep system idle power idle state.
> Entry into S0i3 is via cpuidle, just like regular processor c-states.
> S0i3 depends on pre-conditions including uni-processor, graphics off,
> and certain IO devices in the SC must be off.  If those pre-conditions
> are met, then the PMU allows cpuidle to enter S0i3, otherwise such requests
> are demoted, either to Atom C4 or Atom C6.
> 
> This driver is based on prototype work by Bruce Flemming,
> Illyas Mansoor, Rajeev D. Muralidhar, Vishwesh M. Rudramuni,
> Hari Seshadri and Sujith Thomas.  The current driver also
> includes contributions from H. Peter Anvin, Arjan van de Ven,
> Kristen Accardi, and Yong Wang.
> 
> Thanks for additional review feedback from Alan Cox and Randy Dunlap.
> 
> Signed-off-by: Len Brown <len.brown@intel.com>
> Acked-by: Alan Cox <alan@linux.intel.com>
> ---
>  MAINTAINERS                     |    6 +
>  arch/x86/platform/mrst/Makefile |    1 +
>  arch/x86/platform/mrst/pmu.c    |  817 +++++++++++++++++++++++++++++++++++++++
>  arch/x86/platform/mrst/pmu.h    |  234 +++++++++++
>  4 files changed, 1058 insertions(+), 0 deletions(-)
>  create mode 100644 arch/x86/platform/mrst/pmu.c
>  create mode 100644 arch/x86/platform/mrst/pmu.h
> 

Acked-by: H. Peter Anvin <hpa@linux.intel.com>

	-hpa

^ permalink raw reply

* Re: [PATCH 5/5] cpuidle: stop depending on pm_idle
From: H. Peter Anvin @ 2011-08-03 22:36 UTC (permalink / raw)
  To: Len Brown; +Cc: Len Brown, Kevin Hilman, x86, linux-kernel, linux-pm
In-Reply-To: <619b3f9e65307529dd4bbc98efe9d2f3b632646c.1312400543.git.len.brown@intel.com>

On 08/03/2011 12:44 PM, Len Brown wrote:
> From: Len Brown <len.brown@intel.com>
> 
> cpuidle users should call cpuidle_call_idle() directly
> rather than via (pm_idle)() function pointer.
> 
> Architecture may choose to continue using (pm_idle)(),
> but cpuidle need not depend on it:
> 
>   my_arch_cpu_idle()
> 	...
> 	if(cpuidle_call_idle())
> 		pm_idle();
> 
> cc: x86@kernel.org
> cc: Kevin Hilman <khilman@deeprootsystems.com>
> cc: Paul Mundt <lethal@linux-sh.org>
> Signed-off-by: Len Brown <len.brown@intel.com>

Acked-by: H. Peter Anvin <hpa@linux.intel.com>

^ permalink raw reply

* Re: [PATCH 4/5] x86 idle: move mwait_idle_with_hints() to where it is used
From: H. Peter Anvin @ 2011-08-03 22:36 UTC (permalink / raw)
  To: Len Brown; +Cc: Len Brown, linux-pm, x86, linux-kernel
In-Reply-To: <779d881fbcdf840e23a9bd3e237aebca4e0376e8.1312400543.git.len.brown@intel.com>

On 08/03/2011 12:44 PM, Len Brown wrote:
> From: Len Brown <len.brown@intel.com>
> 
> ...and make it static
> 
> no functional change
> 
> cc: x86@kernel.org
> Signed-off-by: Len Brown <len.brown@intel.com>
> ---
>  arch/x86/include/asm/processor.h |    2 --
>  arch/x86/kernel/acpi/cstate.c    |   23 +++++++++++++++++++++++
>  arch/x86/kernel/process.c        |   23 -----------------------
>  3 files changed, 23 insertions(+), 25 deletions(-)
> 

Acked-by: H. Peter Anvin <hpa@linux.intel.com>

^ permalink raw reply

* Re: [PATCH 3/5] cpuidle: replace xen access to x86 pm_idle and default_idle
From: H. Peter Anvin @ 2011-08-03 22:35 UTC (permalink / raw)
  To: Len Brown; +Cc: Len Brown, linux-pm, xen-devel, linux-kernel
In-Reply-To: <0c70c6b12c37ececbfa9b7734495835514f273a4.1312400543.git.len.brown@intel.com>

On 08/03/2011 12:44 PM, Len Brown wrote:
> From: Len Brown <len.brown@intel.com>
> 
> When a Xen Dom0 kernel boots on a hypervisor, it gets access
> to the raw-hardware ACPI tables.  While it parses the idle tables
> for the hypervisor's beneift, it uses HLT for its own idle.
> 
> Rather than have xen scribble on pm_idle and access default_idle,
> have it simply disable_cpuidle() so acpi_idle will not load and
> architecture default HLT will be used.
> 
> cc: xen-devel@lists.xensource.com
> Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> Signed-off-by: Len Brown <len.brown@intel.com>
> ---
>  arch/x86/xen/setup.c      |    3 ++-
>  drivers/cpuidle/cpuidle.c |    4 ++++
>  include/linux/cpuidle.h   |    2 ++
>  3 files changed, 8 insertions(+), 1 deletions(-)
> 

Acked-by: H. Peter Anvin <hpa@linux.intel.com>

^ permalink raw reply

* [RFC][PATCH] PM / Freezer: Freeze filesystems along with freezing processes (was: Re: PM / hibernate xfs lock up / xfs_reclaim_inodes_ag)
From: Rafael J. Wysocki @ 2011-08-03 21:15 UTC (permalink / raw)
  To: Nigel Cunningham
  Cc: Christoph, Dave Chinner, LKML, xfs, Christoph Hellwig,
	Linux PM mailing list
In-Reply-To: <4E300317.7090108@tuxonice.net>

On Wednesday, July 27, 2011, Nigel Cunningham wrote:
> Hi.
> 
> On 27/07/11 20:33, Christoph Hellwig wrote:
> > On Wed, Jul 27, 2011 at 11:35:13AM +0200, Rafael J. Wysocki wrote:
> >> The Pavel's objection, if I remember it correctly, was that some
> >> (or the majority of?) filesystems didn't implement the freezing operation,
> >> so they would be more vulnerable to data loss in case of a failing hibernation
> >> after this change.  However, that's better than actively causing pain to XFS
> >> users.
> > 
> > The objection never made sense and only means he never read the code.
> > freeze_super (or freeze_bdev back then) always does a sync_filesystem
> > before even checking if we have a freeze method, and sync_filesystem is
> > what we iterate over for each superblock in sync().
> 
> I've had freezing supers in TOI for a couple of years now and it has
> only ever helped. To be honest, if you have a ton of dirty pages, it
> does result in a big delay, but that's the worst of it.

OK, so below is the revived patch.

To be precise, we don't call sys_sync() from the freezer an more
(evidently, I'd removed in myself, but later forgot about that), so
it only adds freeze_filesystems() and thaw_filesystems().

Comments welcome.

Thanks,
Rafael

---

Freeze all filesystems during the freezing of tasks by calling
freeze_bdev() for each of them and thaw them during the thawing
of tasks with the help of thaw_bdev().

This is needed by hibernation, because some filesystems (e.g. XFS)
deadlock with the preallocation of memory used by it if the memory
pressure caused by it is too heavy.

The additional benefit of this change is that, if something goes
wrong after filesystems have been frozen, they will stay in a
consistent state and journal replays won't be necessary (e.g. after
a failing suspend or resume).  In particular, this should help to
solve a long-standing issue that in some cases during resume from
hibernation the boot loader causes the journal to be replied for the
filesystem containing the kernel image and initrd causing it to
become inconsistent with the information stored in the hibernation
image.

This change is based on earlier work by Nigel Cunningham.

Not-really-signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 fs/block_dev.c         |   43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h     |    6 ++++++
 kernel/power/process.c |    7 ++++++-
 3 files changed, 55 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -211,6 +211,7 @@ struct inodes_stat_t {
 #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
+#define MS_FROZEN	(1<<25) /* bdev has been frozen */
 #define MS_NOSEC	(1<<28)
 #define MS_BORN		(1<<29)
 #define MS_ACTIVE	(1<<30)
@@ -2047,6 +2048,8 @@ extern struct super_block *freeze_bdev(s
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
+extern void freeze_filesystems(void);
+extern void thaw_filesystems(void);
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
@@ -2061,6 +2064,9 @@ static inline int thaw_bdev(struct block
 {
 	return 0;
 }
+
+static inline void freeze_filesystems(void) {}
+static inline void thaw_filesystems(void) {}
 #endif
 extern int sync_filesystem(struct super_block *);
 extern const struct file_operations def_blk_fops;
Index: linux-2.6/fs/block_dev.c
===================================================================
--- linux-2.6.orig/fs/block_dev.c
+++ linux-2.6/fs/block_dev.c
@@ -314,6 +314,49 @@ out:
 }
 EXPORT_SYMBOL(thaw_bdev);
 
+/**
+ * freeze_filesystems - Force all filesystems into a consistent state.
+ */
+void freeze_filesystems(void)
+{
+	struct super_block *sb;
+
+	lockdep_off();
+	/*
+	 * Freeze in reverse order so filesystems dependant upon others are
+	 * frozen in the right order (eg. loopback on ext3).
+	 */
+	list_for_each_entry_reverse(sb, &super_blocks, s_list) {
+		if (!sb->s_root || !sb->s_bdev ||
+		    (sb->s_frozen == SB_FREEZE_TRANS) ||
+		    (sb->s_flags & MS_RDONLY) ||
+		    (sb->s_flags & MS_FROZEN))
+			continue;
+
+		freeze_bdev(sb->s_bdev);
+		sb->s_flags |= MS_FROZEN;
+	}
+	lockdep_on();
+}
+
+/**
+ * thaw_filesystems - Make all filesystems active again.
+ */
+void thaw_filesystems(void)
+{
+	struct super_block *sb;
+
+	lockdep_off();
+
+	list_for_each_entry(sb, &super_blocks, s_list)
+		if (sb->s_flags & MS_FROZEN) {
+			sb->s_flags &= ~MS_FROZEN;
+			thaw_bdev(sb->s_bdev, sb);
+		}
+
+	lockdep_on();
+}
+
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
Index: linux-2.6/kernel/power/process.c
===================================================================
--- linux-2.6.orig/kernel/power/process.c
+++ linux-2.6/kernel/power/process.c
@@ -12,10 +12,10 @@
 #include <linux/oom.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
-#include <linux/syscalls.h>
 #include <linux/freezer.h>
 #include <linux/delay.h>
 #include <linux/workqueue.h>
+#include <linux/fs.h>
 
 /* 
  * Timeout for stopping processes
@@ -147,6 +147,10 @@ int freeze_processes(void)
 		goto Exit;
 	printk("done.\n");
 
+	pr_info("Freezing filesystems ... ");
+	freeze_filesystems();
+	pr_info("done.\n");
+
 	printk("Freezing remaining freezable tasks ... ");
 	error = try_to_freeze_tasks(false);
 	if (error)
@@ -188,6 +192,7 @@ void thaw_processes(void)
 	printk("Restarting tasks ... ");
 	thaw_workqueues();
 	thaw_tasks(true);
+	thaw_filesystems();
 	thaw_tasks(false);
 	schedule();
 	printk("done.\n");

^ permalink raw reply

* [GIT PULL] idle patches for Linux 3.1
From: Len Brown @ 2011-08-03 20:54 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-pm, linux-kernel

Hi Linus,

please pull from: 

git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux-idle-2.6.git idle-release

A new power management driver for Moorestown
(essentially a platform idle driver).

Plus some small cpuidle cleanups that were on the list in April.

The cpuidle patches set the stage for adding p-series
support to cpuidle -- possibly by 3.2.

This will update the files shown below.

thanks!

Len Brown
Intel Open Source Technology Center

ps. individual patches are available on linux-pm@lists.linux-foundation.org

 Documentation/kernel-parameters.txt |    3 +
 MAINTAINERS                         |    6 +
 arch/arm/kernel/process.c           |    4 +-
 arch/sh/kernel/idle.c               |    6 +-
 arch/x86/include/asm/processor.h    |    2 -
 arch/x86/kernel/acpi/cstate.c       |   23 +
 arch/x86/kernel/process.c           |   23 -
 arch/x86/kernel/process_32.c        |    4 +-
 arch/x86/kernel/process_64.c        |    4 +-
 arch/x86/platform/mrst/Makefile     |    1 +
 arch/x86/platform/mrst/pmu.c        |  817 +++++++++++++++++++++++++++++++++++
 arch/x86/platform/mrst/pmu.h        |  234 ++++++++++
 arch/x86/xen/setup.c                |    3 +-
 drivers/cpuidle/cpuidle.c           |   50 ++-
 drivers/cpuidle/cpuidle.h           |    1 +
 drivers/cpuidle/driver.c            |    3 +
 drivers/cpuidle/governor.c          |    3 +
 include/linux/cpuidle.h             |    4 +
 18 files changed, 1141 insertions(+), 50 deletions(-)
 create mode 100644 arch/x86/platform/mrst/pmu.c
 create mode 100644 arch/x86/platform/mrst/pmu.h

through these commits:

Len Brown (5):
      mrst_pmu: driver for Intel Moorestown Power Management Unit
      cpuidle: create bootparam "cpuidle.off=1"
      cpuidle: replace xen access to x86 pm_idle and default_idle
      x86 idle: move mwait_idle_with_hints() to where it is used
      cpuidle: stop depending on pm_idle

with this log:

commit 619b3f9e65307529dd4bbc98efe9d2f3b632646c
Author: Len Brown <len.brown@intel.com>
Date:   Fri Apr 1 19:34:59 2011 -0400

    cpuidle: stop depending on pm_idle
    
    cpuidle users should call cpuidle_call_idle() directly
    rather than via (pm_idle)() function pointer.
    
    Architecture may choose to continue using (pm_idle)(),
    but cpuidle need not depend on it:
    
      my_arch_cpu_idle()
    	...
    	if(cpuidle_call_idle())
    		pm_idle();
    
    cc: x86@kernel.org
    cc: Kevin Hilman <khilman@deeprootsystems.com>
    cc: Paul Mundt <lethal@linux-sh.org>
    Signed-off-by: Len Brown <len.brown@intel.com>

commit 779d881fbcdf840e23a9bd3e237aebca4e0376e8
Author: Len Brown <len.brown@intel.com>
Date:   Wed Mar 30 23:52:29 2011 -0400

    x86 idle: move mwait_idle_with_hints() to where it is used
    
    ...and make it static
    
    no functional change
    
    cc: x86@kernel.org
    Signed-off-by: Len Brown <len.brown@intel.com>

commit 0c70c6b12c37ececbfa9b7734495835514f273a4
Author: Len Brown <len.brown@intel.com>
Date:   Fri Apr 1 18:28:35 2011 -0400

    cpuidle: replace xen access to x86 pm_idle and default_idle
    
    When a Xen Dom0 kernel boots on a hypervisor, it gets access
    to the raw-hardware ACPI tables.  While it parses the idle tables
    for the hypervisor's beneift, it uses HLT for its own idle.
    
    Rather than have xen scribble on pm_idle and access default_idle,
    have it simply disable_cpuidle() so acpi_idle will not load and
    architecture default HLT will be used.
    
    cc: xen-devel@lists.xensource.com
    Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
    Signed-off-by: Len Brown <len.brown@intel.com>

commit 3928f8d7ba2df07e391e390f9526c06774b84063
Author: Len Brown <len.brown@intel.com>
Date:   Fri Apr 1 18:13:10 2011 -0400

    cpuidle: create bootparam "cpuidle.off=1"
    
    useful for disabling cpuidle to fall back
    to architecture-default idle loop
    
    cpuidle drivers and governors will fail to register.
    on x86 they'll say so:
    
    intel_idle: intel_idle yielding to (null)
    ACPI: acpi_idle yielding to (null)
    
    Signed-off-by: Len Brown <len.brown@intel.com>

commit 8e24ed2af46a5d008d85bfeb128d25efc394a3fd
Author: Len Brown <len.brown@intel.com>
Date:   Tue Jul 12 22:29:32 2011 -0400

    mrst_pmu: driver for Intel Moorestown Power Management Unit
    
    The Moorestown (MRST) Power Management Unit (PMU) driver
    directs the SOC power states in the "Langwell" south complex (SCU).
    
    It hooks pci_platform_pm_ops[] and thus observes all PCI ".set_state"
    requests.  For devices in the SC, the pmu driver translates those
    PCI requests into the appropriate commands for the SCU.
    
    The PMU driver helps implement S0i3, a deep system idle power idle state.
    Entry into S0i3 is via cpuidle, just like regular processor c-states.
    S0i3 depends on pre-conditions including uni-processor, graphics off,
    and certain IO devices in the SC must be off.  If those pre-conditions
    are met, then the PMU allows cpuidle to enter S0i3, otherwise such requests
    are demoted, either to Atom C4 or Atom C6.
    
    This driver is based on prototype work by Bruce Flemming,
    Illyas Mansoor, Rajeev D. Muralidhar, Vishwesh M. Rudramuni,
    Hari Seshadri and Sujith Thomas.  The current driver also
    includes contributions from H. Peter Anvin, Arjan van de Ven,
    Kristen Accardi, and Yong Wang.
    
    Thanks for additional review feedback from Alan Cox and Randy Dunlap.
    
    Signed-off-by: Len Brown <len.brown@intel.com>
    Acked-by: Alan Cox <alan@linux.intel.com>

^ permalink raw reply

* Re: [PATCH v4 1/2] Input: enable i8042-level wakeup control
From: Daniel Drake @ 2011-08-03 19:51 UTC (permalink / raw)
  To: Dmitry Torokhov; +Cc: linux-pm, dilinger, linux-input
In-Reply-To: <20110803193814.GA17944@core.coreip.homeip.net>

On Wed, Aug 3, 2011 at 8:38 PM, Dmitry Torokhov
<dmitry.torokhov@gmail.com> wrote:
> I believe we can and do mark devices such as USB as wakeup capable on
> other arches, you do not have control here. That is why I am uneasy with
> doing this in input core.

That's possible, but do note that the latest iteration of the patch
only looks at the wakeup capability of the struct input_device. It is
unlikely that another part of the kernel marks it as wakeup-capable
without taking these considerations into account.

It is more likely that, in the USB case, the wakeup field of the
struct usb_device is marked wakeup capable. With Alan's ACPI case
earlier in the discussion, this was true: we found that only the ACPI
device gets marked as wakeup-capable/wakeup-enabled, and not the input
device or even the i8042/serio devices, even though it is implementing
(some form of) keyboard wakeup.

Anyway, I'll look at implementing it according to how you suggested,
which wouldn't involve such wide-reaching changes.

> Is there any keys that are not autorepeating. For example regular
> (non-OLPC) laptops usually do not repeat suspend and other special keys.
> In fact, they quite often forget to send release events for them ;)

I just tested, and all of our keys autorepeat in that fashion. Even
the odd keys like "rotate" and "change language". I guess that makes
us irregular ;)

Daniel

^ permalink raw reply

* [PATCH 5/5] cpuidle: stop depending on pm_idle
From: Len Brown @ 2011-08-03 19:44 UTC (permalink / raw)
  To: linux-pm, linux-kernel; +Cc: Kevin Hilman, Len Brown, x86
In-Reply-To: <8e24ed2af46a5d008d85bfeb128d25efc394a3fd.1312400543.git.len.brown@intel.com>

From: Len Brown <len.brown@intel.com>

cpuidle users should call cpuidle_call_idle() directly
rather than via (pm_idle)() function pointer.

Architecture may choose to continue using (pm_idle)(),
but cpuidle need not depend on it:

  my_arch_cpu_idle()
	...
	if(cpuidle_call_idle())
		pm_idle();

cc: x86@kernel.org
cc: Kevin Hilman <khilman@deeprootsystems.com>
cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/arm/kernel/process.c    |    4 +++-
 arch/sh/kernel/idle.c        |    6 ++++--
 arch/x86/kernel/process_32.c |    4 +++-
 arch/x86/kernel/process_64.c |    4 +++-
 drivers/cpuidle/cpuidle.c    |   38 ++++++++++++++++++--------------------
 include/linux/cpuidle.h      |    2 ++
 6 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 5e1e541..d7ee0d4 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -30,6 +30,7 @@
 #include <linux/uaccess.h>
 #include <linux/random.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/cpuidle.h>
 
 #include <asm/cacheflush.h>
 #include <asm/leds.h>
@@ -196,7 +197,8 @@ void cpu_idle(void)
 				cpu_relax();
 			} else {
 				stop_critical_timings();
-				pm_idle();
+				if (cpuidle_call_idle())
+					pm_idle();
 				start_critical_timings();
 				/*
 				 * This will eventually be removed - pm_idle
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 425d604..9c7099e 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -16,12 +16,13 @@
 #include <linux/thread_info.h>
 #include <linux/irqflags.h>
 #include <linux/smp.h>
+#include <linux/cpuidle.h>
 #include <asm/pgalloc.h>
 #include <asm/system.h>
 #include <asm/atomic.h>
 #include <asm/smp.h>
 
-void (*pm_idle)(void) = NULL;
+static void (*pm_idle)(void);
 
 static int hlt_counter;
 
@@ -100,7 +101,8 @@ void cpu_idle(void)
 			local_irq_disable();
 			/* Don't trace irqs off for idle */
 			stop_critical_timings();
-			pm_idle();
+			if (cpuidle_call_idle())
+				pm_idle();
 			/*
 			 * Sanity check to ensure that pm_idle() returns
 			 * with IRQs enabled
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a3d0dc5..7a3b651 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -109,7 +110,8 @@ void cpu_idle(void)
 			local_irq_disable();
 			/* Don't trace irqs off for idle */
 			stop_critical_timings();
-			pm_idle();
+			if (cpuidle_idle_call())
+				pm_idle();
 			start_critical_timings();
 		}
 		tick_nohz_restart_sched_tick();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ca6f7ab..f693e44 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/ftrace.h>
+#include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -136,7 +137,8 @@ void cpu_idle(void)
 			enter_idle();
 			/* Don't trace irqs off for idle */
 			stop_critical_timings();
-			pm_idle();
+			if (cpuidle_idle_call())
+				pm_idle();
 			start_critical_timings();
 
 			/* In many cases the interrupt that ended idle
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 041df0b..d4c5423 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -25,10 +25,10 @@ DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
 
 DEFINE_MUTEX(cpuidle_lock);
 LIST_HEAD(cpuidle_detected_devices);
-static void (*pm_idle_old)(void);
 
 static int enabled_devices;
 static int off __read_mostly;
+static int initialized __read_mostly;
 
 int cpuidle_disabled(void)
 {
@@ -56,25 +56,23 @@ static int __cpuidle_register_device(struct cpuidle_device *dev);
  * cpuidle_idle_call - the main idle loop
  *
  * NOTE: no locks or semaphores should be used here
+ * return non-zero on failure
  */
-static void cpuidle_idle_call(void)
+int cpuidle_idle_call(void)
 {
 	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
 	struct cpuidle_state *target_state;
 	int next_state;
 
+	if (off)
+		return -ENODEV;
+
+	if (!initialized)
+		return -ENODEV;
+
 	/* check if the device is ready */
-	if (!dev || !dev->enabled) {
-		if (pm_idle_old)
-			pm_idle_old();
-		else
-#if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE)
-			default_idle();
-#else
-			local_irq_enable();
-#endif
-		return;
-	}
+	if (!dev || !dev->enabled)
+		return -EBUSY;
 
 #if 0
 	/* shows regressions, re-enable for 2.6.29 */
@@ -99,7 +97,7 @@ static void cpuidle_idle_call(void)
 	next_state = cpuidle_curr_governor->select(dev);
 	if (need_resched()) {
 		local_irq_enable();
-		return;
+		return 0;
 	}
 
 	target_state = &dev->states[next_state];
@@ -124,6 +122,8 @@ static void cpuidle_idle_call(void)
 	/* give the governor an opportunity to reflect on the outcome */
 	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev);
+
+	return 0;
 }
 
 /**
@@ -131,10 +131,10 @@ static void cpuidle_idle_call(void)
  */
 void cpuidle_install_idle_handler(void)
 {
-	if (enabled_devices && (pm_idle != cpuidle_idle_call)) {
+	if (enabled_devices) {
 		/* Make sure all changes finished before we switch to new idle */
 		smp_wmb();
-		pm_idle = cpuidle_idle_call;
+		initialized = 1;
 	}
 }
 
@@ -143,8 +143,8 @@ void cpuidle_install_idle_handler(void)
  */
 void cpuidle_uninstall_idle_handler(void)
 {
-	if (enabled_devices && pm_idle_old && (pm_idle != pm_idle_old)) {
-		pm_idle = pm_idle_old;
+	if (enabled_devices) {
+		initialized = 0;
 		cpuidle_kick_cpus();
 	}
 }
@@ -440,8 +440,6 @@ static int __init cpuidle_init(void)
 	if (cpuidle_disabled())
 		return -ENODEV;
 
-	pm_idle_old = pm_idle;
-
 	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
 	if (ret)
 		return ret;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index b89f67d..b51629e 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -123,6 +123,7 @@ struct cpuidle_driver {
 
 #ifdef CONFIG_CPU_IDLE
 extern void disable_cpuidle(void);
+extern int cpuidle_idle_call(void);
 
 extern int cpuidle_register_driver(struct cpuidle_driver *drv);
 struct cpuidle_driver *cpuidle_get_driver(void);
@@ -137,6 +138,7 @@ extern void cpuidle_disable_device(struct cpuidle_device *dev);
 
 #else
 static inline void disable_cpuidle(void) { }
+static inline int cpuidle_idle_call(void) { return -ENODEV; }
 
 static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
 {return -ENODEV; }
-- 
1.7.6.396.ge0613

^ permalink raw reply related

* [PATCH 4/5] x86 idle: move mwait_idle_with_hints() to where it is used
From: Len Brown @ 2011-08-03 19:44 UTC (permalink / raw)
  To: linux-pm, linux-kernel; +Cc: Len Brown, x86
In-Reply-To: <8e24ed2af46a5d008d85bfeb128d25efc394a3fd.1312400543.git.len.brown@intel.com>

From: Len Brown <len.brown@intel.com>

...and make it static

no functional change

cc: x86@kernel.org
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/asm/processor.h |    2 --
 arch/x86/kernel/acpi/cstate.c    |   23 +++++++++++++++++++++++
 arch/x86/kernel/process.c        |   23 -----------------------
 3 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2193715..0d1171c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -751,8 +751,6 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 		     :: "a" (eax), "c" (ecx));
 }
 
-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 extern void init_amd_e400_c1e_mask(void);
 
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 5812404..f50e7fb 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -149,6 +149,29 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
 }
 EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
 
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+{
+	if (!need_resched()) {
+		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+			clflush((void *)&current_thread_info()->flags);
+
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (!need_resched())
+			__mwait(ax, cx);
+	}
+}
+
 void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
 {
 	unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e1ba8cb..e7e3b01 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -438,29 +438,6 @@ void cpu_idle_wait(void)
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
-	if (!need_resched()) {
-		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
-			clflush((void *)&current_thread_info()->flags);
-
-		__monitor((void *)&current_thread_info()->flags, 0, 0);
-		smp_mb();
-		if (!need_resched())
-			__mwait(ax, cx);
-	}
-}
-
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
-- 
1.7.6.396.ge0613

^ permalink raw reply related

* [PATCH 3/5] cpuidle: replace xen access to x86 pm_idle and default_idle
From: Len Brown @ 2011-08-03 19:44 UTC (permalink / raw)
  To: linux-pm, linux-kernel; +Cc: Len Brown, xen-devel
In-Reply-To: <8e24ed2af46a5d008d85bfeb128d25efc394a3fd.1312400543.git.len.brown@intel.com>

From: Len Brown <len.brown@intel.com>

When a Xen Dom0 kernel boots on a hypervisor, it gets access
to the raw-hardware ACPI tables.  While it parses the idle tables
for the hypervisor's beneift, it uses HLT for its own idle.

Rather than have xen scribble on pm_idle and access default_idle,
have it simply disable_cpuidle() so acpi_idle will not load and
architecture default HLT will be used.

cc: xen-devel@lists.xensource.com
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/xen/setup.c      |    3 ++-
 drivers/cpuidle/cpuidle.c |    4 ++++
 include/linux/cpuidle.h   |    2 ++
 3 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 60aeeb5..a9627e2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/pm.h>
 #include <linux/memblock.h>
+#include <linux/cpuidle.h>
 
 #include <asm/elf.h>
 #include <asm/vdso.h>
@@ -426,7 +427,7 @@ void __init xen_arch_setup(void)
 #ifdef CONFIG_X86_32
 	boot_cpu_data.hlt_works_ok = 1;
 #endif
-	pm_idle = default_idle;
+	disable_cpuidle();
 	boot_option_idle_override = IDLE_HALT;
 
 	fiddle_vdso();
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index faae2c3..041df0b 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -34,6 +34,10 @@ int cpuidle_disabled(void)
 {
 	return off;
 }
+void disable_cpuidle(void)
+{
+	off = 1;
+}
 
 #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
 static void cpuidle_kick_cpus(void)
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 36719ea..b89f67d 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -122,6 +122,7 @@ struct cpuidle_driver {
 };
 
 #ifdef CONFIG_CPU_IDLE
+extern void disable_cpuidle(void);
 
 extern int cpuidle_register_driver(struct cpuidle_driver *drv);
 struct cpuidle_driver *cpuidle_get_driver(void);
@@ -135,6 +136,7 @@ extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
 
 #else
+static inline void disable_cpuidle(void) { }
 
 static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
 {return -ENODEV; }
-- 
1.7.6.396.ge0613

^ permalink raw reply related

* [PATCH 2/5] cpuidle: create bootparam "cpuidle.off=1"
From: Len Brown @ 2011-08-03 19:44 UTC (permalink / raw)
  To: linux-pm, linux-kernel; +Cc: Len Brown
In-Reply-To: <8e24ed2af46a5d008d85bfeb128d25efc394a3fd.1312400543.git.len.brown@intel.com>

From: Len Brown <len.brown@intel.com>

useful for disabling cpuidle to fall back
to architecture-default idle loop

cpuidle drivers and governors will fail to register.
on x86 they'll say so:

intel_idle: intel_idle yielding to (null)
ACPI: acpi_idle yielding to (null)

Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/kernel-parameters.txt |    3 +++
 drivers/cpuidle/cpuidle.c           |   10 ++++++++++
 drivers/cpuidle/cpuidle.h           |    1 +
 drivers/cpuidle/driver.c            |    3 +++
 drivers/cpuidle/governor.c          |    3 +++
 5 files changed, 20 insertions(+), 0 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index aa47be7..9b8e62d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -546,6 +546,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			/proc/<pid>/coredump_filter.
 			See also Documentation/filesystems/proc.txt.
 
+	cpuidle.off=1	[CPU_IDLE]
+			disable the cpuidle sub-system
+
 	cpcihp_generic=	[HW,PCI] Generic port I/O CompactPCI driver
 			Format:
 			<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index bf50924..faae2c3 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -28,6 +28,12 @@ LIST_HEAD(cpuidle_detected_devices);
 static void (*pm_idle_old)(void);
 
 static int enabled_devices;
+static int off __read_mostly;
+
+int cpuidle_disabled(void)
+{
+	return off;
+}
 
 #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
 static void cpuidle_kick_cpus(void)
@@ -427,6 +433,9 @@ static int __init cpuidle_init(void)
 {
 	int ret;
 
+	if (cpuidle_disabled())
+		return -ENODEV;
+
 	pm_idle_old = pm_idle;
 
 	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
@@ -438,4 +447,5 @@ static int __init cpuidle_init(void)
 	return 0;
 }
 
+module_param(off, int, 0444);
 core_initcall(cpuidle_init);
diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
index 33e50d5..38c3fd8 100644
--- a/drivers/cpuidle/cpuidle.h
+++ b/drivers/cpuidle/cpuidle.h
@@ -13,6 +13,7 @@ extern struct list_head cpuidle_governors;
 extern struct list_head cpuidle_detected_devices;
 extern struct mutex cpuidle_lock;
 extern spinlock_t cpuidle_driver_lock;
+extern int cpuidle_disabled(void);
 
 /* idle loop */
 extern void cpuidle_install_idle_handler(void);
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index fd1601e..3f7e3ce 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -26,6 +26,9 @@ int cpuidle_register_driver(struct cpuidle_driver *drv)
 	if (!drv)
 		return -EINVAL;
 
+	if (cpuidle_disabled())
+		return -ENODEV;
+
 	spin_lock(&cpuidle_driver_lock);
 	if (cpuidle_curr_driver) {
 		spin_unlock(&cpuidle_driver_lock);
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index 724c164..ea2f8e7 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -81,6 +81,9 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)
 	if (!gov || !gov->select)
 		return -EINVAL;
 
+	if (cpuidle_disabled())
+		return -ENODEV;
+
 	mutex_lock(&cpuidle_lock);
 	if (__cpuidle_find_governor(gov->name) == NULL) {
 		ret = 0;
-- 
1.7.6.396.ge0613

^ permalink raw reply related

* [PATCH 1/5] mrst_pmu: driver for Intel Moorestown Power Management Unit
From: Len Brown @ 2011-08-03 19:44 UTC (permalink / raw)
  To: linux-pm, linux-kernel; +Cc: Len Brown
In-Reply-To: <1312400648-17636-1-git-send-email-lenb@kernel.org>

From: Len Brown <len.brown@intel.com>

The Moorestown (MRST) Power Management Unit (PMU) driver
directs the SOC power states in the "Langwell" south complex (SCU).

It hooks pci_platform_pm_ops[] and thus observes all PCI ".set_state"
requests.  For devices in the SC, the pmu driver translates those
PCI requests into the appropriate commands for the SCU.

The PMU driver helps implement S0i3, a deep system idle power idle state.
Entry into S0i3 is via cpuidle, just like regular processor c-states.
S0i3 depends on pre-conditions including uni-processor, graphics off,
and certain IO devices in the SC must be off.  If those pre-conditions
are met, then the PMU allows cpuidle to enter S0i3, otherwise such requests
are demoted, either to Atom C4 or Atom C6.

This driver is based on prototype work by Bruce Flemming,
Illyas Mansoor, Rajeev D. Muralidhar, Vishwesh M. Rudramuni,
Hari Seshadri and Sujith Thomas.  The current driver also
includes contributions from H. Peter Anvin, Arjan van de Ven,
Kristen Accardi, and Yong Wang.

Thanks for additional review feedback from Alan Cox and Randy Dunlap.

Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Alan Cox <alan@linux.intel.com>
---
 MAINTAINERS                     |    6 +
 arch/x86/platform/mrst/Makefile |    1 +
 arch/x86/platform/mrst/pmu.c    |  817 +++++++++++++++++++++++++++++++++++++++
 arch/x86/platform/mrst/pmu.h    |  234 +++++++++++
 4 files changed, 1058 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/platform/mrst/pmu.c
 create mode 100644 arch/x86/platform/mrst/pmu.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 187282d..d37b387 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3344,6 +3344,12 @@ F:	drivers/net/ixgb/
 F:	drivers/net/ixgbe/
 F:	drivers/net/ixgbevf/
 
+INTEL MRST PMU DRIVER
+M:	Len Brown <len.brown@intel.com>
+L:	linux-pm@lists.linux-foundation.org
+S:	Supported
+F:	arch/x86/platform/mrst/pmu.*
+
 INTEL PRO/WIRELESS 2100 NETWORK CONNECTION SUPPORT
 L:	linux-wireless@vger.kernel.org
 S:	Orphan
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index f61ccdd..1ea3877 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 obj-$(CONFIG_X86_MRST)		+= vrtc.o
 obj-$(CONFIG_EARLY_PRINTK_MRST)	+= early_printk_mrst.o
+obj-$(CONFIG_X86_MRST)		+= pmu.o
diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c
new file mode 100644
index 0000000..c0cebfb
--- /dev/null
+++ b/arch/x86/platform/mrst/pmu.c
@@ -0,0 +1,817 @@
+/*
+ * mrst/pmu.c - driver for MRST Power Management Unit
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/seq_file.h>
+#include <linux/sfi.h>
+#include <asm/intel_scu_ipc.h>
+#include "pmu.h"
+
+#define IPCMSG_FW_REVISION	0xF4
+
+struct mrst_device {
+	u16 pci_dev_num;	/* DEBUG only */
+	u16 lss;
+	u16 latest_request;
+	unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */
+};
+
+/*
+ * comlete list of MRST PCI devices
+ */
+static struct mrst_device mrst_devs[] = {
+/*  0 */ { 0x0800, LSS_SPI0 },		/* Moorestown SPI Ctrl 0 */
+/*  1 */ { 0x0801, LSS_SPI1 },		/* Moorestown SPI Ctrl 1 */
+/*  2 */ { 0x0802, LSS_I2C0 },		/* Moorestown I2C 0 */
+/*  3 */ { 0x0803, LSS_I2C1 },		/* Moorestown I2C 1 */
+/*  4 */ { 0x0804, LSS_I2C2 },		/* Moorestown I2C 2 */
+/*  5 */ { 0x0805, LSS_KBD },		/* Moorestown Keyboard Ctrl */
+/*  6 */ { 0x0806, LSS_USB_HC },	/* Moorestown USB Ctrl */
+/*  7 */ { 0x0807, LSS_SD_HC0 },	/* Moorestown SD Host Ctrl 0 */
+/*  8 */ { 0x0808, LSS_SD_HC1 },	/* Moorestown SD Host Ctrl 1 */
+/*  9 */ { 0x0809, LSS_NAND },		/* Moorestown NAND Ctrl */
+/* 10 */ { 0x080a, LSS_AUDIO },		/* Moorestown Audio Ctrl */
+/* 11 */ { 0x080b, LSS_IMAGING },	/* Moorestown ISP */
+/* 12 */ { 0x080c, LSS_SECURITY },	/* Moorestown Security Controller */
+/* 13 */ { 0x080d, LSS_DISPLAY },	/* Moorestown External Displays */
+/* 14 */ { 0x080e, 0 },			/* Moorestown SCU IPC */
+/* 15 */ { 0x080f, LSS_GPIO },		/* Moorestown GPIO Controller */
+/* 16 */ { 0x0810, 0 },			/* Moorestown Power Management Unit */
+/* 17 */ { 0x0811, LSS_USB_OTG },	/* Moorestown OTG Ctrl */
+/* 18 */ { 0x0812, LSS_SPI2 },		/* Moorestown SPI Ctrl 2 */
+/* 19 */ { 0x0813, 0 },			/* Moorestown SC DMA */
+/* 20 */ { 0x0814, LSS_AUDIO_LPE },	/* Moorestown LPE DMA */
+/* 21 */ { 0x0815, LSS_AUDIO_SSP },	/* Moorestown SSP0 */
+
+/* 22 */ { 0x084F, LSS_SD_HC2 },	/* Moorestown SD Host Ctrl 2 */
+
+/* 23 */ { 0x4102, 0 },			/* Lincroft */
+/* 24 */ { 0x4110, 0 },			/* Lincroft */
+};
+
+/* n.b. We ignore PCI-id 0x815 in LSS9 b/c MeeGo has no driver for it */
+static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0};
+static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803,
+					0x0804, 0x0805, 0x080f, 0};
+
+/* handle concurrent SMP invokations of pmu_pci_set_power_state() */
+static spinlock_t mrst_pmu_power_state_lock;
+
+static unsigned int wake_counters[MRST_NUM_LSS];	/* DEBUG only */
+static unsigned int pmu_irq_stats[INT_INVALID + 1];	/* DEBUG only */
+
+static int graphics_is_off;
+static int lss_s0i3_enabled;
+static bool mrst_pmu_s0i3_enable;
+
+/*  debug counters */
+static u32 pmu_wait_ready_calls;
+static u32 pmu_wait_ready_udelays;
+static u32 pmu_wait_ready_udelays_max;
+static u32 pmu_wait_done_calls;
+static u32 pmu_wait_done_udelays;
+static u32 pmu_wait_done_udelays_max;
+static u32 pmu_set_power_state_entry;
+static u32 pmu_set_power_state_send_cmd;
+
+static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num)
+{
+	int index = 0;
+
+	if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815))
+		index = pci_dev_num - 0x800;
+	else if (pci_dev_num == 0x084F)
+		index = 22;
+	else if (pci_dev_num == 0x4102)
+		index = 23;
+	else if (pci_dev_num == 0x4110)
+		index = 24;
+
+	if (pci_dev_num != mrst_devs[index].pci_dev_num) {
+		WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num);
+		return 0;
+	}
+
+	return &mrst_devs[index];
+}
+
+/**
+ * mrst_pmu_validate_cstates
+ * @dev: cpuidle_device
+ *
+ * Certain states are not appropriate for governor to pick in some cases.
+ * This function will be called as cpuidle_device's prepare callback and
+ * thus tells governor to ignore such states when selecting the next state
+ * to enter.
+ */
+
+#define IDLE_STATE4_IS_C6	4
+#define IDLE_STATE5_IS_S0I3	5
+
+int mrst_pmu_invalid_cstates(void)
+{
+	int cpu = smp_processor_id();
+
+	/*
+	 * Demote to C4 if the PMU is busy.
+	 * Since LSS changes leave the busy bit clear...
+	 * busy means either the PMU is waiting for an ACK-C6 that
+	 * isn't coming due to an MWAIT that returned immediately;
+	 * or we returned from S0i3 successfully, and the PMU
+	 * is not done sending us interrupts.
+	 */
+	if (pmu_read_busy_status())
+		return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3;
+
+	/*
+	 * Disallow S0i3 if: PMU is not initialized, or CPU1 is active,
+	 * or if device LSS is insufficient, or the GPU is active,
+	 * or if it has been explicitly disabled.
+	 */
+	if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) ||
+	    !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable)
+		return 1 << IDLE_STATE5_IS_S0I3;
+	else
+		return 0;
+}
+
+/*
+ * pmu_update_wake_counters(): read PM_WKS, update wake_counters[]
+ * DEBUG only.
+ */
+static void pmu_update_wake_counters(void)
+{
+	int lss;
+	u32 wake_status;
+
+	wake_status = pmu_read_wks();
+
+	for (lss = 0; lss < MRST_NUM_LSS; ++lss) {
+		if (wake_status & (1 << lss))
+			wake_counters[lss]++;
+	}
+}
+
+int mrst_pmu_s0i3_entry(void)
+{
+	int status;
+
+	/* Clear any possible error conditions */
+	pmu_write_ics(0x300);
+
+	/* set wake control to current D-states */
+	pmu_write_wssc(S0I3_SSS_TARGET);
+
+	status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd);
+	pmu_update_wake_counters();
+	return status;
+}
+
+/* poll for maximum of 5ms for busy bit to clear */
+static int pmu_wait_ready(void)
+{
+	int udelays;
+
+	pmu_wait_ready_calls++;
+
+	for (udelays = 0; udelays < 5000; ++udelays) {
+		if (udelays > pmu_wait_ready_udelays_max)
+			pmu_wait_ready_udelays_max = udelays;
+
+		if (pmu_read_busy_status() == 0)
+			return 0;
+
+		udelay(10);
+		pmu_wait_ready_udelays++;
+	}
+
+	/*
+	 * if this fires, observe
+	 * /sys/kernel/debug/mrst_pmu_wait_ready_calls
+	 * /sys/kernel/debug/mrst_pmu_wait_ready_udelays
+	 */
+	WARN_ONCE(1, "SCU not ready for 5ms");
+	return -EBUSY;
+}
+/* poll for maximum of 50ms us for busy bit to clear */
+static int pmu_wait_done(void)
+{
+	int udelays;
+
+	pmu_wait_done_calls++;
+
+	for (udelays = 0; udelays < 5000; ++udelays) {
+		if (udelays > pmu_wait_done_udelays_max)
+			pmu_wait_done_udelays_max = udelays;
+
+		if (pmu_read_busy_status() == 0)
+			return 0;
+
+		udelay(100);
+		pmu_wait_done_udelays++;
+	}
+
+	/*
+	 * if this fires, observe
+	 * /sys/kernel/debug/mrst_pmu_wait_done_calls
+	 * /sys/kernel/debug/mrst_pmu_wait_done_udelays
+	 */
+	WARN_ONCE(1, "SCU not done for 50ms");
+	return -EBUSY;
+}
+
+u32 mrst_pmu_msi_is_disabled(void)
+{
+	return pmu_msi_is_disabled();
+}
+
+void mrst_pmu_enable_msi(void)
+{
+	pmu_msi_enable();
+}
+
+/**
+ * pmu_irq - pmu driver interrupt handler
+ * Context: interrupt context
+ */
+static irqreturn_t pmu_irq(int irq, void *dummy)
+{
+	union pmu_pm_ics pmu_ics;
+
+	pmu_ics.value = pmu_read_ics();
+
+	if (!pmu_ics.bits.pending)
+		return IRQ_NONE;
+
+	switch (pmu_ics.bits.cause) {
+	case INT_SPURIOUS:
+	case INT_CMD_DONE:
+	case INT_CMD_ERR:
+	case INT_WAKE_RX:
+	case INT_SS_ERROR:
+	case INT_S0IX_MISS:
+	case INT_NO_ACKC6:
+		pmu_irq_stats[pmu_ics.bits.cause]++;
+		break;
+	default:
+		pmu_irq_stats[INT_INVALID]++;
+	}
+
+	pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Translate PCI power management to MRST LSS D-states
+ */
+static int pci_2_mrst_state(int lss, pci_power_t pci_state)
+{
+	switch (pci_state) {
+	case PCI_D0:
+		if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET)
+			return D0i1;
+		else
+			return D0;
+	case PCI_D1:
+		return D0i1;
+	case PCI_D2:
+		return D0i2;
+	case PCI_D3hot:
+	case PCI_D3cold:
+		return D0i3;
+	default:
+		WARN(1, "pci_state %d\n", pci_state);
+		return 0;
+	}
+}
+
+static int pmu_issue_command(u32 pm_ssc)
+{
+	union pmu_pm_set_cfg_cmd_t command;
+
+	if (pmu_read_busy_status()) {
+		pr_debug("pmu is busy, Operation not permitted\n");
+		return -1;
+	}
+
+	/*
+	 * enable interrupts in PMU so that interrupts are
+	 * propagated when ioc bit for a particular set
+	 * command is set
+	 */
+
+	pmu_irq_enable();
+
+	/* Configure the sub systems for pmu2 */
+
+	pmu_write_ssc(pm_ssc);
+
+	/*
+	 * Send the set config command for pmu its configured
+	 * for mode CM_IMMEDIATE & hence with No Trigger
+	 */
+
+	command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE;
+	command.pmu2_params.d_param.cfg_delay = 0;
+	command.pmu2_params.d_param.rsvd = 0;
+
+	/* construct the command to send SET_CFG to particular PMU */
+	command.pmu2_params.d_param.cmd = SET_CFG_CMD;
+	command.pmu2_params.d_param.ioc = 0;
+	command.pmu2_params.d_param.mode_id = 0;
+	command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0;
+
+	/* write the value of PM_CMD into particular PMU */
+	pr_debug("pmu command being written %x\n",
+			command.pmu_pm_set_cfg_cmd_value);
+
+	pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value);
+
+	return 0;
+}
+
+static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state)
+{
+	u16 existing_request;
+	int i;
+
+	for (i = 0; ids[i]; ++i) {
+		struct mrst_device *mrst_dev;
+
+		mrst_dev = pci_id_2_mrst_dev(ids[i]);
+		if (unlikely(!mrst_dev))
+			continue;
+
+		existing_request = mrst_dev->latest_request;
+		if (existing_request < pci_state)
+			pci_state = existing_request;
+	}
+	return pci_state;
+}
+
+/**
+ * pmu_pci_set_power_state - Callback function is used by all the PCI devices
+ *			for a platform  specific device power on/shutdown.
+ */
+
+int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state)
+{
+	u32 old_sss, new_sss;
+	int status = 0;
+	struct mrst_device *mrst_dev;
+
+	pmu_set_power_state_entry++;
+
+	BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL);
+	BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold);
+
+	mrst_dev = pci_id_2_mrst_dev(pdev->device);
+	if (unlikely(!mrst_dev))
+		return -ENODEV;
+
+	mrst_dev->pci_state_counts[pci_state]++;	/* count invocations */
+
+	/* PMU driver calls self as part of PCI initialization, ignore */
+	if (pdev->device == PCI_DEV_ID_MRST_PMU)
+		return 0;
+
+	BUG_ON(!pmu_reg); /* SW bug if called before initialized */
+
+	spin_lock(&mrst_pmu_power_state_lock);
+
+	if (pdev->d3_delay) {
+		dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n",
+			pdev->d3_delay);
+		pdev->d3_delay = 0;
+	}
+	/*
+	 * If Lincroft graphics, simply remember state
+	 */
+	if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY
+		&& !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) {
+		if (pci_state == PCI_D0)
+			graphics_is_off = 0;
+		else
+			graphics_is_off = 1;
+		goto ret;
+	}
+
+	if (!mrst_dev->lss)
+		goto ret;	/* device with no LSS */
+
+	if (mrst_dev->latest_request == pci_state)
+		goto ret;	/* no change */
+
+	mrst_dev->latest_request = pci_state;	/* record latest request */
+
+	/*
+	 * LSS9 and LSS10 contain multiple PCI devices.
+	 * Use the lowest numbered (highest power) state in the LSS
+	 */
+	if (mrst_dev->lss == 9)
+		pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state);
+	else if (mrst_dev->lss == 10)
+		pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state);
+
+	status = pmu_wait_ready();
+	if (status)
+		goto ret;
+
+	old_sss = pmu_read_sss();
+	new_sss = old_sss & ~SSMSK(3, mrst_dev->lss);
+	new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state),
+			mrst_dev->lss);
+
+	if (new_sss == old_sss)
+		goto ret;	/* nothing to do */
+
+	pmu_set_power_state_send_cmd++;
+
+	status = pmu_issue_command(new_sss);
+
+	if (unlikely(status != 0)) {
+		dev_err(&pdev->dev, "Failed to Issue a PM command\n");
+		goto ret;
+	}
+
+	if (pmu_wait_done())
+		goto ret;
+
+	lss_s0i3_enabled =
+	((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET);
+ret:
+	spin_unlock(&mrst_pmu_power_state_lock);
+	return status;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"};
+
+static inline const char *d0ix_name(int state)
+{
+	return d0ix_names[(int) state];
+}
+
+static int debug_mrst_pmu_show(struct seq_file *s, void *unused)
+{
+	struct pci_dev *pdev = NULL;
+	u32 cur_pmsss;
+	int lss;
+
+	seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET);
+
+	cur_pmsss = pmu_read_sss();
+
+	seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET);
+
+	seq_printf(s, "0x%08X Current SSS ", cur_pmsss);
+	seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n");
+
+	if (cpumask_equal(cpu_online_mask, cpumask_of(0)))
+		seq_printf(s, "cpu0 is only cpu online\n");
+	else
+		seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n");
+
+	seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]");
+
+
+	for_each_pci_dev(pdev) {
+		int pos;
+		u16 pmcsr;
+		struct mrst_device *mrst_dev;
+		int i;
+
+		mrst_dev = pci_id_2_mrst_dev(pdev->device);
+
+		seq_printf(s, "%s %04x/%04X %-16.16s ",
+			dev_name(&pdev->dev),
+			pdev->vendor, pdev->device,
+			dev_driver_string(&pdev->dev));
+
+		if (unlikely (!mrst_dev)) {
+			seq_printf(s, " UNKNOWN\n");
+			continue;
+		}
+
+		if (mrst_dev->lss)
+			seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss,
+				d0ix_name(((cur_pmsss >>
+					(mrst_dev->lss * 2)) & 0x3)));
+		else
+			seq_printf(s, "            ");
+
+		/* PCI PM config space setting */
+		pos = pci_find_capability(pdev, PCI_CAP_ID_PM);
+		if (pos != 0) {
+			pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr);
+		seq_printf(s, "PCI-%-4s",
+			pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK));
+		} else {
+			seq_printf(s, "        ");
+		}
+
+		seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request));
+		for (i = 0; i <= PCI_D3cold; ++i)
+			seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]);
+
+		if (mrst_dev->lss) {
+			unsigned int lssmask;
+
+			lssmask = SSMSK(D0i3, mrst_dev->lss);
+
+			if ((lssmask & S0I3_SSS_TARGET) &&
+				((lssmask & cur_pmsss) !=
+					(lssmask & S0I3_SSS_TARGET)))
+						seq_printf(s , "[BLOCKS s0i3]");
+		}
+
+		seq_printf(s, "\n");
+	}
+	seq_printf(s, "Wake Counters:\n");
+	for (lss = 0; lss < MRST_NUM_LSS; ++lss)
+		seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]);
+
+	seq_printf(s, "Interrupt Counters:\n");
+	seq_printf(s,
+		"INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n"
+		"INT_CMD_ERR  \t%8u\n" "INT_WAKE_RX  \t%8u\n"
+		"INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n"
+		"INT_NO_ACKC6 \t%8u\n" "INT_INVALID  \t%8u\n",
+		pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE],
+		pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX],
+		pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS],
+		pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]);
+
+	seq_printf(s, "mrst_pmu_wait_ready_calls          %8d\n",
+			pmu_wait_ready_calls);
+	seq_printf(s, "mrst_pmu_wait_ready_udelays        %8d\n",
+			pmu_wait_ready_udelays);
+	seq_printf(s, "mrst_pmu_wait_ready_udelays_max    %8d\n",
+			pmu_wait_ready_udelays_max);
+	seq_printf(s, "mrst_pmu_wait_done_calls           %8d\n",
+			pmu_wait_done_calls);
+	seq_printf(s, "mrst_pmu_wait_done_udelays         %8d\n",
+			pmu_wait_done_udelays);
+	seq_printf(s, "mrst_pmu_wait_done_udelays_max     %8d\n",
+			pmu_wait_done_udelays_max);
+	seq_printf(s, "mrst_pmu_set_power_state_entry     %8d\n",
+			pmu_set_power_state_entry);
+	seq_printf(s, "mrst_pmu_set_power_state_send_cmd  %8d\n",
+			pmu_set_power_state_send_cmd);
+	seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status());
+
+	return 0;
+}
+
+static int debug_mrst_pmu_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, debug_mrst_pmu_show, NULL);
+}
+
+static const struct file_operations devices_state_operations = {
+	.open		= debug_mrst_pmu_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif	/* DEBUG_FS */
+
+/*
+ * Validate SCU PCI shim PCI vendor capability byte
+ * against LSS hard-coded in mrst_devs[] above.
+ * DEBUG only.
+ */
+static void pmu_scu_firmware_debug(void)
+{
+	struct pci_dev *pdev = NULL;
+
+	for_each_pci_dev(pdev) {
+		struct mrst_device *mrst_dev;
+		u8 pci_config_lss;
+		int pos;
+
+		mrst_dev = pci_id_2_mrst_dev(pdev->device);
+		if (unlikely(!mrst_dev)) {
+			printk(KERN_ERR FW_BUG "pmu: Unknown "
+				"PCI device 0x%04X\n", pdev->device);
+			continue;
+		}
+
+		if (mrst_dev->lss == 0)
+			continue;	 /* no LSS in our table */
+
+		pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR);
+		if (!pos != 0) {
+			printk(KERN_ERR FW_BUG "pmu: 0x%04X "
+				"missing PCI Vendor Capability\n",
+				pdev->device);
+			continue;
+		}
+		pci_read_config_byte(pdev, pos + 4, &pci_config_lss);
+		if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) {
+			printk(KERN_ERR FW_BUG "pmu: 0x%04X "
+				"invalid PCI Vendor Capability 0x%x "
+				" expected LSS 0x%X\n",
+				pdev->device, pci_config_lss, mrst_dev->lss);
+			continue;
+		}
+		pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK;
+
+		if (mrst_dev->lss == pci_config_lss)
+			continue;
+
+		printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n",
+			pdev->device, pci_config_lss, mrst_dev->lss);
+	}
+}
+
+/**
+ * pmu_probe
+ */
+static int __devinit pmu_probe(struct pci_dev *pdev,
+				   const struct pci_device_id *pci_id)
+{
+	int ret;
+	struct mrst_pmu_reg *pmu;
+
+	/* Init the device */
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		dev_err(&pdev->dev, "Unable to Enable PCI device\n");
+		return ret;
+	}
+
+	ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n");
+		goto out_err1;
+	}
+
+	/* Map the memory of PMU reg base */
+	pmu = pci_iomap(pdev, 0, 0);
+	if (!pmu) {
+		dev_err(&pdev->dev, "Unable to map the PMU address space\n");
+		ret = -ENOMEM;
+		goto out_err2;
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	/* /sys/kernel/debug/mrst_pmu */
+	(void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO,
+				NULL, NULL, &devices_state_operations);
+#endif
+	pmu_reg = pmu;	/* success */
+
+	if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) {
+		dev_err(&pdev->dev, "Registering isr has failed\n");
+		ret = -1;
+		goto out_err3;
+	}
+
+	pmu_scu_firmware_debug();
+
+	pmu_write_wkc(S0I3_WAKE_SOURCES);	/* Enable S0i3 wakeup sources */
+
+	pmu_wait_ready();
+
+	pmu_write_ssc(D0I1_ACG_SSS_TARGET);	/* Enable Auto-Clock_Gating */
+	pmu_write_cmd(0x201);
+
+	spin_lock_init(&mrst_pmu_power_state_lock);
+
+	/* Enable the hardware interrupt */
+	pmu_irq_enable();
+	return 0;
+
+out_err3:
+	free_irq(pdev->irq, NULL);
+	pci_iounmap(pdev, pmu_reg);
+	pmu_reg = NULL;
+out_err2:
+	pci_release_region(pdev, 0);
+out_err1:
+	pci_disable_device(pdev);
+	return ret;
+}
+
+static void __devexit pmu_remove(struct pci_dev *pdev)
+{
+	dev_err(&pdev->dev, "Mid PM pmu_remove called\n");
+
+	/* Freeing up the irq */
+	free_irq(pdev->irq, NULL);
+
+	pci_iounmap(pdev, pmu_reg);
+	pmu_reg = NULL;
+
+	/* disable the current PCI device */
+	pci_release_region(pdev, 0);
+	pci_disable_device(pdev);
+}
+
+static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = {
+	{ PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(pci, pmu_pci_ids);
+
+static struct pci_driver driver = {
+	.name = MRST_PMU_DRV_NAME,
+	.id_table = pmu_pci_ids,
+	.probe = pmu_probe,
+	.remove = __devexit_p(pmu_remove),
+};
+
+/**
+ * pmu_pci_register - register the PMU driver as PCI device
+ */
+static int __init pmu_pci_register(void)
+{
+	return pci_register_driver(&driver);
+}
+
+/* Register and probe via fs_initcall() to preceed device_initcall() */
+fs_initcall(pmu_pci_register);
+
+static void __exit mid_pci_cleanup(void)
+{
+	pci_unregister_driver(&driver);
+}
+
+static int ia_major;
+static int ia_minor;
+
+static int pmu_sfi_parse_oem(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+
+	sb = (struct sfi_table_simple *)table;
+	ia_major = (sb->pentry[1] >> 0) & 0xFFFF;
+	ia_minor = (sb->pentry[1] >> 16) & 0xFFFF;
+	printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n",
+		ia_major, ia_minor);
+
+	return 0;
+}
+
+static int __init scu_fw_check(void)
+{
+	int ret;
+	u32 fw_version;
+
+	if (!pmu_reg)
+		return 0;	/* this driver didn't probe-out */
+
+	sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem);
+
+	if (ia_major < 0x6005 || ia_minor < 0x1525) {
+		WARN(1, "mrst_pmu: IA FW version too old\n");
+		return -1;
+	}
+
+	ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0,
+					&fw_version, 1);
+
+	if (ret) {
+		WARN(1, "mrst_pmu: IPC FW version? %d\n", ret);
+	} else {
+		int scu_major = (fw_version >> 8) & 0xFF;
+		int scu_minor = (fw_version >> 0) & 0xFF;
+
+		printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version);
+
+		if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) {
+			printk(KERN_INFO "mrst_pmu: enabling S0i3\n");
+			mrst_pmu_s0i3_enable = true;
+		} else {
+			WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X",
+					scu_major, scu_minor);
+		}
+	}
+	return 0;
+}
+late_initcall(scu_fw_check);
+module_exit(mid_pci_cleanup);
diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h
new file mode 100644
index 0000000..bfbfe64
--- /dev/null
+++ b/arch/x86/platform/mrst/pmu.h
@@ -0,0 +1,234 @@
+/*
+ * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _MRST_PMU_H_
+#define _MRST_PMU_H_
+
+#define PCI_DEV_ID_MRST_PMU		0x0810
+#define MRST_PMU_DRV_NAME		"mrst_pmu"
+#define	PCI_SUB_CLASS_MASK		0xFF00
+
+#define	PCI_VENDOR_CAP_LOG_ID_MASK	0x7F
+#define PCI_VENDOR_CAP_LOG_SS_MASK	0x80
+
+#define SUB_SYS_ALL_D0I1	0x01155555
+#define S0I3_WAKE_SOURCES	0x00001FFF
+
+#define PM_S0I3_COMMAND					\
+	((0 << 31) |	/* Reserved */			\
+	(0 << 30) |	/* Core must be idle */		\
+	(0xc2 << 22) |	/* ACK C6 trigger */		\
+	(3 << 19) |	/* Trigger on DMI message */	\
+	(3 << 16) |	/* Enter S0i3 */		\
+	(0 << 13) |	/* Numeric mode ID (sw) */	\
+	(3 << 9) |	/* Trigger mode */		\
+	(0 << 8) |	/* Do not interrupt */		\
+	(1 << 0))	/* Set configuration */
+
+#define	LSS_DMI		0
+#define	LSS_SD_HC0	1
+#define	LSS_SD_HC1	2
+#define	LSS_NAND	3
+#define	LSS_IMAGING	4
+#define	LSS_SECURITY	5
+#define	LSS_DISPLAY	6
+#define	LSS_USB_HC	7
+#define	LSS_USB_OTG	8
+#define	LSS_AUDIO	9
+#define	LSS_AUDIO_LPE	9
+#define	LSS_AUDIO_SSP	9
+#define	LSS_I2C0	10
+#define	LSS_I2C1	10
+#define	LSS_I2C2	10
+#define	LSS_KBD		10
+#define	LSS_SPI0	10
+#define	LSS_SPI1	10
+#define	LSS_SPI2	10
+#define	LSS_GPIO	10
+#define	LSS_SRAM	11	/* used by SCU, do not touch */
+#define	LSS_SD_HC2	12
+/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */
+#define MRST_NUM_LSS	13
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+#define	SSMSK(mask, lss) ((mask) << ((lss) * 2))
+#define	D0	0
+#define	D0i1	1
+#define	D0i2	2
+#define	D0i3	3
+
+#define S0I3_SSS_TARGET	(		\
+	SSMSK(D0i1, LSS_DMI) |		\
+	SSMSK(D0i3, LSS_SD_HC0) |	\
+	SSMSK(D0i3, LSS_SD_HC1) |	\
+	SSMSK(D0i3, LSS_NAND) |		\
+	SSMSK(D0i3, LSS_SD_HC2) |	\
+	SSMSK(D0i3, LSS_IMAGING) |	\
+	SSMSK(D0i3, LSS_SECURITY) |	\
+	SSMSK(D0i3, LSS_DISPLAY) |	\
+	SSMSK(D0i3, LSS_USB_HC) |	\
+	SSMSK(D0i3, LSS_USB_OTG) |	\
+	SSMSK(D0i3, LSS_AUDIO) |	\
+	SSMSK(D0i1, LSS_I2C0))
+
+/*
+ * D0i1 on Langwell is Autonomous Clock Gating (ACG).
+ * Enable ACG on every LSS except camera and audio
+ */
+#define D0I1_ACG_SSS_TARGET	 \
+	(SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO))
+
+enum cm_mode {
+	CM_NOP,			/* ignore the config mode value */
+	CM_IMMEDIATE,
+	CM_DELAY,
+	CM_TRIGGER,
+	CM_INVALID
+};
+
+enum sys_state {
+	SYS_STATE_S0I0,
+	SYS_STATE_S0I1,
+	SYS_STATE_S0I2,
+	SYS_STATE_S0I3,
+	SYS_STATE_S3,
+	SYS_STATE_S5
+};
+
+#define SET_CFG_CMD	1
+
+enum int_status {
+	INT_SPURIOUS = 0,
+	INT_CMD_DONE = 1,
+	INT_CMD_ERR = 2,
+	INT_WAKE_RX = 3,
+	INT_SS_ERROR = 4,
+	INT_S0IX_MISS = 5,
+	INT_NO_ACKC6 = 6,
+	INT_INVALID = 7,
+};
+
+/* PMU register interface */
+static struct mrst_pmu_reg {
+	u32 pm_sts;		/* 0x00 */
+	u32 pm_cmd;		/* 0x04 */
+	u32 pm_ics;		/* 0x08 */
+	u32 _resv1;		/* 0x0C */
+	u32 pm_wkc[2];		/* 0x10 */
+	u32 pm_wks[2];		/* 0x18 */
+	u32 pm_ssc[4];		/* 0x20 */
+	u32 pm_sss[4];		/* 0x30 */
+	u32 pm_wssc[4];		/* 0x40 */
+	u32 pm_c3c4;		/* 0x50 */
+	u32 pm_c5c6;		/* 0x54 */
+	u32 pm_msi_disable;	/* 0x58 */
+} *pmu_reg;
+
+static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); }
+static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); }
+static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); }
+static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); }
+
+static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); }
+static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); }
+static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); }
+static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); }
+static inline void pmu_write_wssc(u32 arg)
+					{ writel(arg, &pmu_reg->pm_wssc[0]); }
+
+static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); }
+static inline u32 pmu_msi_is_disabled(void)
+				{ return readl(&pmu_reg->pm_msi_disable); }
+
+union pmu_pm_ics {
+	struct {
+		u32 cause:8;
+		u32 enable:1;
+		u32 pending:1;
+		u32 reserved:22;
+	} bits;
+	u32 value;
+};
+
+static inline void pmu_irq_enable(void)
+{
+	union pmu_pm_ics pmu_ics;
+
+	pmu_ics.value = pmu_read_ics();
+	pmu_ics.bits.enable = 1;
+	pmu_write_ics(pmu_ics.value);
+}
+
+union pmu_pm_status {
+	struct {
+		u32 pmu_rev:8;
+		u32 pmu_busy:1;
+		u32 mode_id:4;
+		u32 Reserved:19;
+	} pmu_status_parts;
+	u32 pmu_status_value;
+};
+
+static inline int pmu_read_busy_status(void)
+{
+	union pmu_pm_status result;
+
+	result.pmu_status_value = pmu_read_sts();
+
+	return result.pmu_status_parts.pmu_busy;
+}
+
+/* pmu set config parameters */
+struct cfg_delay_param_t {
+	u32 cmd:8;
+	u32 ioc:1;
+	u32 cfg_mode:4;
+	u32 mode_id:3;
+	u32 sys_state:3;
+	u32 cfg_delay:8;
+	u32 rsvd:5;
+};
+
+struct cfg_trig_param_t {
+	u32 cmd:8;
+	u32 ioc:1;
+	u32 cfg_mode:4;
+	u32 mode_id:3;
+	u32 sys_state:3;
+	u32 cfg_trig_type:3;
+	u32 cfg_trig_val:8;
+	u32 cmbi:1;
+	u32 rsvd1:1;
+};
+
+union pmu_pm_set_cfg_cmd_t {
+	union {
+		struct cfg_delay_param_t d_param;
+		struct cfg_trig_param_t t_param;
+	} pmu2_params;
+	u32 pmu_pm_set_cfg_cmd_value;
+};
+
+#ifdef FUTURE_PATCH
+extern int mrst_s0i3_entry(u32 regval, u32 *regaddr);
+#else
+static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; }
+#endif
+#endif
-- 
1.7.6.396.ge0613

^ permalink raw reply related

* idle patch queue for Linux 3.1
From: Len Brown @ 2011-08-03 19:44 UTC (permalink / raw)
  To: linux-pm, linux-kernel

Here is the idle patch queue for Linux 3.1
Please let me know if you see troubles with
any of these patches.

thanks,
-Len Brown, Intel Open Source Technology Center

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox