* [PATCH 2/7] x86, acpi, tboot: Have a ACPI sleep override instead of calling tboot_sleep.
From: Konrad Rzeszutek Wilk @ 2011-08-31 18:31 UTC (permalink / raw)
To: x86, tglx, tboot-devel, shane.wang, linux-pm, linux-acpi,
len.brown
Cc: xen-devel, Konrad Rzeszutek Wilk
In-Reply-To: <1314815484-4668-1-git-send-email-konrad.wilk@oracle.com>
The ACPI suspend path makes a call to tboot_sleep right before
it writes the PM1A, PM1B values. We replace the direct call to
tboot via an registration callback similar to __acpi_register_gsi.
CC: Thomas Gleixner <tglx@linutronix.de>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: x86@kernel.org
CC: Len Brown <len.brown@intel.com>
CC: Joseph Cihula <joseph.cihula@intel.com>
CC: Shane Wang <shane.wang@intel.com>
CC: xen-devel@lists.xensource.com
CC: linux-pm@lists.linux-foundation.org
CC: tboot-devel@lists.sourceforge.net
CC: linux-acpi@vger.kernel.org
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/include/asm/acpi.h | 3 +++
arch/x86/kernel/acpi/boot.c | 3 +++
arch/x86/kernel/tboot.c | 13 +++++++++----
drivers/acpi/acpica/hwsleep.c | 12 ++++++++++--
include/linux/tboot.h | 3 ++-
5 files changed, 27 insertions(+), 7 deletions(-)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 610001d..49864a1 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -98,6 +98,9 @@ void acpi_pic_sci_set_trigger(unsigned int, u16);
extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
int trigger, int polarity);
+extern int (*__acpi_override_sleep)(u8 sleep_state, u32 pm1a_ctrl,
+ u32 pm1b_ctrl, bool *skip_rest);
+
static inline void disable_acpi(void)
{
acpi_disabled = 1;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4558f0d..d191b4c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -552,6 +552,9 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
int trigger, int polarity) = acpi_register_gsi_pic;
+int (*__acpi_override_sleep)(u8 sleep_state, u32 pm1a_ctrl,
+ u32 pm1b_ctrl, bool *skip_rest) = NULL;
+
/*
* success: return IRQ number (>=0)
* failure: return < 0
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 30ac65d..a18070c 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -41,7 +41,7 @@
#include <asm/setup.h>
#include <asm/e820.h>
#include <asm/io.h>
-
+#include <linux/acpi.h>
#include "acpi/realmode/wakeup.h"
/* Global pointer to shared data; NULL means no measured launch. */
@@ -270,7 +270,8 @@ static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
offsetof(struct acpi_table_facs, firmware_waking_vector);
}
-void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control,
+ bool *skip_rest)
{
static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
/* S0,1,2: */ -1, -1, -1,
@@ -279,7 +280,7 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
/* S5: */ TB_SHUTDOWN_S5 };
if (!tboot_enabled())
- return;
+ return AE_OK;
tboot_copy_fadt(&acpi_gbl_FADT);
tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
@@ -290,10 +291,12 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
if (sleep_state >= ACPI_S_STATE_COUNT ||
acpi_shutdown_map[sleep_state] == -1) {
pr_warning("unsupported sleep state 0x%x\n", sleep_state);
- return;
+ return AE_ERROR;
}
tboot_shutdown(acpi_shutdown_map[sleep_state]);
+
+ return AE_OK;
}
static atomic_t ap_wfs_count;
@@ -343,6 +346,8 @@ static __init int tboot_late_init(void)
atomic_set(&ap_wfs_count, 0);
register_hotcpu_notifier(&tboot_cpu_notifier);
+
+ __acpi_override_sleep = tboot_sleep;
return 0;
}
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index 2ac28bb..31d1198 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -45,7 +45,6 @@
#include <acpi/acpi.h>
#include "accommon.h"
#include "actables.h"
-#include <linux/tboot.h>
#define _COMPONENT ACPI_HARDWARE
ACPI_MODULE_NAME("hwsleep")
@@ -343,8 +342,17 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state)
ACPI_FLUSH_CPU_CACHE();
- tboot_sleep(sleep_state, pm1a_control, pm1b_control);
+ if (__acpi_override_sleep) {
+ bool skip_rest = false;
+ status = __acpi_override_sleep(sleep_state, pm1a_control,
+ pm1b_control, &skip_rest);
+
+ if (ACPI_FAILURE(status))
+ return_ACPI_STATUS(status);
+ if (skip_rest)
+ return_ACPI_STATUS(AE_OK);
+ }
/* Write #2: Write both SLP_TYP + SLP_EN */
status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control);
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
index 1dba6ee..19badbd 100644
--- a/include/linux/tboot.h
+++ b/include/linux/tboot.h
@@ -143,7 +143,8 @@ static inline int tboot_enabled(void)
extern void tboot_probe(void);
extern void tboot_shutdown(u32 shutdown_type);
-extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control);
+extern int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control,
+ bool *skip);
extern struct acpi_table_header *tboot_get_dmar_table(
struct acpi_table_header *dmar_tbl);
extern int tboot_force_iommu(void);
--
1.7.4.1
^ permalink raw reply related
* [PATCH 1/7] x86: Expand the x86_msi_ops to have a restore MSIs.
From: Konrad Rzeszutek Wilk @ 2011-08-31 18:31 UTC (permalink / raw)
To: x86, tglx, tboot-devel, shane.wang, linux-pm, linux-acpi,
len.brown
Cc: xen-devel, Konrad Rzeszutek Wilk
In-Reply-To: <1314815484-4668-1-git-send-email-konrad.wilk@oracle.com>
The MSI restore function will become a function pointer in an
x86_msi_ops struct. It defaults to the implementation in the
io_apic.c and msi.c. We piggyback on the indirection mechanism
introduced by "x86: Introduce x86_msi_ops".
Cc: x86@kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
arch/x86/include/asm/pci.h | 9 +++++++++
arch/x86/include/asm/x86_init.h | 1 +
arch/x86/kernel/x86_init.c | 1 +
drivers/pci/msi.c | 29 +++++++++++++++++++++++++++--
4 files changed, 38 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d498943..df75d07 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -112,19 +112,28 @@ static inline void x86_teardown_msi_irq(unsigned int irq)
{
x86_msi.teardown_msi_irq(irq);
}
+static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+ x86_msi.restore_msi_irqs(dev, irq);
+}
#define arch_setup_msi_irqs x86_setup_msi_irqs
#define arch_teardown_msi_irqs x86_teardown_msi_irqs
#define arch_teardown_msi_irq x86_teardown_msi_irq
+#define arch_restore_msi_irqs x86_restore_msi_irqs
/* implemented in arch/x86/kernel/apic/io_apic. */
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
void native_teardown_msi_irq(unsigned int irq);
+void native_restore_msi_irqs(struct pci_dev *dev, int irq);
/* default to the implementation in drivers/lib/msi.c */
#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+#define HAVE_DEFAULT_MSI_RESTORE_IRQS
void default_teardown_msi_irqs(struct pci_dev *dev);
+void default_restore_msi_irqs(struct pci_dev *dev, int irq);
#else
#define native_setup_msi_irqs NULL
#define native_teardown_msi_irq NULL
#define default_teardown_msi_irqs NULL
+#define default_restore_msi_irqs NULL
#endif
#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index d3d8590..7af18be 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -174,6 +174,7 @@ struct x86_msi_ops {
int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
void (*teardown_msi_irq)(unsigned int irq);
void (*teardown_msi_irqs)(struct pci_dev *dev);
+ void (*restore_msi_irqs)(struct pci_dev *dev, int irq);
};
extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 6f164bd..bd1fe10 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -110,4 +110,5 @@ struct x86_msi_ops x86_msi = {
.setup_msi_irqs = native_setup_msi_irqs,
.teardown_msi_irq = native_teardown_msi_irq,
.teardown_msi_irqs = default_teardown_msi_irqs,
+ .restore_msi_irqs = default_restore_msi_irqs,
};
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 2f10328..f1fd801 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -85,6 +85,31 @@ void default_teardown_msi_irqs(struct pci_dev *dev)
}
#endif
+#ifndef arch_restore_msi_irqs
+# define arch_restore_msi_irqs default_restore_msi_irqs
+# define HAVE_DEFAULT_MSI_RESTORE_IRQS
+#endif
+
+#ifdef HAVE_DEFAULT_MSI_RESTORE_IRQS
+void default_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+ struct msi_desc *entry;
+
+ entry = NULL;
+ if (dev->msix_enabled) {
+ list_for_each_entry(entry, &dev->msi_list, list) {
+ if (irq == entry->irq)
+ break;
+ }
+ } else if (dev->msi_enabled) {
+ entry = irq_get_msi_desc(irq);
+ }
+
+ if (entry)
+ write_msi_msg(irq, &entry->msg);
+}
+#endif
+
static void msi_set_enable(struct pci_dev *dev, int pos, int enable)
{
u16 control;
@@ -359,7 +384,7 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
pci_intx_for_msi(dev, 0);
msi_set_enable(dev, pos, 0);
- write_msi_msg(dev->irq, &entry->msg);
+ arch_restore_msi_irqs(dev, dev->irq);
pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
msi_mask_irq(entry, msi_capable_mask(control), entry->masked);
@@ -387,7 +412,7 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
list_for_each_entry(entry, &dev->msi_list, list) {
- write_msi_msg(entry->irq, &entry->msg);
+ arch_restore_msi_irqs(dev, entry->irq);
msix_mask_irq(entry, entry->masked);
}
--
1.7.4.1
^ permalink raw reply related
* [RFC PATCH v1] ACPI S3 to work under Xen.
From: Konrad Rzeszutek Wilk @ 2011-08-31 18:31 UTC (permalink / raw)
To: x86, tglx, tboot-devel, shane.wang, linux-pm, linux-acpi,
len.brown
Cc: xen-devel
Attached is an RFC set of patches to enable S3 to work with the Xen hypervisor.
The relationship that Xen has with Linux kernel is symbiotic. The Linux
kernel does the ACPI "stuff" and tells the hypervisor to do the low-level
stuff (such as program the IOAPIC, setup vectors, etc). The realm of
ACPI S3 is more complex as we need to save the CPU state (and Intel TXT
values - which the hypervisor has to do) and then restore them.
The major difficulties we hit was with 'acpi_suspend_lowlevel' - which tweaks
a lot of lowlevel values and some of them are not properly handled by Xen.
Liang Tang has figured which ones of them we trip over (read below) - and he
suggested that perhaps we can provide a registration mechanism to abstract
this away.
So the attached patches do exactly that - there are two entry points
in the ACPI.
1). For S3: acpi_suspend_lowlevel -> .. lots of code -> acpi_enter_sleep_state
2). For S1/S4/S5: acpi_enter_sleep_state
The first naive idea was of abstracting away in the 'acpi_enter_sleep_state'
function the tboot_sleep code so that we can use it too. And low-behold - it
worked splendidly for powering off (S5 I believe)
For S3 that did not work - during suspend the hypervisor tripped over when
saving cr8. During resume it tripped over at restoring the cr3, cr8, idt,
and gdt values.
What do you guys think? One thought is to use the paravirt interface to
deal with cr3, cr8, idt, gdt for suspend/resume case.. But that is a lot
of extra 'if' in the paravirt code - which the callback registration would
effectively do the same thing as the paravirt - except at a higher level.
Thoughts?
Konrad Rzeszutek Wilk (5):
x86: Expand the x86_msi_ops to have a restore MSIs.
x86, acpi, tboot: Have a ACPI sleep override instead of calling tboot_sleep.
xen: Utilize the restore_msi_irqs hook.
xen/acpi/sleep: Enable ACPI sleep via the __acpi_override_sleep
xen/acpi/sleep: Register to the acpi_suspend_lowlevel a callback.
Liang Tang (1):
x86/acpi/sleep: Provide registration for acpi_suspend_lowlevel.
Yu Ke (1):
xen/acpi: Domain0 acpi parser related platform hypercall
arch/ia64/include/asm/xen/interface.h | 1 +
arch/x86/include/asm/acpi.h | 5 +-
arch/x86/include/asm/pci.h | 9 +
arch/x86/include/asm/x86_init.h | 1 +
arch/x86/include/asm/xen/hypercall.h | 8 +
arch/x86/include/asm/xen/interface.h | 1 +
arch/x86/kernel/acpi/boot.c | 5 +
arch/x86/kernel/acpi/sleep.c | 4 +-
arch/x86/kernel/acpi/sleep.h | 2 +
arch/x86/kernel/tboot.c | 13 +-
arch/x86/kernel/x86_init.c | 1 +
arch/x86/pci/xen.c | 12 ++
arch/x86/xen/enlighten.c | 3 +
drivers/acpi/acpica/hwsleep.c | 12 +-
drivers/acpi/sleep.c | 2 +
drivers/pci/msi.c | 29 +++-
drivers/xen/Makefile | 2 +-
drivers/xen/acpi.c | 25 +++
include/linux/tboot.h | 3 +-
include/xen/acpi.h | 38 ++++
include/xen/interface/physdev.h | 7 +
include/xen/interface/platform.h | 320 +++++++++++++++++++++++++++++++++
include/xen/interface/xen.h | 1 +
23 files changed, 491 insertions(+), 13 deletions(-)
^ permalink raw reply
* Re: [PATCH pm-freezer 1/4] cgroup_freezer: fix freezer->state setting bug in freezer_change_state()
From: Oleg Nesterov @ 2011-08-31 18:08 UTC (permalink / raw)
To: Tejun Heo; +Cc: linux-kernel, Paul Menage, containers, linux-pm
In-Reply-To: <20110831102100.GA2828@mtj.dyndns.org>
On 08/31, Tejun Heo wrote:
>
> I'm in the process of moving and can only use a quite old laptop. I
> tested compile but couldn't really do much else, so please proceed
> with caution. Oleg, can you please ack the patches if you agree with
> the updated versions?
Everything looks fine. But I am already sleeping now ;)
Rafael, Tejun, I'll try to re-read 1-4 tomorrow. I do not expect I'll
find something interesting, just I am paranoid.
Looks like, 1/4 could have an additional note in the changelog, with
this patch we avoid the unnecessary try_to_freeze_cgroup() and this
looks like a win to me...
Oleg.
^ permalink raw reply
* Re: [PATCH 6/6] cgroup: kill subsys->can_attach_task(), pre_attach() and attach_task()
From: Frederic Weisbecker @ 2011-08-31 13:42 UTC (permalink / raw)
To: Tejun Heo; +Cc: containers, lizf, linux-kernel, linux-pm, paul, kamezawa.hiroyu
In-Reply-To: <20110831070313.GA29179@mtj.dyndns.org>
On Wed, Aug 31, 2011 at 09:03:13AM +0200, Tejun Heo wrote:
> Hello, Frederic.
>
> On Tue, Aug 30, 2011 at 10:10:32PM +0200, Frederic Weisbecker wrote:
> > In order to keep the fix queued in -mm (https://lkml.org/lkml/2011/8/26/262)
> > the tasks that have failed to migrate should be removed from the iterator
> > so that they are not included in the batch in ->attach().
>
> I don't think that's a good approach. It breaks the symmetry when
> calling different callbacks. What if ->can_attach() allocates
> per-task resources and the task exits in the middle? I think we
> better lock down fork/exit/exec. I'll send patches but I'm currently
> moving / traveling w/ limited access to my toys so it might take some
> time.
My task counter subsystem patchset brings a cancel_attach_task() callback
that cancels can_attach_task() effects.
I thought that rebased on top of your patch it's going to be merged inside
cancel_attach() but OTOH we can't cancel the effect of failed migration
on a thread that way.
May be we need to keep a cancel_attach_task() just for that purpose?
^ permalink raw reply
* wait_event_freezable variant for TASK_KILLABLE?
From: Jeff Layton @ 2011-08-31 12:44 UTC (permalink / raw)
To: linux-pm; +Cc: linux-cifs, linux-kernel
I had a bug reported a while back that cifs mounts were preventing
machines from suspending. I can reproduce this pretty readily by simply
making a cifs mount, leaving it idle for a bit (so that the root dentry
will need to be revalidated) and then attempting to suspend the
machine. When I do that I get the following backtrace:
[ 5323.278130] PM: Syncing filesystems ... done.
[ 5323.313956] PM: Preparing system for mem sleep
[ 5323.435457] Freezing user space processes ...
[ 5343.444237] Freezing of tasks failed after 20.00 seconds (1 tasks refusing to freeze, wq_busy=0):
[ 5343.444335] umount D ffff88011075dc00 0 7400 7383 0x00800084
[ 5343.444342] ffff8800c95e1b08 0000000000000086 ffff8800c95e1b40 ffff880000000001
[ 5343.444348] ffff880117965cc0 ffff8800c95e1fd8 ffff8800c95e1fd8 0000000000012540
[ 5343.444354] ffff8800d5d5c590 ffff880117965cc0 ffff8800c95e1b18 00000001c95e1ad8
[ 5343.444359] Call Trace:
[ 5343.444378] [<ffffffffa044f0ca>] wait_for_response+0x199/0x19e [cifs]
[ 5343.444384] [<ffffffff81070566>] ? remove_wait_queue+0x3a/0x3a
[ 5343.444392] [<ffffffffa044fe93>] SendReceive+0x184/0x285 [cifs]
[ 5343.444399] [<ffffffffa043a51e>] CIFSSMBUnixQPathInfo+0x167/0x212 [cifs]
[ 5343.444407] [<ffffffffa044ae90>] cifs_get_inode_info_unix+0x8e/0x165 [cifs]
[ 5343.444414] [<ffffffffa0444223>] ? build_path_from_dentry+0xe2/0x20d [cifs]
[ 5343.444418] [<ffffffff8111673e>] ? __kmalloc+0x103/0x115
[ 5343.444425] [<ffffffffa0444223>] ? build_path_from_dentry+0xe2/0x20d [cifs]
[ 5343.444431] [<ffffffffa0444223>] ? build_path_from_dentry+0xe2/0x20d [cifs]
[ 5343.444439] [<ffffffffa044c11d>] cifs_revalidate_dentry_attr+0x10b/0x172 [cifs]
[ 5343.444447] [<ffffffffa044c259>] cifs_getattr+0x7a/0xfc [cifs]
[ 5343.444451] [<ffffffff8112a9f7>] vfs_getattr+0x45/0x63
[ 5343.444454] [<ffffffff8112aa6d>] vfs_fstatat+0x58/0x6e
[ 5343.444457] [<ffffffff8112aabe>] vfs_stat+0x1b/0x1d
[ 5343.444460] [<ffffffff8112abbd>] sys_newstat+0x1a/0x33
[ 5343.444463] [<ffffffff8112f9e8>] ? path_put+0x20/0x24
[ 5343.444466] [<ffffffff810a0e84>] ? audit_syscall_entry+0x145/0x171
[ 5343.444469] [<ffffffff811302d1>] ? putname+0x34/0x36
[ 5343.444473] [<ffffffff8148e842>] system_call_fastpath+0x16/0x1b
[ 5343.444476]
[ 5343.444477] Restarting tasks ... done.
wait_for_response basically does this to put a task to sleep while it's
waiting for the server to respond:
error = wait_event_killable(server->response_q,
midQ->midState != MID_REQUEST_SUBMITTED);
NFS does similar sorts of things, and I think it has similar problems
with the freezer.
The problem there is pretty clear. That won't wake up unless you send
it a fatal signal, and we need it to wake up and freeze in that
situation. So, I made a stab at rolling a wait_event_freezekillable()
macro, based on wait_event_freezable.
-----------------------[snip]-----------------------------
#define wait_event_freezekillable(wq, condition) \
({ \
int __retval; \
do { \
__retval = wait_event_killable(wq, \
(condition) || freezing(current)); \
if (__retval && !freezing(current)) \
break; \
else if (!(condition)) \
__retval = -ERESTARTSYS; \
} while (try_to_freeze()); \
__retval; \
})
-----------------------[snip]-----------------------------
However, I still got the same problem when trying to put the task to
sleep. I could dig in and try to figure out why this isn't working like
I expect, but I figured I'd ask here first to see if I can determine
whether linux-pm has advice on how best to approach this. :)
Basically, what we'd like is something akin to wait_event_freezable,
but that only returns -ERESTARTSYS on fatal signals (SIGKILL).
Thoughts?
--
Jeff Layton <jlayton@redhat.com>
^ permalink raw reply
* [PATCH pm-freezer 4/4] freezer: use lock_task_sighand() in fake_signal_wake_up()
From: Tejun Heo @ 2011-08-31 10:22 UTC (permalink / raw)
To: Rafael J. Wysocki, Oleg Nesterov, Paul Menage
Cc: containers, linux-pm, linux-kernel
In-Reply-To: <20110831102210.GC2828@mtj.dyndns.org>
cgroup_freezer calls freeze_task() without holding tasklist_lock and,
if the task is exiting, its ->sighand may be gone by the time
fake_signal_wake_up() is called. Use lock_task_sighand() instead of
accessing ->sighand directly.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Paul Menage <paul@paulmenage.org>
---
kernel/freezer.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
Index: work/kernel/freezer.c
===================================================================
--- work.orig/kernel/freezer.c
+++ work/kernel/freezer.c
@@ -95,9 +95,10 @@ static void fake_signal_wake_up(struct t
{
unsigned long flags;
- spin_lock_irqsave(&p->sighand->siglock, flags);
- signal_wake_up(p, 0);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ if (lock_task_sighand(p, &flags)) {
+ signal_wake_up(p, 0);
+ unlock_task_sighand(p, &flags);
+ }
}
/**
^ permalink raw reply
* [PATCH pm-freezer 3/4] freezer: restructure __refrigerator()
From: Tejun Heo @ 2011-08-31 10:22 UTC (permalink / raw)
To: Rafael J. Wysocki, Oleg Nesterov, Paul Menage
Cc: containers, linux-pm, linux-kernel
In-Reply-To: <20110831102143.GB2828@mtj.dyndns.org>
If another freeze happens before all tasks leave FROZEN state after
being thawed, the freezer can see the existing FROZEN and consider the
tasks to be frozen but they can clear FROZEN without checking the new
freezing().
Oleg suggested restructuring __refrigerator() such that there's single
condition check section inside freezer_lock and sigpending is cleared
afterwards, which fixes the problem and simplifies the code.
Restructure accordingly.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
---
kernel/freezer.c | 33 ++++++++++++++-------------------
1 file changed, 14 insertions(+), 19 deletions(-)
Index: work/kernel/freezer.c
===================================================================
--- work.orig/kernel/freezer.c
+++ work/kernel/freezer.c
@@ -52,36 +52,31 @@ bool __refrigerator(bool check_kthr_stop
/* Hmm, should we be allowed to suspend when there are realtime
processes around? */
bool was_frozen = false;
- long save;
+ long save = current->state;
- /*
- * No point in checking freezing() again - the caller already did.
- * Proceed to enter FROZEN.
- */
- spin_lock_irq(&freezer_lock);
- current->flags |= PF_FROZEN;
- spin_unlock_irq(&freezer_lock);
-
- save = current->state;
pr_debug("%s entered refrigerator\n", current->comm);
- spin_lock_irq(¤t->sighand->siglock);
- recalc_sigpending(); /* We sent fake signal, clean it up */
- spin_unlock_irq(¤t->sighand->siglock);
-
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
+
+ spin_lock_irq(&freezer_lock);
+ current->flags |= PF_FROZEN;
if (!freezing(current) ||
- (check_kthr_stop && kthread_should_stop()))
+ (check_kthr_stop && kthread_should_stop())) {
+ current->flags &= ~PF_FROZEN;
+ break;
+ }
+ spin_unlock_irq(&freezer_lock);
+
+ if (!(current->flags & PF_FROZEN))
break;
was_frozen = true;
schedule();
}
- /* leave FROZEN */
- spin_lock_irq(&freezer_lock);
- current->flags &= ~PF_FROZEN;
- spin_unlock_irq(&freezer_lock);
+ spin_lock_irq(¤t->sighand->siglock);
+ recalc_sigpending(); /* We sent fake signal, clean it up */
+ spin_unlock_irq(¤t->sighand->siglock);
pr_debug("%s left refrigerator\n", current->comm);
^ permalink raw reply
* [PATCH pm-freezer 2/4] freezer: set PF_NOFREEZE on a dying task right before TASK_DEAD setting bug in freezer_change_state()
From: Tejun Heo @ 2011-08-31 10:21 UTC (permalink / raw)
To: Rafael J. Wysocki, Oleg Nesterov, Paul Menage
Cc: containers, linux-pm, linux-kernel
In-Reply-To: <20110831102100.GA2828@mtj.dyndns.org>
3fb45733df "freezer: make exiting tasks properly unfreezable" removed
clear_freeze_flag() from exit path and set PF_NOFREEZE right after
PTRACE_EVENT_EXIT; however, Oleg pointed out that following exit paths
may cause interaction with device drivers after PM freezer consider
the system frozen.
There's no try_to_freeze() call in the exit path and the only
necessary guarantee is that freezer doesn't hang waiting for zombies.
Set PF_NOFREEZE right before setting tsk->state to TASK_DEAD instead.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
---
kernel/exit.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
Index: work/kernel/exit.c
===================================================================
--- work.orig/kernel/exit.c
+++ work/kernel/exit.c
@@ -913,12 +913,6 @@ NORET_TYPE void do_exit(long code)
ptrace_event(PTRACE_EVENT_EXIT, code);
- /*
- * With ptrace notification done, there's no point in freezing from
- * here on. Disallow freezing.
- */
- current->flags |= PF_NOFREEZE;
-
validate_creds_for_do_exit(tsk);
/*
@@ -1044,6 +1038,10 @@ NORET_TYPE void do_exit(long code)
preempt_disable();
exit_rcu();
+
+ /* this task is now dead and freezer should ignore it */
+ current->flags |= PF_NOFREEZE;
+
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
schedule();
^ permalink raw reply
* [PATCH pm-freezer 1/4] cgroup_freezer: fix freezer->state setting bug in freezer_change_state()
From: Tejun Heo @ 2011-08-31 10:21 UTC (permalink / raw)
To: Rafael J. Wysocki, Oleg Nesterov, Paul Menage
Cc: containers, linux-pm, linux-kernel
d02f52811d0e "cgroup_freezer: prepare for removal of TIF_FREEZE" moved
setting of freezer->state into freezer_change_state(); unfortunately,
while doing so, when it's beginning to freeze tasks, it sets the state
to CGROUP_FROZEN instead of CGROUP_FREEZING ending up skipping the
whole freezing state. Fix it.
-v2: Oleg pointed out that re-freezing FROZEN cgroup could increment
system_freezing_cnt. Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Menage <paul@paulmenage.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
---
I'm in the process of moving and can only use a quite old laptop. I
tested compile but couldn't really do much else, so please proceed
with caution. Oleg, can you please ack the patches if you agree with
the updated versions?
Thanks.
kernel/cgroup_freezer.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
Index: work/kernel/cgroup_freezer.c
===================================================================
--- work.orig/kernel/cgroup_freezer.c
+++ work/kernel/cgroup_freezer.c
@@ -308,24 +308,26 @@ static int freezer_change_state(struct c
spin_lock_irq(&freezer->lock);
update_if_frozen(cgroup, freezer);
- if (goal_state == freezer->state)
- goto out;
-
- freezer->state = goal_state;
switch (goal_state) {
case CGROUP_THAWED:
- atomic_dec(&system_freezing_cnt);
- unfreeze_cgroup(cgroup, freezer);
+ if (freezer->state != CGROUP_THAWED) {
+ freezer->state = CGROUP_THAWED;
+ atomic_dec(&system_freezing_cnt);
+ unfreeze_cgroup(cgroup, freezer);
+ }
break;
case CGROUP_FROZEN:
- atomic_inc(&system_freezing_cnt);
- retval = try_to_freeze_cgroup(cgroup, freezer);
+ if (freezer->state == CGROUP_THAWED) {
+ freezer->state = CGROUP_FREEZING;
+ atomic_inc(&system_freezing_cnt);
+ retval = try_to_freeze_cgroup(cgroup, freezer);
+ }
break;
default:
BUG();
}
-out:
+
spin_unlock_irq(&freezer->lock);
return retval;
^ permalink raw reply
* [PATCH v9 4/4] PM / devfreq: add basic governors
From: MyungJoo Ham @ 2011-08-31 7:29 UTC (permalink / raw)
To: linux-pm; +Cc: Len Brown, Greg Kroah-Hartman, Kyungmin Park, Thomas Gleixner
In-Reply-To: <1314775779-21399-1-git-send-email-myungjoo.ham@samsung.com>
Four cpufreq-like governors are provided as examples.
powersave: use the lowest frequency possible. The user (device) should
set the polling_ms as 0 because polling is useless for this governor.
performance: use the highest freqeuncy possible. The user (device)
should set the polling_ms as 0 because polling is useless for this
governor.
userspace: use the user specified frequency stored at
devfreq.user_set_freq. With sysfs support in the following patch, a user
may set the value with the sysfs interface.
simple_ondemand: simplified version of cpufreq's ondemand governor.
When a user updates OPP entries (enable/disable/add), OPP framework
automatically notifies devfreq to update operating frequency
accordingly. Thus, devfreq users (device drivers) do not need to update
devfreq manually with OPP entry updates or set polling_ms for powersave
, performance, userspace, or any other "static" governors.
Note that these are given only as basic examples for governors and any
devices with devfreq may implement their own governors with the drivers
and use them.
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
---
Changed from v8
- Removed unnecessary header entries
Changed from v7
- Userspace uses its own sysfs interface.
Changed from v5
- Seperated governor files from devfreq.c
- Allow simple ondemand to be tuned for each device
---
Documentation/ABI/testing/sysfs-devices-power | 9 ++
drivers/devfreq/Kconfig | 36 +++++++
drivers/devfreq/Makefile | 4 +
drivers/devfreq/governor_performance.c | 24 +++++
drivers/devfreq/governor_powersave.c | 24 +++++
drivers/devfreq/governor_simpleondemand.c | 88 +++++++++++++++++
drivers/devfreq/governor_userspace.c | 126 +++++++++++++++++++++++++
include/linux/devfreq.h | 37 +++++++
8 files changed, 348 insertions(+), 0 deletions(-)
create mode 100644 drivers/devfreq/governor_performance.c
create mode 100644 drivers/devfreq/governor_powersave.c
create mode 100644 drivers/devfreq/governor_simpleondemand.c
create mode 100644 drivers/devfreq/governor_userspace.c
diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power
index 57f4591..c7f6977 100644
--- a/Documentation/ABI/testing/sysfs-devices-power
+++ b/Documentation/ABI/testing/sysfs-devices-power
@@ -202,3 +202,12 @@ Description:
shows the requested polling interval of the corresponding
device. The values are represented in ms. If the value is less
than 1 jiffy, it is considered to be 0, which means no polling.
+
+What: /sys/devices/.../power/devfreq_userspace_set_freq
+Date: August 2011
+Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
+Description:
+ The /sys/devices/.../power/devfreq_userspace_set_freq sets
+ and shows the user specified frequency in kHz. This sysfs
+ entry is created and managed by userspace DEVFREQ governor.
+ If other governors are used, it won't be supported.
diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig
index 1fb42de..643b055 100644
--- a/drivers/devfreq/Kconfig
+++ b/drivers/devfreq/Kconfig
@@ -34,6 +34,42 @@ menuconfig PM_DEVFREQ
if PM_DEVFREQ
+comment "DEVFREQ Governors"
+
+config DEVFREQ_GOV_SIMPLE_ONDEMAND
+ bool "Simple Ondemand"
+ help
+ Chooses frequency based on the recent load on the device. Works
+ similar as ONDEMAND governor of CPUFREQ does. A device with
+ Simple-Ondemand should be able to provide busy/total counter
+ values that imply the usage rate. A device may provide tuned
+ values to the governor with data field at devfreq_add_device().
+
+config DEVFREQ_GOV_PERFORMANCE
+ bool "Performance"
+ help
+ Sets the frequency at the maximum available frequency.
+ This governor always returns UINT_MAX as frequency so that
+ the DEVFREQ framework returns the highest frequency available
+ at any time.
+
+config DEVFREQ_GOV_POWERSAVE
+ bool "Powersave"
+ help
+ Sets the frequency at the minimum available frequency.
+ This governor always returns 0 as frequency so that
+ the DEVFREQ framework returns the lowest frequency available
+ at any time.
+
+config DEVFREQ_GOV_USERSPACE
+ bool "Userspace"
+ help
+ Sets the frequency at the user specified one.
+ This governor returns the user configured frequency if there
+ has been an input to /sys/devices/.../power/devfreq_set_freq.
+ Otherwise, the governor does not change the frequnecy
+ given at the initialization.
+
comment "DEVFREQ Drivers"
endif # PM_DEVFREQ
diff --git a/drivers/devfreq/Makefile b/drivers/devfreq/Makefile
index 168934a..4564a89 100644
--- a/drivers/devfreq/Makefile
+++ b/drivers/devfreq/Makefile
@@ -1 +1,5 @@
obj-$(CONFIG_PM_DEVFREQ) += devfreq.o
+obj-$(CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND) += governor_simpleondemand.o
+obj-$(CONFIG_DEVFREQ_GOV_PERFORMANCE) += governor_performance.o
+obj-$(CONFIG_DEVFREQ_GOV_POWERSAVE) += governor_powersave.o
+obj-$(CONFIG_DEVFREQ_GOV_USERSPACE) += governor_userspace.o
diff --git a/drivers/devfreq/governor_performance.c b/drivers/devfreq/governor_performance.c
new file mode 100644
index 0000000..c47eff8
--- /dev/null
+++ b/drivers/devfreq/governor_performance.c
@@ -0,0 +1,24 @@
+/*
+ * linux/drivers/devfreq/governor_performance.c
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/devfreq.h>
+
+static int devfreq_performance_func(struct devfreq *df,
+ unsigned long *freq)
+{
+ *freq = UINT_MAX; /* devfreq_do will run "floor" */
+ return 0;
+}
+
+struct devfreq_governor devfreq_performance = {
+ .name = "performance",
+ .get_target_freq = devfreq_performance_func,
+};
diff --git a/drivers/devfreq/governor_powersave.c b/drivers/devfreq/governor_powersave.c
new file mode 100644
index 0000000..4f128d8
--- /dev/null
+++ b/drivers/devfreq/governor_powersave.c
@@ -0,0 +1,24 @@
+/*
+ * linux/drivers/devfreq/governor_powersave.c
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/devfreq.h>
+
+static int devfreq_powersave_func(struct devfreq *df,
+ unsigned long *freq)
+{
+ *freq = 0; /* devfreq_do will run "ceiling" to 0 */
+ return 0;
+}
+
+struct devfreq_governor devfreq_powersave = {
+ .name = "powersave",
+ .get_target_freq = devfreq_powersave_func,
+};
diff --git a/drivers/devfreq/governor_simpleondemand.c b/drivers/devfreq/governor_simpleondemand.c
new file mode 100644
index 0000000..18fe8be
--- /dev/null
+++ b/drivers/devfreq/governor_simpleondemand.c
@@ -0,0 +1,88 @@
+/*
+ * linux/drivers/devfreq/governor_simpleondemand.c
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/errno.h>
+#include <linux/devfreq.h>
+#include <linux/math64.h>
+
+/* Default constants for DevFreq-Simple-Ondemand (DFSO) */
+#define DFSO_UPTHRESHOLD (90)
+#define DFSO_DOWNDIFFERENCTIAL (5)
+static int devfreq_simple_ondemand_func(struct devfreq *df,
+ unsigned long *freq)
+{
+ struct devfreq_dev_status stat;
+ int err = df->profile->get_dev_status(df->dev, &stat);
+ unsigned long long a, b;
+ unsigned int dfso_upthreshold = DFSO_UPTHRESHOLD;
+ unsigned int dfso_downdifferential = DFSO_DOWNDIFFERENCTIAL;
+ struct devfreq_simple_ondemand_data *data = df->data;
+
+ if (err)
+ return err;
+
+ if (data) {
+ if (data->upthreshold)
+ dfso_upthreshold = data->upthreshold;
+ if (data->downdifferential)
+ dfso_downdifferential = data->downdifferential;
+ }
+ if (dfso_upthreshold > 100 ||
+ dfso_upthreshold < dfso_downdifferential)
+ return -EINVAL;
+
+ /* Assume MAX if it is going to be divided by zero */
+ if (stat.total_time == 0) {
+ *freq = UINT_MAX;
+ return 0;
+ }
+
+ /* Prevent overflow */
+ if (stat.busy_time >= (1 << 24) || stat.total_time >= (1 << 24)) {
+ stat.busy_time >>= 7;
+ stat.total_time >>= 7;
+ }
+
+ /* Set MAX if it's busy enough */
+ if (stat.busy_time * 100 >
+ stat.total_time * dfso_upthreshold) {
+ *freq = UINT_MAX;
+ return 0;
+ }
+
+ /* Set MAX if we do not know the initial frequency */
+ if (stat.current_frequency == 0) {
+ *freq = UINT_MAX;
+ return 0;
+ }
+
+ /* Keep the current frequency */
+ if (stat.busy_time * 100 >
+ stat.total_time * (dfso_upthreshold - dfso_downdifferential)) {
+ *freq = stat.current_frequency;
+ return 0;
+ }
+
+ /* Set the desired frequency based on the load */
+ a = stat.busy_time;
+ a *= stat.current_frequency;
+ b = div_u64(a, stat.total_time);
+ b *= 100;
+ b = div_u64(b, (dfso_upthreshold - dfso_downdifferential / 2));
+ *freq = (unsigned long) b;
+
+ return 0;
+}
+
+struct devfreq_governor devfreq_simple_ondemand = {
+ .name = "simple_ondemand",
+ .get_target_freq = devfreq_simple_ondemand_func,
+};
diff --git a/drivers/devfreq/governor_userspace.c b/drivers/devfreq/governor_userspace.c
new file mode 100644
index 0000000..490167f
--- /dev/null
+++ b/drivers/devfreq/governor_userspace.c
@@ -0,0 +1,126 @@
+/*
+ * linux/drivers/devfreq/governor_simpleondemand.c
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/devfreq.h>
+#include <linux/pm.h>
+#include <linux/mutex.h>
+#include "governor.h"
+
+struct userspace_data {
+ unsigned long user_frequency;
+ bool valid;
+};
+
+static int devfreq_userspace_func(struct devfreq *df, unsigned long *freq)
+{
+ struct userspace_data *data = df->data;
+
+ if (!data->valid)
+ *freq = df->previous_freq; /* No user freq specified yet */
+ else
+ *freq = data->user_frequency;
+ return 0;
+}
+
+static ssize_t store_freq(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct devfreq *devfreq = get_devfreq(dev);
+ struct userspace_data *data;
+ unsigned long wanted;
+ int err = 0;
+
+ if (IS_ERR(devfreq)) {
+ err = PTR_ERR(devfreq);
+ goto out;
+ }
+
+ mutex_lock(&devfreq->lock);
+ data = devfreq->data;
+
+ sscanf(buf, "%lu", &wanted);
+ data->user_frequency = wanted;
+ data->valid = true;
+ err = update_devfreq(devfreq);
+ if (err == 0)
+ err = count;
+ mutex_unlock(&devfreq->lock);
+out:
+ return err;
+}
+
+static ssize_t show_freq(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct devfreq *devfreq = get_devfreq(dev);
+ struct userspace_data *data;
+ int err = 0;
+
+ if (IS_ERR(devfreq)) {
+ err = PTR_ERR(devfreq);
+ goto out;
+ }
+
+ mutex_lock(&devfreq->lock);
+ data = devfreq->data;
+
+ if (data->valid)
+ err = sprintf(buf, "%lu\n", data->user_frequency);
+ else
+ err = sprintf(buf, "undefined\n");
+ mutex_unlock(&devfreq->lock);
+out:
+ return err;
+}
+
+static DEVICE_ATTR(devfreq_userspace_set_freq, 0644, show_freq, store_freq);
+static struct attribute *dev_entries[] = {
+ &dev_attr_devfreq_userspace_set_freq.attr,
+ NULL,
+};
+static struct attribute_group dev_attr_group = {
+ .name = power_group_name,
+ .attrs = dev_entries,
+};
+
+static int userspace_init(struct devfreq *devfreq)
+{
+ int err = 0;
+ struct userspace_data *data = kzalloc(sizeof(struct userspace_data),
+ GFP_KERNEL);
+
+ if (!data) {
+ err = -ENOMEM;
+ goto out;
+ }
+ data->valid = false;
+ devfreq->data = data;
+
+ sysfs_merge_group(&devfreq->dev->kobj, &dev_attr_group);
+out:
+ return err;
+}
+
+static void userspace_exit(struct devfreq *devfreq)
+{
+ sysfs_unmerge_group(&devfreq->dev->kobj, &dev_attr_group);
+ kfree(devfreq->data);
+ devfreq->data = NULL;
+}
+
+struct devfreq_governor devfreq_userspace = {
+ .name = "userspace",
+ .get_target_freq = devfreq_userspace_func,
+ .init = userspace_init,
+ .exit = userspace_exit,
+};
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index f14b57d..5b802a6 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -105,6 +105,37 @@ extern int devfreq_add_device(struct device *dev,
struct devfreq_governor *governor,
void *data);
extern int devfreq_remove_device(struct device *dev);
+
+#ifdef CONFIG_DEVFREQ_GOV_POWERSAVE
+extern struct devfreq_governor devfreq_powersave;
+#endif
+#ifdef CONFIG_DEVFREQ_GOV_PERFORMANCE
+extern struct devfreq_governor devfreq_performance;
+#endif
+#ifdef CONFIG_DEVFREQ_GOV_USERSPACE
+extern struct devfreq_governor devfreq_userspace;
+#endif
+#ifdef CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND
+extern struct devfreq_governor devfreq_simple_ondemand;
+/**
+ * struct devfreq_simple_ondemand_data - void *data fed to struct devfreq
+ * and devfreq_add_device
+ * @ upthreshold If the load is over this value, the frequency jumps.
+ * Specify 0 to use the default. Valid value = 0 to 100.
+ * @ downdifferential If the load is under upthreshold - downdifferential,
+ * the governor may consider slowing the frequency down.
+ * Specify 0 to use the default. Valid value = 0 to 100.
+ * downdifferential < upthreshold must hold.
+ *
+ * If the fed devfreq_simple_ondemand_data pointer is NULL to the governor,
+ * the governor uses the default values.
+ */
+struct devfreq_simple_ondemand_data {
+ unsigned int upthreshold;
+ unsigned int downdifferential;
+};
+#endif
+
#else /* !CONFIG_PM_DEVFREQ */
static int devfreq_add_device(struct device *dev,
struct devfreq_dev_profile *profile,
@@ -118,6 +149,12 @@ static int devfreq_remove_device(struct device *dev)
{
return 0;
}
+
+#define devfreq_powersave NULL
+#define devfreq_performance NULL
+#define devfreq_userspace NULL
+#define devfreq_simple_ondemand NULL
+
#endif /* CONFIG_PM_DEVFREQ */
#endif /* __LINUX_DEVFREQ_H__ */
--
1.7.4.1
^ permalink raw reply related
* [PATCH v9 3/4] PM / devfreq: add common sysfs interfaces
From: MyungJoo Ham @ 2011-08-31 7:29 UTC (permalink / raw)
To: linux-pm; +Cc: Len Brown, Greg Kroah-Hartman, Kyungmin Park, Thomas Gleixner
In-Reply-To: <1314775779-21399-1-git-send-email-myungjoo.ham@samsung.com>
Device specific sysfs interface /sys/devices/.../power/devfreq_*
- governor R: name of governor
- cur_freq R: current frequency
- max_freq R: maximum operable frequency
- min_freq R: minimum operable frequency
- polling_interval R: polling interval in ms given with devfreq profile
W: update polling interval.
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
--
Changes from v8
- applied per-devfreq locking mechanism
Changes from v7
- removed set_freq from the common devfreq interface
- added get_devfreq, a mutex-protected wrapper for find_device_devfreq
(for sysfs interfaces and later with governor-support)
- corrected ABI documentation.
Changes from v6
- poling_interval is writable.
Changes from v5
- updated devferq_update usage.
Changes from v4
- removed system-wide sysfs interface
- removed tickling sysfs interface
- added set_freq for userspace governor (and any other governors that
require user input)
Changes from v3
- corrected sysfs API usage
- corrected error messages
- moved sysfs entry location
- added sysfs entries
Changes from v2
- add ABI entries for devfreq sysfs interface
---
Documentation/ABI/testing/sysfs-devices-power | 37 +++++
drivers/devfreq/devfreq.c | 203 +++++++++++++++++++++++++
2 files changed, 240 insertions(+), 0 deletions(-)
diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power
index 8ffbc25..57f4591 100644
--- a/Documentation/ABI/testing/sysfs-devices-power
+++ b/Documentation/ABI/testing/sysfs-devices-power
@@ -165,3 +165,40 @@ Description:
Not all drivers support this attribute. If it isn't supported,
attempts to read or write it will yield I/O errors.
+
+What: /sys/devices/.../power/devfreq_governor
+Date: July 2011
+Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
+Description:
+ The /sys/devices/.../power/devfreq_governor shows the name
+ of the governor used by the corresponding device.
+
+What: /sys/devices/.../power/devfreq_cur_freq
+Date: July 2011
+Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
+Description:
+ The /sys/devices/.../power/devfreq_cur_freq shows the current
+ frequency of the corresponding device.
+
+What: /sys/devices/.../power/devfreq_max_freq
+Date: July 2011
+Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
+Description:
+ The /sys/devices/.../power/devfreq_max_freq shows the
+ maximum operable frequency of the corresponding device.
+
+What: /sys/devices/.../power/devfreq_min_freq
+Date: July 2011
+Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
+Description:
+ The /sys/devices/.../power/devfreq_min_freq shows the
+ minimum operable frequency of the corresponding device.
+
+What: /sys/devices/.../power/devfreq_polling_interval
+Date: July 2011
+Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
+Description:
+ The /sys/devices/.../power/devfreq_polling_interval sets and
+ shows the requested polling interval of the corresponding
+ device. The values are represented in ms. If the value is less
+ than 1 jiffy, it is considered to be 0, which means no polling.
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 621b863..1c46052 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -37,6 +37,8 @@ static struct delayed_work devfreq_work;
static LIST_HEAD(devfreq_list);
static DEFINE_MUTEX(devfreq_list_lock);
+static struct attribute_group dev_attr_group;
+
/**
* find_device_devfreq() - find devfreq struct using device pointer
* @dev: device pointer used to lookup device devfreq.
@@ -191,6 +193,8 @@ static void devfreq_monitor(struct work_struct *work)
dev_err(devfreq->dev, "Due to devfreq_do error(%d), devfreq(%s) is removed from the device\n",
error, devfreq->governor->name);
+ sysfs_unmerge_group(&devfreq->dev->kobj,
+ &dev_attr_group);
list_del(&devfreq->node);
mutex_unlock(&devfreq->lock);
kfree(devfreq);
@@ -293,6 +297,8 @@ int devfreq_add_device(struct device *dev, struct devfreq_dev_profile *profile,
queue_delayed_work(devfreq_wq, &devfreq_work,
devfreq->next_polling);
}
+
+ sysfs_merge_group(&dev->kobj, &dev_attr_group);
mutex_unlock(&devfreq->lock);
goto out;
err_init:
@@ -333,6 +339,8 @@ int devfreq_remove_device(struct device *dev)
goto out;
}
+ sysfs_unmerge_group(&dev->kobj, &dev_attr_group);
+
list_del(&devfreq->node);
if (devfreq->governor->exit)
@@ -346,6 +354,201 @@ out:
return 0;
}
+static ssize_t show_governor(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct devfreq *df;
+ ssize_t ret;
+
+ mutex_lock(&devfreq_list_lock);
+ df = find_device_devfreq(dev);
+ if (IS_ERR(df)) {
+ ret = PTR_ERR(df);
+ goto out;
+ }
+
+ mutex_lock(&df->lock);
+ if (!df->governor) {
+ ret = -EINVAL;
+ goto out_l;
+ }
+
+ ret = sprintf(buf, "%s\n", df->governor->name);
+out_l:
+ mutex_unlock(&df->lock);
+out:
+ mutex_unlock(&devfreq_list_lock);
+ return ret;
+}
+
+static ssize_t show_freq(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct devfreq *df;
+ ssize_t ret;
+
+ mutex_lock(&devfreq_list_lock);
+ df = find_device_devfreq(dev);
+ if (IS_ERR(df)) {
+ ret = PTR_ERR(df);
+ goto out;
+ }
+
+ ret = sprintf(buf, "%lu\n", df->previous_freq);
+out:
+ mutex_unlock(&devfreq_list_lock);
+ return ret;
+}
+
+static ssize_t show_max_freq(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct devfreq *df;
+ ssize_t ret;
+ unsigned long freq = ULONG_MAX;
+ struct opp *opp;
+
+ mutex_lock(&devfreq_list_lock);
+ df = find_device_devfreq(dev);
+ if (IS_ERR(df)) {
+ ret = PTR_ERR(df);
+ goto out;
+ }
+
+ mutex_lock(&df->lock);
+ opp = opp_find_freq_floor(df->dev, &freq);
+ if (IS_ERR(opp)) {
+ ret = PTR_ERR(opp);
+ goto out_l;
+ }
+
+ ret = sprintf(buf, "%lu\n", freq);
+out_l:
+ mutex_unlock(&df->lock);
+out:
+ mutex_unlock(&devfreq_list_lock);
+ return ret;
+}
+
+static ssize_t show_min_freq(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct devfreq *df;
+ ssize_t ret;
+ unsigned long freq = 0;
+ struct opp *opp;
+
+ mutex_lock(&devfreq_list_lock);
+ df = find_device_devfreq(dev);
+ if (IS_ERR(df)) {
+ ret = PTR_ERR(df);
+ goto out;
+ }
+
+ mutex_lock(&df->lock);
+ opp = opp_find_freq_ceil(df->dev, &freq);
+ if (IS_ERR(opp)) {
+ ret = PTR_ERR(opp);
+ goto out_l;
+ }
+
+ ret = sprintf(buf, "%lu\n", freq);
+out_l:
+ mutex_unlock(&df->lock);
+out:
+ mutex_unlock(&devfreq_list_lock);
+ return ret;
+}
+
+static ssize_t show_polling_interval(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct devfreq *df;
+ ssize_t ret;
+
+ mutex_lock(&devfreq_list_lock);
+ df = find_device_devfreq(dev);
+ if (IS_ERR(df)) {
+ ret = PTR_ERR(df);
+ goto out;
+ }
+
+ mutex_lock(&df->lock);
+ if (!df->profile) {
+ ret = -EINVAL;
+ goto out_l;
+ }
+
+ ret = sprintf(buf, "%d\n", df->profile->polling_ms);
+out_l:
+ mutex_unlock(&df->lock);
+out:
+ mutex_unlock(&devfreq_list_lock);
+ return ret;
+}
+
+static ssize_t store_polling_interval(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct devfreq *df;
+ unsigned int value;
+ int ret;
+
+ mutex_lock(&devfreq_list_lock);
+ df = find_device_devfreq(dev);
+ if (IS_ERR(df)) {
+ count = PTR_ERR(df);
+ goto out;
+ }
+ mutex_lock(&df->lock);
+ if (!df->profile) {
+ count = -EINVAL;
+ goto out_l;
+ }
+
+ ret = sscanf(buf, "%u", &value);
+ if (ret != 1) {
+ count = -EINVAL;
+ goto out_l;
+ }
+
+ df->profile->polling_ms = value;
+ df->next_polling = df->polling_jiffies
+ = msecs_to_jiffies(value);
+
+ if (df->next_polling > 0 && !polling) {
+ polling = true;
+ queue_delayed_work(devfreq_wq, &devfreq_work,
+ df->next_polling);
+ }
+out_l:
+ mutex_unlock(&df->lock);
+out:
+ mutex_unlock(&devfreq_list_lock);
+
+ return count;
+}
+
+static DEVICE_ATTR(devfreq_governor, 0444, show_governor, NULL);
+static DEVICE_ATTR(devfreq_cur_freq, 0444, show_freq, NULL);
+static DEVICE_ATTR(devfreq_max_freq, 0444, show_max_freq, NULL);
+static DEVICE_ATTR(devfreq_min_freq, 0444, show_min_freq, NULL);
+static DEVICE_ATTR(devfreq_polling_interval, 0644, show_polling_interval,
+ store_polling_interval);
+static struct attribute *dev_entries[] = {
+ &dev_attr_devfreq_governor.attr,
+ &dev_attr_devfreq_cur_freq.attr,
+ &dev_attr_devfreq_max_freq.attr,
+ &dev_attr_devfreq_min_freq.attr,
+ &dev_attr_devfreq_polling_interval.attr,
+ NULL,
+};
+static struct attribute_group dev_attr_group = {
+ .name = power_group_name,
+ .attrs = dev_entries,
+};
+
/**
* devfreq_init() - Initialize data structure for devfreq framework and
* start polling registered devfreq devices.
--
1.7.4.1
^ permalink raw reply related
* [PATCH v9 2/4] PM: Introduce devfreq: generic DVFS framework with device-specific OPPs
From: MyungJoo Ham @ 2011-08-31 7:29 UTC (permalink / raw)
To: linux-pm; +Cc: Len Brown, Greg Kroah-Hartman, Kyungmin Park, Thomas Gleixner
In-Reply-To: <1314775779-21399-1-git-send-email-myungjoo.ham@samsung.com>
With OPPs, a device may have multiple operable frequency and voltage
sets. However, there can be multiple possible operable sets and a system
will need to choose one from them. In order to reduce the power
consumption (by reducing frequency and voltage) without affecting the
performance too much, a Dynamic Voltage and Frequency Scaling (DVFS)
scheme may be used.
This patch introduces the DVFS capability to non-CPU devices with OPPs.
DVFS is a techique whereby the frequency and supplied voltage of a
device is adjusted on-the-fly. DVFS usually sets the frequency as low
as possible with given conditions (such as QoS assurance) and adjusts
voltage according to the chosen frequency in order to reduce power
consumption and heat dissipation.
The generic DVFS for devices, devfreq, may appear quite similar with
/drivers/cpufreq. However, cpufreq does not allow to have multiple
devices registered and is not suitable to have multiple heterogenous
devices with different (but simple) governors.
Normally, DVFS mechanism controls frequency based on the demand for
the device, and then, chooses voltage based on the chosen frequency.
devfreq also controls the frequency based on the governor's frequency
recommendation and let OPP pick up the pair of frequency and voltage
based on the recommended frequency. Then, the chosen OPP is passed to
device driver's "target" callback.
When PM QoS is going to be used with the devfreq device, the device
driver should enable OPPs that are appropriate with the current PM QoS
requests. In order to do so, the device driver may call opp_enable and
opp_disable at the notifier callback of PM QoS so that PM QoS's
update_target() call enables the appropriate OPPs. Note that at least
one of OPPs should be enabled at any time; be careful when there is a
transition.
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
---
The test code with board support for Exynos4-NURI is at
http://git.infradead.org/users/kmpark/linux-2.6-samsung/shortlog/refs/heads/devfreq
---
Thank you for your valuable comments, Rafael, Greg, Pavel, Colin, Mike,
and Kevin.
Changed from v8
- Merged patch 4/5 of v8 (internal interfaces for governors)
- Added lock (mutex) to struct devfreq
- Uses devfreq->lock to access elements in devfreq.
- Added kerneldoc entries for init/exit callbacks of governors.
- The caller of update_devfreq() in governor.h should lock
devfreq->lock before calling it.
- Added comment on the usage of lock in struct devfreq.
- Revised devfreq_add_device error handling
At v8, there is no changes since v7
Changes from v6
- Type revised for timing variables
- Removed unnecessary code and variable
Changes at v6-resubmit from v6
- Use jiffy directly instead of ktime
- Be prepared for profile->polling_ms changes (not supported fully at
this stage)
Changes from v5
- Uses OPP availability change notifier
- Removed devfreq_interval. Uses one jiffy instead. DEVFREQ adjusts
polling interval based on the interval requirement of DEVFREQ
devices.
- Moved devfreq to /drivers/devfreq to accomodate devfreq-related files
including governors and devfreq drivers.
- Coding style revised.
- Updated devfreq_add_device interface to get tunable values.
Changed from v4
- Removed tickle, which is a duplicated feature; PM QoS can do the same.
- Allow to extend polling interval if devices have longer polling intervals.
- Relocated private data of governors.
- Removed system-wide sysfs
Changed from v3
- In kerneldoc comments, DEVFREQ has ben replaced by devfreq
- Revised removing devfreq entries with error mechanism
- Added and revised comments
- Removed unnecessary codes
- Allow to give a name to a governor
- Bugfix: a tickle call may cancel an older tickle call that is still in
effect.
Changed from v2
- Code style revised and cleaned up.
- Remove DEVFREQ entries that incur errors except for EAGAIN
- Bug fixed: tickle for devices without polling governors
Changes from v1(RFC)
- Rename: DVFS --> DEVFREQ
- Revised governor design
. Governor receives the whole struct devfreq
. Governor should gather usage information (thru get_dev_status) itself
- Periodic monitoring runs only when needed.
- DEVFREQ no more deals with voltage information directly
- Removed some printks.
- Some cosmetics update
- Use freezable_wq.
---
drivers/Kconfig | 2 +
drivers/Makefile | 2 +
drivers/devfreq/Kconfig | 39 +++++
drivers/devfreq/Makefile | 1 +
drivers/devfreq/devfreq.c | 364 ++++++++++++++++++++++++++++++++++++++++++++
drivers/devfreq/governor.h | 22 +++
include/linux/devfreq.h | 123 +++++++++++++++
7 files changed, 553 insertions(+), 0 deletions(-)
create mode 100644 drivers/devfreq/Kconfig
create mode 100644 drivers/devfreq/Makefile
create mode 100644 drivers/devfreq/devfreq.c
create mode 100644 drivers/devfreq/governor.h
create mode 100644 include/linux/devfreq.h
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 95b9e7e..a1efd75 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -130,4 +130,6 @@ source "drivers/iommu/Kconfig"
source "drivers/virt/Kconfig"
+source "drivers/devfreq/Kconfig"
+
endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 7fa433a..97c957b 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -127,3 +127,5 @@ obj-$(CONFIG_IOMMU_SUPPORT) += iommu/
# Virtualization drivers
obj-$(CONFIG_VIRT_DRIVERS) += virt/
+
+obj-$(CONFIG_PM_DEVFREQ) += devfreq/
diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig
new file mode 100644
index 0000000..1fb42de
--- /dev/null
+++ b/drivers/devfreq/Kconfig
@@ -0,0 +1,39 @@
+config ARCH_HAS_DEVFREQ
+ bool
+ depends on ARCH_HAS_OPP
+ help
+ Denotes that the architecture supports DEVFREQ. If the architecture
+ supports multiple OPP entries per device and the frequency of the
+ devices with OPPs may be altered dynamically, the architecture
+ supports DEVFREQ.
+
+menuconfig PM_DEVFREQ
+ bool "Generic Dynamic Voltage and Frequency Scaling (DVFS) support"
+ depends on PM_OPP && ARCH_HAS_DEVFREQ
+ help
+ With OPP support, a device may have a list of frequencies and
+ voltages available. DEVFREQ, a generic DVFS framework can be
+ registered for a device with OPP support in order to let the
+ governor provided to DEVFREQ choose an operating frequency
+ based on the OPP's list and the policy given with DEVFREQ.
+
+ Each device may have its own governor and policy. DEVFREQ can
+ reevaluate the device state periodically and/or based on the
+ OPP list changes (each frequency/voltage pair in OPP may be
+ disabled or enabled).
+
+ Like some CPUs with CPUFREQ, a device may have multiple clocks.
+ However, because the clock frequencies of a single device are
+ determined by the single device's state, an instance of DEVFREQ
+ is attached to a single device and returns a "representative"
+ clock frequency from the OPP of the device, which is also attached
+ to a device by 1-to-1. The device registering DEVFREQ takes the
+ responsiblity to "interpret" the frequency listed in OPP and
+ to set its every clock accordingly with the "target" callback
+ given to DEVFREQ.
+
+if PM_DEVFREQ
+
+comment "DEVFREQ Drivers"
+
+endif # PM_DEVFREQ
diff --git a/drivers/devfreq/Makefile b/drivers/devfreq/Makefile
new file mode 100644
index 0000000..168934a
--- /dev/null
+++ b/drivers/devfreq/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_PM_DEVFREQ) += devfreq.o
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
new file mode 100644
index 0000000..621b863
--- /dev/null
+++ b/drivers/devfreq/devfreq.c
@@ -0,0 +1,364 @@
+/*
+ * devfreq: Generic Dynamic Voltage and Frequency Scaling (DVFS) Framework
+ * for Non-CPU Devices Based on OPP.
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/opp.h>
+#include <linux/devfreq.h>
+#include <linux/workqueue.h>
+#include <linux/platform_device.h>
+#include <linux/list.h>
+#include <linux/printk.h>
+#include <linux/hrtimer.h>
+
+/*
+ * devfreq_work periodically monitors every registered device.
+ * The minimum polling interval is one jiffy. The polling interval is
+ * determined by the minimum polling period among all polling devfreq
+ * devices. The resolution of polling interval is one jiffy.
+ */
+static bool polling;
+static struct workqueue_struct *devfreq_wq;
+static struct delayed_work devfreq_work;
+
+/* The list of all device-devfreq */
+static LIST_HEAD(devfreq_list);
+static DEFINE_MUTEX(devfreq_list_lock);
+
+/**
+ * find_device_devfreq() - find devfreq struct using device pointer
+ * @dev: device pointer used to lookup device devfreq.
+ *
+ * Search the list of device devfreqs and return the matched device's
+ * devfreq info. devfreq_list_lock should be held by the caller.
+ */
+static struct devfreq *find_device_devfreq(struct device *dev)
+{
+ struct devfreq *tmp_devfreq;
+
+ if (unlikely(IS_ERR_OR_NULL(dev))) {
+ pr_err("DEVFREQ: %s: Invalid parameters\n", __func__);
+ return ERR_PTR(-EINVAL);
+ }
+
+ list_for_each_entry(tmp_devfreq, &devfreq_list, node) {
+ if (tmp_devfreq->dev == dev)
+ return tmp_devfreq;
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+/**
+ * get_devfreq() - find devfreq struct. a wrapped find_device_devfreq()
+ * with mutex protection. exported for governors
+ * @dev: device pointer used to lookup device devfreq.
+ */
+struct devfreq *get_devfreq(struct device *dev)
+{
+ struct devfreq *ret;
+
+ mutex_lock(&devfreq_list_lock);
+ ret = find_device_devfreq(dev);
+ mutex_unlock(&devfreq_list_lock);
+
+ return ret;
+}
+
+/**
+ * devfreq_do() - Check the usage profile of a given device and configure
+ * frequency and voltage accordingly
+ * @devfreq: devfreq info of the given device
+ */
+static int devfreq_do(struct devfreq *devfreq)
+{
+ struct opp *opp;
+ unsigned long freq;
+ int err;
+
+ err = devfreq->governor->get_target_freq(devfreq, &freq);
+ if (err)
+ return err;
+
+ opp = opp_find_freq_ceil(devfreq->dev, &freq);
+ if (opp == ERR_PTR(-ENODEV))
+ opp = opp_find_freq_floor(devfreq->dev, &freq);
+
+ if (IS_ERR(opp))
+ return PTR_ERR(opp);
+
+ if (devfreq->previous_freq == freq)
+ return 0;
+
+ err = devfreq->profile->target(devfreq->dev, opp);
+ if (err)
+ return err;
+
+ devfreq->previous_freq = freq;
+ return 0;
+}
+
+/**
+ * update_devfreq() - Notify that the device OPP or frequency requirement
+ * has been changed. This function is exported for governors.
+ * @devfreq: the devfreq instance.
+ *
+ * Note: lock devfreq->lock before calling update_devfreq
+ */
+int update_devfreq(struct devfreq *devfreq)
+{
+ int err = 0;
+
+ if (!mutex_is_locked(&devfreq->lock)) {
+ WARN(true, "devfreq->lock must be locked by the caller.\n");
+ return -EINVAL;
+ }
+
+ /* Reevaluate the proper frequency */
+ err = devfreq_do(devfreq);
+ return err;
+}
+
+/**
+ * devfreq_update() - Notify that the device OPP has been changed.
+ * @dev: the device whose OPP has been changed.
+ *
+ * Called by OPP notifier.
+ */
+static int devfreq_update(struct notifier_block *nb, unsigned long type,
+ void *devp)
+{
+ struct devfreq *devfreq = container_of(nb, struct devfreq, nb);
+ int ret;
+
+ mutex_lock(&devfreq->lock);
+ ret = update_devfreq(devfreq);
+ mutex_unlock(&devfreq->lock);
+
+ return ret;
+}
+
+/**
+ * devfreq_monitor() - Periodically run devfreq_do()
+ * @work: the work struct used to run devfreq_monitor periodically.
+ *
+ */
+static void devfreq_monitor(struct work_struct *work)
+{
+ static unsigned long last_polled_at;
+ struct devfreq *devfreq, *tmp;
+ int error;
+ unsigned long jiffies_passed;
+ unsigned long next_jiffies = ULONG_MAX, now = jiffies;
+
+ /* Initially last_polled_at = 0, polling every device at bootup */
+ jiffies_passed = now - last_polled_at;
+ last_polled_at = now;
+ if (jiffies_passed == 0)
+ jiffies_passed = 1;
+
+ mutex_lock(&devfreq_list_lock);
+
+ list_for_each_entry_safe(devfreq, tmp, &devfreq_list, node) {
+ mutex_lock(&devfreq->lock);
+
+ if (devfreq->next_polling == 0) {
+ mutex_unlock(&devfreq->lock);
+ continue;
+ }
+
+ /*
+ * Reduce more next_polling if devfreq_wq took an extra
+ * delay. (i.e., CPU has been idled.)
+ */
+ if (devfreq->next_polling <= jiffies_passed) {
+ error = devfreq_do(devfreq);
+
+ /* Remove a devfreq with an error. */
+ if (error && error != -EAGAIN) {
+ dev_err(devfreq->dev, "Due to devfreq_do error(%d), devfreq(%s) is removed from the device\n",
+ error, devfreq->governor->name);
+
+ list_del(&devfreq->node);
+ mutex_unlock(&devfreq->lock);
+ kfree(devfreq);
+ continue;
+ }
+ devfreq->next_polling = devfreq->polling_jiffies;
+
+ /* No more polling required (polling_ms changed) */
+ if (devfreq->next_polling == 0) {
+ mutex_unlock(&devfreq->lock);
+ continue;
+ }
+ } else {
+ devfreq->next_polling -= jiffies_passed;
+ }
+
+ next_jiffies = (next_jiffies > devfreq->next_polling) ?
+ devfreq->next_polling : next_jiffies;
+
+ mutex_unlock(&devfreq->lock);
+ }
+
+ if (next_jiffies > 0 && next_jiffies < ULONG_MAX) {
+ polling = true;
+ queue_delayed_work(devfreq_wq, &devfreq_work, next_jiffies);
+ } else {
+ polling = false;
+ }
+
+ mutex_unlock(&devfreq_list_lock);
+}
+
+/**
+ * devfreq_add_device() - Add devfreq feature to the device
+ * @dev: the device to add devfreq feature.
+ * @profile: device-specific profile to run devfreq.
+ * @governor: the policy to choose frequency.
+ * @data: private data for the governor. The devfreq framework does not
+ * touch this value.
+ */
+int devfreq_add_device(struct device *dev, struct devfreq_dev_profile *profile,
+ struct devfreq_governor *governor, void *data)
+{
+ struct devfreq *devfreq;
+ struct srcu_notifier_head *nh;
+ int err = 0;
+
+ if (!dev || !profile || !governor) {
+ dev_err(dev, "%s: Invalid parameters.\n", __func__);
+ return -EINVAL;
+ }
+
+ mutex_lock(&devfreq_list_lock);
+
+ devfreq = find_device_devfreq(dev);
+ if (!IS_ERR(devfreq)) {
+ dev_err(dev, "%s: Unable to create devfreq for the device. It already has one.\n", __func__);
+ err = -EINVAL;
+ goto out;
+ }
+
+ devfreq = kzalloc(sizeof(struct devfreq), GFP_KERNEL);
+ if (!devfreq) {
+ dev_err(dev, "%s: Unable to create devfreq for the device\n",
+ __func__);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ mutex_init(&devfreq->lock);
+ mutex_lock(&devfreq->lock);
+ devfreq->dev = dev;
+ devfreq->profile = profile;
+ devfreq->governor = governor;
+ devfreq->next_polling = devfreq->polling_jiffies
+ = msecs_to_jiffies(devfreq->profile->polling_ms);
+ devfreq->previous_freq = profile->initial_freq;
+ devfreq->data = data;
+
+ devfreq->nb.notifier_call = devfreq_update;
+
+ nh = opp_get_notifier(dev);
+ if (IS_ERR(nh)) {
+ err = PTR_ERR(nh);
+ goto err_opp;
+ }
+ err = srcu_notifier_chain_register(nh, &devfreq->nb);
+ if (err)
+ goto err_opp;
+
+ if (governor->init)
+ err = governor->init(devfreq);
+ if (err)
+ goto err_init;
+
+ list_add(&devfreq->node, &devfreq_list);
+
+ if (devfreq_wq && devfreq->next_polling && !polling) {
+ polling = true;
+ queue_delayed_work(devfreq_wq, &devfreq_work,
+ devfreq->next_polling);
+ }
+ mutex_unlock(&devfreq->lock);
+ goto out;
+err_init:
+ srcu_notifier_chain_unregister(nh, &devfreq->nb);
+err_opp:
+ mutex_unlock(&devfreq->lock);
+ kfree(devfreq);
+out:
+ mutex_unlock(&devfreq_list_lock);
+ return err;
+}
+
+/**
+ * devfreq_remove_device() - Remove devfreq feature from a device.
+ * @device: the device to remove devfreq feature.
+ */
+int devfreq_remove_device(struct device *dev)
+{
+ struct devfreq *devfreq;
+ struct srcu_notifier_head *nh;
+ int err = 0;
+
+ if (!dev)
+ return -EINVAL;
+
+ mutex_lock(&devfreq_list_lock);
+ devfreq = find_device_devfreq(dev);
+ if (IS_ERR(devfreq)) {
+ err = PTR_ERR(devfreq);
+ goto out;
+ }
+
+ mutex_lock(&devfreq->lock);
+ nh = opp_get_notifier(dev);
+ if (IS_ERR(nh)) {
+ err = PTR_ERR(nh);
+ mutex_unlock(&devfreq->lock);
+ goto out;
+ }
+
+ list_del(&devfreq->node);
+
+ if (devfreq->governor->exit)
+ devfreq->governor->exit(devfreq);
+
+ srcu_notifier_chain_unregister(nh, &devfreq->nb);
+ mutex_unlock(&devfreq->lock);
+ kfree(devfreq);
+out:
+ mutex_unlock(&devfreq_list_lock);
+ return 0;
+}
+
+/**
+ * devfreq_init() - Initialize data structure for devfreq framework and
+ * start polling registered devfreq devices.
+ */
+static int __init devfreq_init(void)
+{
+ mutex_lock(&devfreq_list_lock);
+ polling = false;
+ devfreq_wq = create_freezable_workqueue("devfreq_wq");
+ INIT_DELAYED_WORK_DEFERRABLE(&devfreq_work, devfreq_monitor);
+ mutex_unlock(&devfreq_list_lock);
+
+ devfreq_monitor(&devfreq_work.work);
+ return 0;
+}
+late_initcall(devfreq_init);
diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h
new file mode 100644
index 0000000..9122090
--- /dev/null
+++ b/drivers/devfreq/governor.h
@@ -0,0 +1,22 @@
+/*
+ * governor.h - internal header for governors.
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This header is for devfreq governors in drivers/devfreq/
+ */
+
+#ifndef _GOVERNOR_H
+#define _GOVERNOR_H
+
+extern struct devfreq *get_devfreq(struct device *dev);
+
+/* (Mandatory) Lock devfreq->lock before calling update_devfreq */
+extern int update_devfreq(struct devfreq *devfreq);
+
+#endif /* _GOVERNOR_H */
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
new file mode 100644
index 0000000..f14b57d
--- /dev/null
+++ b/include/linux/devfreq.h
@@ -0,0 +1,123 @@
+/*
+ * devfreq: Generic Dynamic Voltage and Frequency Scaling (DVFS) Framework
+ * for Non-CPU Devices Based on OPP.
+ *
+ * Copyright (C) 2011 Samsung Electronics
+ * MyungJoo Ham <myungjoo.ham@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_DEVFREQ_H__
+#define __LINUX_DEVFREQ_H__
+
+#include <linux/device.h>
+#include <linux/notifier.h>
+#include <linux/opp.h>
+
+#define DEVFREQ_NAME_LEN 16
+
+struct devfreq;
+struct devfreq_dev_status {
+ /* both since the last measure */
+ unsigned long total_time;
+ unsigned long busy_time;
+ unsigned long current_frequency;
+};
+
+struct devfreq_dev_profile {
+ unsigned long max_freq; /* may be larger than the actual value */
+ unsigned long initial_freq;
+ unsigned int polling_ms; /* 0 for at opp change only */
+
+ int (*target)(struct device *dev, struct opp *opp);
+ int (*get_dev_status)(struct device *dev,
+ struct devfreq_dev_status *stat);
+};
+
+/**
+ * struct devfreq_governor - Devfreq policy governor
+ * @name Governor's name
+ * @get_target_freq Returns desired operating frequency for the device.
+ * Basically, get_target_freq will run
+ * devfreq_dev_profile.get_dev_status() to get the
+ * status of the device (load = busy_time / total_time).
+ * @init Called when the devfreq is being attached to a device
+ * @exit Called when the devfreq is being removed from a device
+ *
+ * Note that the callbacks are called with devfreq->lock locked by devfreq.
+ */
+struct devfreq_governor {
+ char name[DEVFREQ_NAME_LEN];
+ int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
+ int (*init)(struct devfreq *this);
+ void (*exit)(struct devfreq *this);
+};
+
+/**
+ * struct devfreq - Device devfreq structure
+ * @node list node - contains the devices with devfreq that have been
+ * registered.
+ * @lock a mutex to protect accessing devfreq.
+ * @dev device pointer
+ * @profile device-specific devfreq profile
+ * @governor method how to choose frequency based on the usage.
+ * @nb notifier block registered to the corresponding OPP to get
+ * notified for frequency availability updates.
+ * @polling_jiffies interval in jiffies.
+ * @previous_freq previously configured frequency value.
+ * @next_polling the number of remaining jiffies to poll with
+ * "devfreq_monitor" executions to reevaluate
+ * frequency/voltage of the device. Set by
+ * profile's polling_ms interval.
+ * @data Private data of the governor. The devfreq framework does not
+ * touch this.
+ *
+ * This structure stores the devfreq information for a give device.
+ *
+ * Note that when a governor accesses entries in struct devfreq in its
+ * functions except for the context of callbacks defined in struct
+ * devfreq_governor, the governor should protect its access with the
+ * struct mutex lock in struct devfreq. A governor may use this mutex
+ * to protect its own private data in void *data as well.
+ */
+struct devfreq {
+ struct list_head node;
+
+ struct mutex lock;
+ struct device *dev;
+ struct devfreq_dev_profile *profile;
+ struct devfreq_governor *governor;
+ struct notifier_block nb;
+
+ unsigned long polling_jiffies;
+ unsigned long previous_freq;
+ unsigned int next_polling;
+
+ void *data; /* private data for governors */
+};
+
+#if defined(CONFIG_PM_DEVFREQ)
+extern int devfreq_add_device(struct device *dev,
+ struct devfreq_dev_profile *profile,
+ struct devfreq_governor *governor,
+ void *data);
+extern int devfreq_remove_device(struct device *dev);
+#else /* !CONFIG_PM_DEVFREQ */
+static int devfreq_add_device(struct device *dev,
+ struct devfreq_dev_profile *profile,
+ struct devfreq_governor *governor,
+ void *data)
+{
+ return 0;
+}
+
+static int devfreq_remove_device(struct device *dev)
+{
+ return 0;
+}
+#endif /* CONFIG_PM_DEVFREQ */
+
+#endif /* __LINUX_DEVFREQ_H__ */
--
1.7.4.1
^ permalink raw reply related
* [PATCH v9 1/4] PM / OPP: Add OPP availability change notifier.
From: MyungJoo Ham @ 2011-08-31 7:29 UTC (permalink / raw)
To: linux-pm; +Cc: Len Brown, Greg Kroah-Hartman, Kyungmin Park, Thomas Gleixner
In-Reply-To: <1314775779-21399-1-git-send-email-myungjoo.ham@samsung.com>
The patch enables to register notifier_block for an OPP-device in order
to get notified for any changes in the availability of OPPs of the
device. For example, if a new OPP is inserted or enable/disable status
of an OPP is changed, the notifier is executed.
This enables the usage of opp_add, opp_enable, and opp_disable to
directly take effect with any connected entities such as cpufreq or
devfreq.
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Reviewed-by: Mike Turquette <mturquette@ti.com>
---
No changes since v7
Added at devfreq patch set v6 replacing devfreq_update calls at OPP.
---
drivers/base/power/opp.c | 29 +++++++++++++++++++++++++++++
include/linux/opp.h | 12 ++++++++++++
2 files changed, 41 insertions(+), 0 deletions(-)
diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index b23de18..e6b4c89 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -73,6 +73,7 @@ struct opp {
* RCU usage: nodes are not modified in the list of device_opp,
* however addition is possible and is secured by dev_opp_list_lock
* @dev: device pointer
+ * @head: notifier head to notify the OPP availability changes.
* @opp_list: list of opps
*
* This is an internal data structure maintaining the link to opps attached to
@@ -83,6 +84,7 @@ struct device_opp {
struct list_head node;
struct device *dev;
+ struct srcu_notifier_head head;
struct list_head opp_list;
};
@@ -404,6 +406,7 @@ int opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
}
dev_opp->dev = dev;
+ srcu_init_notifier_head(&dev_opp->head);
INIT_LIST_HEAD(&dev_opp->opp_list);
/* Secure the device list modification */
@@ -428,6 +431,11 @@ int opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
list_add_rcu(&new_opp->node, head);
mutex_unlock(&dev_opp_list_lock);
+ /*
+ * Notify the changes in the availability of the operable
+ * frequency/voltage list.
+ */
+ srcu_notifier_call_chain(&dev_opp->head, OPP_EVENT_ADD, new_opp);
return 0;
}
@@ -504,6 +512,14 @@ static int opp_set_availability(struct device *dev, unsigned long freq,
mutex_unlock(&dev_opp_list_lock);
synchronize_rcu();
+ /* Notify the change of the OPP availability */
+ if (availability_req)
+ srcu_notifier_call_chain(&dev_opp->head, OPP_EVENT_ENABLE,
+ new_opp);
+ else
+ srcu_notifier_call_chain(&dev_opp->head, OPP_EVENT_DISABLE,
+ new_opp);
+
/* clean up old opp */
new_opp = opp;
goto out;
@@ -643,3 +659,16 @@ void opp_free_cpufreq_table(struct device *dev,
*table = NULL;
}
#endif /* CONFIG_CPU_FREQ */
+
+/** opp_get_notifier() - find notifier_head of the device with opp
+ * @dev: device pointer used to lookup device OPPs.
+ */
+struct srcu_notifier_head *opp_get_notifier(struct device *dev)
+{
+ struct device_opp *dev_opp = find_device_opp(dev);
+
+ if (IS_ERR(dev_opp))
+ return ERR_PTR(PTR_ERR(dev_opp)); /* matching type */
+
+ return &dev_opp->head;
+}
diff --git a/include/linux/opp.h b/include/linux/opp.h
index 7020e97..87a9208 100644
--- a/include/linux/opp.h
+++ b/include/linux/opp.h
@@ -16,9 +16,14 @@
#include <linux/err.h>
#include <linux/cpufreq.h>
+#include <linux/notifier.h>
struct opp;
+enum opp_event {
+ OPP_EVENT_ADD, OPP_EVENT_ENABLE, OPP_EVENT_DISABLE,
+};
+
#if defined(CONFIG_PM_OPP)
unsigned long opp_get_voltage(struct opp *opp);
@@ -40,6 +45,8 @@ int opp_enable(struct device *dev, unsigned long freq);
int opp_disable(struct device *dev, unsigned long freq);
+struct srcu_notifier_head *opp_get_notifier(struct device *dev);
+
#else
static inline unsigned long opp_get_voltage(struct opp *opp)
{
@@ -89,6 +96,11 @@ static inline int opp_disable(struct device *dev, unsigned long freq)
{
return 0;
}
+
+struct srcu_notifier_head *opp_get_notifier(struct device *dev)
+{
+ return ERR_PTR(-EINVAL);
+}
#endif /* CONFIG_PM */
#if defined(CONFIG_CPU_FREQ) && defined(CONFIG_PM_OPP)
--
1.7.4.1
^ permalink raw reply related
* [PATCH v9 0/4] Devfreq, DVFS Framework for Non-CPU Devices
From: MyungJoo Ham @ 2011-08-31 7:29 UTC (permalink / raw)
To: linux-pm; +Cc: Len Brown, Greg Kroah-Hartman, Kyungmin Park, Thomas Gleixner
The main update from the patchset v8:
- Add per-devfreq-device locking mechanism (devfreq->lock)
- Provide the per-devfreq-device locking mechanism to governors
- Merged 4/5 patch to 2/5 (devfreq internal interface for governors)
Patch 1/4 has no changes.
Patch 2/4 has major update on synchronization and merged another patch; thus, dropped "Reviewed-By".
Patch 3/4 has minor udpate (affected by the update on 2/4: mutex added)
Patch 4/4 has minor update (affected by the update on 2/4: mutex added) + removed unused variable.
For a usage example, please look at
http://git.infradead.org/users/kmpark/linux-2.6-samsung/shortlog/refs/heads/devfreq
In the above git tree, DVFS (dynamic voltage and frequency scaling) mechanism
is applied to the memory bus of Exynos4210 for Exynos4210-NURI boards.
In the example, the LPDDR2 DRAM frequency changes between 133, 266, and 400MHz
and other related clocks simply follow the determined DDR RAM clock.
The devfreq driver for Exynos4210 memory bus is at
/drivers/devfreq/exynos4210_memorybus.c in the git tree.
In the dd (writing and reading 360MiB) test with NURI board, the memory
throughput was not changed (the performance is not deteriorated) while
the SoC power consumption has been reduced by 1%. When the memory access
is not that intense while the CPU is heavily used, the SoC power consumption
has been reduced by 6%. The power consumption has been compared with the
case using the conventional Exynos4210 cpufreq driver, which sets memory
bus frequency according to the CPU core frequency. Besides, when the CPU core
running slow and the memory access is intense, the performance (memory
throughput) has been increased by 11% (with higher SoC power consumption of
5%). The tested governor is "simple-ondemand".
MyungJoo Ham (4):
PM / OPP: Add OPP availability change notifier.
PM: Introduce devfreq: generic DVFS framework with device-specific
OPPs
PM / devfreq: add common sysfs interfaces
PM / devfreq: add basic governors
Documentation/ABI/testing/sysfs-devices-power | 46 ++
drivers/Kconfig | 2 +
drivers/Makefile | 2 +
drivers/base/power/opp.c | 29 ++
drivers/devfreq/Kconfig | 75 ++++
drivers/devfreq/Makefile | 5 +
drivers/devfreq/devfreq.c | 567 +++++++++++++++++++++++++
drivers/devfreq/governor.h | 22 +
drivers/devfreq/governor_performance.c | 24 +
drivers/devfreq/governor_powersave.c | 24 +
drivers/devfreq/governor_simpleondemand.c | 88 ++++
drivers/devfreq/governor_userspace.c | 126 ++++++
include/linux/devfreq.h | 160 +++++++
include/linux/opp.h | 12 +
14 files changed, 1182 insertions(+), 0 deletions(-)
create mode 100644 drivers/devfreq/Kconfig
create mode 100644 drivers/devfreq/Makefile
create mode 100644 drivers/devfreq/devfreq.c
create mode 100644 drivers/devfreq/governor.h
create mode 100644 drivers/devfreq/governor_performance.c
create mode 100644 drivers/devfreq/governor_powersave.c
create mode 100644 drivers/devfreq/governor_simpleondemand.c
create mode 100644 drivers/devfreq/governor_userspace.c
create mode 100644 include/linux/devfreq.h
--
1.7.4.1
^ permalink raw reply
* Re: [PATCH 6/6] cgroup: kill subsys->can_attach_task(), pre_attach() and attach_task()
From: Tejun Heo @ 2011-08-31 7:03 UTC (permalink / raw)
To: Frederic Weisbecker
Cc: containers, lizf, linux-kernel, linux-pm, paul, kamezawa.hiroyu
In-Reply-To: <20110830201030.GC15953@somewhere.redhat.com>
Hello, Frederic.
On Tue, Aug 30, 2011 at 10:10:32PM +0200, Frederic Weisbecker wrote:
> In order to keep the fix queued in -mm (https://lkml.org/lkml/2011/8/26/262)
> the tasks that have failed to migrate should be removed from the iterator
> so that they are not included in the batch in ->attach().
I don't think that's a good approach. It breaks the symmetry when
calling different callbacks. What if ->can_attach() allocates
per-task resources and the task exits in the middle? I think we
better lock down fork/exit/exec. I'll send patches but I'm currently
moving / traveling w/ limited access to my toys so it might take some
time.
Thanks.
--
tejun
^ permalink raw reply
* Re: [PATCH v8 0/5] DEVFREQ, DVFS Framework for Non-CPU Devices.
From: MyungJoo Ham @ 2011-08-31 3:59 UTC (permalink / raw)
To: Kevin Hilman
Cc: Len Brown, Greg Kroah-Hartman, Kyungmin Park, linux-pm,
Thomas Gleixner
In-Reply-To: <871uw2lbls.fsf@ti.com>
On Wed, Aug 31, 2011 at 8:32 AM, Kevin Hilman <khilman@ti.com> wrote:
> MyungJoo Ham <myungjoo.ham@samsung.com> writes:
>
>> The patchset revision v8 has minor updates since v7 and v6.
>> - Allow governors to have their own sysfs interface and init/exit callbacks.
>>
>> The patches 1/5 (OPP notifier) and 2/5 (DEVFREQ core) have no changes since v7.
>> There has been reordering between "add common sysfs interfaces" patch
>> and "add basic governors" (3/5 and 5/5)
>> "add internal interfaces for governors (4/5)" patch has been newly
>> introduced at v8 patchset.
>>
>> For a usage example, please look at
>> http://git.infradead.org/users/kmpark/linux-2.6-samsung/shortlog/refs/heads/devfreq
>>
>> In the above git tree, DVFS (dynamic voltage and frequency scaling) mechanism
>> is applied to the memory bus of Exynos4210 for Exynos4210-NURI boards.
>> In the example, the LPDDR2 DRAM frequency changes between 133, 266, and 400MHz
>> and other related clocks simply follow the determined DDR RAM clock.
>>
>> The DEVFREQ driver for Exynos4210 memory bus is at
>> /drivers/devfreq/exynos4210_memorybus.c in the git tree.
>
> Minor nit: you continue to use DEVFREQ (all caps) throughout subjects
> and changelogs etc. when it should be called devfreq since it's not an
> acryonym.
>
> Kevin
>
Fine, I'll regard devfreq as a common noun.
Cheers,
MyungJoo
--
MyungJoo Ham (함명주), Ph.D.
Mobile Software Platform Lab,
Digital Media and Communications (DMC) Business
Samsung Electronics
cell: 82-10-6714-2858
_______________________________________________
linux-pm mailing list
linux-pm@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/linux-pm
^ permalink raw reply
* Re: [PATCH v8 0/5] DEVFREQ, DVFS Framework for Non-CPU Devices.
From: Kevin Hilman @ 2011-08-30 23:32 UTC (permalink / raw)
To: MyungJoo Ham
Cc: Len Brown, Greg Kroah-Hartman, Kyungmin Park, linux-pm,
Thomas Gleixner
In-Reply-To: <1314174131-14194-1-git-send-email-myungjoo.ham@samsung.com>
MyungJoo Ham <myungjoo.ham@samsung.com> writes:
> The patchset revision v8 has minor updates since v7 and v6.
> - Allow governors to have their own sysfs interface and init/exit callbacks.
>
> The patches 1/5 (OPP notifier) and 2/5 (DEVFREQ core) have no changes since v7.
> There has been reordering between "add common sysfs interfaces" patch
> and "add basic governors" (3/5 and 5/5)
> "add internal interfaces for governors (4/5)" patch has been newly
> introduced at v8 patchset.
>
> For a usage example, please look at
> http://git.infradead.org/users/kmpark/linux-2.6-samsung/shortlog/refs/heads/devfreq
>
> In the above git tree, DVFS (dynamic voltage and frequency scaling) mechanism
> is applied to the memory bus of Exynos4210 for Exynos4210-NURI boards.
> In the example, the LPDDR2 DRAM frequency changes between 133, 266, and 400MHz
> and other related clocks simply follow the determined DDR RAM clock.
>
> The DEVFREQ driver for Exynos4210 memory bus is at
> /drivers/devfreq/exynos4210_memorybus.c in the git tree.
Minor nit: you continue to use DEVFREQ (all caps) throughout subjects
and changelogs etc. when it should be called devfreq since it's not an
acryonym.
Kevin
^ permalink raw reply
* [Update] LPC2011 Power Management Micro Conf
From: Rafael J. Wysocki @ 2011-08-30 22:39 UTC (permalink / raw)
To: linux-pm
Cc: Arnd Bergmann, Greg Kroah-Hartman, Stephen Boyd, LKML,
Grant Likely, MyungJoo Ham, Jean Pihet, Arjan van de Ven
In-Reply-To: <201107030929.28555.rjw@sisk.pl>
Hi,
On Sunday, July 03, 2011, Rafael J. Wysocki wrote:
> Hi,
>
> Some time ago I sent the following to linux-pm and wasn't really satisfied
> with the response, so here it goes again. It is slightly outdated, because
> we've already started to plan a PM meeting at the Kernel Summit, but I hope
> some of you will be able to participate in the LPC PM Mini Conf too.
>
> In the previous years we used to organize Power Management Mini Summits to
> discuss PM-related issues, the last of which took place during LinuxCon in
> Boston in 2010. Unfortunately, however, it wasn't particularly well attended
> and some of the participants generally felt that it had failed its purpose.
> The power management track at the last LPC, on the other hand, was generally
> regarded as interesting and successful, so there only seems to be room for
> one event devoted to power management a year. For this reason, there won't
> be a PM Mini Summit in 2011 and the LPC PM Mini Conference will play the role
> of it. To this end, in addition to the plenary sessions within the LPC PM
> track there will be a possibility to discuss specific problems related to
> power management in smaller groups.
>
> If there's anything you'd like to discuss or give a presentation on related to
> Power Management, please let me know, preferably by replying to this message.
>
> We have 1.5 hours allocated in the LPC schedule for the plenary session, but
> we will be able to move to another room (or a number of rooms if that's more
> suitable) afterwards to discuss things in detail.
The LPC planned schedule has recently changed so that the PM microconference
is now scheduled on Thursday, Sep. 8, 2011, at 4:45 PM. Since the LPC
organizers have no plans for Thursday evening, we can easily extend the
discussion part pretty much as far as we want.
Please review the updated agenda at
http://wiki.linuxplumbersconf.org/2011:power_management
and adjust your plans. Sorry for the inconveniences, if any.
Thanks,
Rafael
^ permalink raw reply
* Re: [stable] [regression] Re: [213/474] x86, hotplug: Use mwait to offline a processor, fix the legacy case
From: Greg KH @ 2011-08-30 22:34 UTC (permalink / raw)
To: Jonathan Nieder
Cc: Len Brown, x86, Robert Scott, Greg KH, Frédéric Boiteux,
stable-review, Venkatesh Pallipadi, linux-pm, H. Peter Anvin,
stable
In-Reply-To: <20110828184004.GA7690@elie.gateway.2wire.net>
On Sun, Aug 28, 2011 at 01:40:04PM -0500, Jonathan Nieder wrote:
> Hi,
>
> On 2011-03-17, Greg KH wrote:
>
> > 2.6.33-longterm review patch. If anyone has any objections, please let us know.
> [...]
> > From: H. Peter Anvin <hpa@linux.intel.com>
> >
> > upstream ea53069231f9317062910d6e772cca4ce93de8c8
> > x86, hotplug: Use mwait to offline a processor, fix the legacy case
> >
> > Here included also some small follow-on patches to the same code:
> [...]
> > https://bugzilla.kernel.org/show_bug.cgi?id=5471
> >
> > Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
> > Signed-off-by: Len Brown <len.brown@intel.com>
> > Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
>
> Sorry for the sloow response. Unfortunately this is reported to have
> broken hibernation on two machines: an EeePC 1002HA and an Ideapad S10-3.
> Frédéric writes:
>
> | I've observed that when resuming from hibernation, sometimes my computer was
> | returning to Grub2's menu (without any error message), and I had to do a
> | "normal" boot on my Debian Squeeze system (with file systems corrections),
> | loosing my hibernation's state.
> | The fail isn't automatic, but seems to happen more frequently after a while
> | my computer was disconnected from AC and battery.
>
> Noticed on Debian squeeze (which is based on v2.6.32.y), confirmed
> with unpatched v2.6.32.45 and v2.6.33.18. Backing out the patch
> mentioned above avoids trouble. A newer kernel (based on v2.6.39.y)
> does _not_ exhibit the same problem, so it looks like the problem was
> introduced in backporting. http://bugs.debian.org/622259 has
> details.
Thanks for letting me know, I'll go revert that patch for the next .32
and .33 longterm releases and let everyone else work out exactly what
the problem is, if they want to.
greg k-h
^ permalink raw reply
* [RFC][PATCH 5/5] PM / Domains: Add default power off governor function
From: Rafael J. Wysocki @ 2011-08-30 22:22 UTC (permalink / raw)
To: Linux PM mailing list; +Cc: LKML, Linux-sh list
In-Reply-To: <201108310017.03103.rjw@sisk.pl>
From: Rafael J. Wysocki <rjw@sisk.pl>
Add a function deciding whether or not a given PM domain should
be powered off on the basis of that domain's devices' PM QoS
constraints.
---
drivers/base/power/domain_governor.c | 96 +++++++++++++++++++++++++++++++++++
include/linux/pm_domain.h | 7 ++
2 files changed, 103 insertions(+)
Index: linux/include/linux/pm_domain.h
===================================================================
--- linux.orig/include/linux/pm_domain.h
+++ linux/include/linux/pm_domain.h
@@ -49,6 +49,10 @@ struct generic_pm_domain {
int (*start_device)(struct device *dev);
int (*stop_device)(struct device *dev);
bool (*active_wakeup)(struct device *dev);
+ ktime_t power_off_latency;
+ ktime_t power_on_latency;
+ s64 break_even_ns;
+ s64 min_delta_ns;
};
static inline struct generic_pm_domain *pd_to_genpd(struct dev_pm_domain *pd)
@@ -64,6 +68,9 @@ struct gpd_link {
};
struct gpd_gov_dev_data {
+ ktime_t start_latency;
+ ktime_t suspend_latency;
+ ktime_t resume_latency;
s64 break_even_ns;
};
Index: linux/drivers/base/power/domain_governor.c
===================================================================
--- linux.orig/drivers/base/power/domain_governor.c
+++ linux/drivers/base/power/domain_governor.c
@@ -35,6 +35,102 @@ static bool default_stop_ok(struct devic
return constraint_ns > gov_data->break_even_ns;
}
+/* This routine must be executed under the PM domain's lock. */
+static bool default_power_down_ok(struct dev_pm_domain *pd)
+{
+ struct generic_pm_domain *genpd = pd_to_genpd(pd);
+ struct gpd_link *link;
+ struct pm_domain_data *pdd;
+ ktime_t off_time, on_time;
+ s64 delta_ns, min_delta_ns;
+
+ on_time = genpd->power_on_latency;
+ /* Check if slave domains can be off for enough time. */
+ delta_ns = ktime_to_ns(ktime_add(genpd->power_off_latency, on_time));
+ min_delta_ns = 0;
+ /* All slave domains have been powered off at this point. */
+ list_for_each_entry(link, &genpd->master_links, master_node) {
+ if (delta_ns > link->slave->min_delta_ns)
+ return false;
+
+ delta_ns = link->slave->min_delta_ns - delta_ns;
+ if (delta_ns < min_delta_ns)
+ min_delta_ns = delta_ns;
+ }
+
+ genpd->min_delta_ns = min_delta_ns;
+
+ /* Compute the total time needed to power off the domain. */
+ off_time = ktime_set(0, 0);
+ /* All devices have been stopped at this point. */
+ list_for_each_entry(pdd, &genpd->dev_list, list_node) {
+ struct gpd_gov_dev_data *gov_data;
+
+ if (!pdd->dev->driver)
+ continue;
+
+ gov_data = to_gpd_data(pdd)->gov_data;
+ if (!gov_data)
+ continue;
+
+ off_time = ktime_add(off_time, gov_data->suspend_latency);
+ }
+ off_time = ktime_add(off_time, genpd->power_off_latency);
+
+ /*
+ * For each device in the domain compute the difference between the
+ * QoS value and the total time required to bring the device back
+ * assuming that the domain will be powered off and compute the minimum
+ * of those.
+ */
+ min_delta_ns = 0;
+ on_time = ktime_add(on_time, off_time);
+ list_for_each_entry(pdd, &genpd->dev_list, list_node) {
+ struct gpd_gov_dev_data *gov_data;
+ struct device *dev = pdd->dev;
+ ktime_t dev_up_time;
+ s32 constraint;
+ s64 constraint_ns;
+
+ if (!dev->driver)
+ continue;
+
+ gov_data = to_gpd_data(pdd)->gov_data;
+ if (gov_data) {
+ dev_up_time = ktime_add(on_time,
+ gov_data->resume_latency);
+ dev_up_time = ktime_add(dev_up_time,
+ gov_data->start_latency);
+ } else {
+ dev_up_time = on_time;
+ }
+
+ constraint = dev_pm_qos_read_value(dev);
+ if (constraint < 0)
+ return false;
+ else if (constraint == 0) /* 0 means "don't care" */
+ continue;
+
+ constraint_ns = constraint;
+ constraint_ns *= NSEC_PER_USEC;
+ delta_ns = constraint_ns - ktime_to_ns(dev_up_time);
+ if (min_delta_ns > delta_ns)
+ min_delta_ns = delta_ns;
+ }
+
+ /* Compare the computed delta with the break even value. */
+ if (min_delta_ns < genpd->break_even_ns)
+ return false;
+
+ /* Store the computed value for the masters to use. */
+ if (genpd->min_delta_ns > min_delta_ns)
+ genpd->min_delta_ns = min_delta_ns;
+
+ /* The domain can be powered off. */
+ return true;
+}
+
struct dev_power_governor default_qos_governor = {
.stop_ok = default_stop_ok,
+ .power_down_ok = default_power_down_ok,
};
^ permalink raw reply
* [RFC][PATCH 4/5] PM / Domains: Add device stop governor function
From: Rafael J. Wysocki @ 2011-08-30 22:21 UTC (permalink / raw)
To: Linux PM mailing list; +Cc: LKML, Linux-sh list
In-Reply-To: <201108310017.03103.rjw@sisk.pl>
From: Rafael J. Wysocki <rjw@sisk.pl>
Add a function deciding whether or not devices should be
stopped in pm_genpd_runtime_suspend() depending on their
PM QoS values.
---
arch/arm/mach-shmobile/pm-sh7372.c | 2 -
drivers/base/power/Makefile | 2 -
drivers/base/power/domain.c | 35 ++++++++++++++++++-----
drivers/base/power/domain_governor.c | 40 ++++++++++++++++++++++++++
include/linux/pm_domain.h | 52 ++++++++++++++++++++++++++++++-----
5 files changed, 115 insertions(+), 16 deletions(-)
Index: linux/include/linux/pm_domain.h
===================================================================
--- linux.orig/include/linux/pm_domain.h
+++ linux/include/linux/pm_domain.h
@@ -21,6 +21,7 @@ enum gpd_status {
struct dev_power_governor {
bool (*power_down_ok)(struct dev_pm_domain *domain);
+ bool (*stop_ok)(struct device *dev);
};
struct generic_pm_domain {
@@ -62,8 +63,13 @@ struct gpd_link {
struct list_head slave_node;
};
+struct gpd_gov_dev_data {
+ s64 break_even_ns;
+};
+
struct generic_pm_domain_data {
struct pm_domain_data base;
+ struct gpd_gov_dev_data *gov_data;
bool need_restore;
};
@@ -73,18 +79,47 @@ static inline struct generic_pm_domain_d
}
#ifdef CONFIG_PM_GENERIC_DOMAINS
-extern int pm_genpd_add_device(struct generic_pm_domain *genpd,
- struct device *dev);
+extern struct dev_power_governor default_qos_governor;
+
+extern struct generic_pm_domain *dev_to_genpd(struct device *dev);
+extern int __pm_genpd_add_device(struct generic_pm_domain *genpd,
+ struct device *dev,
+ struct gpd_gov_dev_data *gov_data);
+
+static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
+ struct device *dev)
+{
+ return __pm_genpd_add_device(genpd, dev, NULL);
+}
+
extern int pm_genpd_remove_device(struct generic_pm_domain *genpd,
struct device *dev);
extern int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
struct generic_pm_domain *new_subdomain);
extern int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
struct generic_pm_domain *target);
-extern void pm_genpd_init(struct generic_pm_domain *genpd,
- struct dev_power_governor *gov, bool is_off);
+extern void __pm_genpd_init(struct generic_pm_domain *genpd,
+ struct dev_power_governor *gov, bool is_off);
+
+static inline void pm_genpd_init(struct generic_pm_domain *genpd, bool is_off)
+{
+ __pm_genpd_init(genpd, &default_qos_governor, is_off);
+}
+
extern int pm_genpd_poweron(struct generic_pm_domain *genpd);
+
#else
+
+static inline struct generic_pm_domain *dev_to_genpd(struct device *dev)
+{
+ return ERR_PTR(-ENOSYS);
+}
+static inline int __pm_genpd_add_device(struct generic_pm_domain *genpd,
+ struct device *dev,
+ struct gpd_gov_dev_data *gov_data)
+{
+ return -ENOSYS;
+}
static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
struct device *dev)
{
@@ -105,8 +140,13 @@ static inline int pm_genpd_remove_subdom
{
return -ENOSYS;
}
-static inline void pm_genpd_init(struct generic_pm_domain *genpd,
- struct dev_power_governor *gov, bool is_off) {}
+static inline void __pm_genpd_init(struct generic_pm_domain *genpd,
+ struct dev_power_governor *gov, bool is_off)
+{
+}
+static inline void pm_genpd_init(struct generic_pm_domain *genpd, bool is_off)
+{
+}
static inline int pm_genpd_poweron(struct generic_pm_domain *genpd)
{
return -ENOSYS;
Index: linux/drivers/base/power/domain.c
===================================================================
--- linux.orig/drivers/base/power/domain.c
+++ linux/drivers/base/power/domain.c
@@ -21,7 +21,7 @@ static DEFINE_MUTEX(gpd_list_lock);
#ifdef CONFIG_PM
-static struct generic_pm_domain *dev_to_genpd(struct device *dev)
+struct generic_pm_domain *dev_to_genpd(struct device *dev)
{
if (IS_ERR_OR_NULL(dev->pm_domain))
return ERR_PTR(-EINVAL);
@@ -403,6 +403,22 @@ static void genpd_power_off_work_fn(stru
}
/**
+ * genpd_stop_dev - Stop a given device if that's beneficial.
+ * @genpd: PM domain the device belongs to.
+ * @dev: Device to stop.
+ */
+static int genpd_stop_dev(struct generic_pm_domain *genpd, struct device *dev)
+{
+ bool (*stop_ok)(struct device *dev);
+
+ stop_ok = genpd->gov ? genpd->gov->stop_ok : NULL;
+ if (stop_ok && !stop_ok(dev))
+ return -EBUSY;
+
+ return genpd->stop_device(dev);
+}
+
+/**
* pm_genpd_runtime_suspend - Suspend a device belonging to I/O PM domain.
* @dev: Device to suspend.
*
@@ -423,7 +439,7 @@ static int pm_genpd_runtime_suspend(stru
might_sleep_if(!genpd->dev_irq_safe);
if (genpd->stop_device) {
- int ret = genpd->stop_device(dev);
+ int ret = genpd_stop_dev(genpd, dev);
if (ret)
return ret;
}
@@ -495,7 +511,7 @@ static int pm_genpd_runtime_resume(struc
mutex_lock(&genpd->lock);
}
finish_wait(&genpd->status_wait_queue, &wait);
- __pm_genpd_restore_device(dev->power.subsys_data->domain_data, genpd);
+ __pm_genpd_restore_device(dev_to_psd(dev)->domain_data, genpd);
genpd->resume_count--;
genpd_set_active(genpd);
wake_up_all(&genpd->status_wait_queue);
@@ -1076,11 +1092,13 @@ static void pm_genpd_complete(struct dev
#endif /* CONFIG_PM_SLEEP */
/**
- * pm_genpd_add_device - Add a device to an I/O PM domain.
+ * __pm_genpd_add_device - Add a device to an I/O PM domain.
* @genpd: PM domain to add the device to.
* @dev: Device to be added.
+ * @gov_data: Set of PM QoS parameters to attach to the device.
*/
-int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev)
+int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
+ struct gpd_gov_dev_data *gov_data)
{
struct generic_pm_domain_data *gpd_data;
struct pm_domain_data *pdd;
@@ -1123,6 +1141,7 @@ int pm_genpd_add_device(struct generic_p
gpd_data->base.dev = dev;
gpd_data->need_restore = false;
list_add_tail(&gpd_data->base.list_node, &genpd->dev_list);
+ gpd_data->gov_data = gov_data;
out:
genpd_release_lock(genpd);
@@ -1280,13 +1299,13 @@ int pm_genpd_remove_subdomain(struct gen
}
/**
- * pm_genpd_init - Initialize a generic I/O PM domain object.
+ * __pm_genpd_init - Initialize a generic I/O PM domain object.
* @genpd: PM domain object to initialize.
* @gov: PM domain governor to associate with the domain (may be NULL).
* @is_off: Initial value of the domain's power_is_off field.
*/
-void pm_genpd_init(struct generic_pm_domain *genpd,
- struct dev_power_governor *gov, bool is_off)
+void __pm_genpd_init(struct generic_pm_domain *genpd,
+ struct dev_power_governor *gov, bool is_off)
{
if (IS_ERR_OR_NULL(genpd))
return;
Index: linux/drivers/base/power/Makefile
===================================================================
--- linux.orig/drivers/base/power/Makefile
+++ linux/drivers/base/power/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_PM_SLEEP) += main.o wakeup.
obj-$(CONFIG_PM_RUNTIME) += runtime.o
obj-$(CONFIG_PM_TRACE_RTC) += trace.o
obj-$(CONFIG_PM_OPP) += opp.o
-obj-$(CONFIG_PM_GENERIC_DOMAINS) += domain.o
+obj-$(CONFIG_PM_GENERIC_DOMAINS) += domain.o domain_governor.o
obj-$(CONFIG_HAVE_CLK) += clock_ops.o
ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
Index: linux/drivers/base/power/domain_governor.c
===================================================================
--- /dev/null
+++ linux/drivers/base/power/domain_governor.c
@@ -0,0 +1,40 @@
+/*
+ * drivers/base/power/domain_governor.c - Governors for device PM domains.
+ *
+ * Copyright (C) 2011 Rafael J. Wysocki <rjw@sisk.pl>, Renesas Electronics Corp.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_qos.h>
+
+static bool default_stop_ok(struct device *dev)
+{
+ struct gpd_gov_dev_data *gov_data;
+ s64 constraint_ns;
+ s32 constraint;
+
+ dev_dbg(dev, "%s()\n", __func__);
+
+ gov_data = to_gpd_data(dev_to_psd(dev)->domain_data)->gov_data;
+ if (!gov_data)
+ return true;
+
+ constraint = dev_pm_qos_read_value(dev);
+ if (constraint < 0)
+ return false;
+ else if (constraint == 0) /* 0 means "don't care" */
+ return true;
+
+ constraint_ns = constraint;
+ constraint_ns *= NSEC_PER_USEC;
+
+ return constraint_ns > gov_data->break_even_ns;
+}
+
+struct dev_power_governor default_qos_governor = {
+ .stop_ok = default_stop_ok,
+};
Index: linux/arch/arm/mach-shmobile/pm-sh7372.c
===================================================================
--- linux.orig/arch/arm/mach-shmobile/pm-sh7372.c
+++ linux/arch/arm/mach-shmobile/pm-sh7372.c
@@ -100,7 +100,7 @@ void sh7372_init_pm_domain(struct sh7372
{
struct generic_pm_domain *genpd = &sh7372_pd->genpd;
- pm_genpd_init(genpd, NULL, false);
+ pm_genpd_init(genpd, false);
genpd->stop_device = pm_clk_suspend;
genpd->start_device = pm_clk_resume;
genpd->dev_irq_safe = true;
^ permalink raw reply
* [PATCH 3/5] PM / QoS: Add function dev_pm_qos_read_value()
From: Rafael J. Wysocki @ 2011-08-30 22:21 UTC (permalink / raw)
To: Linux PM mailing list; +Cc: LKML, Linux-sh list
In-Reply-To: <201108310017.03103.rjw@sisk.pl>
From: Rafael J. Wysocki <rjw@sisk.pl>
To read the current PM QoS value for a given device we need to
make sure that the device's power.constraints object won't be
removed while we're doing that. For this reason, put the
operation under dev->power.lock and acquire the lock
around the initialization and removal of power.constraints.
Moreover, since we're using the value of power.constraints to
determine whether or not the object is present, the
power.constraints_state field isn't necessary any more and may be
removed. However, dev_pm_qos_add_request() needs to check if the
device is being removed from the system before allocating a new
PM QoS constraints object for it, so it has to use device_pm_lock()
and the device PM QoS initialization and destruction should be done
under device_pm_lock() as well.
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
drivers/base/power/main.c | 4 -
drivers/base/power/qos.c | 167 ++++++++++++++++++++++++++--------------------
include/linux/pm.h | 8 --
include/linux/pm_qos.h | 3
4 files changed, 101 insertions(+), 81 deletions(-)
Index: linux/drivers/base/power/qos.c
===================================================================
--- linux.orig/drivers/base/power/qos.c
+++ linux/drivers/base/power/qos.c
@@ -30,15 +30,6 @@
* . To minimize the data usage by the per-device constraints, the data struct
* is only allocated at the first call to dev_pm_qos_add_request.
* . The data is later free'd when the device is removed from the system.
- * . The constraints_state variable from dev_pm_info tracks the data struct
- * allocation state:
- * DEV_PM_QOS_NO_DEVICE: No device present or device removed, no data
- * allocated,
- * DEV_PM_QOS_DEVICE_PRESENT: Device present, data not allocated and will be
- * allocated at the first call to dev_pm_qos_add_request,
- * DEV_PM_QOS_ALLOCATED: Device present, data allocated. The per-device
- * PM QoS constraints framework is operational and constraints can be
- * added, updated or removed using the dev_pm_qos_* API.
* . A global mutex protects the constraints users from the data being
* allocated and free'd.
*/
@@ -51,8 +42,30 @@
static DEFINE_MUTEX(dev_pm_qos_mtx);
+
static BLOCKING_NOTIFIER_HEAD(dev_pm_notifiers);
+/**
+ * dev_pm_qos_read_value - Get PM QoS constraint for a given device.
+ * @dev: Device to get the PM QoS constraint value for.
+ */
+s32 dev_pm_qos_read_value(struct device *dev)
+{
+ struct pm_qos_constraints *c;
+ unsigned long flags;
+ s32 ret = 0;
+
+ spin_lock_irqsave(&dev->power.lock, flags);
+
+ c = dev->power.constraints;
+ if (c)
+ ret = pm_qos_read_value(c);
+
+ spin_unlock_irqrestore(&dev->power.lock, flags);
+
+ return ret;
+}
+
/*
* apply_constraint
* @req: constraint request to apply
@@ -105,27 +118,37 @@ static int dev_pm_qos_constraints_alloca
}
BLOCKING_INIT_NOTIFIER_HEAD(n);
+ plist_head_init(&c->list);
+ c->target_value = PM_QOS_DEV_LAT_DEFAULT_VALUE;
+ c->default_value = PM_QOS_DEV_LAT_DEFAULT_VALUE;
+ c->type = PM_QOS_MIN;
+ c->notifiers = n;
+
+ spin_lock_irq(&dev->power.lock);
dev->power.constraints = c;
- plist_head_init(&dev->power.constraints->list);
- dev->power.constraints->target_value = PM_QOS_DEV_LAT_DEFAULT_VALUE;
- dev->power.constraints->default_value = PM_QOS_DEV_LAT_DEFAULT_VALUE;
- dev->power.constraints->type = PM_QOS_MIN;
- dev->power.constraints->notifiers = n;
- dev->power.constraints_state = DEV_PM_QOS_ALLOCATED;
+ spin_unlock_irq(&dev->power.lock);
return 0;
}
+static void __dev_pm_qos_constraints_init(struct device *dev)
+{
+ spin_lock_irq(&dev->power.lock);
+ dev->power.constraints = NULL;
+ spin_unlock_irq(&dev->power.lock);
+}
+
/**
- * dev_pm_qos_constraints_init
+ * dev_pm_qos_constraints_init - Initalize device's PM QoS constraints pointer.
* @dev: target device
*
- * Called from the device PM subsystem at device insertion
+ * Called from the device PM subsystem at device insertion under
+ * device_pm_lock().
*/
void dev_pm_qos_constraints_init(struct device *dev)
{
mutex_lock(&dev_pm_qos_mtx);
- dev->power.constraints_state = DEV_PM_QOS_DEVICE_PRESENT;
+ dev->power.constraints = NULL;
mutex_unlock(&dev_pm_qos_mtx);
}
@@ -133,34 +156,35 @@ void dev_pm_qos_constraints_init(struct
* dev_pm_qos_constraints_destroy
* @dev: target device
*
- * Called from the device PM subsystem at device removal
+ * Called from the device PM subsystem at device removal under device_pm_lock().
*/
void dev_pm_qos_constraints_destroy(struct device *dev)
{
struct dev_pm_qos_request *req, *tmp;
+ struct pm_qos_constraints *c;
mutex_lock(&dev_pm_qos_mtx);
- if (dev->power.constraints_state == DEV_PM_QOS_ALLOCATED) {
- /* Flush the constraints list for the device */
- plist_for_each_entry_safe(req, tmp,
- &dev->power.constraints->list,
- node) {
- /*
- * Update constraints list and call the notification
- * callbacks if needed
- */
- apply_constraint(req, PM_QOS_REMOVE_REQ,
- PM_QOS_DEFAULT_VALUE);
- memset(req, 0, sizeof(*req));
- }
+ c = dev->power.constraints;
+ if (!c)
+ goto out;
- kfree(dev->power.constraints->notifiers);
- kfree(dev->power.constraints);
- dev->power.constraints = NULL;
+ /* Flush the constraints list for the device */
+ plist_for_each_entry_safe(req, tmp, &c->list, node) {
+ /*
+ * Update constraints list and call the notification
+ * callbacks if needed
+ */
+ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
+ memset(req, 0, sizeof(*req));
}
- dev->power.constraints_state = DEV_PM_QOS_NO_DEVICE;
+ __dev_pm_qos_constraints_init(dev);
+
+ kfree(c->notifiers);
+ kfree(c);
+
+ out:
mutex_unlock(&dev_pm_qos_mtx);
}
@@ -178,8 +202,8 @@ void dev_pm_qos_constraints_destroy(stru
*
* Returns 1 if the aggregated constraint value has changed,
* 0 if the aggregated constraint value has not changed,
- * -EINVAL in case of wrong parameters, -ENODEV if the device has been
- * removed from the system
+ * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory
+ * to allocate for data structures.
*/
int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
s32 value)
@@ -195,28 +219,35 @@ int dev_pm_qos_add_request(struct device
return -EINVAL;
}
- mutex_lock(&dev_pm_qos_mtx);
req->dev = dev;
- /* Return if the device has been removed */
- if (req->dev->power.constraints_state == DEV_PM_QOS_NO_DEVICE) {
- ret = -ENODEV;
- goto out;
- }
+ device_pm_lock();
+ mutex_lock(&dev_pm_qos_mtx);
- /*
- * Allocate the constraints data on the first call to add_request,
- * i.e. only if the data is not already allocated and if the device has
- * not been removed
- */
- if (dev->power.constraints_state == DEV_PM_QOS_DEVICE_PRESENT)
- ret = dev_pm_qos_constraints_allocate(dev);
+ if (dev->power.constraints) {
+ device_pm_unlock();
+ } else {
+ if (list_empty(&dev->power.entry)) {
+ /* The device has been removed from the system. */
+ device_pm_unlock();
+ goto out;
+ } else {
+ device_pm_unlock();
+ /*
+ * Allocate the constraints data on the first call to
+ * add_request, i.e. only if the data is not already
+ * allocated and if the device has not been removed.
+ */
+ ret = dev_pm_qos_constraints_allocate(dev);
+ }
+ }
if (!ret)
ret = apply_constraint(req, PM_QOS_ADD_REQ, value);
-out:
+ out:
mutex_unlock(&dev_pm_qos_mtx);
+
return ret;
}
EXPORT_SYMBOL_GPL(dev_pm_qos_add_request);
@@ -252,13 +283,13 @@ int dev_pm_qos_update_request(struct dev
mutex_lock(&dev_pm_qos_mtx);
- if (req->dev->power.constraints_state == DEV_PM_QOS_ALLOCATED) {
+ if (req->dev->power.constraints) {
if (new_value != req->node.prio)
ret = apply_constraint(req, PM_QOS_UPDATE_REQ,
new_value);
} else {
/* Return if the device has been removed */
- ret = -ENODEV;
+ ret = -EINVAL;
}
mutex_unlock(&dev_pm_qos_mtx);
@@ -293,7 +324,7 @@ int dev_pm_qos_remove_request(struct dev
mutex_lock(&dev_pm_qos_mtx);
- if (req->dev->power.constraints_state == DEV_PM_QOS_ALLOCATED) {
+ if (req->dev->power.constraints) {
ret = apply_constraint(req, PM_QOS_REMOVE_REQ,
PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
@@ -323,15 +354,12 @@ int dev_pm_qos_add_notifier(struct devic
mutex_lock(&dev_pm_qos_mtx);
- /* Silently return if the device has been removed */
- if (dev->power.constraints_state != DEV_PM_QOS_ALLOCATED)
- goto out;
-
- retval = blocking_notifier_chain_register(
- dev->power.constraints->notifiers,
- notifier);
+ /* Silently return if the constraints object is not present. */
+ if (dev->power.constraints)
+ retval = blocking_notifier_chain_register(
+ dev->power.constraints->notifiers,
+ notifier);
-out:
mutex_unlock(&dev_pm_qos_mtx);
return retval;
}
@@ -354,15 +382,12 @@ int dev_pm_qos_remove_notifier(struct de
mutex_lock(&dev_pm_qos_mtx);
- /* Silently return if the device has been removed */
- if (dev->power.constraints_state != DEV_PM_QOS_ALLOCATED)
- goto out;
-
- retval = blocking_notifier_chain_unregister(
- dev->power.constraints->notifiers,
- notifier);
+ /* Silently return if the constraints object is not present. */
+ if (dev->power.constraints)
+ retval = blocking_notifier_chain_unregister(
+ dev->power.constraints->notifiers,
+ notifier);
-out:
mutex_unlock(&dev_pm_qos_mtx);
return retval;
}
Index: linux/include/linux/pm_qos.h
===================================================================
--- linux.orig/include/linux/pm_qos.h
+++ linux/include/linux/pm_qos.h
@@ -77,6 +77,7 @@ int pm_qos_remove_notifier(int pm_qos_cl
int pm_qos_request_active(struct pm_qos_request *req);
s32 pm_qos_read_value(struct pm_qos_constraints *c);
+s32 dev_pm_qos_read_value(struct device *dev);
int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
s32 value);
int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value);
@@ -117,6 +118,8 @@ static inline int pm_qos_request_active(
static inline s32 pm_qos_read_value(struct pm_qos_constraints *c)
{ return 0; }
+static inline s32 dev_pm_qos_read_value(struct device *dev)
+ { return 0; }
static inline int dev_pm_qos_add_request(struct device *dev,
struct dev_pm_qos_request *req,
s32 value)
Index: linux/drivers/base/power/main.c
===================================================================
--- linux.orig/drivers/base/power/main.c
+++ linux/drivers/base/power/main.c
@@ -98,8 +98,8 @@ void device_pm_add(struct device *dev)
dev_warn(dev, "parent %s should not be sleeping\n",
dev_name(dev->parent));
list_add_tail(&dev->power.entry, &dpm_list);
- mutex_unlock(&dpm_list_mtx);
dev_pm_qos_constraints_init(dev);
+ mutex_unlock(&dpm_list_mtx);
}
/**
@@ -110,9 +110,9 @@ void device_pm_remove(struct device *dev
{
pr_debug("PM: Removing info for %s:%s\n",
dev->bus ? dev->bus->name : "No Bus", dev_name(dev));
- dev_pm_qos_constraints_destroy(dev);
complete_all(&dev->power.completion);
mutex_lock(&dpm_list_mtx);
+ dev_pm_qos_constraints_destroy(dev);
list_del_init(&dev->power.entry);
mutex_unlock(&dpm_list_mtx);
device_wakeup_disable(dev);
Index: linux/include/linux/pm.h
===================================================================
--- linux.orig/include/linux/pm.h
+++ linux/include/linux/pm.h
@@ -421,13 +421,6 @@ enum rpm_request {
RPM_REQ_RESUME,
};
-/* Per-device PM QoS constraints data struct state */
-enum dev_pm_qos_state {
- DEV_PM_QOS_NO_DEVICE, /* No device present */
- DEV_PM_QOS_DEVICE_PRESENT, /* Device present, data not allocated */
- DEV_PM_QOS_ALLOCATED, /* Device present, data allocated */
-};
-
struct wakeup_source;
struct pm_domain_data {
@@ -489,7 +482,6 @@ struct dev_pm_info {
#endif
struct pm_subsys_data *subsys_data; /* Owned by the subsystem. */
struct pm_qos_constraints *constraints;
- enum dev_pm_qos_state constraints_state;
};
extern void update_pm_runtime_accounting(struct device *dev);
^ permalink raw reply
* [PATCH 2/5] PM / Runtime: Do not run callbacks under lock for power.irq_safe set
From: Rafael J. Wysocki @ 2011-08-30 22:20 UTC (permalink / raw)
To: Linux PM mailing list; +Cc: LKML, Linux-sh list
In-Reply-To: <201108310017.03103.rjw@sisk.pl>
From: Rafael J. Wysocki <rjw@sisk.pl>
The rpm_suspend() and rpm_resume() routines execute subsystem or PM
domain callbacks under power.lock if power.irq_safe is set for the
given device. This is inconsistent with that rpm_idle() does after
commit 02b2677 (PM / Runtime: Allow _put_sync() from
interrupts-disabled context) and is problematic for subsystems and PM
domains wanting to use power.lock for synchronization in their
runtime PM callbacks. For this reason, make runtime PM core functions
always release power.lock before invoking subsystem or PM domain
callbacks.
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
drivers/base/power/runtime.c | 50 ++++++++++++++++++++++++-------------------
1 file changed, 28 insertions(+), 22 deletions(-)
Index: linux/drivers/base/power/runtime.c
===================================================================
--- linux.orig/drivers/base/power/runtime.c
+++ linux/drivers/base/power/runtime.c
@@ -155,6 +155,31 @@ static int rpm_check_suspend_allowed(str
}
/**
+ * __rpm_callback - Run a given runtime PM callback for a given device.
+ * @cb: Runtime PM callback to run.
+ * @dev: Device to run the callback for.
+ */
+static int __rpm_callback(int (*cb)(struct device *), struct device *dev)
+ __releases(&dev->power.lock) __acquires(&dev->power.lock)
+{
+ int retval;
+
+ if (dev->power.irq_safe)
+ spin_unlock(&dev->power.lock);
+ else
+ spin_unlock_irq(&dev->power.lock);
+
+ retval = cb(dev);
+
+ if (dev->power.irq_safe)
+ spin_lock(&dev->power.lock);
+ else
+ spin_lock_irq(&dev->power.lock);
+
+ return retval;
+}
+
+/**
* rpm_idle - Notify device bus type if the device can be suspended.
* @dev: Device to notify the bus type about.
* @rpmflags: Flag bits.
@@ -225,19 +250,8 @@ static int rpm_idle(struct device *dev,
else
callback = NULL;
- if (callback) {
- if (dev->power.irq_safe)
- spin_unlock(&dev->power.lock);
- else
- spin_unlock_irq(&dev->power.lock);
-
- callback(dev);
-
- if (dev->power.irq_safe)
- spin_lock(&dev->power.lock);
- else
- spin_lock_irq(&dev->power.lock);
- }
+ if (callback)
+ __rpm_callback(callback, dev);
dev->power.idle_notification = false;
wake_up_all(&dev->power.wait_queue);
@@ -252,22 +266,14 @@ static int rpm_idle(struct device *dev,
* @dev: Device to run the callback for.
*/
static int rpm_callback(int (*cb)(struct device *), struct device *dev)
- __releases(&dev->power.lock) __acquires(&dev->power.lock)
{
int retval;
if (!cb)
return -ENOSYS;
- if (dev->power.irq_safe) {
- retval = cb(dev);
- } else {
- spin_unlock_irq(&dev->power.lock);
-
- retval = cb(dev);
+ retval = __rpm_callback(cb, dev);
- spin_lock_irq(&dev->power.lock);
- }
dev->power.runtime_error = retval;
return retval != -EACCES ? retval : -EIO;
}
^ permalink raw reply
* [PATCH 1/5] PM / Domains: Split device PM domain data into base and need_restore
From: Rafael J. Wysocki @ 2011-08-30 22:18 UTC (permalink / raw)
To: Linux PM mailing list; +Cc: LKML, Linux-sh list
In-Reply-To: <201108310017.03103.rjw@sisk.pl>
From: Rafael J. Wysocki <rjw@sisk.pl>
The struct pm_domain_data data type is defined in such a way that
adding new fields specific to the generic PM domains code will
require include/linux/pm.h to be modified. As a result, data types
used only by the generic PM domains code will be defined in two
headers, although they all should be defined in pm_domain.h and
pm.h will need to include more headers, which won't be very nice.
For this reason change the definition of struct pm_subsys_data
so that its domain_data member is a pointer, which will allow
struct pm_domain_data to be subclassed by various PM domains
implementations. Remove the need_restore member from
struct pm_domain_data and make the generic PM domains code
subclass it by adding the need_restore member to the new data type.
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
drivers/base/power/domain.c | 28 +++++++++++++++++++---------
include/linux/pm.h | 3 +--
include/linux/pm_domain.h | 10 ++++++++++
3 files changed, 30 insertions(+), 11 deletions(-)
Index: linux/include/linux/pm.h
===================================================================
--- linux.orig/include/linux/pm.h
+++ linux/include/linux/pm.h
@@ -433,7 +433,6 @@ struct wakeup_source;
struct pm_domain_data {
struct list_head list_node;
struct device *dev;
- bool need_restore;
};
struct pm_subsys_data {
@@ -443,7 +442,7 @@ struct pm_subsys_data {
struct list_head clock_list;
#endif
#ifdef CONFIG_PM_GENERIC_DOMAINS
- struct pm_domain_data domain_data;
+ struct pm_domain_data *domain_data;
#endif
};
Index: linux/include/linux/pm_domain.h
===================================================================
--- linux.orig/include/linux/pm_domain.h
+++ linux/include/linux/pm_domain.h
@@ -62,6 +62,16 @@ struct gpd_link {
struct list_head slave_node;
};
+struct generic_pm_domain_data {
+ struct pm_domain_data base;
+ bool need_restore;
+};
+
+static inline struct generic_pm_domain_data *to_gpd_data(struct pm_domain_data *pdd)
+{
+ return container_of(pdd, struct generic_pm_domain_data, base);
+}
+
#ifdef CONFIG_PM_GENERIC_DOMAINS
extern int pm_genpd_add_device(struct generic_pm_domain *genpd,
struct device *dev);
Index: linux/drivers/base/power/domain.c
===================================================================
--- linux.orig/drivers/base/power/domain.c
+++ linux/drivers/base/power/domain.c
@@ -188,11 +188,12 @@ static int __pm_genpd_save_device(struct
struct generic_pm_domain *genpd)
__releases(&genpd->lock) __acquires(&genpd->lock)
{
+ struct generic_pm_domain_data *gpd_data = to_gpd_data(pdd);
struct device *dev = pdd->dev;
struct device_driver *drv = dev->driver;
int ret = 0;
- if (pdd->need_restore)
+ if (gpd_data->need_restore)
return 0;
mutex_unlock(&genpd->lock);
@@ -210,7 +211,7 @@ static int __pm_genpd_save_device(struct
mutex_lock(&genpd->lock);
if (!ret)
- pdd->need_restore = true;
+ gpd_data->need_restore = true;
return ret;
}
@@ -224,10 +225,11 @@ static void __pm_genpd_restore_device(st
struct generic_pm_domain *genpd)
__releases(&genpd->lock) __acquires(&genpd->lock)
{
+ struct generic_pm_domain_data *gpd_data = to_gpd_data(pdd);
struct device *dev = pdd->dev;
struct device_driver *drv = dev->driver;
- if (!pdd->need_restore)
+ if (!gpd_data->need_restore)
return;
mutex_unlock(&genpd->lock);
@@ -244,7 +246,7 @@ static void __pm_genpd_restore_device(st
mutex_lock(&genpd->lock);
- pdd->need_restore = false;
+ gpd_data->need_restore = false;
}
/**
@@ -493,7 +495,7 @@ static int pm_genpd_runtime_resume(struc
mutex_lock(&genpd->lock);
}
finish_wait(&genpd->status_wait_queue, &wait);
- __pm_genpd_restore_device(&dev->power.subsys_data->domain_data, genpd);
+ __pm_genpd_restore_device(dev->power.subsys_data->domain_data, genpd);
genpd->resume_count--;
genpd_set_active(genpd);
wake_up_all(&genpd->status_wait_queue);
@@ -1080,6 +1082,7 @@ static void pm_genpd_complete(struct dev
*/
int pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev)
{
+ struct generic_pm_domain_data *gpd_data;
struct pm_domain_data *pdd;
int ret = 0;
@@ -1106,14 +1109,20 @@ int pm_genpd_add_device(struct generic_p
goto out;
}
+ gpd_data = kzalloc(sizeof(*gpd_data), GFP_KERNEL);
+ if (!gpd_data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
genpd->device_count++;
dev->pm_domain = &genpd->domain;
dev_pm_get_subsys_data(dev);
- pdd = &dev->power.subsys_data->domain_data;
- pdd->dev = dev;
- pdd->need_restore = false;
- list_add_tail(&pdd->list_node, &genpd->dev_list);
+ dev->power.subsys_data->domain_data = &gpd_data->base;
+ gpd_data->base.dev = dev;
+ gpd_data->need_restore = false;
+ list_add_tail(&gpd_data->base.list_node, &genpd->dev_list);
out:
genpd_release_lock(genpd);
@@ -1152,6 +1161,7 @@ int pm_genpd_remove_device(struct generi
pdd->dev = NULL;
dev_pm_put_subsys_data(dev);
dev->pm_domain = NULL;
+ kfree(to_gpd_data(pdd));
genpd->device_count--;
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox