LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [v4 PATCH 0/5]: cpuidle/POWER (REDISIGN): Introducing cpuidle to POWER.
From: Arun R Bharadwaj @ 2009-09-01 11:37 UTC (permalink / raw)
  To: Joel Schopp, Benjamin Herrenschmidt, Paul Mackerras,
	Peter Zijlstra, Ingo Molnar, Vaidyanathan Srinivasan,
	Dipankar Sarma, Balbir Singh, Gautham R Shenoy, Arun R Bharadwaj
  Cc: linuxppc-dev, linux-kernel

Hi,

******** This is an RFC, not for inclusion **********

This patchset introduces cpuidle infrastructure to POWER, prototyping
for pseries and currently in the process of porting to x86 and hence
will *not* build on x86/other POWER platforms.

This is to get initial comments on the redesign of my earlier implementation
which can be found at http://lkml.org/lkml/2009/8/27/124

Major changes from last iteration:
----------------------------------

* Cleanup drivers/cpuidle/cpuidle.c
	Currently, the cpuidle implementation has weakness in the
	framework where an exported pm_idle function pointer is
	manipulated by various subsystem. The proposed framework has
	a registration architecture to cleanly add and remove new idle
	routines from different subsystems.

* Introduce [un]register_idle_function() routines
        Implement a LIFO based approach for registering architecture
        dependent idle routines.

* Sample implementation of register_idle_function for pSeries


TODO:
-----

* Extend this prototype to cover x86 and other archs that use cpuidle.
        Currently, in x86, the cpu_idle() idle loop doesn't have a
        default idle loop to fall back to if pm_idle is NULL, unlike
        the corresponding implementation in pseries, where
        ppc_md.power_save can be NULL and there is a fallback.
        So we need to create a similar fork in cpu_idle() idle loop of
        x86.



Patches included in this series:
--------------------------------

1/5 - Cleanup drivers/cpuidle/cpuidle.c
2/5 - Implement routines to register and unregister idle function.
3/5 - Incorporate registering of idle loop for pSeries.
4/5 - Add Kconfig entry to enable cpuidle for POWER.
5/5 - Implement pSeries processor idle module.


Any comments on the design is welcome.

--arun

^ permalink raw reply

* [v4 PATCH 1/5]: cpuidle: Cleanup drivers/cpuidle/cpuidle.c
From: Arun R Bharadwaj @ 2009-09-01 11:38 UTC (permalink / raw)
  To: Joel Schopp, Benjamin Herrenschmidt, Paul Mackerras,
	Peter Zijlstra, Ingo Molnar, Vaidyanathan Srinivasan,
	Dipankar Sarma, Balbir Singh, Gautham R Shenoy, Arun Bharadwaj
  Cc: Arun Bharadwaj, linuxppc-dev, linux-kernel
In-Reply-To: <20090901113704.GG7599@linux.vnet.ibm.com>

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-01 17:07:04]:

Cleanup drivers/cpuidle/cpuidle.c

Cpuidle maintains a pm_idle_old void pointer because, currently in x86
there is no clean way of registering and unregistering a idle function.

So remove pm_idle_old and leave the responsibility of maintaining the
list of registered idle loops to the architecture specific code. If the
architecture registers cpuidle_idle_call as its idle loop, only then
this loop is called.

Also remove unwanted functions cpuidle_[un]install_idle_handler,
cpuidle_kick_cpus()

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
 drivers/cpuidle/cpuidle.c  |   51 +++++++++++++++------------------------------
 drivers/cpuidle/governor.c |    3 --
 2 files changed, 17 insertions(+), 37 deletions(-)

Index: linux.trees.git/drivers/cpuidle/cpuidle.c
===================================================================
--- linux.trees.git.orig/drivers/cpuidle/cpuidle.c
+++ linux.trees.git/drivers/cpuidle/cpuidle.c
@@ -24,9 +24,14 @@ DEFINE_PER_CPU(struct cpuidle_device *, 
 
 DEFINE_MUTEX(cpuidle_lock);
 LIST_HEAD(cpuidle_detected_devices);
-static void (*pm_idle_old)(void);
 
 static int enabled_devices;
+static int idle_function_registered;
+
+struct idle_function_desc cpuidle_idle_desc = {
+	.name           =       "cpuidle_loop",
+	.idle_func      =       cpuidle_idle_call,
+};
 
 #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
 static void cpuidle_kick_cpus(void)
@@ -54,13 +59,10 @@ static void cpuidle_idle_call(void)
 
 	/* check if the device is ready */
 	if (!dev || !dev->enabled) {
-		if (pm_idle_old)
-			pm_idle_old();
-		else
 #if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE)
-			default_idle();
+		default_idle();
 #else
-			local_irq_enable();
+		local_irq_enable();
 #endif
 		return;
 	}
@@ -94,35 +96,11 @@ static void cpuidle_idle_call(void)
 }
 
 /**
- * cpuidle_install_idle_handler - installs the cpuidle idle loop handler
- */
-void cpuidle_install_idle_handler(void)
-{
-	if (enabled_devices && (pm_idle != cpuidle_idle_call)) {
-		/* Make sure all changes finished before we switch to new idle */
-		smp_wmb();
-		pm_idle = cpuidle_idle_call;
-	}
-}
-
-/**
- * cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler
- */
-void cpuidle_uninstall_idle_handler(void)
-{
-	if (enabled_devices && pm_idle_old && (pm_idle != pm_idle_old)) {
-		pm_idle = pm_idle_old;
-		cpuidle_kick_cpus();
-	}
-}
-
-/**
  * cpuidle_pause_and_lock - temporarily disables CPUIDLE
  */
 void cpuidle_pause_and_lock(void)
 {
 	mutex_lock(&cpuidle_lock);
-	cpuidle_uninstall_idle_handler();
 }
 
 EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock);
@@ -132,7 +110,6 @@ EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock
  */
 void cpuidle_resume_and_unlock(void)
 {
-	cpuidle_install_idle_handler();
 	mutex_unlock(&cpuidle_lock);
 }
 
@@ -287,6 +264,12 @@ static int __cpuidle_register_device(str
 	return 0;
 }
 
+static void register_cpuidle_idle_function(void)
+{
+	register_idle_function(&cpuidle_idle_desc);
+
+	idle_function_registered = 1;
+}
 /**
  * cpuidle_register_device - registers a CPU's idle PM feature
  * @dev: the cpu
@@ -303,7 +286,9 @@ int cpuidle_register_device(struct cpuid
 	}
 
 	cpuidle_enable_device(dev);
-	cpuidle_install_idle_handler();
+
+	if (!idle_function_registered)
+		register_cpuidle_idle_function();
 
 	mutex_unlock(&cpuidle_lock);
 
@@ -382,8 +367,6 @@ static int __init cpuidle_init(void)
 {
 	int ret;
 
-	pm_idle_old = pm_idle;
-
 	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
 	if (ret)
 		return ret;
Index: linux.trees.git/drivers/cpuidle/governor.c
===================================================================
--- linux.trees.git.orig/drivers/cpuidle/governor.c
+++ linux.trees.git/drivers/cpuidle/governor.c
@@ -48,8 +48,6 @@ int cpuidle_switch_governor(struct cpuid
 	if (gov == cpuidle_curr_governor)
 		return 0;
 
-	cpuidle_uninstall_idle_handler();
-
 	if (cpuidle_curr_governor) {
 		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
 			cpuidle_disable_device(dev);
@@ -63,7 +61,6 @@ int cpuidle_switch_governor(struct cpuid
 			return -EINVAL;
 		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
 			cpuidle_enable_device(dev);
-		cpuidle_install_idle_handler();
 		printk(KERN_INFO "cpuidle: using governor %s\n", gov->name);
 	}
 

^ permalink raw reply

* [v4 PATCH 2/5]: cpuidle: Implement routines to register and unregister idle function.
From: Arun R Bharadwaj @ 2009-09-01 11:39 UTC (permalink / raw)
  To: Joel Schopp, Benjamin Herrenschmidt, Paul Mackerras,
	Peter Zijlstra, Ingo Molnar, Vaidyanathan Srinivasan,
	Dipankar Sarma, Balbir Singh, Gautham R Shenoy, Arun Bharadwaj
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20090901113704.GG7599@linux.vnet.ibm.com>

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-01 17:07:04]:

Implement a LIFO based approach for registering arch dependent
idle routines.

This is a prototype for pseries, needs to be extended
for other platforms.

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/idle.c |    5 +++++
 drivers/cpuidle/cpuidle.c  |   37 +++++++++++++++++++++++++++++++++++++
 include/linux/pm.h         |   10 ++++++++++
 3 files changed, 52 insertions(+)

Index: linux.trees.git/arch/powerpc/kernel/idle.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/kernel/idle.c
+++ linux.trees.git/arch/powerpc/kernel/idle.c
@@ -46,6 +46,11 @@ static int __init powersave_off(char *ar
 }
 __setup("powersave=off", powersave_off);
 
+void set_arch_idle(void (*idle)(void))
+{
+	ppc_md.power_save = idle;
+}
+
 /*
  * The body of the idle task.
  */
Index: linux.trees.git/include/linux/pm.h
===================================================================
--- linux.trees.git.orig/include/linux/pm.h
+++ linux.trees.git/include/linux/pm.h
@@ -30,6 +30,16 @@ extern void (*pm_idle)(void);
 extern void (*pm_power_off)(void);
 extern void (*pm_power_off_prepare)(void);
 
+struct idle_function_desc {
+	char			*name;
+	void			(*idle_func)(void);
+	struct list_head	idle_list;
+};
+
+extern void set_arch_idle(void (*idle)(void));
+extern void register_idle_function(struct idle_function_desc *desc);
+extern void unregister_idle_function(struct idle_function_desc *desc);
+
 /*
  * Device power management
  */
Index: linux.trees.git/drivers/cpuidle/cpuidle.c
===================================================================
--- linux.trees.git.orig/drivers/cpuidle/cpuidle.c
+++ linux.trees.git/drivers/cpuidle/cpuidle.c
@@ -44,6 +44,43 @@ static void cpuidle_kick_cpus(void)
 static void cpuidle_kick_cpus(void) {}
 #endif
 
+LIST_HEAD(idle_function_list);
+static DEFINE_MUTEX(idle_list_mutex);
+
+void register_idle_function(struct idle_function_desc *desc)
+{
+	mutex_lock(&idle_list_mutex);
+
+	list_add(&desc->idle_list, &idle_function_list);
+	set_arch_idle(desc->idle_func);
+	cpuidle_kick_cpus();
+
+	mutex_unlock(&idle_list_mutex);
+}
+
+void unregister_idle_function(struct idle_function_desc *desc)
+{
+	struct list_head *pos;
+	struct idle_function_desc *temp_desc;
+
+	mutex_lock(&idle_list_mutex);
+	WARN_ON_ONCE(list_empty(&desc->idle_list) || desc != NULL);
+
+	list_for_each(pos, &idle_function_list) {
+		temp_desc = container_of(pos, struct idle_function_desc,
+					idle_list);
+		if (temp_desc == desc) {
+			list_del(&temp_desc->idle_list);
+			/* Re-using temp_desc here */
+			temp_desc = list_first_entry(&idle_function_list,
+					struct idle_function_desc, idle_list);
+			set_arch_idle(temp_desc->idle_func);
+			cpuidle_kick_cpus();
+		}
+	}
+	mutex_unlock(&idle_list_mutex);
+}
+
 static int __cpuidle_register_device(struct cpuidle_device *dev);
 
 /**

^ permalink raw reply

* [v4 PATCH 3/5]: pSeries: Incorporate registering of idle loop for pSeries.
From: Arun R Bharadwaj @ 2009-09-01 11:40 UTC (permalink / raw)
  To: Joel Schopp, Benjamin Herrenschmidt, Paul Mackerras,
	Peter Zijlstra, Ingo Molnar, Vaidyanathan Srinivasan,
	Dipankar Sarma, Balbir Singh, Gautham R Shenoy, Arun Bharadwaj
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20090901113704.GG7599@linux.vnet.ibm.com>

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-01 17:07:04]:

Platform needs to register its idle function via register_idle_function()
in order to provide a clean way of handling the ppc_md.power_save

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/setup.c |   13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

Index: linux.trees.git/arch/powerpc/platforms/pseries/setup.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/platforms/pseries/setup.c
+++ linux.trees.git/arch/powerpc/platforms/pseries/setup.c
@@ -280,6 +280,8 @@ static struct notifier_block pci_dn_reco
 
 static void __init pSeries_setup_arch(void)
 {
+	struct idle_function_desc pseries_idle_desc;
+
 	/* Discover PIC type and setup ppc_md accordingly */
 	pseries_discover_pic();
 
@@ -305,10 +307,17 @@ static void __init pSeries_setup_arch(vo
 		vpa_init(boot_cpuid);
 		if (get_lppaca()->shared_proc) {
 			printk(KERN_DEBUG "Using shared processor idle loop\n");
-			ppc_md.power_save = pseries_shared_idle_sleep;
+			//snprintf(pseries_idle_desc.name, 16, "shared_loop");
+			pseries_idle_desc.name = "shared_loop";
+			pseries_idle_desc.idle_func = pseries_shared_idle_sleep;
+			register_idle_function(&pseries_idle_desc);
 		} else {
 			printk(KERN_DEBUG "Using dedicated idle loop\n");
-			ppc_md.power_save = pseries_dedicated_idle_sleep;
+			//snprintf(pseries_idle_desc.name, 16, "dedicated_loop");
+			pseries_idle_desc.name = "dedicated_loop";
+			pseries_idle_desc.idle_func =
+						pseries_dedicated_idle_sleep;
+			register_idle_function(&pseries_idle_desc);
 		}
 	} else {
 		printk(KERN_DEBUG "Using default idle loop\n");

^ permalink raw reply

* [v4 PATCH 4/5]: cpuidle: Add Kconfig entry to enable cpuidle for POWER.
From: Arun R Bharadwaj @ 2009-09-01 11:41 UTC (permalink / raw)
  To: Joel Schopp, Benjamin Herrenschmidt, Paul Mackerras,
	Peter Zijlstra, Ingo Molnar, Vaidyanathan Srinivasan,
	Dipankar Sarma, Balbir Singh, Gautham R Shenoy, Arun Bharadwaj
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20090901113704.GG7599@linux.vnet.ibm.com>

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-01 17:07:04]:

This patch enables the cpuidle option in Kconfig for pSeries.

Currently cpuidle infrastructure is enabled only for x86 and ARM.
This code is almost completely borrowed from x86 to enable
cpuidle for pSeries.

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
 arch/powerpc/Kconfig              |   17 +++++++++++++++++
 arch/powerpc/include/asm/system.h |    2 ++
 arch/powerpc/kernel/idle.c        |   19 +++++++++++++++++++
 3 files changed, 38 insertions(+)

Index: linux.trees.git/arch/powerpc/Kconfig
===================================================================
--- linux.trees.git.orig/arch/powerpc/Kconfig
+++ linux.trees.git/arch/powerpc/Kconfig
@@ -88,6 +88,9 @@ config ARCH_HAS_ILOG2_U64
 	bool
 	default y if 64BIT
 
+config ARCH_HAS_CPU_IDLE_WAIT
+	def_bool y
+
 config GENERIC_HWEIGHT
 	bool
 	default y
@@ -243,6 +246,20 @@ source "kernel/Kconfig.freezer"
 source "arch/powerpc/sysdev/Kconfig"
 source "arch/powerpc/platforms/Kconfig"
 
+menu "Power management options"
+
+source "drivers/cpuidle/Kconfig"
+
+config PSERIES_PROCESSOR_IDLE
+	bool "Idle Power Management Support for pSeries"
+	depends on PPC_PSERIES && CPU_IDLE
+	default y
+	help
+	  Idle Power Management Support for pSeries. This hooks onto cpuidle
+	  infrastructure to help in idle cpu power management.
+
+endmenu
+
 menu "Kernel options"
 
 config HIGHMEM
Index: linux.trees.git/arch/powerpc/include/asm/system.h
===================================================================
--- linux.trees.git.orig/arch/powerpc/include/asm/system.h
+++ linux.trees.git/arch/powerpc/include/asm/system.h
@@ -546,5 +546,7 @@ extern void account_system_vtime(struct 
 
 extern struct dentry *powerpc_debugfs_root;
 
+void cpu_idle_wait(void);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_SYSTEM_H */
Index: linux.trees.git/arch/powerpc/kernel/idle.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/kernel/idle.c
+++ linux.trees.git/arch/powerpc/kernel/idle.c
@@ -107,6 +107,25 @@ void cpu_idle(void)
 	}
 }
 
+static void do_nothing(void *unused)
+{
+}
+
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * ppc_md.power_save and update to new value.
+ * Required while changing ppc_md.power_save handler on SMP systems.
+ * Caller must have changed ppc_md.power_save to the new value before the call.
+ */
+void cpu_idle_wait(void)
+{
+	/* Ensure that new value of ppc_md.power_save is set */
+	smp_mb();
+	/* kick all the CPUs so that they exit out of ppc_md.power_save */
+	smp_call_function(do_nothing, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
 int powersave_nap;
 
 #ifdef CONFIG_SYSCTL

^ permalink raw reply

* [v4 PATCH 5/5]: pSeries: Implement pSeries processor idle module.
From: Arun R Bharadwaj @ 2009-09-01 11:42 UTC (permalink / raw)
  To: Joel Schopp, Benjamin Herrenschmidt, Paul Mackerras,
	Peter Zijlstra, Ingo Molnar, Vaidyanathan Srinivasan,
	Dipankar Sarma, Balbir Singh, Gautham R Shenoy, Arun Bharadwaj
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20090901113704.GG7599@linux.vnet.ibm.com>

* Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-01 17:07:04]:

This patch creates arch/powerpc/platforms/pseries/processor_idle.c,
which implements the cpuidle infrastructure for pseries.
It implements a pseries_cpuidle_loop() which would be the main idle loop
called from cpu_idle(). It makes decision of entering either snooze or nap
state based on the decision taken by the cpuidle governor.

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/Makefile         |    1 
 arch/powerpc/platforms/pseries/processor_idle.c |  179 ++++++++++++++++++++++++
 arch/powerpc/platforms/pseries/pseries.h        |   14 +
 arch/powerpc/platforms/pseries/setup.c          |    3 
 4 files changed, 194 insertions(+), 3 deletions(-)

Index: linux.trees.git/arch/powerpc/platforms/pseries/Makefile
===================================================================
--- linux.trees.git.orig/arch/powerpc/platforms/pseries/Makefile
+++ linux.trees.git/arch/powerpc/platforms/pseries/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_HCALL_STATS)	+= hvCall_inst
 obj-$(CONFIG_PHYP_DUMP)	+= phyp_dump.o
 obj-$(CONFIG_CMM)		+= cmm.o
 obj-$(CONFIG_DTL)		+= dtl.o
+obj-$(CONFIG_PSERIES_PROCESSOR_IDLE)	+= processor_idle.o
Index: linux.trees.git/arch/powerpc/platforms/pseries/pseries.h
===================================================================
--- linux.trees.git.orig/arch/powerpc/platforms/pseries/pseries.h
+++ linux.trees.git/arch/powerpc/platforms/pseries/pseries.h
@@ -10,6 +10,8 @@
 #ifndef _PSERIES_PSERIES_H
 #define _PSERIES_PSERIES_H
 
+#include <linux/cpuidle.h>
+
 extern void __init fw_feature_init(const char *hypertas, unsigned long len);
 
 struct pt_regs;
@@ -40,4 +42,16 @@ extern unsigned long rtas_poweron_auto;
 
 extern void find_udbg_vterm(void);
 
+DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
+
+#ifdef CONFIG_PSERIES_PROCESSOR_IDLE
+struct pseries_processor_power {
+	struct cpuidle_device dev;
+	int count;
+	int id;
+};
+
+extern struct cpuidle_driver pseries_idle_driver;
+#endif
+
 #endif /* _PSERIES_PSERIES_H */
Index: linux.trees.git/arch/powerpc/platforms/pseries/processor_idle.c
===================================================================
--- /dev/null
+++ linux.trees.git/arch/powerpc/platforms/pseries/processor_idle.c
@@ -0,0 +1,179 @@
+/*
+ *  processor_idle - idle state cpuidle driver.
+ *  Adapted from drivers/acpi/processor_idle.c
+ *
+ *  Arun R Bharadwaj <arun@linux.vnet.ibm.com>
+ *
+ *  Copyright (C) 2009 IBM Corporation.
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+
+#include <asm/paca.h>
+#include <asm/reg.h>
+#include <asm/system.h>
+#include <asm/machdep.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+
+MODULE_AUTHOR("Arun R Bharadwaj");
+MODULE_DESCRIPTION("pSeries Idle State Driver");
+MODULE_LICENSE("GPL");
+
+struct cpuidle_driver pseries_idle_driver = {
+	.name =		"pseries_idle",
+	.owner =	THIS_MODULE,
+};
+
+DEFINE_PER_CPU(struct pseries_processor_power, power);
+
+#define IDLE_STATE_COUNT	2
+
+static int pseries_idle_init(struct pseries_processor_power *power)
+{
+	return cpuidle_register_device(&power->dev);
+}
+
+static void cede1(void)
+{
+	local_irq_enable();
+	set_thread_flag(TIF_POLLING_NRFLAG);
+	while (!need_resched()) {
+		ppc64_runlatch_off();
+		HMT_low();
+		HMT_very_low();
+	}
+	HMT_medium();
+	clear_thread_flag(TIF_POLLING_NRFLAG);
+	smp_mb();
+	local_irq_disable();
+}
+
+static void cede2(void)
+{
+	ppc64_runlatch_off();
+	HMT_medium();
+	cede_processor();
+}
+
+static int pseries_cpuidle_loop(struct cpuidle_device *dev,
+				struct cpuidle_state *st)
+{
+	ktime_t t1, t2;
+	s64 diff;
+	int ret;
+	unsigned long in_purr, out_purr;
+
+	get_lppaca()->idle = 1;
+	get_lppaca()->donate_dedicated_cpu = 1;
+	in_purr = mfspr(SPRN_PURR);
+
+	t1 = ktime_get();
+
+	if (strcmp(st->desc, "cede1") == 0)
+		cede1();
+	else
+		cede2();
+
+	t2 = ktime_get();
+	diff = ktime_to_us(ktime_sub(t2, t1));
+	if (diff > INT_MAX)
+		diff = INT_MAX;
+
+	ret = (int) diff;
+
+	out_purr = mfspr(SPRN_PURR);
+	get_lppaca()->wait_state_cycles += out_purr - in_purr;
+	get_lppaca()->donate_dedicated_cpu = 0;
+	get_lppaca()->idle = 0;
+
+	return ret;
+}
+
+static int pseries_setup_cpuidle(struct pseries_processor_power *power)
+{
+	int i;
+	struct cpuidle_state *state;
+	struct cpuidle_device *dev = &power->dev;
+
+	dev->cpu = power->id;
+
+	dev->enabled = 0;
+	for (i = 0; i < IDLE_STATE_COUNT; i++) {
+		state = &dev->states[i];
+
+		snprintf(state->name, CPUIDLE_NAME_LEN, "IDLE%d", i);
+		state->enter = pseries_cpuidle_loop;
+
+		switch (i) {
+		case 0:
+			strncpy(state->desc, "cede1", CPUIDLE_DESC_LEN);
+			state->exit_latency = 0;
+			state->target_residency = 0;
+			break;
+
+		case 1:
+			strncpy(state->desc, "cede2", CPUIDLE_DESC_LEN);
+			state->exit_latency = 1;
+			state->target_residency =
+					__get_cpu_var(smt_snooze_delay);
+			break;
+		}
+	}
+
+	power->dev.state_count = i;
+	return 0;
+}
+
+static int pseries_get_power_info(struct pseries_processor_power *power,
+				int cpu)
+{
+	power->id = cpu;
+	power->count = IDLE_STATE_COUNT;
+	return 0;
+}
+
+static int __init pseries_processor_idle_init(void)
+{
+	int cpu;
+	int result = cpuidle_register_driver(&pseries_idle_driver);
+
+	if (result < 0)
+		return result;
+
+	printk(KERN_DEBUG "pSeries idle driver registered\n");
+
+	for_each_online_cpu(cpu) {
+		pseries_get_power_info(&per_cpu(power, cpu), cpu);
+		pseries_setup_cpuidle(&per_cpu(power, cpu));
+		pseries_idle_init(&per_cpu(power, cpu));
+	}
+
+	printk(KERN_DEBUG "Using cpuidle idle loop\n");
+
+	return 0;
+}
+
+late_initcall(pseries_processor_idle_init);
Index: linux.trees.git/arch/powerpc/platforms/pseries/setup.c
===================================================================
--- linux.trees.git.orig/arch/powerpc/platforms/pseries/setup.c
+++ linux.trees.git/arch/powerpc/platforms/pseries/setup.c
@@ -509,9 +509,6 @@ static int __init pSeries_probe(void)
 	return 1;
 }
 
-
-DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
-
 static void pseries_dedicated_idle_sleep(void)
 { 
 	unsigned int cpu = smp_processor_id();

^ permalink raw reply

* [PATCH] IB/ehca: Fix CQE flags reporting
From: Joachim Fenkes @ 2009-09-01 11:55 UTC (permalink / raw)
  To: LinuxPPC-Dev, LKML, OF-General, Roland Dreier, OF-EWG
  Cc: Alexander Schmidt, Christoph Raisch, Stefan Roscher
In-Reply-To: <48499C11.7030504@gmail.com>

Was reporting CQE flags in the wrong bit positions, causing consumers to
miss incoming immediate data.

Signed-off-by: Joachim Fenkes <fenkes@de.ibm.com>
---

Please review and queue for 2.6.32 if you think it's okay. Thanks!
  Joachim

 drivers/infiniband/hw/ehca/ehca_reqs.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
index 5a3d96f..8fd88cd 100644
--- a/drivers/infiniband/hw/ehca/ehca_reqs.c
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -786,7 +786,11 @@ repoll:
 	wc->slid = cqe->rlid;
 	wc->dlid_path_bits = cqe->dlid;
 	wc->src_qp = cqe->remote_qp_number;
-	wc->wc_flags = cqe->w_completion_flags;
+	/*
+	 * HW has "Immed data present" and "GRH present" in bits 6 and 5.
+	 * SW defines those in bits 1 and 0, so we can just shift and mask.
+	 */
+	wc->wc_flags = (cqe->w_completion_flags >> 5) & 3;
 	wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
 	wc->sl = cqe->service_level;
 
-- 
1.6.0.4

^ permalink raw reply related

* Re: [PATCH] Fix fake numa on ppc
From: Balbir Singh @ 2009-09-01 14:27 UTC (permalink / raw)
  To: Ankita Garg; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20090901092407.GC4076@in.ibm.com>

* Ankita Garg <ankita@in.ibm.com> [2009-09-01 14:54:07]:

> Hi Balbir,
> 
> On Tue, Sep 01, 2009 at 11:27:53AM +0530, Balbir Singh wrote:
> > * Ankita Garg <ankita@in.ibm.com> [2009-09-01 10:33:16]:
> > 
> > > Hello,
> > > 
> > > Below is a patch to fix a couple of issues with fake numa node creation
> > > on ppc:
> > > 
> > > 1) Presently, fake nodes could be created such that real numa node
> > > boundaries are not respected. So a node could have lmbs that belong to
> > > different real nodes.
> > > 
> > > 2) The cpu association is broken. On a JS22 blade for example, which is
> > > a 2-node numa machine, I get the following:
> > > 
> > > # cat /proc/cmdline
> > > root=/dev/sda6  numa=fake=2G,4G,,6G,8G,10G,12G,14G,16G
> > > # cat /sys/devices/system/node/node0/cpulist
> > > 0-3
> > > # cat /sys/devices/system/node/node1/cpulist
> > > 4-7
> > > # cat /sys/devices/system/node/node4/cpulist
> > > 
> > > #
> > > 
> > > So, though the cpus 4-7 should have been associated with node4, they
> > > still belong to node1. The patch works by recording a real numa node
> > > boundary and incrementing the fake node count. At the same time, a
> > > mapping is stored from the real numa node to the first fake node that
> > > gets created on it.
> > >
> > 
> > Some details on how you tested it and results before and after would
> > be nice. Please see git commit 1daa6d08d1257aa61f376c3cc4795660877fb9e3
> > for example
> > 
> >
> 
> Thanks for the quick review of the patch. Here is some information on
> the testing:
> 
> Tested the patch with the following commandlines:
> numa=fake=2G,4G,6G,8G,10G,12G,14G,16G
> numa=fake=3G,6G,10G,16G
> numa=fake=4G
> numa=fake=
> 
> For testing if the fake nodes respect the real node boundaries, I added
> some debug printks in the node creation path. Without the patch, for the
> commandline numa=fake=2G,4G,6G,8G,10G,12G,14G,16G, this is what I got:
> 
> fake id: 1 nid: 0
> fake id: 1 nid: 0
> ...
> fake id: 2 nid: 0
> fake id: 2 nid: 0
> ...
> fake id: 2 nid: 0
> created new fake_node with id 3
> fake id: 3 nid: 0
> fake id: 3 nid: 0
> ...
> fake id: 3 nid: 0
> fake id: 3 nid: 0
> fake id: 3 nid: 1
> fake id: 3 nid: 1
> ...
> created new fake_node with id 4
> fake id: 4 nid: 1
> fake id: 4 nid: 1
> ...
> 
> and so on. So, fake node 3 encompasses real node 0 & 1. Also,
> 
> # cat /sys/devices/system/node/node3/meminfo
> Node 0 MemTotal:        2097152 kB
> ...
> # # cat /sys/devices/system/node/node4/meminfo
> Node 0 MemTotal:        2097152 kB
> ...
> 
> 
> With the patch, I get:
> 
> fake id: 1 nid: 0
> fake id: 1 nid: 0
> ...
> fake id: 2 nid: 0
> fake id: 2 nid: 0
> ...
> fake id: 2 nid: 0
> created new fake_node with id 3
> fake id: 3 nid: 0
> fake id: 3 nid: 0
> ...
> fake id: 3 nid: 0
> fake id: 3 nid: 0
> created new fake_node with id 4
> fake id: 4 nid: 1
> fake id: 4 nid: 1
> ...
> 
> and so on. With the patch, the fake node sizes are slightly different
> from that specified by the user.
> 
> # cat /sys/devices/system/node/node3/meminfo
> Node 3 MemTotal:        1638400 kB
> ...
> # cat /sys/devices/system/node/node4/meminfo
> Node 4 MemTotal:         458752 kB
> ...
> 
> CPU association was tested as mentioned in the previous mail:
> 
> Without the patch,
> 
> # cat /proc/cmdline
> root=/dev/sda6  numa=fake=2G,4G,,6G,8G,10G,12G,14G,16G
> # cat /sys/devices/system/node/node0/cpulist
> 0-3
> # cat /sys/devices/system/node/node1/cpulist
> 4-7
> # cat /sys/devices/system/node/node4/cpulist
> 
> #
> 
> With the patch,
> 
> # cat /proc/cmdline
> root=/dev/sda6  numa=fake=2G,4G,,6G,8G,10G,12G,14G,16G
> # cat /sys/devices/system/node/node0/cpulist
> 0-3
> # cat /sys/devices/system/node/node1/cpulist
> 

Oh! interesting.. cpuless nodes :) I think we need to fix this in the
longer run and distribute cpus between fake numa nodes of a real node
using some acceptable heuristic.

> # cat /sys/devices/system/node/node4/cpulist
> 4-7
> 
> > > 
> > > Signed-off-by: Ankita Garg <ankita@in.ibm.com>
> > > 
> > > Index: linux-2.6.31-rc5/arch/powerpc/mm/numa.c
> > > ===================================================================
> > > --- linux-2.6.31-rc5.orig/arch/powerpc/mm/numa.c
> > > +++ linux-2.6.31-rc5/arch/powerpc/mm/numa.c
> > > @@ -26,6 +26,11 @@
> > >  #include <asm/smp.h>
> > > 
> > >  static int numa_enabled = 1;
> > > +static int fake_enabled = 1;
> > > +
> > > +/* The array maps a real numa node to the first fake node that gets
> > > +created on it */
> > 
> > Coding style is broken
> > 
> 
> Fixed.
> 
> > > +int fake_numa_node_mapping[MAX_NUMNODES];
> > > 
> > >  static char *cmdline __initdata;
> > > 
> > > @@ -49,14 +54,24 @@ static int __cpuinit fake_numa_create_ne
> > >  	unsigned long long mem;
> > >  	char *p = cmdline;
> > >  	static unsigned int fake_nid;
> > > +	static unsigned int orig_nid = 0;
> > 
> > Should we call this prev_nid?
> > 
> 
> Yes, makes sense.
> > >  	static unsigned long long curr_boundary;
> > > 
> > >  	/*
> > >  	 * Modify node id, iff we started creating NUMA nodes
> > >  	 * We want to continue from where we left of the last time
> > >  	 */
> > > -	if (fake_nid)
> > > +	if (fake_nid) {
> > > +		if (orig_nid != *nid) {
> > 
> > OK, so this is called when the real NUMA node changes - comments would
> > be nice
> >
> 
> Thanks, have added the comment.
> 
> > > +			fake_nid++;
> > > +			fake_numa_node_mapping[*nid] = fake_nid;
> > > +			orig_nid = *nid;
> > > +			*nid = fake_nid;
> > > +			return 0;
> > > +		}
> > >  		*nid = fake_nid;
> > > +	}
> > > +
> > >  	/*
> > >  	 * In case there are no more arguments to parse, the
> > >  	 * node_id should be the same as the last fake node id
> > > @@ -440,7 +455,7 @@ static int of_drconf_to_nid_single(struc
> > >   */
> > >  static int __cpuinit numa_setup_cpu(unsigned long lcpu)
> > >  {
> > > -	int nid = 0;
> > > +	int nid = 0, new_nid;
> > >  	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
> > > 
> > >  	if (!cpu) {
> > > @@ -450,8 +465,15 @@ static int __cpuinit numa_setup_cpu(unsi
> > > 
> > >  	nid = of_node_to_nid_single(cpu);
> > > 
> > > +	if (fake_enabled && nid) {
> > > +		new_nid = fake_numa_node_mapping[nid];
> > > +		if (new_nid > 0)
> > > +			nid = new_nid;
> > > +	}
> > > +
> > >  	if (nid < 0 || !node_online(nid))
> > >  		nid = any_online_node(NODE_MASK_ALL);
> > > +
> > >  out:
> > >  	map_cpu_to_node(lcpu, nid);
> > > 
> > > @@ -1005,8 +1027,11 @@ static int __init early_numa(char *p)
> > >  		numa_debug = 1;
> > > 
> > >  	p = strstr(p, "fake=");
> > > -	if (p)
> > > +	if (p) {
> > >  		cmdline = p + strlen("fake=");
> > > +		if (numa_enabled)
> > > +			fake_enabled = 1;
> > 
> > Have you tried passing just numa=fake= without any commandline?
> > That should enable fake_enabled, but I wonder if that negatively
> > impacts numa_setup_cpu(). I wonder if you should look at cmdline
> > to decide on fake_enabled.
> >
> 
> fake_enabled does get set even for numa=fake=. However, it does not
> impact numa_setup_cpu, since fake_numa_node_mapping array would have no
> mapping stored and there is a condition there already to check for the
> value of the mapping. I confirmed this by booting with the above
> parameter as well.
> 
> > > +	}
> > > 
> > >  	return 0;
> > >  }
> > >
> > 
> > Overall, I think this is the right thing to do, we need to move in
> > this direction. 
> > 
> 
> Heres the updated patch:
> 
> Signed-off-by: Ankita Garg <ankita@in.ibm.com> 
> 
> Index: linux-2.6.31-rc5/arch/powerpc/mm/numa.c
> ===================================================================
> --- linux-2.6.31-rc5.orig/arch/powerpc/mm/numa.c
> +++ linux-2.6.31-rc5/arch/powerpc/mm/numa.c
> @@ -26,6 +26,13 @@
>  #include <asm/smp.h>
> 
>  static int numa_enabled = 1;
> +static int fake_enabled = 1;
> +
> +/*
> + * The array maps a real numa node to the first fake node that gets
> + * created on it
> + */
> +int fake_numa_node_mapping[MAX_NUMNODES];
> 
>  static char *cmdline __initdata;
> 
> @@ -49,14 +56,29 @@ static int __cpuinit fake_numa_create_ne
>  	unsigned long long mem;
>  	char *p = cmdline;
>  	static unsigned int fake_nid;
> +	static unsigned int prev_nid = 0;
>  	static unsigned long long curr_boundary;
> 
>  	/*
>  	 * Modify node id, iff we started creating NUMA nodes
>  	 * We want to continue from where we left of the last time
>  	 */
> -	if (fake_nid)
> +	if (fake_nid) {
> +		/*
> +		 * Moved over to the next real numa node, increment fake
> +		 * node number and store the mapping of the real node to
> +		 * the fake node
> +		 */
> +		if (prev_nid != *nid) {
> +			fake_nid++;
> +			fake_numa_node_mapping[*nid] = fake_nid;
> +			prev_nid = *nid;
> +			*nid = fake_nid;
> +			return 0;
> +		}
>  		*nid = fake_nid;
> +	}
> +
>  	/*
>  	 * In case there are no more arguments to parse, the
>  	 * node_id should be the same as the last fake node id
> @@ -440,7 +462,7 @@ static int of_drconf_to_nid_single(struc
>   */
>  static int __cpuinit numa_setup_cpu(unsigned long lcpu)
>  {
> -	int nid = 0;
> +	int nid = 0, new_nid;
>  	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
> 
>  	if (!cpu) {
> @@ -450,8 +472,15 @@ static int __cpuinit numa_setup_cpu(unsi
> 
>  	nid = of_node_to_nid_single(cpu);
> 
> +	if (fake_enabled && nid) {
> +		new_nid = fake_numa_node_mapping[nid];
> +		if (new_nid > 0)
> +			nid = new_nid;
> +	}
> +
>  	if (nid < 0 || !node_online(nid))
>  		nid = any_online_node(NODE_MASK_ALL);
> +
>  out:
>  	map_cpu_to_node(lcpu, nid);
> 
> @@ -1005,8 +1034,12 @@ static int __init early_numa(char *p)
>  		numa_debug = 1;
> 
>  	p = strstr(p, "fake=");
> -	if (p)
> +	if (p) {
>  		cmdline = p + strlen("fake=");
> +		if (numa_enabled) {
> +			fake_enabled = 1;
> +		}
> +	}
> 
>  	return 0;
>  }
>


Looks good to me


Reviewed-by: Balbir Singh <balbir@linux.vnet.ibm.com>
 
 

-- 
	Balbir

^ permalink raw reply

* Re: Question about linux boot procedure (head_64.S)
From: Geoff Levand @ 2009-09-01 17:32 UTC (permalink / raw)
  To: Lee HongWoo; +Cc: Linuxppc-dev
In-Reply-To: <5e2889710909010358v907022cs708dfc0dd3ed7fd0@mail.gmail.com>

On 09/01/2009 03:58 AM, Lee HongWoo wrote:
> __start  (in head_64.S)
>   ---> __start_initialization_multiplatform (in head_64.S)
>     ---> __boot_from_prom (in head_64.S)
>        ---> prom_init ( in prom_init.c)
>          ---> __start ???
> 
> And I don't understand where __start is called, because I can find __start
> only in head_64.S.
> If it calls __start in head_64.S, it's a recursive call.
> 
> Can anybody explain about this precedure ?

In the general case, __start is the entry point of the kernel.
It is where the bootloader or boot wrapper program jumps to
when it transfers control to the kernel.

-Geoff

^ permalink raw reply

* Re: [PATCH] IB/ehca: Fix CQE flags reporting
From: Roland Dreier @ 2009-09-01 19:55 UTC (permalink / raw)
  To: Joachim Fenkes
  Cc: LKML, OF-EWG, LinuxPPC-Dev, Christoph Raisch, OF-General,
	Alexander Schmidt, Stefan Roscher
In-Reply-To: <200909011355.34319.fenkes@de.ibm.com>

applied, thanks

^ permalink raw reply

* [RFC][POWERPC] WDT: added support for the WDT Chain driver.
From: Vitaly Bordug @ 2009-09-01 20:49 UTC (permalink / raw)
  To: Wim Van Sebroeck; +Cc: linuxppc-dev, Heiko Schocher, lkml


From: Heiko Schocher <hs@denx.de>

[POWERPC] WDT: added support for the WDT Chain driver.

    This new driver implements a character device with major number 10
    and minor number 130.  It is a software abstraction of the hardware
    watchdog with two different APIs.  While the driver periodically
    triggers the hardware watchdog, the software can setup independent
    timeout periods.

    More info in Documentation/watchdog/wdt_chain.txt

    Signed-off-by: Heiko Schocher <hs@denx.de>
    Signed-off-by: Vitaly Bordug <vitb@kernel.crashing.org>
---
This code was (and is) originally residing in DENX public git repo. I
think it would be useful upstream, to prevent reinventing the same
thing. 

The BSP files are optional but included so that to keep consistency
with original patch. Direct immr dereference will not work now of
course, but I'd like to know if the entire idea and common stuff look
OK for mainline, before adding anything ontop.

TIA

 Documentation/watchdog/00-INDEX          |    2 
 Documentation/watchdog/wdt_chain.txt     |  200 ++++++
 drivers/watchdog/Kconfig                 |   11 
 drivers/watchdog/Makefile                |    3 
 drivers/watchdog/wdt_chain_hwl_mpc82xx.c |  102 +++
 drivers/watchdog/wdt_chain_hwl_mpc8xx.c  |  148 ++++
 drivers/watchdog/wdt_chains.c            | 1013
 ++++++++++++++++++++++++++++++
 include/linux/wdt_chains.h               |  102 +++ 8 files changed,
 1581 insertions(+), 0 deletions(-) create mode 100644
 Documentation/watchdog/wdt_chain.txt create mode 100644
 drivers/watchdog/wdt_chain_hwl_mpc82xx.c create mode 100644
 drivers/watchdog/wdt_chain_hwl_mpc8xx.c create mode 100644
 drivers/watchdog/wdt_chains.c create mode 100644
 include/linux/wdt_chains.h


diff --git a/Documentation/watchdog/00-INDEX
b/Documentation/watchdog/00-INDEX index c3ea47e..f8e1ad7 100644
--- a/Documentation/watchdog/00-INDEX
+++ b/Documentation/watchdog/00-INDEX
@@ -8,3 +8,5 @@ watchdog-api.txt
 	- description of the Linux Watchdog driver API.
 wdt.txt
 	- description of the Watchdog Timer Interfaces for Linux.
+wdt_chain.txt
+	- description of the Watchdog Chain Timer Interfaces for Linux.
diff --git a/Documentation/watchdog/wdt_chain.txt
b/Documentation/watchdog/wdt_chain.txt new file mode 100644
index 0000000..78cf4cb
--- /dev/null
+++ b/Documentation/watchdog/wdt_chain.txt
@@ -0,0 +1,200 @@
+Last Reviewed: 29/08/2008
+
+	WDT Watchdog CHAIN Timer Interfaces For The Linux Operating
System
+		Heiko Schocher <hs@denx.de>
+
+driver implements a character device with major number 10 and minor
+number 130.  It is a software abstraction of the hardware watchdog
+with two different APIs.  While the driver periodically triggers the
+hardware watchdog, the software can setup independent timeout periods.
+
+"REGULAR API"
+The regular API provides a facility to setup a watchdog behaviour
+shared by all processes using the driver.  This interface uses read(2),
+write(2) and the first two ioctl(2) calls listed below.  The
+parameterless ioctl(2) calls select the operational mode of the
+driver, which can be
+	open-only
+or
+	always.
+
+In open-only mode, the watchdog will not expire if the device file is
+not opened by any process, while in always mode the behaviour is
+independent of the device file being opened.
+
+Reading from the device file will return an unsigned integer denoting
+the number of seconds left till the watchdog expires.  Writing an
+unsigned integer to the device file will set the expiration period in
+seconds.  Note that the hardware watchdog will be triggered
+independently with a configurable period.  See the section
+CONFIGURATION for details.
+
+An expiration of the watchdog will trigger a hard-reset of the machine.
+
+"CHAIN API"
+The second API, which is implemented only through calls to ioctl(2),
+can be used to register configurable
+	watchdog chains
+from either user or kernel space.  A watchdog chain is identified by
+an unsigned integer and can contain up to three action stages.
+
+"time interval"	in seconds and an "action"
+
+is associated with each stage.  When the chain is not reset before the
+interval elapses, the associated action is triggered and the chain
+moves on to the next stage.
+
+A chain can request to kill the registering process if the interval
+elapses.  In this case a restarted process can register with the
+driver giving the same identifier and reset the chain.  This is the
+main reason why there is no association between chains and processes
+or open device files.
+
+For a detailed description of the possible chain configurations, see
+the description of the WDT_CHAIN_REGISTER ioctl call.
+
+Note that when mixing the two interfaces, the second API takes
+precedence.  That is, expiry of the interval set by writing to the
+device file while a chain is registered, will not trigger any actions.
+
+Also note that the default operational mode of the driver,
+i.e. open-only or always can only be configured in the source-code.
+
+IOCTLS
+
+  WDT_CHAIN_OPEN_ONLY
+    This parameterless call selects the "open-only"
+    operational mode of the driver as described above.
+
+
+  WDT_CHAIN_ALWAYS
+    Also a parameterless call, this sets the driver to the "always"
+    operational mode.
+
+
+  WDT_CHAIN_REGISTER
+    This and the two following ioctls constitute the "chain interface"
+    described above.  The parameter given to the call is a pointer to a
+    structure with the following layout:
+
+    typedef struct wdt_chain_param {
+      unsigned chainid;
+      unsigned long timer_count[3];
+      int action[3];
+      int signal;
+    } wdt_chain_param_t;
+
+  Each stage is configured with entries in the arrays
+    "timer_count"
+  and
+    "action."
+
+  The timer_count contains the length of the interval in seconds
+  while action contains one of the constants
+
+  WDT_CHAIN_ACTION_SIGNAL, WDT_CHAIN_ACTION_KILL,
+  WDT_CHAIN_ACTION_REBOOT
+  and
+  WDT_CHAIN_ACTION_RESET.
+
+  A timer_count of zero signals the end of the chain.
+
+  The ACTION_SIGNAL will send the configurable signal with number
+  "signal" to the registering process, while ACTION_KILL signals
SIGKILL which
+  can not be caught by the registered process.
+
+  ACTION_REBOOT tries a soft reboot and ACTION_RESET
+  triggers a hard-reset of the machine.
+
+  When stages of the chain are to be left unused, they should be filled
+  with zero entries.
+
+  Note that internally a hard-reset stage is appended as a stop entry
+  ensuring a chain will never exceed its stages.
+
+
+WDT_CHAIN_RESET
+  This call resets the chain denoted by the unsigned integer passed to
+  it.  When reset, a chain will expire beginning with stage zero again.
+
+
+WDT_CHAIN_UNREGISTER
+  As closing the device file will not have any effect on chains, a
+  process must unregister a chain if the service is no longer needed.
+  This can be done with this ioctl taking an unsigned integer as a
+  parameter denoting the chain to be unregistered.
+
+
+"IOCTL RESULT VALUES"
+On successful completion, the above calls to ioctl(2) return 0.  When
+invalid parameters are provided or an error occurs, a negative value
+will be returned and "errno" set accordingly.  Specifically
+"EINVAL, EFAULT, ENOMEM"
+can be returned.
+
+
+"KERNEL INTERFACE"
+Modules can also register with the chain API of the watchdog driver.
+This the three functions (wdt_chain_register_mon_chain,
wdt_chain_reset_mon_chain +and wdt_chain_unregister_mon_chain) are
exported from the driver. The +first function takes one argument,
namely a pointer to a "wdt_chain_param" +structure. The other two calls
take a pointer to an unsigned integer as a +parameter, namely the chain
id of the chain to be reset or unregistered. +
+
+CONFIGURATION
+The driver is configurable through parameters passed to the driver
+through the Linux commandline as
+
+"wdt_chain=<opts>".
+
+Multiple options can be seperated by commas, as usual.
+
+timeout:<n>
+  will set the expiry period of the regular driver API to <n> seconds.
+
+period:<n>
+  sets the period with which the hardware watchdog is triggered to <n>
+  jiffies.  This usually means 1/100th of a second.
+
+off
+  will disable the software APIs of the driver but still trigger the
+  hardware watchdog as described previously.
+
+
+EXAMPLE
+The following code snippet registers a watchdog chain whose first
+stage will expire after 3 seconds and send the SIGUSR1 signal to the
+process.  When 5 seconds after this the chain is not reset, the
+machine will do a hard-reset.
+
+  wdt_chain_param_t param;
+
+  /* Setup signal handling */
+  signal(SIGUSR1, got_signal);
+
+  param.chainid=823;
+  param.timer_count[0]=3;
+  param.action[0]=WDT_CHAIN_ACTION_SIGNAL;
+  param.signal=SIGUSR1;
+  param.timer_count[1]=5;
+  param.action[1]=WDT_CHAIN_ACTION_RESET;
+
+  /* Register chain */
+  ioctl(fd, WDT_CHAIN_REGISTER, &param);
+  ..
+  /* Reset chain    */
+  ioctl(fd, WDT_CHAIN_RESET, &param.chainid);
+
+
+FILES
+ /dev/watchdog
+
+SUPPORTED HARDWARE
+  The Hardwaredependent functions are seperated, so that it should be
+  easy to support new Hardware. Actual the following Hardware are
supported: +
+  Hardware		File
+  MPC82XX		wdt_chain_hwl_mpc82xx.c
+  MPC8XX		wdt_chain_hwl_mpc8xx.c
+
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index b1ccc04..0908c64 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -831,6 +831,17 @@ config 8xxx_WDT
 
 	  For BookE processors (MPC85xx) use the BOOKE_WDT driver
instead. 
+config WDT_CHAIN
+	tristate "WDT Chain"
+
+config WDT_CHAIN_HWL_MPC8XX
+	tristate "WDT CHAIN Hardwarelayer for MPC8xx"
+	depends on WDT_CHAIN && PPC_8xx
+
+config WDT_CHAIN_HWL_MPC82XX
+	tristate "WDT CHAIN Hardwarelayer for MPC82xx"
+	depends on WDT_CHAIN
+
 config MV64X60_WDT
 	tristate "MV64X60 (Marvell Discovery) Watchdog Timer"
 	depends on MV64X60
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 3d77429..ce08a34 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -120,6 +120,9 @@ obj-$(CONFIG_8xxx_WDT) += mpc8xxx_wdt.o
 obj-$(CONFIG_MV64X60_WDT) += mv64x60_wdt.o
 obj-$(CONFIG_PIKA_WDT) += pika_wdt.o
 obj-$(CONFIG_BOOKE_WDT) += booke_wdt.o
+obj-$(CONFIG_WDT_CHAIN) += wdt_chains.o
+obj-$(CONFIG_WDT_CHAIN_HWL_MPC8XX) += wdt_chain_hwl_mpc8xx.o
+obj-$(CONFIG_WDT_CHAIN_HWL_MPC82XX) += wdt_chain_hwl_mpc82xx.o
 
 # PPC64 Architecture
 obj-$(CONFIG_WATCHDOG_RTAS) += wdrtas.o
diff --git a/drivers/watchdog/wdt_chain_hwl_mpc82xx.c
b/drivers/watchdog/wdt_chain_hwl_mpc82xx.c new file mode 100644
index 0000000..e4a50f2
--- /dev/null
+++ b/drivers/watchdog/wdt_chain_hwl_mpc82xx.c
@@ -0,0 +1,102 @@
+/*
+ * (C) Copyright Heiko Schocher <hs@denx.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ * MA 02111-1307 USA
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <linux/wdt_chains.h>
+#include <asm/cpm2.h>
+
+#ifdef DEBUG
+# define debugk(fmt, args...) printk(fmt, ##args)
+#else
+# define debugk(fmt, args...)
+#endif
+
+
+int wdt_chain_hwl_start(void)
+{
+	return 0;
+}
+
+int wdt_chain_hwl_stop(void)
+{
+	return 0;
+}
+
+/***********************************************************************
+F* Function:     int __init wdt_hwl_init (void) P*A*Z*
+ *
+P* Parameters:   none
+P*
+P* Returnvalue:  int
+P*                - 0 success
+P*                  -ENXIO  The watchdog is not enabled by firmware
+ *
+Z* Intention:    Initialize the Hardwaredependent functions for the WDT
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+#define MPC82XX_SYPCR_SWE	0x00000004 +#define
MPC82XX_SYPCR_SWRI	0x00000002 +#define MPC82XX_SYPCR_SWP
0x00000001 +
+int __init wdt_hwl_init(void)
+{
+	/* using MPC82XX internal WDT */
+	debugk("%s: SYPCR=0x%x\n", __func__,
+
cpm2_immr->im_siu_conf.siu_82xx.sc_sypcr);
+	debugk("%s: Should be 0x02 (Hardreset) | 0x01 prescaled | \
+						0x04 active\n",
__func__);
+	if ((cpm2_immr->im_siu_conf.siu_82xx.sc_sypcr &
MPC82XX_SYPCR_SWE)
+							!=
MPC82XX_SYPCR_SWE) {
+		printk("WDT not enabled by firmware, SYPCR=0x%x\n",
+			cpm2_immr->im_siu_conf.siu_82xx.sc_sypcr);
+		return -ENXIO;
+	}
+
+	/* trigger now */
+	cpm2_immr->im_siu_conf.siu_82xx.sc_swsr = 0x556C;
+	cpm2_immr->im_siu_conf.siu_82xx.sc_swsr = 0xAA39;
+	return 0;
+}
+
+/***********************************************************************
+F* Function:	 void wdt_hwl_reset (void) P*A*Z*
+ *
+P* Parameters:	 none
+P*
+P* Returnvalue:	 none
+ *
+Z* Intention:	 Reset the hardware watchdog.
+ *
+D* Design:	 wd@denx.de
+C* Coding:	 wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+void wdt_hwl_reset(void) +{
+	cpm2_immr->im_siu_conf.siu_82xx.sc_swsr = 0x556C;
+	cpm2_immr->im_siu_conf.siu_82xx.sc_swsr = 0xAA39;
+/*	debugk ("%s: WDT serviced\n", __FUNCTION__); */
+}
+EXPORT_SYMBOL(wdt_hwl_reset);
+
diff --git a/drivers/watchdog/wdt_chain_hwl_mpc8xx.c
b/drivers/watchdog/wdt_chain_hwl_mpc8xx.c new file mode 100644
index 0000000..32abefb
--- /dev/null
+++ b/drivers/watchdog/wdt_chain_hwl_mpc8xx.c
@@ -0,0 +1,148 @@
+/*
+ * (C) Copyright Heiko Schocher <hs@denx.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ * MA 02111-1307 USA
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>			/* for character
devices	*/ +#include <linux/version.h>
+#include <linux/init.h>			/* for
__initfunc		*/ +#include
<linux/slab.h>			/* for kmalloc and friends
*/ + +#include <linux/wdt_chains.h>
+#include <asm/8xx_immap.h>
+#include <asm/mpc8xx.h>
+
+#ifdef CONFIG_8xx
+static immap_t *immr;	/* pointer to register-structure */
+#endif
+
+#ifdef DEBUG
+# define debugk(fmt, args...) printk(fmt, ##args)
+#else
+# define debugk(fmt, args...)
+#endif
+
+
+int wdt_chain_hwl_start(void)
+{
+	return 0;
+}
+
+int wdt_chain_hwl_stop(void)
+{
+	return 0;
+}
+
+#define IMAP_ADDR_CHAIN 0xf0000000
+/***********************************************************************
+F* Function:     int __init wdt_hwl_init (void) P*A*Z*
+ *
+P* Parameters:   none
+P*
+P* Returnvalue:  int
+P*                - 0 success
+P*                  -ENXIO  (LWMON) The watchdog is not enabled by
firmware
+ *
+Z* Intention:    Initialize the Hardwaredependent functions for the WDT
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+int __init wdt_hwl_init(void) +{
+	/* get pointer for SYPCR and SWSR */
+	if (!immr)
+		immr = (immap_t *) IMAP_ADDR_CHAIN;
+
+	debugk("%s: SYPCR=0x%x\n", __func__,
immr->im_siu_conf.sc_sypcr); +
+#ifndef	CONFIG_LWMON	/* LWMON uses external MAX708TESA
watchdog */
+	if ((immr->im_siu_conf.sc_sypcr & 0x00000004) == 0) {
+		printk("WDT_8xx: SWT not enabled by firmware,
SYPCR=0x%x\n",
+			immr->im_siu_conf.sc_sypcr);
+		return -ENXIO;
+	}
+#endif
+
+#ifdef CONFIG_LWMON	/* LWMON uses external MAX708TESA watchdog
*/ +
+	immr->im_ioport.iop_padat ^= 0x0100;
+
+#else			/* use MPC8xx internal SWT */
+	/* set SYPCR[SWRI]...wdt-timeout causes reset */
+	immr->im_siu_conf.sc_sypcr |= (
+				0x00000002	/* SWRI - SWT causes
HRESET  */
+				|
+				0x00000001	/* SWP  - activate
prescaler */
+				|
+				0xFFFF0000	/* SWTC - max. timer
count   */
+				);
+
+	debugk("%s: SYPCR=0x%x\n", __func__,
immr->im_siu_conf.sc_sypcr); +
+	/* trigger now */
+	immr->im_siu_conf.sc_swsr = 0x556C;
+	immr->im_siu_conf.sc_swsr = 0xAA39;
+#endif
+
+	return 0;
+}
+
+/***********************************************************************
+F* Function:	 void wdt_hwl_reset (void) P*A*Z*
+ *
+P* Parameters:	 none
+P*
+P* Returnvalue:	 none
+ *
+Z* Intention:	 Reset the hardware watchdog.
+ *
+D* Design:	 wd@denx.de
+C* Coding:	 wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+void wdt_hwl_reset(void) +{
+	immap_t *imap = (immap_t *) IMAP_ADDR_CHAIN;
+#if defined(CONFIG_LWMON)
+	/*
+	 * The LWMON board uses a MAX706TESA Watchdog
+	 * with the trigger pin connected to port PA.7
+	 *
+	 * The port has already been set up in the firmware,
+	 * so we just have to toggle it.
+	 */
+	imap->im_ioport.iop_padat ^= 0x0100;
+	/*
+	 * Do NOT add a call to "debugk()" here;
+	 * it would be called TOO often.
+	 */
+#else
+	/*
+	 * All other boards use the MPC8xx Internal Watchdog
+	 */
+	imap->im_siu_conf.sc_swsr = 0x556C;
+	imap->im_siu_conf.sc_swsr = 0xAA39;
+	debugk("%s: WDT serviced\n", __func__);
+#endif /* CONFIG_LWMON */
+}
+EXPORT_SYMBOL(wdt_hwl_reset);
+
diff --git a/drivers/watchdog/wdt_chains.c
b/drivers/watchdog/wdt_chains.c new file mode 100644
index 0000000..e7a5054
--- /dev/null
+++ b/drivers/watchdog/wdt_chains.c
@@ -0,0 +1,1013 @@
+/*
+ * (C) Copyright 2000
+ * Jorg Haider, SIEMENS AG
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ * MA 02111-1307 USA
+ *
+ * (C) 2008 Heiko Scocher, hs@denx.de -- ported for 2.6
+ *                                       seperated Hardwarelayer, now
not
+ *                                       only for MPC8xx.
+ * (C) 2002 Detlev Zundel, dzu@denx.de -- Added "watchdog chains"
+ * (C) 2001 Wolfgang Denk, wd@denx.de -- Cleanup, Modifications for
2.4 kernels
+ * (C) 2001 Wolfgang Denk, wd@denx.de -- Adaption for MAX706TESA
Watchdog
+ * (C) 2001 Steven Hein,  ssh@sgi.com -- Added timeout configuration
option
+ * (C) 2001 Robert Enzinger, Robert.Enzinger.extern@icn.siemens.de,
+ *		-- added ioctl() for dynamic configuration
+ *
+M* Modul:         wdt_mpc8xx.c
+M*
+M* Content:       Linux kernel driver for the watchdog.
+ *
+ */
+
+/*---------------------------- Headerfiles
----------------------------*/ +#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>			/* for character
devices	*/ +#include <linux/miscdevice.h>		/*
driver is a misc device	*/ +#include <linux/version.h>
+#include <linux/init.h>			/* for
__initfunc		*/ +#include
<linux/spinlock.h>		/* for spinlocks                */
+#include <linux/list.h>			/* for linked-list
macros       */ +#include <linux/sched.h> +#include
<linux/slab.h>			/* for kmalloc and friends
*/ +#include <linux/reboot.h>		/* for
sys_reboot               */ +#include <linux/wdt_chains.h> +#include
<linux/uaccess.h>		/* for put_user
*/ + +/*----------------- Local vars, datatypes and macros
------------------*/ +/* #define DEBUG	1	//	*/
+
+#ifdef DEBUG
+# define debugk(fmt, args...) printk(fmt, ##args)
+#else
+# define debugk(fmt, args...)
+#endif
+
+#define	WDT_VERSION	"0.6"	/* version of this
device driver	*/ +
+#ifdef	CONFIG_CHAIN_WDT_TIMEOUT
+#define TIMEOUT_VALUE CONFIG_CHAIN_WDT_TIMEOUT	/* configurable
timeout	*/ +#else
+#define TIMEOUT_VALUE 300	/* reset after five minutes = 300
seconds*/ +#endif
+
+#ifdef CONFIG_CHAIN_WDT_TIMEOUT_OPEN_ONLY
+static int timeout_open_only = 1;
+#else
+static int timeout_open_only;
+#endif
+
+/*
+ * The shortest watchdog period of all boards is (so far) approx. 1
sec,
+ * thus re-trigger watchdog by default every 500 ms = HZ / 2,
+ * except for the LWMON board, which needs 50ms = HZ / 20.
+ * The default can be overwritten using a boot argument.
+ *
+ * The external interface is in seconds, but internal all calculation
+ * is done in jiffies.
+ */
+#ifdef	CONFIG_TRIGGER_PERIOD
+# define TRIGGER_PERIOD   (CONFIG_TRIGGER_PERIOD)
+#else
+# define TRIGGER_PERIOD   (HZ/2)
+#endif
+
+typedef struct monitored_chain {
+	struct list_head list;
+	unsigned int chainid;
+	pid_t pid;
+	int escalation;
+	unsigned long expires;
+	unsigned long timer_count[4];
+	int action[4];
+	int signal;
+} monitored_chain_t;
+
+static struct list_head mon_list;
+static spinlock_t mon_lock;	/* lock for the monitored chain
list */ +static int mon_chains;
+
+struct timer_list wd_timer;	/* structure for timer
administration */ +static unsigned long wdt_chain_dummy;
+
+/*
+ * Watchdog active? When disabled, it will get re-triggered
+ * automatically without timeout, so it appears to be switched off
+ * although actually it is still running.
+ */
+static int enabled = 1;
+
+static unsigned long timer_count = TIMEOUT_VALUE *
HZ;	/*remaining time*/ +static unsigned long timer_period =
TRIGGER_PERIOD;	/*period to trigger WD*/ +
+static int device_open;	/* to implement "run while open"
mode	*/ +
+
+/*------------------------- Extern prototypes
-------------------------*/ +extern inline void sync(void);
+extern long sys_reboot(int, int, unsigned int, void *);
+/*----------------------- Interface prototypes
------------------------*/ +void wdt_chain_handler(unsigned long);
+/*------------------------- Local prototypes
--------------------------*/ +static int wdt_chain_open(struct inode *,
struct file *); +static int wdt_chain_release(struct inode *, struct
file *); +static ssize_t wdt_chain_read(struct file *, char *, size_t,
loff_t *); +static ssize_t wdt_chain_write(struct file *, const char *,
size_t, loff_t *); +static int wdt_chain_ioctl(struct inode *, struct
file *,
+			     unsigned int, unsigned long);
+
+static int register_mon_chain(wdt_chain_param_t *, int);
+/*---------------------Kernel interface prototypes
--------------------*/ +int
wdt_chain_register_mon_chain(wdt_chain_param_t *); +int
wdt_chain_unregister_mon_chain(unsigned int); +int
wdt_chain_reset_mon_chain(int); +static int process_mon_chains(void);
+static monitored_chain_t *find_mon_chain_by_chainid(unsigned int);
+static void insert_mon_chain(monitored_chain_t *);
+#ifdef MODULE
+static void free_mon_list(void);
+#endif
+
+const struct file_operations wdt_chain_ops = {
+	.owner = THIS_MODULE,
+	.open = wdt_chain_open,
+	.release = wdt_chain_release,
+	.read = wdt_chain_read,
+	.write = wdt_chain_write,
+	.ioctl = wdt_chain_ioctl,
+};
+
+static struct miscdevice wdt_miscdev =	/* driver is a misc
device */ +{
+	WATCHDOG_MINOR,
+	"watchdog",
+	&wdt_chain_ops
+};
+
+/*
----------------------------------------------------------------------
*/ +int __init wdt_chain_init(void) +{
+	int rc;
+
+	rc = wdt_hwl_init();
+	debugk("WDT_CHAIN HWL init: %d\n", rc);
+
+	if (rc)
+		return rc;
+
+	/* register misc device */
+	rc = misc_register(&wdt_miscdev);
+	if (rc < 0) {
+		printk("%s: failed with %d\n", __func__, rc);
+		return rc;
+	}
+
+	debugk("WDT_CHAIN registered: major=%d minor=%d\n",
+		MISC_MAJOR, WATCHDOG_MINOR);
+
+	INIT_LIST_HEAD(&mon_list);
+	spin_lock_init(&mon_lock);
+
+	init_timer(&wd_timer);		/* initialize
timer-structure...*/
+	wd_timer.function = wdt_chain_handler;
+	wd_timer.data = (unsigned long) &wdt_chain_dummy;
+	wd_timer.expires = jiffies + timer_period;
+
+	debugk("%s: watchdog timer initialized - timer_count = %d \
+		 period = %d\n",
+		__func__, timer_count / HZ, HZ / timer_period);
+
+	add_timer(&wd_timer);			/* ...and
activate timer */ +
+	debugk("%s: timer activated\n", __func__);
+
+	pr_info("WDT_CHAIN: Software Watchdog Timer version "
WDT_VERSION); +
+	if (enabled)
+		pr_info(", timeout %ld sec.\n", timer_count / HZ);
+	else
+		pr_info(" (disabled)\n");
+	return 0;
+}
+fs_initcall(wdt_chain_init);
+
+/***********************************************************************
+F* Function:     int __init wdt_chain_setup (char *options) P*A*Z*
+ *
+P* Parameters:   char *options
+P*                - Options to parse
+P*
+P* Returnvalue:  int
+P*                - 0 is always returned
+ *
+Z* Intention:    Parse the options passed on the linux command line
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+int __init wdt_chain_setup(char *options) +{
+	while (options && *options) {
+		if (strncmp(options, "off", 3) == 0) {
+			options += 3;
+			enabled = 0;
+			if (*options != ',')
+				return 0;
+			options++;
+		}
+
+		if (strncmp(options, "timeout:", 8) == 0) {
+			options += 8;
+			if (!*options)
+				return 0;
+
+			timer_count = HZ * strict_strtoul(options,
&options, 0);
+			if (*options != ',')
+				return 0;
+			options++;
+		}
+
+		if (strncmp(options, "period:", 7) == 0) {
+			options += 7;
+			if (!*options)
+				return 0;
+
+			timer_period = strict_strtoul(options,
&options, 0);
+			if (*options != ',')
+				return 0;
+			options++;
+		}
+	}
+
+	return 0;
+}
+__setup("wdt_chain=", wdt_chain_setup);
+
+
+/***********************************************************************
+F* Function:     static int wdt_chain_open (struct inode *inode,
+F*                                           struct file *filp) P*A*Z*
+ *
+P* Parameters:   struct inode *inode
+P*                - Inode of the device file being opened
+P*               struct file *file
+P*                - Passed by the kernel, but not used
+P*
+P* Returnvalue:  int - 0 success
+P*                    <0 Errorcondition, which can be
+P*                     -ENXIO  Watchdog is not enabled
+ *
+Z* Intention:    This function is called by the kernel when a device
file +Z*               for the driver is opened by open(2).
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+static int wdt_chain_open(struct inode *inode, struct file *filp) +{
+	debugk("ENTER %s (%p, %p)\n", __func__, inode, filp);
+
+	if (!enabled)		/* user interface disabled */
+		return -ENXIO;
+
+	device_open++;			/* increment usage
counter */
+	debugk("%s: WDT is open\n", __func__);
+	return 0;
+}
+
+
+/***********************************************************************
+F* Function:     static int wdt_chain_release (struct inode *inode,
+F*                                              struct file *filp)
P*A*Z*
+ *
+P* Parameters:   struct inode *inode
+P*                - Inode of the device file being closed
+P*               struct file *file
+P*                - Passed by the kernel, but not used
+P*
+P* Returnvalue:  int
+P*                - 0 is always returned
+ *
+Z* Intention:    This function is called by the kernel when a device
file +Z*               of the driver is closed with close(2).
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+static int wdt_chain_release(struct inode *inode, struct file *filp) +{
+	debugk("ENTER %s (%p, %p)\n", __func__, inode, filp);
+
+	device_open--;			/* decrement usage
counter */
+	debugk("%s: WDT is closed.\n", __func__);
+
+	return 0;
+}
+
+
+/***********************************************************************
+F* Function:     static ssize_t wdt_chain_read (struct file *filp,
char *buffer,
+				size_t length, loff_t *offset) P*A*Z*
+ *
+P* Parameters:   struct file *file
+P*                - Passed by the kernel, pointer to the file structure
+P*                  for the device file
+P*               char *buf
+P*                - Pointer to buffer in userspace
+P*               size_t count
+P*                - Number of bytes to read
+P*               loff_t *ppos
+P*                - Offset for the read - ignored.
+P*
+P* Returnvalue:  int
+P*                 - >0 number of bytes read, i.e. 4
+P*                   <0 Errorcondition, which can be
+P*                    -EINVAL  When trying to read fewer bytes than the
+P*                             rest counter occupies, i.e.
sizeof(unsigned) +P*                    -EFAULT  A user-provided
pointer is invalid
+ *
+Z* Intention:    Read the rest counter from the device.
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+static ssize_t wdt_chain_read(struct file *filp, char *buffer,
+				size_t length,   loff_t *offset)
+{
+	unsigned int rest_count = timer_count / HZ;
+	int rc;
+
+	debugk("ENTER %s (%p, %p, %d, %p)\n",
+		__func__, filp, buffer, length,  offset);
+
+	if (length < sizeof(rest_count)) {
+		debugk("wdt_chain_release/kernel: invalid argument\n");
+
+		return -EINVAL;
+	}
+	/* copy value into userspace */
+	rc = put_user(rest_count, (int *) buffer);
+	if (rc)
+		return rc;
+
+	debugk("%s: rest_count=%i\n", __func__, rest_count);
+
+	return sizeof(rest_count);	/* read always exactly 4
bytes */ +}
+
+
+/***********************************************************************
+F* Function:     static ssize_t wdt_chain_write (struct file *filp,
const char *buffer,
+ *			size_t length, loff_t *offset) P*A*Z*
+ *
+P* Parameters:   struct file *file
+P*                - Passed by the kernel, pointer to the file structure
+P*                  for the device file
+P*               char *buf
+P*                - Pointer to buffer in userspace
+P*               size_t count
+P*                - Number of bytes to write
+P*               loff_t *ppos
+P*                - Offset for the write - ignored.
+P*
+P* Returnvalue:  int
+P*                 - >0 number of bytes actually written, i.e. 4
+P*                   <0 Errorcondition, which can be
+P*                    -EINVAL  When trying to write more or less than 4
+P*                             bytes, i.e. sizeof(unsigned)
+P*                    -EFAULT  A user-provided pointer is invalid
+ *
+Z* Intention:    Set the rest counter to the value provided by the user
+Z*               process interpreted as a number of seconds.
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+static ssize_t wdt_chain_write(struct file *filp, const char *buffer,
+				 size_t length, loff_t *offset)
+{
+	int error;
+	unsigned int new_count;
+
+	debugk("ENTER %s (%p, %p, %d, %p)\n",
+		__func__, filp, buffer, length,  offset);
+
+	if (length != sizeof(new_count)) {
+		debugk("%s: invalid length (%d instead of %d)\n",
+			__func__, length, sizeof(new_count));
+
+		return -EINVAL;
+	}
+
+	/* copy count value into kernel space */
+	error = get_user(new_count, (int *)buffer);
+	if (error != 0) {
+		debugk("%s: get_user failed: rc=%d\n",
+			__func__, error);
+
+		return error;
+	}
+
+	timer_count = HZ * new_count;
+
+	return sizeof(new_count);
+}
+
+/***********************************************************************
+F* Function:     static int wdt_chain_ioctl (struct inode *node,
struct file *filp,
+		unsigned int cmd, unsigned long arg) P*A*Z*
+ *
+P* Parameters:   struct inode *inode
+P*                - Passed by the kernel, inode of the device file
being +P*                  operated on
+P*               struct file *file
+P*                - Passed by the kernel, but not used
+P*               unsigned int cmd
+P*                - ioctl command number
+P*               unsigned long arg
+P*                - Pointer to arguments cast to unsigned long.
+P*                  The actual parameter depends on the command, see
+P*                  wdt_mpc8xx(4).
+P*
+P* Returnvalue:  int
+P*                 - 0 => success
+P*                  <0 Errorcondition, which can be
+P*                  -EINTR  the call was interrupted
+P*                  -EFAULT the pointer passed as arg is invalid
+P*                  -EINVAL a parameter was invalid
+ *
+Z* Intention:    This is the entry point for the ioctl() commands.
+Z*               For a detailed description see the man-page
wdt_mpc8xx(4).
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+static int wdt_chain_ioctl(struct inode *node, struct file *filp,
+			     unsigned int cmd, unsigned long arg)
+{
+	wdt_chain_param_t param;
+	int chainid;
+	int	res;
+
+	switch (cmd) {
+	case WDT_CHAIN_OPEN_ONLY:
+		timeout_open_only = 1;
+		break;
+	case WDT_CHAIN_ALWAYS:
+		timeout_open_only = 0;
+		break;
+	case WDT_CHAIN_REGISTER:
+		if (copy_from_user(&param, (void *)arg, sizeof(param)))
+			return -EFAULT;
+		res = register_mon_chain(&param, 1);
+		return res;
+		break;
+	case WDT_CHAIN_RESET:
+		if (copy_from_user(&chainid, (void *)arg,
sizeof(chainid)))
+			return -EFAULT;
+		return wdt_chain_reset_mon_chain(chainid);
+		break;
+	case WDT_CHAIN_UNREGISTER:
+		if (copy_from_user(&chainid, (void *)arg,
sizeof(chainid)))
+			return -EFAULT;
+		return wdt_chain_unregister_mon_chain(chainid);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+#ifdef MODULE
+
+/***********************************************************************
+F* Function:     int init_module (void) P*A*Z*
+ *
+P* Parameters:   none
+P*
+P* Returnvalue:  int
+P*                - see wdt_chain_init()
+ *
+Z* Intention:    This is a wrapper function for the module
initialization +Z*               simply calling wdt_chain_init().
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+int init_module(void) +{
+	debugk("%s: initialize WDT_CHAIN\n", __func__);
+
+	return wdt_chain_init();
+}
+
+
+/***********************************************************************
+F* Function:     void cleanup_module (void) P*A*Z*
+ *
+P* Parameters:   none
+P*
+P* Returnvalue:  none
+ *
+Z* Intention:    Cleanup and shutdown the driver to allow unloading
+Z*               of the module.
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+void cleanup_module(void) +{
+	debugk("%s: cleanup WDT_CHAIN\n", __func__);
+
+	del_timer(&wd_timer);
+	free_mon_list();
+
+	misc_deregister(&wdt_miscdev);
+}
+
+#endif	/* MODULE */
+
+/***********************************************************************
+F* Function:     void wdt_chain_handler (unsigned long ptr) P*A*Z*
+ *
+P* Parameters:   unsigned long ptr
+P*                - Parameter passed in from the timer invocation,
ignored +P*
+P* Returnvalue:  none
+ *
+Z* Intention:    This is the core functionality of the watchdog.  It is
+Z*               called from the timer wd_timer and handles the
necessary +Z*               processing, including resetting the
hardware watchdog. +Z*               When chains are registered, they
override the +Z*               default behaviour and are processed in
process_mon_chains().
+ *
+D* Design:       Haider / wd@denx.de
+C* Coding:       Haider / wd@denx.de
+V* Verification: wd@denx.de / dzu@denx.de
+
***********************************************************************/
+void wdt_chain_handler(unsigned long ptr) +{
+
+	debugk("%s: timer_count=%d jiffies active chains: %d\n",
+					__func__, timer_count,
mon_chains); +
+	if ((timer_count == 0) && enabled) {	/* try "graceful"
shutdown */
+		pr_info("WDT_CHAIN: Reset system...\n");
+		machine_restart(NULL);
+	}
+
+	if ((timer_count > 0) || (!enabled)) {
+
+		/* execute WDT service sequence */
+		wdt_hwl_reset();
+
+		wd_timer.expires = jiffies + timer_period;
+		add_timer(&wd_timer);	/* ...re-activate timer */
+
+		/*
+		 * process the monitor list
+		 */
+		process_mon_chains();
+
+		/*
+		 * don't timeout if disabled
+		 */
+		if (!enabled)
+			return;
+
+		/*
+		 * don't timeout if new interface is used or if device
+		 * is not opened
+		 */
+		if ((timeout_open_only && (!device_open)) ||
mon_chains)
+			return;
+
+		/* decrement variable for timer-control */
+		if (timer_count > timer_period)
+			timer_count -= timer_period;
+		else
+			timer_count = 0;
+
+		if (timer_count <= 0)
+			pr_info("WDT_CHAIN: watchdog about to
expire\n"); +
+	}
+
+	return;
+}
+
+/***********************************************************************
+F* Function:     static int register_mon_chain(wdt_chain_param_t
*param, +F*                                             int userproc)
P*A*Z*
+ *
+P* Parameters:   wdt_chain_param_t *param
+P*                - The parameters for the chain to be registered.
+P*                  Unused stages should be cleared with 0's.
+P*               int userproc
+P*                - Flag whether we are called from user or kernel
space +P*
+P* Returnvalue:  int
+P*                - 0  success
+P*                  -EINVAL  invalid parameters
+P*                  -ENOMEM  out of memory
+ *
+Z* Intention:    This is the main interface to register a watchdog
chain +Z*               either from userspace throught ioctl() or from
kernel +Z*               space through the wrapper function
+Z*               wdt_mpc8x_register_mon_chain().
+Z*               Re-registering an existing chain is explicitely ok, as
+Z*               a restarted process has to go through this.  This
+Z*               effectively resets the corresponding chain.
+Z*               For a detailed description of the parameters, see the
+Z*               wdt_mpc8xx(4) manpage.
+ *
+D* Design:       dzu@denx.de
+C* Coding:       dzu@denx.de
+V* Verification: wd@denx.de
+
***********************************************************************/
+static int register_mon_chain(wdt_chain_param_t *param, int userproc)
+{
+	monitored_chain_t *entry;
+	int result = 0, i;
+
+	/* Before kmallocing storage we first check the parameters */
+	for (i = 0; (i < 3) && (param->timer_count[i]); i++)
+		if ((param->action[i] < WDT_CHAIN_ACTION_NO) ||
+		    (param->action[i] > WDT_CHAIN_ACTION_RESET))
+			return -EINVAL;
+
+	debugk("%s: registering CHAIN monitor\n", __func__);
+
+	spin_lock(&mon_lock);
+	entry = find_mon_chain_by_chainid(param->chainid);
+	if (entry == NULL) {
+		/* New chain-id so allocate list entry */
+		entry = kmalloc(sizeof(monitored_chain_t), GFP_KERNEL);
+		if (entry == NULL) {
+			result = -ENOMEM;
+			goto out;
+		}
+
+		/* Copy request data to internal format */
+		if (userproc)
+			entry->pid = current->pid;
+		else
+			entry->pid = 0;
+		entry->chainid = param->chainid;
+		entry->signal = param->signal;
+		for (i = 0; i < 2 ; i++) {
+			if (param->action[i] != 0) {
+				entry->timer_count[i] =
param->timer_count[i];
+				entry->action[i] = param->action[i];
+			} else {
+				/* Fill with stop entries */
+				entry->timer_count[i] = 2;
+				entry->action[i] =
WDT_CHAIN_ACTION_RESET;
+			}
+		}
+
+		/* This is a final stop entry */
+		entry->timer_count[3] = 2;
+		entry->action[3] = WDT_CHAIN_RESET;
+
+		/* Initialize internal data */
+		entry->escalation = 0;
+		entry->expires = jiffies + HZ * entry->timer_count[0];
+		insert_mon_chain(entry);
+		mon_chains++;
+	} else {
+		/* Re-registering of active monitor */
+		entry->pid = current->pid;
+		entry->escalation = 0;
+		entry->expires = jiffies + HZ * entry->timer_count[0];
+		entry->escalation = 0;
+		list_del(&entry->list);
+		insert_mon_chain(entry);
+	}
+
+ out:
+	spin_unlock(&mon_lock);
+
+	return result;
+}
+
+/*
+ * The next three functions form the external interface for kernel
modules
+ */
+
+/***********************************************************************
+F* Function:     int wdt_chain_register_mon_chain(wdt_chain_param_t
*param) P*A*Z*
+ *
+P* Parameters:   wdt_chain_param_t *param
+P*
+P* Returnvalue:  int
+P*                - See description of register_mon_chain()
+ *
+Z* Intention:    This is only a wrapper function around
register_mon_chain() +Z*               exported to the kernel name
space.  It only augments the +Z*               parameter with a flag
informing register_mon_chain() that +Z*               it was called
through this interface and not from a +Z*               user space
program.
+ *
+D* Design:       dzu@denx.de
+C* Coding:       dzu@denx.de:
+V* Verification: wd@denx.de
+
***********************************************************************/
+int wdt_chain_register_mon_chain(wdt_chain_param_t *param) +{
+	return register_mon_chain(param, 0);
+}
+EXPORT_SYMBOL(wdt_chain_register_mon_chain);
+
+/***********************************************************************
+F* Function:     int wdt_chain_unregister_mon_chain(unsigned int
chainid) P*A*Z*
+ *
+P* Parameters:   unsigned int chainid
+P*                - The id of the chain to unregister
+P*
+P* Returnvalue:  int
+P*                - 0 The chain was unregistered successfully
+P*                  -EINVAL  The chainid is unknown
+ *
+Z* Intention:    When the watchdog functionality is no longer needed,
+Z*               chains can be unregistered through this call.
+Z*               The function is called through the ioctl() mechanism
+Z*               or directly from other kernel proper, as it is
exported.
+ *
+D* Design:       dzu@denx.de
+C* Coding:       dzu@denx.de
+V* Verification: wd@denx.de
+
***********************************************************************/
+int wdt_chain_unregister_mon_chain(unsigned int chainid) +{
+	monitored_chain_t *entry = find_mon_chain_by_chainid(chainid);
+
+	if (entry == NULL)
+		return -EINVAL;
+
+	debugk("%s: CHAIN unregistering monitor for id %d\n",
+					__func__, entry->chainid);
+
+	spin_lock(&mon_lock);
+
+	list_del(&entry->list);
+	kfree(entry);
+	mon_chains--;
+
+	spin_unlock(&mon_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(wdt_chain_unregister_mon_chain);
+
+/***********************************************************************
+F* Function:     int wdt_chain_reset_mon_chain(int chainid) P*A*Z*
+ *
+P* Parameters:   int chainid
+P*                - The id of the chain to reset
+P*
+P* Returnvalue:  int
+P*                - 0 The chain was reset suiccessfully
+P*                 <0 Errorcondition, which can be
+P*                  -EINVAL The supplied id is unknown.
+ *
+Z* Intention:    This function resets a chain to its initial state.
+Z*               The function is called through the ioctl() mechanism
+Z*               to reset or trigger a chain or directly from other
+Z*               kernel proper, as it is exported.
+ *
+D* Design:       dzu@denx.de
+C* Coding:       dzu@denx.de
+V* Verification: wd@denx.de
+
***********************************************************************/
+int wdt_chain_reset_mon_chain(int chainid) +{
+	monitored_chain_t *entry;
+	int result = 0;
+
+	debugk("%s: CHAIN monitor reset for id %d\n", __func__,
chainid); +
+	spin_lock(&mon_lock);
+
+	entry = find_mon_chain_by_chainid(chainid);
+	if (entry == NULL) {
+		result = -EINVAL;
+		goto out;
+	}
+	entry->escalation = 0;
+	entry->expires = jiffies + HZ * entry->timer_count[0];
+	list_del(&entry->list);
+	insert_mon_chain(entry);
+
+ out:
+	spin_unlock(&mon_lock);
+
+	return result;
+}
+EXPORT_SYMBOL(wdt_chain_reset_mon_chain);
+
+#ifdef MODULE
+/***********************************************************************
+F* Function:     static void free_mon_list(void) P*A*Z*
+ *
+P* Parameters:   none
+P*
+P* Returnvalue:  none
+ *
+Z* Intention:    This function frees the entire kmalloc'ed list of
+Z*               monitored chains in case the module is unloaded.
+ *
+D* Design:       dzu@denx.de
+C* Coding:       dzu@denx.de
+V* Verification: wd@denx.de
+
***********************************************************************/
+static void free_mon_list(void) +{
+	struct list_head *ptr, *n;
+	monitored_chain_t *entry;
+
+	debugk("%s: CHAIN freeing monitor list\n", __func__);
+
+	spin_lock(&mon_lock);
+
+	for (ptr = mon_list.next, n = ptr->next; ptr != &mon_list; ptr
= n) {
+		entry = list_entry(ptr, monitored_chain_t, list);
+		kfree(entry);
+	}
+
+	spin_unlock(&mon_lock);
+}
+#endif
+
+/***********************************************************************
+F* Function:     static int process_mon_chains(void) P*A*Z*
+ *
+P* Parameters:   none
+P*
+P* Returnvalue:  int
+P*                - 0 if the function returns at all
+ *
+Z* Intention:    This is the core function of the chain functionality.
+Z*               The list with the monitored chain is processed and
+Z*               expired entries handled appropriately by stepping up
+Z*               the escalation ladder.  The escalation actions are
+Z*               triggered from here.
+ *
+D* Design:       dzu@denx.de
+C* Coding:       dzu@denx.de
+V* Verification: wd@denx.de
+
***********************************************************************/
+static int process_mon_chains(void) +{
+	struct list_head *ptr;
+	monitored_chain_t *entry;
+	int sig;
+
+	spin_lock(&mon_lock);
+
+	for (ptr = mon_list.next; ptr != &mon_list; ptr = ptr->next) {
+		entry = list_entry(ptr, monitored_chain_t, list);
+		if (entry->expires <= jiffies) {
+			debugk("%s: WDT_CHAIN monitor expired for id
%d\n",
+						__func__,
entry->chainid);
+			switch (entry->action[entry->escalation]) {
+			case WDT_CHAIN_ACTION_SIGNAL:
+				debugk("WDT_CHAIN: sending user signal
\
+					for key %d...\n",
entry->chainid);
+				sig = (entry->signal) ?
entry->signal : SIGTERM;
+				if (entry->pid)
+					kill_proc_info(sig,
+						SEND_SIG_PRIV,
entry->pid);
+				break;
+
+			case WDT_CHAIN_ACTION_KILL:
+				debugk("WDT_CHAIN: sending KILL signal
\
+					for key %d...\n",
entry->chainid);
+				if (entry->pid) {
+					pid_t pid = entry->pid;
+				/* Deregister the monitor chain in
release ! */
+					kill_proc_info(SIGKILL,
+							SEND_SIG_PRIV,
pid);
+				}
+				break;
+
+			case WDT_CHAIN_ACTION_REBOOT:
+				spin_unlock(&mon_lock);
+
wdt_chain_unregister_mon_chain(entry->chainid);
+				pr_info("WDT_CHAIN: Rebooting system \
+						for key %d...\n",
+
entry->chainid);
+				sys_reboot(LINUX_REBOOT_MAGIC1,
+					   LINUX_REBOOT_MAGIC2,
+					   LINUX_REBOOT_CMD_RESTART,
NULL);
+				break;
+
+			case WDT_CHAIN_ACTION_RESET:
+				pr_info("WDT_CHAIN: Resetting system \
+					for key %d...\n",
entry->chainid);
+				machine_restart(NULL);
+				break;
+
+			default:
+				debugk("WDT_CHAIN: undefined action
%d\n",
+
entry->action[entry->escalation]);
+				break;
+			}
+			entry->escalation++;
+			entry->expires = jiffies + HZ *
+				entry->timer_count[entry->escalation];
+			list_del(&entry->list);
+			insert_mon_chain(entry);
+		} else
+			/* The list is sorted, so we can stop here */
+			break;
+	}
+
+	spin_unlock(&mon_lock);
+
+	return 0;
+}
+
+/***********************************************************************
+F* Function:     static monitored_chain_t
*find_mon_chain_by_chainid(unsigned int id) P*A*Z*
+ *
+P* Parameters:   unsigned int id
+P*                - The ID of the chain to find
+P*
+P* Returnvalue:  monitored_chain_t *
+P*                - The entry for the chain with id id, or NULL if not
+P*                  found
+ *
+Z* Intention:    Find an entry in the list of monitored chains by
+Z*               searching for a specified id.
+ *
+D* Design:       dzu@denx.de
+C* Coding:       dzu@denx.de
+V* Verification: wd@denx.de
+
***********************************************************************/
+static monitored_chain_t *find_mon_chain_by_chainid(unsigned int id) +{
+	struct list_head *ptr;
+	monitored_chain_t *entry;
+
+	for (ptr = mon_list.next; ptr != &mon_list; ptr = ptr->next) {
+		entry = list_entry(ptr, monitored_chain_t, list);
+		if (entry->chainid == id)
+			return entry;
+	}
+	return NULL;
+}
+
+/***********************************************************************
+F* Function:     static void insert_mon_chain(monitored_chain_t *new)
P*A*Z*
+ *
+P* Parameters:   monitored_chain_t *new
+P*                - The entry to insert into the list
+P*
+P* Returnvalue:  none
+ *
+Z* Intention:    Insert an entry for a monitor chain at the correct
+Z*               position into the module-global list.  Keeping the
+Z*               list sorted with respect to the expiratoin avoids
+Z*               unneccessary processing.
+ *
+D* Design:       dzu@denx.de
+C* Coding:       dzu@denx.de
+V* Verification: wd@denx.de
+
***********************************************************************/
+static void insert_mon_chain(monitored_chain_t *new) +{
+	struct list_head *ptr;
+	monitored_chain_t *entry;
+
+	for (ptr = mon_list.next; ptr != &mon_list; ptr = ptr->next) {
+		entry = list_entry(ptr, monitored_chain_t, list);
+		if (entry->expires >= new->expires) {
+			list_add(&new->list, ptr);
+			return;
+		}
+	}
+	list_add_tail(&new->list, &mon_list);
+}
diff --git a/include/linux/wdt_chains.h b/include/linux/wdt_chains.h
new file mode 100644
index 0000000..8202a4f
--- /dev/null
+++ b/include/linux/wdt_chains.h
@@ -0,0 +1,102 @@
+/*
+ * (C) Copyright 2000
+ * Jorg Haider, SIEMENS AG
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ * MA 02111-1307 USA
+ *
+ * (C) 2002 Detlev Zundel, dzu@denx.de - Added "watchdog chains"
+ * (C) 2001 Wolfgang Denk, wd@denx.de - Cleanup, Modifications for 2.4
kernels
+ * (C) 2001 Wolfgang Denk, wd@denx.de - Adaption for MAX706TESA
Watchdog
+ * (C) 2001 Steven Hein,  ssh@sgi.com - Added timeout configuration
option
+ */
+
+/*
+ * The purpose of this header file is to provide an interface for the
+ * driver of the watchdog chain timer wdt_chain. In essence this
interface
+ * consists of the three macros WDT_CHAIN_INIT, WDT_CHAIN_SERVICE,
+ * WDT_CHAIN_CLOSE, and its functionality is described as follows:
+ *
+ * WDT_CHAIN_INIT:      opens the driver and initializes the timer to
+ *                      300 seconds;
+ *
+ * WDT_CHAIN_SERVICE:   writes the value defined by the macro
+ *                      WDT_CHAIN_DEF_SERVICE_TIME to the variable,
+ *                      which serves as a timer counter;
+ *
+ * WDT_CHAIN_CLOSE:     closes the watchdog driver;
+ *
+ * Finally there is a macro called WDT_CHAIN_SET_SERVICE_TIME(sec)
+ * for altering the value written to the timer counter to a value,
+ * which is specified by sec.
+ */
+
+
+#ifndef _wdt_chain_h
+#define _wdt_chain_h
+
+typedef struct	wdt_chain_param {
+	unsigned chainid;
+	unsigned long timer_count[3];
+	int action[3];
+	int signal;
+} wdt_chain_param_t;
+
+/* Constants for the action[] fields */
+#define WDT_CHAIN_ACTION_NO	0
+#define WDT_CHAIN_ACTION_SIGNAL	1
+#define WDT_CHAIN_ACTION_KILL	2
+#define WDT_CHAIN_ACTION_REBOOT	3
+#define WDT_CHAIN_ACTION_RESET	4
+
+#define	WDT_CHAIN_IOCTL_BASE	'W'
+
+#define WDT_CHAIN_OPEN_ONLY	_IO(WDT_CHAIN_IOCTL_BASE, 0)
+#define WDT_CHAIN_ALWAYS	_IO(WDT_CHAIN_IOCTL_BASE, 1)
+#define WDT_CHAIN_REGISTER	_IOW(WDT_CHAIN_IOCTL_BASE, 2,
wdt_chain_param_t) +#define WDT_CHAIN_RESET
_IOW(WDT_CHAIN_IOCTL_BASE, 3, int) +#define WDT_CHAIN_UNREGISTER
_IOW(WDT_CHAIN_IOCTL_BASE, 4, int) +
+#ifndef	__KERNEL__
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <linux/ioctl.h>
+
+#define WDT_CHAIN_DEVICE "/dev/watchdog"
+#define WDT_CHAIN_DEF_SERVICE_TIME 300
+
+int wdt_chain_fd;
+int wdt_chain_value = WDT_CHAIN_DEF_SERVICE_TIME;
+
+#define WDT_CHAIN_INIT (wdt_chain_fd = open(WDT_CHAIN_DEVICE, O_RDWR,
0)) +
+#define WDT_CHAIN_SET_SERVICE_TIME(sec) wdt_chain_value = (sec);
+
+#define WDT_CHAIN_SERVICE write(wdt_chain_fd, (char *)
&wdt_chain_value, \
+
sizeof(wdt_chain_value)) +
+#define WDT_CHAIN_CLOSE close(wdt_chain_fd)
+
+#else
+
+extern	int wdt_chain_hwl_start(void);
+extern	int wdt_chain_hwl_stop(void);
+extern	int wdt_hwl_init(void);
+extern	void wdt_hwl_reset(void);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* _wdt_chain_h */

^ permalink raw reply related

* Re: Question about linux boot procedure (head_64.S)
From: Lee HongWoo @ 2009-09-02  0:25 UTC (permalink / raw)
  To: Geoff Levand; +Cc: linuxppc-dev
In-Reply-To: <4A9D5A99.5030101@am.sony.com>

[-- Attachment #1: Type: text/plain, Size: 2834 bytes --]

On Wed, Sep 2, 2009 at 2:32 AM, Geoff Levand <geoffrey.levand@am.sony.com>wrote:

> On 09/01/2009 03:58 AM, Lee HongWoo wrote:
> > __start  (in head_64.S)
> >   ---> __start_initialization_multiplatform (in head_64.S)
> >     ---> __boot_from_prom (in head_64.S)
> >        ---> prom_init ( in prom_init.c)
> >          ---> __start ???
> >
> > And I don't understand where __start is called, because I can find
> __start
> > only in head_64.S.
> > If it calls __start in head_64.S, it's a recursive call.
> >
> > Can anybody explain about this precedure ?
>
> In the general case, __start is the entry point of the kernel.
> It is where the bootloader or boot wrapper program jumps to
> when it transfers control to the kernel.
>
> -Geoff
>
>
Thanks Geoff,

I believe __start is the entry point of the kernel in this case.
And the entry point is __GLOBAL(__start) in the head_64.S.

What I asked is where or what __start is called in the prom_init.c
__start(hdr, KERNELBASE + offset, 0);
Below is the simple function call flow of linux kernel boot procedure.

file : head_64.S

_GLOBAL(__start)
        /* NOP this out unconditionally */
BEGIN_FTR_SECTION
        b       .__start_initialization_multiplatform
END_FTR_SECTION(0, 1)

....

_GLOBAL(__start_initialization_multiplatform)
        /*
         * Are we booted from a PROM Of-type client-interface ?
         */
        cmpldi  cr0,r5,0
        bne     .__boot_from_prom               /* yes -> prom */

....

_STATIC(__boot_from_prom)
        /* Save parameters */
        mr      r31,r3
        mr      r30,r4
        mr      r29,r5
        mr      r28,r6
        mr      r27,r7

        /*
         * Align the stack to 16-byte boundary
         * Depending on the size and layout of the ELF sections in the
initial
         * boot binary, the stack pointer will be unalignet on PowerMac
         */
        rldicr  r1,r1,0,59

        /* Make sure we are running in 64 bits mode */
        bl      .enable_64b_mode

        /* put a relocation offset into r3 */
        bl      .reloc_offset

        LOAD_REG_IMMEDIATE(r2,__toc_start)
        addi    r2,r2,0x4000
        addi    r2,r2,0x4000

        /* Relocate the TOC from a virt addr to a real addr */
        add     r2,r2,r3

        /* Restore parameters */
        mr      r3,r31
        mr      r4,r30
        mr      r5,r29
        mr      r6,r28
        mr      r7,r27

        /* Do all of the interaction with OF client interface */
        bl      .prom_init
        /* We never return */
        trap


file : prom_init.c

unsigned long __init prom_init(unsigned long r3, unsigned long r4,
                               unsigned long pp,
                               unsigned long r6, unsigned long r7)
{
    ...
    ...
        __start(hdr, KERNELBASE + offset, 0);

        return 0;
}


HongWoo.

[-- Attachment #2: Type: text/html, Size: 3540 bytes --]

^ permalink raw reply

* Re: Question about linux boot procedure (head_64.S)
From: Michael Ellerman @ 2009-09-02  0:47 UTC (permalink / raw)
  To: Lee HongWoo; +Cc: Linuxppc-dev
In-Reply-To: <5e2889710909010358v907022cs708dfc0dd3ed7fd0@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1602 bytes --]

On Tue, 2009-09-01 at 19:58 +0900, Lee HongWoo wrote:
> Hi ~ 
> 
> This is a boot flow of linux kernel under the arch/powerpc/kernel and
> I'm using pasemi cpu. 
> 
> __start  (in head_64.S) 
>   ---> __start_initialization_multiplatform (in head_64.S) 
>     ---> __boot_from_prom (in head_64.S) 
>        ---> prom_init ( in prom_init.c) 
>          ---> __start ???
> 
> And I don't understand where __start is called, because I can find
> __start only in head_64.S. 
> If it calls __start in head_64.S, it's a recursive call. 
> 
> Can anybody explain about this precedure ? 

It calls __start() with different arguments. They are checked in
__start_initialization_multiplatform:

308         /*
309          * Are we booted from a PROM Of-type client-interface ?             
310          */
311         cmpldi  cr0,r5,0
312         beq     1f
313         b       .__boot_from_prom               /* yes -> prom */

The first time through __start we are running under OF. The kernel can
detect this based on the arguments it is passed (r5 in particular).

prom_init() deals with talking to OF and flattening the OF device tree.
We then call back into __start but this time r5 is 0:

2555         __start(hdr, kbase, 0);                                            

So the second time through we don't call into prom_init(), instead the
kernel continues using the flattened device tree.

The startup is structured this way so that the kernel can boot either
from OF (in which case we call prom_init()), or directly with a
flattened device tree.

cheers

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply

* Re: [PATCH] [V3] net: add Xilinx emac lite device driver
From: David Miller @ 2009-09-02  0:51 UTC (permalink / raw)
  To: michal.simek
  Cc: sadanan, netdev, linuxppc-dev, jgarzik, john.linn, john.williams
In-Reply-To: <4A9BCDB4.8040406@petalogix.com>

From: Michal Simek <michal.simek@petalogix.com>
Date: Mon, 31 Aug 2009 15:18:44 +0200

> I see that John's patch has wrong file permission
> -rwxr-xr-x 	xilinx_emaclite.c
 ...
> should be 644.
> 
> Please fix it in your repo.

Done, thanks!

^ permalink raw reply

* Re: Question about linux boot procedure (head_64.S)
From: Geoff Levand @ 2009-09-02  1:19 UTC (permalink / raw)
  To: Lee HongWoo; +Cc: linuxppc-dev
In-Reply-To: <5e2889710909011725t275dcdc8oa8be58b6e32e51f4@mail.gmail.com>

On 09/01/2009 05:25 PM, Lee HongWoo wrote:
> On Wed, Sep 2, 2009 at 2:32 AM, Geoff Levand <geoffrey.levand@am.sony.com>wrote:
>> In the general case, __start is the entry point of the kernel.
>> It is where the bootloader or boot wrapper program jumps to
>> when it transfers control to the kernel.
> 
> I believe __start is the entry point of the kernel in this case.
> And the entry point is __GLOBAL(__start) in the head_64.S.
> 
> What I asked is where or what __start is called in the prom_init.c
> __start(hdr, KERNELBASE + offset, 0);

I think Michael answered this.  Just FYI, more info about the powerpc
boot is in the kernel source file:

  Documentation/powerpc/booting-without-of.txt

-Geoff

^ permalink raw reply

* [PATCH] powerpc/book3e: Add missing page sizes
From: Kumar Gala @ 2009-09-02  1:43 UTC (permalink / raw)
  To: linuxppc-dev

Add defines for the other page sizes.  Even if HW doesn't support them
we made them use them for hugetlbfs support.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/include/asm/pte-book3e.h |   10 ++++++++++
 1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h
index 9800565..b82b9dc 100644
--- a/arch/powerpc/include/asm/pte-book3e.h
+++ b/arch/powerpc/include/asm/pte-book3e.h
@@ -20,9 +20,19 @@
 #define _PAGE_BAP_UX	0x000080
 #define _PAGE_PSIZE_MSK	0x000f00
 #define _PAGE_PSIZE_4K	0x000200
+#define _PAGE_PSIZE_8K	0x000300
+#define _PAGE_PSIZE_16K	0x000400
+#define _PAGE_PSIZE_32K	0x000500
 #define _PAGE_PSIZE_64K	0x000600
+#define _PAGE_PSIZE_128K	0x000700
+#define _PAGE_PSIZE_256K	0x000800
+#define _PAGE_PSIZE_512K	0x000900
 #define _PAGE_PSIZE_1M	0x000a00
+#define _PAGE_PSIZE_2M	0x000b00
+#define _PAGE_PSIZE_4M	0x000c00
+#define _PAGE_PSIZE_8M	0x000d00
 #define _PAGE_PSIZE_16M	0x000e00
+#define _PAGE_PSIZE_32M	0x000f00
 #define _PAGE_DIRTY	0x001000 /* C: page changed */
 #define _PAGE_SW0	0x002000
 #define _PAGE_U3	0x004000
-- 
1.6.0.6

^ permalink raw reply related

* [PATCH] powerpc/fsl-booke: Use HW PTE format if CONFIG_PTE_64BIT
From: Kumar Gala @ 2009-09-02  1:48 UTC (permalink / raw)
  To: linuxppc-dev

Switch to using the Power ISA defined PTE format when we have a 64-bit
PTE.  This makes the code handling between fsl-booke and book3e-64
similiar for TLB faults.

Additionally this lets use take advantage of the page size encodings and
full permissions that the HW PTE defines.

Also defined _PMD_PRESENT, _PMD_PRESENT_MASK, and _PMD_BAD since the
32-bit ppc arch code expects them.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/include/asm/pgtable-ppc32.h |    2 +
 arch/powerpc/include/asm/pte-book3e.h    |    3 ++
 arch/powerpc/include/asm/pte-fsl-booke.h |    7 -----
 arch/powerpc/kernel/head_fsl_booke.S     |   36 ++++++++++++++++++++---------
 4 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index f2c52e2..55646ad 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -111,6 +111,8 @@ extern int icache_44x_need_flush;
 #include <asm/pte-40x.h>
 #elif defined(CONFIG_44x)
 #include <asm/pte-44x.h>
+#elif defined(CONFIG_FSL_BOOKE) && defined(CONFIG_PTE_64BIT)
+#include <asm/pte-book3e.h>
 #elif defined(CONFIG_FSL_BOOKE)
 #include <asm/pte-fsl-booke.h>
 #elif defined(CONFIG_8xx)
diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h
index b82b9dc..082d515 100644
--- a/arch/powerpc/include/asm/pte-book3e.h
+++ b/arch/powerpc/include/asm/pte-book3e.h
@@ -75,6 +75,9 @@
 /* On 32-bit, we never clear the top part of the PTE */
 #ifdef CONFIG_PPC32
 #define _PTE_NONE_MASK	0xffffffff00000000ULL
+#define _PMD_PRESENT	0
+#define _PMD_PRESENT_MASK (PAGE_MASK)
+#define _PMD_BAD	(~PAGE_MASK)
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pte-fsl-booke.h b/arch/powerpc/include/asm/pte-fsl-booke.h
index ce8a9e9..2c12be5 100644
--- a/arch/powerpc/include/asm/pte-fsl-booke.h
+++ b/arch/powerpc/include/asm/pte-fsl-booke.h
@@ -33,13 +33,6 @@
 #define _PAGE_WRITETHRU	0x00400	/* H: W bit */
 #define _PAGE_SPECIAL	0x00800 /* S: Special page */
 
-#ifdef CONFIG_PTE_64BIT
-/* ERPN in a PTE never gets cleared, ignore it */
-#define _PTE_NONE_MASK	0xffffffffffff0000ULL
-/* We extend the size of the PTE flags area when using 64-bit PTEs */
-#define PTE_RPN_SHIFT	(PAGE_SHIFT + 8)
-#endif
-
 #define _PMD_PRESENT	0
 #define _PMD_PRESENT_MASK (PAGE_MASK)
 #define _PMD_BAD	(~PAGE_MASK)
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 2c5af52..975788c 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -575,7 +575,12 @@ interrupt_base:
 	 *       place or can we save a couple of instructions here ?
 	 */
 	mfspr	r12,SPRN_ESR
+#ifdef CONFIG_PTE_64BIT
+	li	r13,_PAGE_PRESENT
+	oris	r13,r13,_PAGE_ACCESSED@h
+#else
 	li	r13,_PAGE_PRESENT|_PAGE_ACCESSED
+#endif
 	rlwimi	r13,r12,11,29,29
 
 	FIND_PTE
@@ -643,7 +648,12 @@ interrupt_base:
 
 4:
 	/* Make up the required permissions */
+#ifdef CONFIG_PTE_64BIT
+	li	r13,_PAGE_PRESENT | _PAGE_EXEC
+	oris	r13,r13,_PAGE_ACCESSED@h
+#else
 	li	r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
+#endif
 
 	FIND_PTE
 	andc.	r13,r13,r11		/* Check permission */
@@ -733,7 +743,7 @@ finish_tlb_load:
 
 	mfspr	r12, SPRN_MAS2
 #ifdef CONFIG_PTE_64BIT
-	rlwimi	r12, r11, 26, 24, 31	/* extract ...WIMGE from pte */
+	rlwimi	r12, r11, 32-19, 27, 31	/* extract WIMGE from pte */
 #else
 	rlwimi	r12, r11, 26, 27, 31	/* extract WIMGE from pte */
 #endif
@@ -742,6 +752,20 @@ finish_tlb_load:
 #endif
 	mtspr	SPRN_MAS2, r12
 
+#ifdef CONFIG_PTE_64BIT
+	rlwinm	r12, r11, 32-2, 26, 31	/* Move in perm bits */
+	andi.	r10, r11, _PAGE_DIRTY
+	bne	1f
+	li	r10, MAS3_SW | MAS3_UW
+	andc	r12, r12, r10
+1:	rlwimi	r12, r13, 20, 0, 11	/* grab RPN[32:43] */
+	rlwimi	r12, r11, 20, 12, 19	/* grab RPN[44:51] */
+	mtspr	SPRN_MAS3, r12
+BEGIN_MMU_FTR_SECTION
+	srwi	r10, r13, 12		/* grab RPN[12:31] */
+	mtspr	SPRN_MAS7, r10
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
+#else
 	li	r10, (_PAGE_EXEC | _PAGE_PRESENT)
 	rlwimi	r10, r11, 31, 29, 29	/* extract _PAGE_DIRTY into SW */
 	and	r12, r11, r10
@@ -749,16 +773,6 @@ finish_tlb_load:
 	slwi	r10, r12, 1
 	or	r10, r10, r12
 	iseleq	r12, r12, r10
-	
-#ifdef CONFIG_PTE_64BIT
-	rlwimi	r12, r13, 24, 0, 7	/* grab RPN[32:39] */
-	rlwimi	r12, r11, 24, 8, 19	/* grab RPN[40:51] */
-	mtspr	SPRN_MAS3, r12
-BEGIN_MMU_FTR_SECTION
-	srwi	r10, r13, 8		/* grab RPN[8:31] */
-	mtspr	SPRN_MAS7, r10
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
-#else
 	rlwimi	r11, r12, 0, 20, 31	/* Extract RPN from PTE and merge with perms */
 	mtspr	SPRN_MAS3, r11
 #endif
-- 
1.6.0.6

^ permalink raw reply related

* Re: [v4 PATCH 1/5]: cpuidle: Cleanup drivers/cpuidle/cpuidle.c
From: Balbir Singh @ 2009-09-01 17:28 UTC (permalink / raw)
  To: Arun R Bharadwaj
  Cc: Peter Zijlstra, Gautham R Shenoy, linux-kernel, Paul Mackerras,
	Ingo Molnar, linuxppc-dev
In-Reply-To: <20090901113840.GH7599@linux.vnet.ibm.com>

* Arun R B <arun@linux.vnet.ibm.com> [2009-09-01 17:08:40]:

> * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-01 17:07:04]:
> 
> Cleanup drivers/cpuidle/cpuidle.c
> 
> Cpuidle maintains a pm_idle_old void pointer because, currently in x86
> there is no clean way of registering and unregistering a idle function.
>
> So remove pm_idle_old and leave the responsibility of maintaining the
> list of registered idle loops to the architecture specific code. If the
> architecture registers cpuidle_idle_call as its idle loop, only then
> this loop is called.
> 

It sounds as if there is a side-effect of this
patch on x86 (am I reading it incorrectly), which can be fixed, but
it will need a patch or so to get back the old behaviour on x86.
 
> Also remove unwanted functions cpuidle_[un]install_idle_handler,
> cpuidle_kick_cpus()
>
> Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
> ---
>  drivers/cpuidle/cpuidle.c  |   51 +++++++++++++++------------------------------
>  drivers/cpuidle/governor.c |    3 --
>  2 files changed, 17 insertions(+), 37 deletions(-)
> 
> Index: linux.trees.git/drivers/cpuidle/cpuidle.c
> ===================================================================
> --- linux.trees.git.orig/drivers/cpuidle/cpuidle.c
> +++ linux.trees.git/drivers/cpuidle/cpuidle.c
> @@ -24,9 +24,14 @@ DEFINE_PER_CPU(struct cpuidle_device *, 
> 
>  DEFINE_MUTEX(cpuidle_lock);
>  LIST_HEAD(cpuidle_detected_devices);
> -static void (*pm_idle_old)(void);
> 
>  static int enabled_devices;
> +static int idle_function_registered;
> +
> +struct idle_function_desc cpuidle_idle_desc = {
> +	.name           =       "cpuidle_loop",
> +	.idle_func      =       cpuidle_idle_call,
> +};
> 
>  #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
>  static void cpuidle_kick_cpus(void)
> @@ -54,13 +59,10 @@ static void cpuidle_idle_call(void)
> 
>  	/* check if the device is ready */
>  	if (!dev || !dev->enabled) {
> -		if (pm_idle_old)
> -			pm_idle_old();
> -		else
>  #if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE)
> -			default_idle();
> +		default_idle();
>  #else
> -			local_irq_enable();
> +		local_irq_enable();
>  #endif
>  		return;
>  	}
> @@ -94,35 +96,11 @@ static void cpuidle_idle_call(void)
>  }
> 
>  /**
> - * cpuidle_install_idle_handler - installs the cpuidle idle loop handler
> - */
> -void cpuidle_install_idle_handler(void)
> -{
> -	if (enabled_devices && (pm_idle != cpuidle_idle_call)) {
> -		/* Make sure all changes finished before we switch to new idle */
> -		smp_wmb();
> -		pm_idle = cpuidle_idle_call;
> -	}
> -}
> -
> -/**
> - * cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler
> - */
> -void cpuidle_uninstall_idle_handler(void)
> -{
> -	if (enabled_devices && pm_idle_old && (pm_idle != pm_idle_old)) {
> -		pm_idle = pm_idle_old;
> -		cpuidle_kick_cpus();
> -	}
> -}
> -
> -/**
>   * cpuidle_pause_and_lock - temporarily disables CPUIDLE
>   */
>  void cpuidle_pause_and_lock(void)
>  {
>  	mutex_lock(&cpuidle_lock);
> -	cpuidle_uninstall_idle_handler();
>  }
> 
>  EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock);
> @@ -132,7 +110,6 @@ EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock
>   */
>  void cpuidle_resume_and_unlock(void)
>  {
> -	cpuidle_install_idle_handler();
>  	mutex_unlock(&cpuidle_lock);
>  }
> 

What does this mean for users of cpuidle_pause_and_lock/unlock?
Should we be calling register/unregister_idle_function here?


> @@ -287,6 +264,12 @@ static int __cpuidle_register_device(str
>  	return 0;
>  }
> 
> +static void register_cpuidle_idle_function(void)
> +{
> +	register_idle_function(&cpuidle_idle_desc);
> +
> +	idle_function_registered = 1;

Use booleans if possible, unless you intend to extend the meaning of
registered someday.

> +}
>  /**
>   * cpuidle_register_device - registers a CPU's idle PM feature
>   * @dev: the cpu
> @@ -303,7 +286,9 @@ int cpuidle_register_device(struct cpuid
>  	}
> 
>  	cpuidle_enable_device(dev);
> -	cpuidle_install_idle_handler();
> +
> +	if (!idle_function_registered)
> +		register_cpuidle_idle_function();
> 
>  	mutex_unlock(&cpuidle_lock);
> 
> @@ -382,8 +367,6 @@ static int __init cpuidle_init(void)
>  {
>  	int ret;
> 
> -	pm_idle_old = pm_idle;
> -
>  	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
>  	if (ret)
>  		return ret;
> Index: linux.trees.git/drivers/cpuidle/governor.c
> ===================================================================
> --- linux.trees.git.orig/drivers/cpuidle/governor.c
> +++ linux.trees.git/drivers/cpuidle/governor.c
> @@ -48,8 +48,6 @@ int cpuidle_switch_governor(struct cpuid
>  	if (gov == cpuidle_curr_governor)
>  		return 0;
> 
> -	cpuidle_uninstall_idle_handler();
> -
>  	if (cpuidle_curr_governor) {
>  		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
>  			cpuidle_disable_device(dev);
> @@ -63,7 +61,6 @@ int cpuidle_switch_governor(struct cpuid
>  			return -EINVAL;
>  		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
>  			cpuidle_enable_device(dev);
> -		cpuidle_install_idle_handler();
>  		printk(KERN_INFO "cpuidle: using governor %s\n", gov->name);
>  	}
> 

-- 
	Balbir

^ permalink raw reply

* Re: [PATCH v2 1/2] cpu: Offline state Framework.
From: Andrew Morton @ 2009-09-02  4:49 UTC (permalink / raw)
  To: Gautham R Shenoy
  Cc: Peter Zijlstra, Venkatesh Pallipadi, linux-kernel, linuxppc-dev,
	Darrick J. Wong
In-Reply-To: <20090828100016.10641.62621.stgit@sofia.in.ibm.com>

On Fri, 28 Aug 2009 15:30:16 +0530 Gautham R Shenoy <ego@in.ibm.com> wrote:

> Provide an interface by which the system administrator can decide what state
> should the CPU go to when it is offlined.
> 
> To query the hotplug states, on needs to perform a read on the sysfs tunable:
> 	/sys/devices/system/cpu/cpu<number>/available_hotplug_states
> 
> To query or set the current state for a particular CPU, one needs to
> use the sysfs interface:
> 	/sys/devices/system/cpu/cpu<number>/current_state
> 
> This patch implements the architecture independent bits of the
> cpu-offline-state framework.
> 
> Architectures which want to expose the multiple offline-states to the
> userspace are expected to write a driver which can register
> with this framework.
> 
> Such a driver should:
> - Implement the callbacks defined in the structure struct cpu_offline_driver
>   which can be called into by this framework when the corresponding
>   sysfs interfaces are read or written into.
> 
> - Ensure that the following operation puts the CPU in the same state
>   as it did in the absence of the driver.
> 	echo 0 > /sys/devices/system/cpu/cpu<number>/online
> 
> This framework also serializes the writes to the "current_state"
> with respect to with the writes to the "online" sysfs tunable.
> 

It would be nice to document this new userspace interface somewhere.


> +struct cpu_offline_driver *cpu_offline_driver;
> +static DEFINE_MUTEX(cpu_offline_driver_lock);
> +
> +ssize_t show_available_states(struct sys_device *dev,
> +			struct sysdev_attribute *attr, char *buf)
> +{
> +	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
> +	int cpu_num = cpu->sysdev.id;
> +	ssize_t ret;
> +
> +	mutex_lock(&cpu_offline_driver_lock);
> +	if (!cpu_offline_driver) {
> +		ret = -EEXIST;
> +		goto out_unlock;
> +	}
> +
> +	ret = cpu_offline_driver->read_available_states(cpu_num, buf);
> +
> +out_unlock:
> +	mutex_unlock(&cpu_offline_driver_lock);
> +
> +	return ret;
> +
> +}

The patch adds boatloads of global symbols which do not have names
which are appropriate for global symbols.

> +ssize_t show_current_state(struct sys_device *dev,
> +			struct sysdev_attribute *attr, char *buf)

Like that.

> +ssize_t store_current_state(struct sys_device *dev,
> +			struct sysdev_attribute *attr,
> +			const char *buf, size_t count)

And that.

> +
> +static SYSDEV_ATTR(available_hotplug_states, 0444, show_available_states,
> +								NULL);
> +static SYSDEV_ATTR(current_state, 0644, show_current_state,
> +						store_current_state);
> +
> +/* Should be called with cpu_offline_driver_lock held */
> +void cpu_offline_driver_add_cpu(struct sys_device *cpu_sys_dev)
> +{
> +	if (!cpu_offline_driver || !cpu_sys_dev)
> +		return;
> +
> +	sysdev_create_file(cpu_sys_dev, &attr_available_hotplug_states);
> +	sysdev_create_file(cpu_sys_dev, &attr_current_state);
> +}
> +
> +/* Should be called with cpu_offline_driver_lock held */
> +void cpu_offline_driver_remove_cpu(struct sys_device *cpu_sys_dev)
> +{
> +	if (!cpu_offline_driver || !cpu_sys_dev)
> +		return;
> +
> +	sysdev_remove_file(cpu_sys_dev, &attr_available_hotplug_states);
> +	sysdev_remove_file(cpu_sys_dev, &attr_current_state);
> +
> +}

Please don't just ignore possible error returns.

> +int register_cpu_offline_driver(struct cpu_offline_driver *arch_cpu_driver)
> +{
> +	int ret = 0;
> +	int cpu;
> +	mutex_lock(&cpu_offline_driver_lock);
> +

The blank line goes after end-of-locals and before start-of-code.

> +	if (cpu_offline_driver != NULL) {
> +		ret = -EEXIST;
> +		goto out_unlock;
> +	}
> +
> +	if (!(arch_cpu_driver->read_available_states &&
> +	      arch_cpu_driver->read_current_state &&
> +	      arch_cpu_driver->write_current_state)) {
> +		ret = -EINVAL;
> +		goto out_unlock;

This seems pretty pointless.  Just let the code oops - the developer
will notice fairly quickly.

> +	}
> +
> +	cpu_offline_driver = arch_cpu_driver;
> +
> +	for_each_possible_cpu(cpu)
> +		cpu_offline_driver_add_cpu(get_cpu_sysdev(cpu));
> +
> +out_unlock:
> +	mutex_unlock(&cpu_offline_driver_lock);
> +	return ret;
> +}
> +
> +void unregister_cpu_offline_driver(struct cpu_offline_driver *arch_cpu_driver)
> +{
> +	int cpu;
> +	mutex_lock(&cpu_offline_driver_lock);
> +
> +	if (!cpu_offline_driver) {
> +		WARN_ON(1);

	if (WARN_ON(!cpu_offline_driver)) {

> +		mutex_unlock(&cpu_offline_driver_lock);
> +		return;
> +	}
> +
> +	for_each_possible_cpu(cpu)
> +		cpu_offline_driver_remove_cpu(get_cpu_sysdev(cpu));
> +
> +	cpu_offline_driver = NULL;
> +	mutex_unlock(&cpu_offline_driver_lock);
> +}
> +
> +

^ permalink raw reply

* Re: [v4 PATCH 1/5]: cpuidle: Cleanup drivers/cpuidle/cpuidle.c
From: Arun R Bharadwaj @ 2009-09-02  5:21 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Peter Zijlstra, Gautham R Shenoy, linux-kernel, Paul Mackerras,
	Arun Bharadwaj, Ingo Molnar, linuxppc-dev
In-Reply-To: <20090901172825.GA6780@balbir.in.ibm.com>

* Balbir Singh <balbir@linux.vnet.ibm.com> [2009-09-01 22:58:25]:

> * Arun R B <arun@linux.vnet.ibm.com> [2009-09-01 17:08:40]:
> 
> > * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-01 17:07:04]:
> > 
> > Cleanup drivers/cpuidle/cpuidle.c
> > 
> > Cpuidle maintains a pm_idle_old void pointer because, currently in x86
> > there is no clean way of registering and unregistering a idle function.
> >
> > So remove pm_idle_old and leave the responsibility of maintaining the
> > list of registered idle loops to the architecture specific code. If the
> > architecture registers cpuidle_idle_call as its idle loop, only then
> > this loop is called.
> > 
> 
> It sounds as if there is a side-effect of this
> patch on x86 (am I reading it incorrectly), which can be fixed, but
> it will need a patch or so to get back the old behaviour on x86.
> 

Hi Balbir,

Yes, your understanding is correct. Currently, x86 exports pm_idle and
this pm_idle is set to cpuidle_idle_call inside cpuidle.c

So instead of that x86 should just export a function called
set_arch_idle() which will be called from within
register_idle_function() and set pm_idle to the idle handler which is
currently being registered.

I have implemented this for pseries, and in the process of doing it
for x86 too.

> > Also remove unwanted functions cpuidle_[un]install_idle_handler,
> > cpuidle_kick_cpus()
> >
> > Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
> > ---
> >  drivers/cpuidle/cpuidle.c  |   51 +++++++++++++++------------------------------
> >  drivers/cpuidle/governor.c |    3 --
> >  2 files changed, 17 insertions(+), 37 deletions(-)
> > 
> > Index: linux.trees.git/drivers/cpuidle/cpuidle.c
> > ===================================================================
> > --- linux.trees.git.orig/drivers/cpuidle/cpuidle.c
> > +++ linux.trees.git/drivers/cpuidle/cpuidle.c
> > @@ -24,9 +24,14 @@ DEFINE_PER_CPU(struct cpuidle_device *, 
> > 
> >  DEFINE_MUTEX(cpuidle_lock);
> >  LIST_HEAD(cpuidle_detected_devices);
> > -static void (*pm_idle_old)(void);
> > 
> >  static int enabled_devices;
> > +static int idle_function_registered;
> > +
> > +struct idle_function_desc cpuidle_idle_desc = {
> > +	.name           =       "cpuidle_loop",
> > +	.idle_func      =       cpuidle_idle_call,
> > +};
> > 
> >  #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
> >  static void cpuidle_kick_cpus(void)
> > @@ -54,13 +59,10 @@ static void cpuidle_idle_call(void)
> > 
> >  	/* check if the device is ready */
> >  	if (!dev || !dev->enabled) {
> > -		if (pm_idle_old)
> > -			pm_idle_old();
> > -		else
> >  #if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE)
> > -			default_idle();
> > +		default_idle();
> >  #else
> > -			local_irq_enable();
> > +		local_irq_enable();
> >  #endif
> >  		return;
> >  	}
> > @@ -94,35 +96,11 @@ static void cpuidle_idle_call(void)
> >  }
> > 
> >  /**
> > - * cpuidle_install_idle_handler - installs the cpuidle idle loop handler
> > - */
> > -void cpuidle_install_idle_handler(void)
> > -{
> > -	if (enabled_devices && (pm_idle != cpuidle_idle_call)) {
> > -		/* Make sure all changes finished before we switch to new idle */
> > -		smp_wmb();
> > -		pm_idle = cpuidle_idle_call;
> > -	}
> > -}
> > -
> > -/**
> > - * cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler
> > - */
> > -void cpuidle_uninstall_idle_handler(void)
> > -{
> > -	if (enabled_devices && pm_idle_old && (pm_idle != pm_idle_old)) {
> > -		pm_idle = pm_idle_old;
> > -		cpuidle_kick_cpus();
> > -	}
> > -}
> > -
> > -/**
> >   * cpuidle_pause_and_lock - temporarily disables CPUIDLE
> >   */
> >  void cpuidle_pause_and_lock(void)
> >  {
> >  	mutex_lock(&cpuidle_lock);
> > -	cpuidle_uninstall_idle_handler();
> >  }
> > 
> >  EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock);
> > @@ -132,7 +110,6 @@ EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock
> >   */
> >  void cpuidle_resume_and_unlock(void)
> >  {
> > -	cpuidle_install_idle_handler();
> >  	mutex_unlock(&cpuidle_lock);
> >  }
> > 
> 
> What does this mean for users of cpuidle_pause_and_lock/unlock?
> Should we be calling register/unregister_idle_function here?
>

Yes, you are right. I have missed out on this part.
register/unregister_idle_function should replace
install/uninstall_idle_handler at those places. Thanks.

> 
> > @@ -287,6 +264,12 @@ static int __cpuidle_register_device(str
> >  	return 0;
> >  }
> > 
> > +static void register_cpuidle_idle_function(void)
> > +{
> > +	register_idle_function(&cpuidle_idle_desc);
> > +
> > +	idle_function_registered = 1;
> 
> Use booleans if possible, unless you intend to extend the meaning of
> registered someday.
>

I don't intend to extend the meaning of idle_function_registered.
Will use boolean here.

> > +}
> >  /**
> >   * cpuidle_register_device - registers a CPU's idle PM feature
> >   * @dev: the cpu
> > @@ -303,7 +286,9 @@ int cpuidle_register_device(struct cpuid
> >  	}
> > 
> >  	cpuidle_enable_device(dev);
> > -	cpuidle_install_idle_handler();
> > +
> > +	if (!idle_function_registered)
> > +		register_cpuidle_idle_function();
> > 
> >  	mutex_unlock(&cpuidle_lock);
> > 
> > @@ -382,8 +367,6 @@ static int __init cpuidle_init(void)
> >  {
> >  	int ret;
> > 
> > -	pm_idle_old = pm_idle;
> > -
> >  	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
> >  	if (ret)
> >  		return ret;
> > Index: linux.trees.git/drivers/cpuidle/governor.c
> > ===================================================================
> > --- linux.trees.git.orig/drivers/cpuidle/governor.c
> > +++ linux.trees.git/drivers/cpuidle/governor.c
> > @@ -48,8 +48,6 @@ int cpuidle_switch_governor(struct cpuid
> >  	if (gov == cpuidle_curr_governor)
> >  		return 0;
> > 
> > -	cpuidle_uninstall_idle_handler();
> > -
> >  	if (cpuidle_curr_governor) {
> >  		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
> >  			cpuidle_disable_device(dev);
> > @@ -63,7 +61,6 @@ int cpuidle_switch_governor(struct cpuid
> >  			return -EINVAL;
> >  		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
> >  			cpuidle_enable_device(dev);
> > -		cpuidle_install_idle_handler();
> >  		printk(KERN_INFO "cpuidle: using governor %s\n", gov->name);
> >  	}
> > 
> 
> -- 
> 	Balbir

Thanks for the review!
--arun

^ permalink raw reply

* Re: [PATCH] Fix fake numa on ppc
From: Ankita Garg @ 2009-09-02  5:36 UTC (permalink / raw)
  To: Balbir Singh; +Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20090901142729.GA5022@balbir.in.ibm.com>

Hi Balbir,

On Tue, Sep 01, 2009 at 07:57:29PM +0530, Balbir Singh wrote:
> * Ankita Garg <ankita@in.ibm.com> [2009-09-01 14:54:07]:
> 
> > Hi Balbir,
> > 
> > On Tue, Sep 01, 2009 at 11:27:53AM +0530, Balbir Singh wrote:
> > > * Ankita Garg <ankita@in.ibm.com> [2009-09-01 10:33:16]:
> > > 
> > > > Hello,
> > > > 
> > > > Below is a patch to fix a couple of issues with fake numa node creation
> > > > on ppc:
> > > > 
> > > > 1) Presently, fake nodes could be created such that real numa node
> > > > boundaries are not respected. So a node could have lmbs that belong to
> > > > different real nodes.
> > > > 
> > > > 2) The cpu association is broken. On a JS22 blade for example, which is
> > > > a 2-node numa machine, I get the following:
> > > > 
> > > > # cat /proc/cmdline
> > > > root=/dev/sda6  numa=fake=2G,4G,,6G,8G,10G,12G,14G,16G
> > > > # cat /sys/devices/system/node/node0/cpulist
> > > > 0-3
> > > > # cat /sys/devices/system/node/node1/cpulist
> > > > 4-7
> > > > # cat /sys/devices/system/node/node4/cpulist
> > > > 
> > > > #
> > > > 
> > > > So, though the cpus 4-7 should have been associated with node4, they
> > > > still belong to node1. The patch works by recording a real numa node
> > > > boundary and incrementing the fake node count. At the same time, a
> > > > mapping is stored from the real numa node to the first fake node that
> > > > gets created on it.
> > > >
> > > 
> > > Some details on how you tested it and results before and after would
> > > be nice. Please see git commit 1daa6d08d1257aa61f376c3cc4795660877fb9e3
> > > for example
> > > 
> > >
> > 
> > Thanks for the quick review of the patch. Here is some information on
> > the testing:
> > 
> > Tested the patch with the following commandlines:
> > numa=fake=2G,4G,6G,8G,10G,12G,14G,16G
> > numa=fake=3G,6G,10G,16G
> > numa=fake=4G
> > numa=fake=
> > 
> > For testing if the fake nodes respect the real node boundaries, I added
> > some debug printks in the node creation path. Without the patch, for the
> > commandline numa=fake=2G,4G,6G,8G,10G,12G,14G,16G, this is what I got:
> > 
> > fake id: 1 nid: 0
> > fake id: 1 nid: 0
> > ...
> > fake id: 2 nid: 0
> > fake id: 2 nid: 0
> > ...
> > fake id: 2 nid: 0
> > created new fake_node with id 3
> > fake id: 3 nid: 0
> > fake id: 3 nid: 0
> > ...
> > fake id: 3 nid: 0
> > fake id: 3 nid: 0
> > fake id: 3 nid: 1
> > fake id: 3 nid: 1
> > ...
> > created new fake_node with id 4
> > fake id: 4 nid: 1
> > fake id: 4 nid: 1
> > ...
> > 
> > and so on. So, fake node 3 encompasses real node 0 & 1. Also,
> > 
> > # cat /sys/devices/system/node/node3/meminfo
> > Node 0 MemTotal:        2097152 kB
> > ...
> > # # cat /sys/devices/system/node/node4/meminfo
> > Node 0 MemTotal:        2097152 kB
> > ...
> > 
> > 
> > With the patch, I get:
> > 
> > fake id: 1 nid: 0
> > fake id: 1 nid: 0
> > ...
> > fake id: 2 nid: 0
> > fake id: 2 nid: 0
> > ...
> > fake id: 2 nid: 0
> > created new fake_node with id 3
> > fake id: 3 nid: 0
> > fake id: 3 nid: 0
> > ...
> > fake id: 3 nid: 0
> > fake id: 3 nid: 0
> > created new fake_node with id 4
> > fake id: 4 nid: 1
> > fake id: 4 nid: 1
> > ...
> > 
> > and so on. With the patch, the fake node sizes are slightly different
> > from that specified by the user.
> > 
> > # cat /sys/devices/system/node/node3/meminfo
> > Node 3 MemTotal:        1638400 kB
> > ...
> > # cat /sys/devices/system/node/node4/meminfo
> > Node 4 MemTotal:         458752 kB
> > ...
> > 
> > CPU association was tested as mentioned in the previous mail:
> > 
> > Without the patch,
> > 
> > # cat /proc/cmdline
> > root=/dev/sda6  numa=fake=2G,4G,,6G,8G,10G,12G,14G,16G
> > # cat /sys/devices/system/node/node0/cpulist
> > 0-3
> > # cat /sys/devices/system/node/node1/cpulist
> > 4-7
> > # cat /sys/devices/system/node/node4/cpulist
> > 
> > #
> > 
> > With the patch,
> > 
> > # cat /proc/cmdline
> > root=/dev/sda6  numa=fake=2G,4G,,6G,8G,10G,12G,14G,16G
> > # cat /sys/devices/system/node/node0/cpulist
> > 0-3
> > # cat /sys/devices/system/node/node1/cpulist
> > 
> 
> Oh! interesting.. cpuless nodes :) I think we need to fix this in the
> longer run and distribute cpus between fake numa nodes of a real node
> using some acceptable heuristic.
>

True. Presently this is broken on both x86 and ppc systems. It would be
interesting to find a way to map, for example, 4 cpus to >4 number of
fake nodes created from a single real numa node!
 
> > # cat /sys/devices/system/node/node4/cpulist
> > 4-7
> > 
> > > > 
> > > > Signed-off-by: Ankita Garg <ankita@in.ibm.com>
> > > > 
> > > > Index: linux-2.6.31-rc5/arch/powerpc/mm/numa.c
> > > > ===================================================================
> > > > --- linux-2.6.31-rc5.orig/arch/powerpc/mm/numa.c
> > > > +++ linux-2.6.31-rc5/arch/powerpc/mm/numa.c
> > > > @@ -26,6 +26,11 @@
> > > >  #include <asm/smp.h>
> > > > 
> > > >  static int numa_enabled = 1;
> > > > +static int fake_enabled = 1;
> > > > +
> > > > +/* The array maps a real numa node to the first fake node that gets
> > > > +created on it */
> > > 
> > > Coding style is broken
> > > 
> > 
> > Fixed.
> > 
> > > > +int fake_numa_node_mapping[MAX_NUMNODES];
> > > > 
> > > >  static char *cmdline __initdata;
> > > > 
> > > > @@ -49,14 +54,24 @@ static int __cpuinit fake_numa_create_ne
> > > >  	unsigned long long mem;
> > > >  	char *p = cmdline;
> > > >  	static unsigned int fake_nid;
> > > > +	static unsigned int orig_nid = 0;
> > > 
> > > Should we call this prev_nid?
> > > 
> > 
> > Yes, makes sense.
> > > >  	static unsigned long long curr_boundary;
> > > > 
> > > >  	/*
> > > >  	 * Modify node id, iff we started creating NUMA nodes
> > > >  	 * We want to continue from where we left of the last time
> > > >  	 */
> > > > -	if (fake_nid)
> > > > +	if (fake_nid) {
> > > > +		if (orig_nid != *nid) {
> > > 
> > > OK, so this is called when the real NUMA node changes - comments would
> > > be nice
> > >
> > 
> > Thanks, have added the comment.
> > 
> > > > +			fake_nid++;
> > > > +			fake_numa_node_mapping[*nid] = fake_nid;
> > > > +			orig_nid = *nid;
> > > > +			*nid = fake_nid;
> > > > +			return 0;
> > > > +		}
> > > >  		*nid = fake_nid;
> > > > +	}
> > > > +
> > > >  	/*
> > > >  	 * In case there are no more arguments to parse, the
> > > >  	 * node_id should be the same as the last fake node id
> > > > @@ -440,7 +455,7 @@ static int of_drconf_to_nid_single(struc
> > > >   */
> > > >  static int __cpuinit numa_setup_cpu(unsigned long lcpu)
> > > >  {
> > > > -	int nid = 0;
> > > > +	int nid = 0, new_nid;
> > > >  	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
> > > > 
> > > >  	if (!cpu) {
> > > > @@ -450,8 +465,15 @@ static int __cpuinit numa_setup_cpu(unsi
> > > > 
> > > >  	nid = of_node_to_nid_single(cpu);
> > > > 
> > > > +	if (fake_enabled && nid) {
> > > > +		new_nid = fake_numa_node_mapping[nid];
> > > > +		if (new_nid > 0)
> > > > +			nid = new_nid;
> > > > +	}
> > > > +
> > > >  	if (nid < 0 || !node_online(nid))
> > > >  		nid = any_online_node(NODE_MASK_ALL);
> > > > +
> > > >  out:
> > > >  	map_cpu_to_node(lcpu, nid);
> > > > 
> > > > @@ -1005,8 +1027,11 @@ static int __init early_numa(char *p)
> > > >  		numa_debug = 1;
> > > > 
> > > >  	p = strstr(p, "fake=");
> > > > -	if (p)
> > > > +	if (p) {
> > > >  		cmdline = p + strlen("fake=");
> > > > +		if (numa_enabled)
> > > > +			fake_enabled = 1;
> > > 
> > > Have you tried passing just numa=fake= without any commandline?
> > > That should enable fake_enabled, but I wonder if that negatively
> > > impacts numa_setup_cpu(). I wonder if you should look at cmdline
> > > to decide on fake_enabled.
> > >
> > 
> > fake_enabled does get set even for numa=fake=. However, it does not
> > impact numa_setup_cpu, since fake_numa_node_mapping array would have no
> > mapping stored and there is a condition there already to check for the
> > value of the mapping. I confirmed this by booting with the above
> > parameter as well.
> > 
> > > > +	}
> > > > 
> > > >  	return 0;
> > > >  }
> > > >
> > > 
> > > Overall, I think this is the right thing to do, we need to move in
> > > this direction. 
> > > 
> > 
> > Heres the updated patch:
> > 
> > Signed-off-by: Ankita Garg <ankita@in.ibm.com> 
> > 
> > Index: linux-2.6.31-rc5/arch/powerpc/mm/numa.c
> > ===================================================================
> > --- linux-2.6.31-rc5.orig/arch/powerpc/mm/numa.c
> > +++ linux-2.6.31-rc5/arch/powerpc/mm/numa.c
> > @@ -26,6 +26,13 @@
> >  #include <asm/smp.h>
> > 
> >  static int numa_enabled = 1;
> > +static int fake_enabled = 1;
> > +
> > +/*
> > + * The array maps a real numa node to the first fake node that gets
> > + * created on it
> > + */
> > +int fake_numa_node_mapping[MAX_NUMNODES];
> > 
> >  static char *cmdline __initdata;
> > 
> > @@ -49,14 +56,29 @@ static int __cpuinit fake_numa_create_ne
> >  	unsigned long long mem;
> >  	char *p = cmdline;
> >  	static unsigned int fake_nid;
> > +	static unsigned int prev_nid = 0;
> >  	static unsigned long long curr_boundary;
> > 
> >  	/*
> >  	 * Modify node id, iff we started creating NUMA nodes
> >  	 * We want to continue from where we left of the last time
> >  	 */
> > -	if (fake_nid)
> > +	if (fake_nid) {
> > +		/*
> > +		 * Moved over to the next real numa node, increment fake
> > +		 * node number and store the mapping of the real node to
> > +		 * the fake node
> > +		 */
> > +		if (prev_nid != *nid) {
> > +			fake_nid++;
> > +			fake_numa_node_mapping[*nid] = fake_nid;
> > +			prev_nid = *nid;
> > +			*nid = fake_nid;
> > +			return 0;
> > +		}
> >  		*nid = fake_nid;
> > +	}
> > +
> >  	/*
> >  	 * In case there are no more arguments to parse, the
> >  	 * node_id should be the same as the last fake node id
> > @@ -440,7 +462,7 @@ static int of_drconf_to_nid_single(struc
> >   */
> >  static int __cpuinit numa_setup_cpu(unsigned long lcpu)
> >  {
> > -	int nid = 0;
> > +	int nid = 0, new_nid;
> >  	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
> > 
> >  	if (!cpu) {
> > @@ -450,8 +472,15 @@ static int __cpuinit numa_setup_cpu(unsi
> > 
> >  	nid = of_node_to_nid_single(cpu);
> > 
> > +	if (fake_enabled && nid) {
> > +		new_nid = fake_numa_node_mapping[nid];
> > +		if (new_nid > 0)
> > +			nid = new_nid;
> > +	}
> > +
> >  	if (nid < 0 || !node_online(nid))
> >  		nid = any_online_node(NODE_MASK_ALL);
> > +
> >  out:
> >  	map_cpu_to_node(lcpu, nid);
> > 
> > @@ -1005,8 +1034,12 @@ static int __init early_numa(char *p)
> >  		numa_debug = 1;
> > 
> >  	p = strstr(p, "fake=");
> > -	if (p)
> > +	if (p) {
> >  		cmdline = p + strlen("fake=");
> > +		if (numa_enabled) {
> > +			fake_enabled = 1;
> > +		}
> > +	}
> > 
> >  	return 0;
> >  }
> >
> 
> 
> Looks good to me
> 
> 
> Reviewed-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> 
> 
> 
> -- 
> 	Balbir

-- 
Regards,
Ankita Garg (ankita@in.ibm.com)
Linux Technology Center
IBM India Systems & Technology Labs, 
Bangalore, India   

^ permalink raw reply

* Re: [PATCH v2 0/2] cpu: pseries: Offline state framework.
From: Peter Zijlstra @ 2009-09-02  5:33 UTC (permalink / raw)
  To: Gautham R Shenoy
  Cc: linux-kernel, Venkatesh Pallipadi, linuxppc-dev, Darrick J. Wong
In-Reply-To: <20090828095741.10641.32053.stgit@sofia.in.ibm.com>

On Fri, 2009-08-28 at 15:30 +0530, Gautham R Shenoy wrote:
> Hi,
> 
> This is the version 2 of the patch series to provide a cpu-offline framework
> that enables the administrators choose the state the offline CPU must be put
> into when multiple such states are exposed by the underlying architecture.
> 
> Version 1 of the Patch can be found here:
> http://lkml.org/lkml/2009/8/6/236
> 
> The patch-series exposes the following sysfs tunables to
> allow the system-adminstrator to choose the state of a CPU:
> 
> To query the available hotplug states, one needs to read the sysfs tunable:
> 	/sys/devices/system/cpu/cpu<number>/available_hotplug_states
> To query or set the current state, on needs to read/write the sysfs tunable:
> 	/sys/devices/system/cpu/cpu<number>/current_states
> 
> The patchset ensures that the writes to the "current_state" sysfs file are
> serialized against the writes to the "online" file.
> 
> This patchset also contains the offline state driver implemented for
> pSeries. For pSeries, we define three available_hotplug_states. They are:
> 
> 	online: The processor is online.
> 
> 	deallocate: This is the the default behaviour when the cpu is offlined
> 	even in the absense of this driver. The CPU would call make an
> 	rtas_stop_self() call and hand over the CPU back to the resource pool,
> 	thereby effectively deallocating that vCPU from the LPAR.
> 	NOTE: This would result in a configuration change to the LPAR
> 	which is visible to the outside world.
> 
> 	deactivate: This cedes the vCPU to the hypervisor which
> 	in turn can put the vCPU time to the best use.
> 	NOTE: This option DOES NOT result in a configuration change
> 	and the vCPU would be still entitled to the LPAR to which it earlier
> 	belong to.
> 
> Awaiting your feedback.

I'm still thinking this is a bad idea.

The OS should only know about online/offline.

Use the hypervisor interface to deal with the cpu once its offline.

That is, I think this interface you propose is a layering violation.

^ permalink raw reply

* Re: [v4 PATCH 1/5]: cpuidle: Cleanup drivers/cpuidle/cpuidle.c
From: Arun R Bharadwaj @ 2009-09-02  5:45 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Peter Zijlstra, Gautham R Shenoy, aun, linux-kernel,
	Paul Mackerras, Ingo Molnar, linuxppc-dev
In-Reply-To: <20090901172825.GA6780@balbir.in.ibm.com>

* Balbir Singh <balbir@linux.vnet.ibm.com> [2009-09-01 22:58:25]:

> * Arun R B <arun@linux.vnet.ibm.com> [2009-09-01 17:08:40]:
> 
> > * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-01 17:07:04]:
> > 
> > Cleanup drivers/cpuidle/cpuidle.c
> > 
> > Cpuidle maintains a pm_idle_old void pointer because, currently in x86
> > there is no clean way of registering and unregistering a idle function.
> >
> > So remove pm_idle_old and leave the responsibility of maintaining the
> > list of registered idle loops to the architecture specific code. If the
> > architecture registers cpuidle_idle_call as its idle loop, only then
> > this loop is called.
> > 
> 
> It sounds as if there is a side-effect of this
> patch on x86 (am I reading it incorrectly), which can be fixed, but
> it will need a patch or so to get back the old behaviour on x86.
> 
> > Also remove unwanted functions cpuidle_[un]install_idle_handler,
> > cpuidle_kick_cpus()
> >
> > Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
> > ---
> >  drivers/cpuidle/cpuidle.c  |   51 +++++++++++++++------------------------------
> >  drivers/cpuidle/governor.c |    3 --
> >  2 files changed, 17 insertions(+), 37 deletions(-)
> > 
> > Index: linux.trees.git/drivers/cpuidle/cpuidle.c
> > ===================================================================
> > --- linux.trees.git.orig/drivers/cpuidle/cpuidle.c
> > +++ linux.trees.git/drivers/cpuidle/cpuidle.c
> > @@ -24,9 +24,14 @@ DEFINE_PER_CPU(struct cpuidle_device *, 
> > 
> >  DEFINE_MUTEX(cpuidle_lock);
> >  LIST_HEAD(cpuidle_detected_devices);
> > -static void (*pm_idle_old)(void);
> > 
> >  static int enabled_devices;
> > +static int idle_function_registered;
> > +
> > +struct idle_function_desc cpuidle_idle_desc = {
> > +	.name           =       "cpuidle_loop",
> > +	.idle_func      =       cpuidle_idle_call,
> > +};
> > 
> >  #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
> >  static void cpuidle_kick_cpus(void)
> > @@ -54,13 +59,10 @@ static void cpuidle_idle_call(void)
> > 
> >  	/* check if the device is ready */
> >  	if (!dev || !dev->enabled) {
> > -		if (pm_idle_old)
> > -			pm_idle_old();
> > -		else
> >  #if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE)
> > -			default_idle();
> > +		default_idle();
> >  #else
> > -			local_irq_enable();
> > +		local_irq_enable();
> >  #endif
> >  		return;
> >  	}
> > @@ -94,35 +96,11 @@ static void cpuidle_idle_call(void)
> >  }
> > 
> >  /**
> > - * cpuidle_install_idle_handler - installs the cpuidle idle loop handler
> > - */
> > -void cpuidle_install_idle_handler(void)
> > -{
> > -	if (enabled_devices && (pm_idle != cpuidle_idle_call)) {
> > -		/* Make sure all changes finished before we switch to new idle */
> > -		smp_wmb();
> > -		pm_idle = cpuidle_idle_call;
> > -	}
> > -}
> > -
> > -/**
> > - * cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler
> > - */
> > -void cpuidle_uninstall_idle_handler(void)
> > -{
> > -	if (enabled_devices && pm_idle_old && (pm_idle != pm_idle_old)) {
> > -		pm_idle = pm_idle_old;
> > -		cpuidle_kick_cpus();
> > -	}
> > -}
> > -
> > -/**
> >   * cpuidle_pause_and_lock - temporarily disables CPUIDLE
> >   */
> >  void cpuidle_pause_and_lock(void)
> >  {
> >  	mutex_lock(&cpuidle_lock);
> > -	cpuidle_uninstall_idle_handler();
> >  }
> > 
> >  EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock);
> > @@ -132,7 +110,6 @@ EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock
> >   */
> >  void cpuidle_resume_and_unlock(void)
> >  {
> > -	cpuidle_install_idle_handler();
> >  	mutex_unlock(&cpuidle_lock);
> >  }
> > 
> 
> What does this mean for users of cpuidle_pause_and_lock/unlock?
> Should we be calling register/unregister_idle_function here?
> 

Just observed the use case for cpuidle_pause_and_lock/unlock.
It is not clear as to why we need to switch back to the old idle
handler and then again back to cpuidle's idle handler. Wouldn't it
make more sense to just register the idle handler when the first
cpuidle device is being registered and unregister the idle handler
when the last cpuidle device is unregistered?

--arun

> 
> > @@ -287,6 +264,12 @@ static int __cpuidle_register_device(str
> >  	return 0;
> >  }
> > 
> > +static void register_cpuidle_idle_function(void)
> > +{
> > +	register_idle_function(&cpuidle_idle_desc);
> > +
> > +	idle_function_registered = 1;
> 
> Use booleans if possible, unless you intend to extend the meaning of
> registered someday.
> 
> > +}
> >  /**
> >   * cpuidle_register_device - registers a CPU's idle PM feature
> >   * @dev: the cpu
> > @@ -303,7 +286,9 @@ int cpuidle_register_device(struct cpuid
> >  	}
> > 
> >  	cpuidle_enable_device(dev);
> > -	cpuidle_install_idle_handler();
> > +
> > +	if (!idle_function_registered)
> > +		register_cpuidle_idle_function();
> > 
> >  	mutex_unlock(&cpuidle_lock);
> > 
> > @@ -382,8 +367,6 @@ static int __init cpuidle_init(void)
> >  {
> >  	int ret;
> > 
> > -	pm_idle_old = pm_idle;
> > -
> >  	ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
> >  	if (ret)
> >  		return ret;
> > Index: linux.trees.git/drivers/cpuidle/governor.c
> > ===================================================================
> > --- linux.trees.git.orig/drivers/cpuidle/governor.c
> > +++ linux.trees.git/drivers/cpuidle/governor.c
> > @@ -48,8 +48,6 @@ int cpuidle_switch_governor(struct cpuid
> >  	if (gov == cpuidle_curr_governor)
> >  		return 0;
> > 
> > -	cpuidle_uninstall_idle_handler();
> > -
> >  	if (cpuidle_curr_governor) {
> >  		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
> >  			cpuidle_disable_device(dev);
> > @@ -63,7 +61,6 @@ int cpuidle_switch_governor(struct cpuid
> >  			return -EINVAL;
> >  		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
> >  			cpuidle_enable_device(dev);
> > -		cpuidle_install_idle_handler();
> >  		printk(KERN_INFO "cpuidle: using governor %s\n", gov->name);
> >  	}
> > 
> 
> -- 
> 	Balbir

^ permalink raw reply

* Re: [v4 PATCH 1/5]: cpuidle: Cleanup drivers/cpuidle/cpuidle.c
From: Peter Zijlstra @ 2009-09-02  5:42 UTC (permalink / raw)
  To: arun
  Cc: Gautham R Shenoy, linux-kernel, Paul Mackerras, Ingo Molnar,
	linuxppc-dev
In-Reply-To: <20090901113840.GH7599@linux.vnet.ibm.com>

On Tue, 2009-09-01 at 17:08 +0530, Arun R Bharadwaj wrote:
> * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-01 17:07:04]:
> 
> Cleanup drivers/cpuidle/cpuidle.c
> 
> Cpuidle maintains a pm_idle_old void pointer because, currently in x86
> there is no clean way of registering and unregistering a idle function.

Right, and instead of fixing that, they build this cpuidle crap on top,
instead of replacing the current crap with it.

> So remove pm_idle_old and leave the responsibility of maintaining the
> list of registered idle loops to the architecture specific code. If the
> architecture registers cpuidle_idle_call as its idle loop, only then
> this loop is called.

OK, that's a start I guess. Best would be to replace all of pm_idle with
cpuidle, which is what should have been done from the very start.

If cpuidle cannot fully replace the pm_idle functionality, then it needs
to fix that. But having two layers of idle functions is just silly.

Looking at patch 2 and 3, you're making the same mistake on power, after
those patches there are multiple ways of registering idle functions, one
through some native interface and one through cpuidle, this strikes me
as undesirable.

If cpuidle is a good idle function manager, then it should be good
enough to be the sole one, if its not, then why bother with it at all.

^ permalink raw reply

* Re: [PATCH] powerpc/fsl-booke: Use HW PTE format if CONFIG_PTE_64BIT
From: Benjamin Herrenschmidt @ 2009-09-02  5:48 UTC (permalink / raw)
  To: Kumar Gala; +Cc: linuxppc-dev
In-Reply-To: <1251856122-24560-1-git-send-email-galak@kernel.crashing.org>

On Tue, 2009-09-01 at 20:48 -0500, Kumar Gala wrote:
> Switch to using the Power ISA defined PTE format when we have a 64-bit
> PTE.  This makes the code handling between fsl-booke and book3e-64
> similiar for TLB faults.
> 
> Additionally this lets use take advantage of the page size encodings and
> full permissions that the HW PTE defines.
> 
> Also defined _PMD_PRESENT, _PMD_PRESENT_MASK, and _PMD_BAD since the
> 32-bit ppc arch code expects them.

No immediate problem with the patch, though I can't test it so I
assume you did :-)

Is this 2.6.32 material ? I'm going to stick it into my test branch
for now regardless.

Cheers,
Ben.

> Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
> ---
>  arch/powerpc/include/asm/pgtable-ppc32.h |    2 +
>  arch/powerpc/include/asm/pte-book3e.h    |    3 ++
>  arch/powerpc/include/asm/pte-fsl-booke.h |    7 -----
>  arch/powerpc/kernel/head_fsl_booke.S     |   36 ++++++++++++++++++++---------
>  4 files changed, 30 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
> index f2c52e2..55646ad 100644
> --- a/arch/powerpc/include/asm/pgtable-ppc32.h
> +++ b/arch/powerpc/include/asm/pgtable-ppc32.h
> @@ -111,6 +111,8 @@ extern int icache_44x_need_flush;
>  #include <asm/pte-40x.h>
>  #elif defined(CONFIG_44x)
>  #include <asm/pte-44x.h>
> +#elif defined(CONFIG_FSL_BOOKE) && defined(CONFIG_PTE_64BIT)
> +#include <asm/pte-book3e.h>
>  #elif defined(CONFIG_FSL_BOOKE)
>  #include <asm/pte-fsl-booke.h>
>  #elif defined(CONFIG_8xx)
> diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h
> index b82b9dc..082d515 100644
> --- a/arch/powerpc/include/asm/pte-book3e.h
> +++ b/arch/powerpc/include/asm/pte-book3e.h
> @@ -75,6 +75,9 @@
>  /* On 32-bit, we never clear the top part of the PTE */
>  #ifdef CONFIG_PPC32
>  #define _PTE_NONE_MASK	0xffffffff00000000ULL
> +#define _PMD_PRESENT	0
> +#define _PMD_PRESENT_MASK (PAGE_MASK)
> +#define _PMD_BAD	(~PAGE_MASK)
>  #endif
>  
>  #endif /* __KERNEL__ */
> diff --git a/arch/powerpc/include/asm/pte-fsl-booke.h b/arch/powerpc/include/asm/pte-fsl-booke.h
> index ce8a9e9..2c12be5 100644
> --- a/arch/powerpc/include/asm/pte-fsl-booke.h
> +++ b/arch/powerpc/include/asm/pte-fsl-booke.h
> @@ -33,13 +33,6 @@
>  #define _PAGE_WRITETHRU	0x00400	/* H: W bit */
>  #define _PAGE_SPECIAL	0x00800 /* S: Special page */
>  
> -#ifdef CONFIG_PTE_64BIT
> -/* ERPN in a PTE never gets cleared, ignore it */
> -#define _PTE_NONE_MASK	0xffffffffffff0000ULL
> -/* We extend the size of the PTE flags area when using 64-bit PTEs */
> -#define PTE_RPN_SHIFT	(PAGE_SHIFT + 8)
> -#endif
> -
>  #define _PMD_PRESENT	0
>  #define _PMD_PRESENT_MASK (PAGE_MASK)
>  #define _PMD_BAD	(~PAGE_MASK)
> diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
> index 2c5af52..975788c 100644
> --- a/arch/powerpc/kernel/head_fsl_booke.S
> +++ b/arch/powerpc/kernel/head_fsl_booke.S
> @@ -575,7 +575,12 @@ interrupt_base:
>  	 *       place or can we save a couple of instructions here ?
>  	 */
>  	mfspr	r12,SPRN_ESR
> +#ifdef CONFIG_PTE_64BIT
> +	li	r13,_PAGE_PRESENT
> +	oris	r13,r13,_PAGE_ACCESSED@h
> +#else
>  	li	r13,_PAGE_PRESENT|_PAGE_ACCESSED
> +#endif
>  	rlwimi	r13,r12,11,29,29
>  
>  	FIND_PTE
> @@ -643,7 +648,12 @@ interrupt_base:
>  
>  4:
>  	/* Make up the required permissions */
> +#ifdef CONFIG_PTE_64BIT
> +	li	r13,_PAGE_PRESENT | _PAGE_EXEC
> +	oris	r13,r13,_PAGE_ACCESSED@h
> +#else
>  	li	r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC
> +#endif
>  
>  	FIND_PTE
>  	andc.	r13,r13,r11		/* Check permission */
> @@ -733,7 +743,7 @@ finish_tlb_load:
>  
>  	mfspr	r12, SPRN_MAS2
>  #ifdef CONFIG_PTE_64BIT
> -	rlwimi	r12, r11, 26, 24, 31	/* extract ...WIMGE from pte */
> +	rlwimi	r12, r11, 32-19, 27, 31	/* extract WIMGE from pte */
>  #else
>  	rlwimi	r12, r11, 26, 27, 31	/* extract WIMGE from pte */
>  #endif
> @@ -742,6 +752,20 @@ finish_tlb_load:
>  #endif
>  	mtspr	SPRN_MAS2, r12
>  
> +#ifdef CONFIG_PTE_64BIT
> +	rlwinm	r12, r11, 32-2, 26, 31	/* Move in perm bits */
> +	andi.	r10, r11, _PAGE_DIRTY
> +	bne	1f
> +	li	r10, MAS3_SW | MAS3_UW
> +	andc	r12, r12, r10
> +1:	rlwimi	r12, r13, 20, 0, 11	/* grab RPN[32:43] */
> +	rlwimi	r12, r11, 20, 12, 19	/* grab RPN[44:51] */
> +	mtspr	SPRN_MAS3, r12
> +BEGIN_MMU_FTR_SECTION
> +	srwi	r10, r13, 12		/* grab RPN[12:31] */
> +	mtspr	SPRN_MAS7, r10
> +END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
> +#else
>  	li	r10, (_PAGE_EXEC | _PAGE_PRESENT)
>  	rlwimi	r10, r11, 31, 29, 29	/* extract _PAGE_DIRTY into SW */
>  	and	r12, r11, r10
> @@ -749,16 +773,6 @@ finish_tlb_load:
>  	slwi	r10, r12, 1
>  	or	r10, r10, r12
>  	iseleq	r12, r12, r10
> -	
> -#ifdef CONFIG_PTE_64BIT
> -	rlwimi	r12, r13, 24, 0, 7	/* grab RPN[32:39] */
> -	rlwimi	r12, r11, 24, 8, 19	/* grab RPN[40:51] */
> -	mtspr	SPRN_MAS3, r12
> -BEGIN_MMU_FTR_SECTION
> -	srwi	r10, r13, 8		/* grab RPN[8:31] */
> -	mtspr	SPRN_MAS7, r10
> -END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
> -#else
>  	rlwimi	r11, r12, 0, 20, 31	/* Extract RPN from PTE and merge with perms */
>  	mtspr	SPRN_MAS3, r11
>  #endif

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox