LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH v4 08/10] fadump: Invalidate registration and release reserved memory for general use.
From: Mahesh J Salgaonkar @ 2011-11-07  9:56 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Amerigo Wang, Milton Miller, Eric W. Biederman, Anton Blanchard
In-Reply-To: <20111107095215.1997.14866.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

This patch introduces an sysfs interface '/sys/kernel/fadump_release_mem' to
invalidate the last fadump registration, invalidate '/proc/vmcore', release
the reserved memory for general use and re-register for future kernel dump.
Once the dump is copied to the disk, the userspace tool will echo 1 to
'/sys/kernel/fadump_release_mem'.

Release the reserved memory region excluding the size of the memory required
for future kernel dump registration.

Change in v3:
- Syncronize the fadump invalidation step to handle simultaneous writes to
  /sys/kernel/fadump_release_mem.

Change in v2:
- Introduced cpu_notes_buf_free() function to free memory allocated for
  cpu notes buffer.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/fadump.h |    3 +
 arch/powerpc/kernel/fadump.c      |  157 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 156 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 0c14097..8ddfbc7 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -202,6 +202,9 @@ extern int fadump_reserve_mem(void);
 extern int setup_fadump(void);
 extern int is_fadump_active(void);
 extern void crash_fadump(struct pt_regs *, const char *);
+extern void fadump_cleanup(void);
+
+extern void vmcore_cleanup(void);
 #else	/* CONFIG_FA_DUMP */
 static inline int is_fadump_active(void) { return 0; }
 #endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index e68ee3a..b449b55 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -33,6 +33,8 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/crash_dump.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
 
 #include <asm/page.h>
 #include <asm/prom.h>
@@ -986,6 +988,131 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
 	return 0;
 }
 
+static int fadump_invalidate_dump(struct fadump_mem_struct *fdm)
+{
+	int rc = 0;
+	unsigned int wait_time;
+
+	pr_debug("Invalidating firmware-assisted dump registration\n");
+
+	/* TODO: Add upper time limit for the delay */
+	do {
+		rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL,
+			FADUMP_INVALIDATE, fdm,
+			sizeof(struct fadump_mem_struct));
+
+		wait_time = rtas_busy_delay_time(rc);
+		if (wait_time)
+			mdelay(wait_time);
+	} while (wait_time);
+
+	if (rc) {
+		printk(KERN_ERR "Failed to invalidate firmware-assisted dump "
+			"rgistration. unexpected error(%d).\n", rc);
+		return rc;
+	}
+	fw_dump.dump_active = 0;
+	fdm_active = NULL;
+	return 0;
+}
+
+void fadump_cleanup(void)
+{
+	/* Invalidate the registration only if dump is active. */
+	if (fw_dump.dump_active) {
+		init_fadump_mem_struct(&fdm,
+			fdm_active->cpu_state_data.destination_address);
+		fadump_invalidate_dump(&fdm);
+	}
+}
+
+/*
+ * Release the memory that was reserved in early boot to preserve the memory
+ * contents. The released memory will be available for general use.
+ */
+static void fadump_release_memory(unsigned long begin, unsigned long end)
+{
+	unsigned long addr;
+	unsigned long ra_start, ra_end;
+
+	ra_start = fw_dump.reserve_dump_area_start;
+	ra_end = ra_start + fw_dump.reserve_dump_area_size;
+
+	for (addr = begin; addr < end; addr += PAGE_SIZE) {
+		/*
+		 * exclude the dump reserve area. Will reuse it for next
+		 * fadump registration.
+		 */
+		if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start))
+			continue;
+
+		ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
+		init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
+		free_page((unsigned long)__va(addr));
+		totalram_pages++;
+	}
+}
+
+static void fadump_invalidate_release_mem(void)
+{
+	unsigned long reserved_area_start, reserved_area_end;
+	unsigned long destination_address;
+
+	mutex_lock(&fadump_mutex);
+	if (!fw_dump.dump_active) {
+		mutex_unlock(&fadump_mutex);
+		return;
+	}
+
+	destination_address = fdm_active->cpu_state_data.destination_address;
+	fadump_cleanup();
+	mutex_unlock(&fadump_mutex);
+
+	/*
+	 * Save the current reserved memory bounds we will require them
+	 * later for releasing the memory for general use.
+	 */
+	reserved_area_start = fw_dump.reserve_dump_area_start;
+	reserved_area_end = reserved_area_start +
+			fw_dump.reserve_dump_area_size;
+	/*
+	 * Setup reserve_dump_area_start and its size so that we can
+	 * reuse this reserved memory for Re-registration.
+	 */
+	fw_dump.reserve_dump_area_start = destination_address;
+	fw_dump.reserve_dump_area_size = get_dump_area_size();
+
+	fadump_release_memory(reserved_area_start, reserved_area_end);
+	if (fw_dump.cpu_notes_buf) {
+		cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf),
+					fw_dump.cpu_notes_buf_size);
+		fw_dump.cpu_notes_buf = 0;
+		fw_dump.cpu_notes_buf_size = 0;
+	}
+	/* Initialize the kernel dump memory structure for FAD registration. */
+	init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
+}
+
+static ssize_t fadump_release_memory_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *buf, size_t count)
+{
+	if (!fw_dump.dump_active)
+		return -EPERM;
+
+	if (buf[0] == '1') {
+		/*
+		 * Take away the '/proc/vmcore'. We are releasing the dump
+		 * memory, hence it will not be valid anymore.
+		 */
+		vmcore_cleanup();
+		fadump_invalidate_release_mem();
+
+	} else
+		return -EINVAL;
+	return count;
+}
+
 static ssize_t fadump_enabled_show(struct kobject *kobj,
 					struct kobj_attribute *attr,
 					char *buf)
@@ -1045,10 +1172,13 @@ static int fadump_region_show(struct seq_file *m, void *private)
 	if (!fw_dump.fadump_enabled)
 		return 0;
 
+	mutex_lock(&fadump_mutex);
 	if (fdm_active)
 		fdm_ptr = fdm_active;
-	else
+	else {
+		mutex_unlock(&fadump_mutex);
 		fdm_ptr = &fdm;
+	}
 
 	seq_printf(m,
 			"CPU : [%#016llx-%#016llx] %#llx bytes, "
@@ -1078,7 +1208,7 @@ static int fadump_region_show(struct seq_file *m, void *private)
 	if (!fdm_active ||
 		(fw_dump.reserve_dump_area_start ==
 		fdm_ptr->cpu_state_data.destination_address))
-		return 0;
+		goto out;
 
 	/* Dump is active. Show reserved memory region. */
 	seq_printf(m,
@@ -1090,9 +1220,15 @@ static int fadump_region_show(struct seq_file *m, void *private)
 			fw_dump.reserve_dump_area_start,
 			fdm_ptr->cpu_state_data.destination_address -
 			fw_dump.reserve_dump_area_start);
+out:
+	if (fdm_active)
+		mutex_unlock(&fadump_mutex);
 	return 0;
 }
 
+static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem,
+						0200, NULL,
+						fadump_release_memory_store);
 static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled,
 						0444, fadump_enabled_show,
 						NULL);
@@ -1133,6 +1269,13 @@ static void fadump_init_files(void)
 	if (!debugfs_file)
 		printk(KERN_ERR "fadump: unable to create debugfs file"
 				" fadump_region\n");
+
+	if (fw_dump.dump_active) {
+		rc = sysfs_create_file(kernel_kobj, &fadump_release_attr.attr);
+		if (rc)
+			printk(KERN_ERR "fadump: unable to create sysfs file"
+				" fadump_release_mem (%d)\n", rc);
+	}
 	return;
 }
 
@@ -1152,8 +1295,14 @@ int __init setup_fadump(void)
 	 * If dump data is available then see if it is valid and prepare for
 	 * saving it to the disk.
 	 */
-	if (fw_dump.dump_active)
-		process_fadump(fdm_active);
+	if (fw_dump.dump_active) {
+		/*
+		 * if dump process fails then invalidate the registration
+		 * and release memory before proceeding for re-registration.
+		 */
+		if (process_fadump(fdm_active) < 0)
+			fadump_invalidate_release_mem();
+	}
 	/* Initialize the kernel dump memory structure for FAD registration. */
 	else if (fw_dump.reserve_dump_area_size)
 		init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);

^ permalink raw reply related

* [RFC PATCH v4 09/10] fadump: Invalidate the fadump registration during machine shutdown.
From: Mahesh J Salgaonkar @ 2011-11-07  9:56 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Amerigo Wang, Milton Miller, Eric W. Biederman, Anton Blanchard
In-Reply-To: <20111107095215.1997.14866.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

If dump is active during system reboot, shutdown or halt then invalidate
the fadump registration as it does not get invalidated automatically.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/setup-common.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index ce35aaf..67e5caa 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -110,6 +110,14 @@ EXPORT_SYMBOL(ppc_do_canonicalize_irqs);
 /* also used by kexec */
 void machine_shutdown(void)
 {
+#ifdef CONFIG_FA_DUMP
+	/*
+	 * if fadump is active, cleanup the fadump registration before we
+	 * shutdown.
+	 */
+	fadump_cleanup();
+#endif
+
 	if (ppc_md.machine_shutdown)
 		ppc_md.machine_shutdown();
 }

^ permalink raw reply related

* [RFC PATCH v4 10/10] fadump: Introduce config option for firmware assisted dump feature
From: Mahesh J Salgaonkar @ 2011-11-07  9:56 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Amerigo Wang, Milton Miller, Eric W. Biederman, Anton Blanchard
In-Reply-To: <20111107095215.1997.14866.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

This patch introduces a new config option CONFIG_FA_DUMP for firmware
assisted dump feature on Powerpc (ppc64) architecture.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/Kconfig |   13 +++++++++++++
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6926b61..7ce773c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -379,6 +379,19 @@ config PHYP_DUMP

 	  If unsure, say "N"

+config FA_DUMP
+	bool "Firmware-assisted dump"
+	depends on PPC64 && PPC_RTAS && CRASH_DUMP
+	help
+	  A robust mechanism to get reliable kernel crash dump with
+	  assistance from firmware. This approach does not use kexec,
+	  instead firmware assists in booting the kdump kernel
+	  while preserving memory contents. Firmware-assisted dump
+	  is meant to be a kdump replacement offering robustness and
+	  speed not possible without system firmware assistance.
+
+	  If unsure, say "N"
+
 config PPCBUG_NVRAM
 	bool "Enable reading PPCBUG NVRAM during boot" if PPLUS || LOPEC
 	default y if PPC_PREP

^ permalink raw reply related

* [RFC PATCH v4 01/10] fadump: Add documentation for firmware-assisted dump.
From: Mahesh J Salgaonkar @ 2011-11-07  9:55 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Amerigo Wang, Milton Miller, Eric W. Biederman, Anton Blanchard
In-Reply-To: <20111107095215.1997.14866.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Documentation for firmware-assisted dump. This document is based on the
original documentation written for phyp assisted dump by Linas Vepstas
and Manish Ahuja, with few changes to reflect the current implementation.

Change in v3:
- Modified the documentation to reflect introdunction of fadump_registered
  sysfs file and few minor changes.

Change in v2:
- Modified the documentation to reflect the change of fadump_region
  file under debugfs filesystem.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 Documentation/powerpc/firmware-assisted-dump.txt |  262 ++++++++++++++++++++++
 1 files changed, 262 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/powerpc/firmware-assisted-dump.txt

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt
new file mode 100644
index 0000000..ba6724a
--- /dev/null
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -0,0 +1,262 @@
+
+                   Firmware-Assisted Dump
+                   ------------------------
+                       July 2011
+
+The goal of firmware-assisted dump is to enable the dump of
+a crashed system, and to do so from a fully-reset system, and
+to minimize the total elapsed time until the system is back
+in production use.
+
+As compared to kdump or other strategies, firmware-assisted
+dump offers several strong, practical advantages:
+
+-- Unlike kdump, the system has been reset, and loaded
+   with a fresh copy of the kernel.  In particular,
+   PCI and I/O devices have been reinitialized and are
+   in a clean, consistent state.
+-- Once the dump is copied out, the memory that held the dump
+   is immediately available to the running kernel. A further
+   reboot isn't required.
+
+The above can only be accomplished by coordination with,
+and assistance from the Power firmware. The procedure is
+as follows:
+
+-- The first kernel registers the sections of memory with the
+   Power firmware for dump preservation during OS initialization.
+   This registered sections of memory is reserved by the first
+   kernel during early boot.
+
+-- When a system crashes, the Power firmware will save
+   the low memory (boot memory of size larger of 5% of system RAM
+   or 256MB) of RAM to a previously registered save region. It
+   will also save system registers, and hardware PTE's.
+
+   NOTE: The term 'boot memory' means size of the low memory chunk
+         that is required for a kernel to boot successfully when
+         booted with restricted memory. By default, the boot memory
+         size will be calculated to larger of 5% of system RAM or
+         256MB. Alternatively, user can also specify boot memory
+         size through boot parameter 'fadump_reserve_mem=' which
+         will override the default calculated size.
+
+-- After the low memory (boot memory) area has been saved, the
+   firmware will reset PCI and other hardware state.  It will
+   *not* clear the RAM. It will then launch the bootloader, as
+   normal.
+
+-- The freshly booted kernel will notice that there is a new
+   node (ibm,dump-kernel) in the device tree, indicating that
+   there is crash data available from a previous boot. During
+   the early boot OS will reserve rest of the memory above
+   boot memory size effectively booting with restricted memory
+   size. This will make sure that the second kernel will not
+   touch any of the dump memory area.
+
+-- Userspace tools will read /proc/vmcore to obtain the contents
+   of memory, which holds the previous crashed kernel dump in ELF
+   format. The userspace tools may copy this info to disk, or
+   network, nas, san, iscsi, etc. as desired.
+
+-- Once the userspace tool is done saving dump, it will echo
+   '1' to /sys/kernel/fadump_release_mem to release the reserved
+   memory back to general use, except the memory required for
+   next firmware-assisted dump registration.
+
+   e.g.
+     # echo 1 > /sys/kernel/fadump_release_mem
+
+Please note that the firmware-assisted dump feature
+is only available on Power6 and above systems with recent
+firmware versions.
+
+Implementation details:
+----------------------
+
+During boot, a check is made to see if firmware supports
+this feature on that particular machine. If it does, then
+we check to see if an active dump is waiting for us. If yes
+then everything but boot memory size of RAM is reserved during
+early boot (See Fig. 2). This area is released once we collect a
+dump from user land scripts (kdump scripts) that are run. If
+there is dump data, then the /sys/kernel/fadump_release_mem
+file is created, and the reserved memory is held.
+
+If there is no waiting dump data, then only the memory required
+to hold CPU state, HPTE region, boot memory dump and elfcore
+header, is reserved at the top of memory (see Fig. 1). This area
+is *not* released: this region will be kept permanently reserved,
+so that it can act as a receptacle for a copy of the boot memory
+content in addition to CPU state and HPTE region, in the case a
+crash does occur.
+
+  o Memory Reservation during first kernel
+
+  Low memory                                        Top of memory
+  0      boot memory size                                       |
+  |           |                       |<--Reserved dump area -->|
+  V           V                       |   Permanent Reservation V
+  +-----------+----------/ /----------+---+----+-----------+----+
+  |           |                       |CPU|HPTE|  DUMP     |ELF |
+  +-----------+----------/ /----------+---+----+-----------+----+
+        |                                           ^
+        |                                           |
+        \                                           /
+         -------------------------------------------
+          Boot memory content gets transferred to
+          reserved area by firmware at the time of
+          crash
+                   Fig. 1
+
+  o Memory Reservation during second kernel after crash
+
+  Low memory                                        Top of memory
+  0      boot memory size                                       |
+  |           |<------------- Reserved dump area ----------- -->|
+  V           V                                                 V
+  +-----------+----------/ /----------+---+----+-----------+----+
+  |           |                       |CPU|HPTE|  DUMP     |ELF |
+  +-----------+----------/ /----------+---+----+-----------+----+
+        |                                                    |
+        V                                                    V
+   Used by second                                    /proc/vmcore
+   kernel to boot
+                   Fig. 2
+
+Currently the dump will be copied from /proc/vmcore to a
+a new file upon user intervention. The dump data available through
+/proc/vmcore will be in ELF format. Hence the existing kdump
+infrastructure (kdump scripts) to save the dump works fine
+with minor modifications. The kdump script requires following
+modifications:
+-- During service kdump start if /proc/vmcore entry is not present,
+   look for the existence of /sys/kernel/fadump_enabled and read
+   value exported by it. If value is set to '0' then fallback to
+   existing kexec based kdump. If value is set to '1' then check the
+   value exported by /sys/kernel/fadump_registered. If value it set
+   to '1' then print success otherwise register for fadump by
+   echo'ing 1 > /sys/kernel/fadump_registered file.
+
+-- During service kdump start if /proc/vmcore entry is present,
+   execute the existing routine to save the dump. Once the dump
+   is saved, echo 1 > /sys/kernel/fadump_release_mem (if the
+   file exists) to release the reserved memory for general use
+   and continue without rebooting. At this point the memory
+   reservation map will look like as shown in Fig. 1. If the file
+   /sys/kernel/fadump_release_mem is not present then follow
+   the existing routine to reboot into new kernel.
+
+-- During service kdump stop echo 0 > /sys/kernel/fadump_registered
+   to un-register the fadump.
+
+The tools to examine the dump will be same as the ones
+used for kdump.
+
+How to enable firmware-assisted dump (fadump):
+-------------------------------------
+
+1. Set config option CONFIG_FA_DUMP=y and build kernel.
+2. Boot into linux kernel with 'fadump=1' kernel cmdline option.
+3. Optionally, user can also set 'fadump_reserve_mem=' kernel cmdline
+   to specify size of the memory to reserve for boot memory dump
+   preservation.
+
+NOTE: If firmware-assisted dump fails to reserve memory then it will
+   fallback to existing kdump mechanism if 'crashkernel=' option
+   is set at kernel cmdline.
+
+Sysfs/debugfs files:
+------------
+
+Firmware-assisted dump feature uses sysfs file system to hold
+the control files and debugfs file to display memory reserved region.
+
+Here is the list of files under kernel sysfs:
+
+ /sys/kernel/fadump_enabled
+
+    This is used to display the fadump status.
+    0 = fadump is disabled
+    1 = fadump is enabled
+
+ /sys/kernel/fadump_registered
+
+    This is used to display the fadump registration status as well
+    as to control (start/stop) the fadump registration.
+    0 = fadump is not registered.
+    1 = fadump is registered and ready to handle system crash.
+
+    To register fadump echo 1 > /sys/kernel/fadump_registered and
+    echo 0 > /sys/kernel/fadump_registered for un-register and stop the
+    fadump. Once the fadump is un-registered, the system crash will not
+    be handled and vmcore will not be captured.
+
+ /sys/kernel/fadump_release_mem
+
+    This file is available only when fadump is active during
+    second kernel. This is used to release the reserved memory
+    region that are held for saving crash dump. To release the
+    reserved memory echo 1 to it:
+
+    echo 1  > /sys/kernel/fadump_release_mem
+
+    After echo 1, the content of the /sys/kernel/debug/powerpc/fadump_region
+    file will change to reflect the new memory reservations.
+
+Here is the list of files under powerpc debugfs:
+(Assuming debugfs is mounted on /sys/kernel/debug directory.)
+
+ /sys/kernel/debug/powerpc/fadump_region
+
+    This file shows the reserved memory regions if fadump is
+    enabled otherwise this file is empty. The output format
+    is:
+    <region>: [<start>-<end>] <reserved-size> bytes, Dumped: <dump-size>
+
+    e.g.
+    Contents when fadump is registered during first kernel
+
+    # cat /sys/kernel/debug/powerpc/fadump_region
+    CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0
+    HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0
+    DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0
+
+    Contents when fadump is active during second kernel
+
+    # cat /sys/kernel/debug/powerpc/fadump_region
+    CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020
+    HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x1000
+    DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000
+        : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000
+
+NOTE: Please refer to debugfs documentation on how to mount the debugfs
+      filesystem.
+
+
+TODO:
+-----
+ o Need to come up with the better approach to find out more
+   accurate boot memory size that is required for a kernel to
+   boot successfully when booted with restricted memory.
+ o The fadump implementation introduces a fadump crash info structure
+   in the scratch area before the ELF core header. The idea of introducing
+   this structure is to pass some important crash info data to the second
+   kernel which will help second kernel to populate ELF core header with
+   correct data before it gets exported through /proc/vmcore. The current
+   design implementation does not address a possibility of introducing
+   additional fields (in future) to this structure without affecting
+   compatibility. Need to come up with the better approach to address this.
+   The possible approaches are:
+	1. Introduce version field for version tracking, bump up the version
+	whenever a new field is added to the structure in future. The version
+	field can be used to find out what fields are valid for the current
+	version of the structure.
+	2. Reserve the area of predefined size (say PAGE_SIZE) for this
+	structure and have unused area as reserved (initialized to zero)
+	for future field additions.
+   The advantage of approach 1 over 2 is we don't need to reserve extra space.
+---
+Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+This document is based on the original documentation written for phyp
+assisted dump by Linas Vepstas and Manish Ahuja.

^ permalink raw reply related

* [PATCH][v2] powerpc/usb: fix type cast for address of ioremap to compatible with 64-bit
From: Shaohui Xie @ 2011-11-07  8:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-usb, Shaohui Xie

Below are codes for accessing usb sysif_regs in driver:

usb_sys_regs = (struct usb_sys_interface *)
	((u32)dr_regs + USB_DR_SYS_OFFSET);

these codes work in 32-bit, but in 64-bit, use u32 to type cast the address
of ioremap is not right, and accessing members of 'usb_sys_regs' will cause
call trace, so use (void *) for both 32-bit and 64-bit.

Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
---
changes for v2:
1. use (void *) instead of unsigned long and the double cast according
to Timur's comment.

 drivers/usb/gadget/fsl_udc_core.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/gadget/fsl_udc_core.c b/drivers/usb/gadget/fsl_udc_core.c
index c81fbad..398c5e6 100644
--- a/drivers/usb/gadget/fsl_udc_core.c
+++ b/drivers/usb/gadget/fsl_udc_core.c
@@ -2497,8 +2497,7 @@ static int __init fsl_udc_probe(struct platform_device *pdev)
 
 #ifndef CONFIG_ARCH_MXC
 	if (pdata->have_sysif_regs)
-		usb_sys_regs = (struct usb_sys_interface *)
-				((u32)dr_regs + USB_DR_SYS_OFFSET);
+		usb_sys_regs = (void *)dr_regs + USB_DR_SYS_OFFSET;
 #endif
 
 	/* Initialize USB clocks */
-- 
1.6.4

^ permalink raw reply related

* [RFC PATCH v4 02/10] fadump: Reserve the memory for firmware assisted dump.
From: Mahesh J Salgaonkar @ 2011-11-07  9:55 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Amerigo Wang, Milton Miller, Eric W. Biederman, Anton Blanchard
In-Reply-To: <20111107095215.1997.14866.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Reserve the memory during early boot to preserve CPU state data, HPTE region
and RMR region data in case of kernel crash. At the time of crash, powerpc
firmware will store CPU state data, HPTE region data and move RMR region
data to the reserved memory area.

If the firmware-assisted dump fails to reserve the memory, then fallback
to existing kexec-based kdump.

The most of the code implementation to reserve memory has been
adapted from phyp assisted dump implementation written by Linas Vepstas
and Manish Ahuja

Change in v2:
- Modified to use standard pr_debug() macro.
- Modified early_init_dt_scan_fw_dump() to get the size of
  "ibm,configure-kernel-dump-sizes" property and use it to iterate through
  an array of dump sections.
- Introduced boot option 'fadump_reserve_mem=' to let user specify the
  fadump boot memory to be reserved.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/fadump.h |   65 ++++++++++
 arch/powerpc/kernel/Makefile      |    1 
 arch/powerpc/kernel/fadump.c      |  250 +++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/prom.c        |   15 ++
 4 files changed, 330 insertions(+), 1 deletions(-)
 create mode 100644 arch/powerpc/include/asm/fadump.h
 create mode 100644 arch/powerpc/kernel/fadump.c

diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
new file mode 100644
index 0000000..0b040c1
--- /dev/null
+++ b/arch/powerpc/include/asm/fadump.h
@@ -0,0 +1,65 @@
+/*
+ * Firmware Assisted dump header file.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2011 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#ifndef __PPC64_FA_DUMP_H__
+#define __PPC64_FA_DUMP_H__
+
+#ifdef CONFIG_FA_DUMP
+
+/*
+ * The RMR region will be saved for later dumping when kernel crashes.
+ * Set this to 256MB.
+ */
+#define RMR_START	0x0
+#define RMR_END		(ppc64_rma_size)
+
+/*
+ * On some Power systems where RMO is 128MB, it still requires minimum of
+ * 256MB for kernel to boot successfully.
+ */
+#define MIN_BOOT_MEM	((RMR_END < (0x1UL << 28)) ? (0x1UL << 28) : RMR_END)
+
+/* Firmware provided dump sections */
+#define FADUMP_CPU_STATE_DATA	0x0001
+#define FADUMP_HPTE_REGION	0x0002
+#define FADUMP_REAL_MODE_REGION	0x0011
+
+struct fw_dump {
+	unsigned long	cpu_state_data_size;
+	unsigned long	hpte_region_size;
+	unsigned long	boot_memory_size;
+	unsigned long	reserve_dump_area_start;
+	unsigned long	reserve_dump_area_size;
+	/* cmd line option during boot */
+	unsigned long	reserve_bootvar;
+
+	int		ibm_configure_kernel_dump;
+
+	unsigned long	fadump_enabled:1;
+	unsigned long	fadump_supported:1;
+	unsigned long	dump_active:1;
+};
+
+extern int early_init_dt_scan_fw_dump(unsigned long node,
+		const char *uname, int depth, void *data);
+extern int fadump_reserve_mem(void);
+#endif
+#endif
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index ce4f7f1..59b549c 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_IBMVIO)		+= vio.o
 obj-$(CONFIG_IBMEBUS)           += ibmebus.o
 obj-$(CONFIG_GENERIC_TBSYNC)	+= smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
+obj-$(CONFIG_FA_DUMP)		+= fadump.o
 ifeq ($(CONFIG_PPC32),y)
 obj-$(CONFIG_E500)		+= idle_e500.o
 endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
new file mode 100644
index 0000000..05dffc0
--- /dev/null
+++ b/arch/powerpc/kernel/fadump.c
@@ -0,0 +1,250 @@
+/*
+ * Firmware Assisted dump: A robust mechanism to get reliable kernel crash
+ * dump with assistance from firmware. This approach does not use kexec,
+ * instead firmware assists in booting the kdump kernel while preserving
+ * memory contents. The most of the code implementation has been adapted
+ * from phyp assisted dump implementation written by Linas Vepstas and
+ * Manish Ahuja
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2011 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/memblock.h>
+
+#include <asm/page.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/fadump.h>
+
+/*
+ * The RTAS property "ibm,configure-kernel-dump-sizes" returns dump
+ * sizes for the firmware provided dump sections (cpu state data
+ * and hpte region).
+ */
+struct dump_section {
+	u32		dump_section;
+	unsigned long	section_size;
+} __packed;
+
+static struct fw_dump fw_dump;
+
+/* Scan the Firmware Assisted dump configuration details. */
+int __init early_init_dt_scan_fw_dump(unsigned long node,
+			const char *uname, int depth, void *data)
+{
+	const struct dump_section *sections;
+	int i, num_sections;
+	unsigned long size;
+	const int *token;
+
+	if (depth != 1 || strcmp(uname, "rtas") != 0)
+		return 0;
+
+	/*
+	 * Check if Firmware Assisted dump is supported. if yes, check
+	 * if dump has been initiated on last reboot.
+	 */
+	token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
+	if (!token)
+		return 0;
+
+	fw_dump.fadump_supported = 1;
+	fw_dump.ibm_configure_kernel_dump = *token;
+
+	/*
+	 * The 'ibm,kernel-dump' rtas node is present only if there is
+	 * dump data waiting for us.
+	 */
+	if (of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL))
+		fw_dump.dump_active = 1;
+
+	/* Get the sizes required to store dump data for the firmware provided
+	 * dump sections.
+	 */
+	sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
+					&size);
+
+	if (!sections)
+		return 0;
+
+	num_sections = size / sizeof(struct dump_section);
+
+	for (i = 0; i < num_sections; i++) {
+		switch (sections[i].dump_section) {
+		case FADUMP_CPU_STATE_DATA:
+			fw_dump.cpu_state_data_size = sections[i].section_size;
+			break;
+		case FADUMP_HPTE_REGION:
+			fw_dump.hpte_region_size = sections[i].section_size;
+			break;
+		}
+	}
+	return 1;
+}
+
+/**
+ * calculate_reserve_size() - reserve variable boot area 5% of System RAM
+ *
+ * Function to find the largest memory size we need to reserve during early
+ * boot process. This will be the size of the memory that is required for a
+ * kernel to boot successfully.
+ *
+ * This function has been taken from phyp-assisted dump feature implementation.
+ *
+ * returns larger of 256MB or 5% rounded down to multiples of 256MB.
+ *
+ * TODO: Come up with better approach to find out more accurate memory size
+ * that is required for a kernel to boot successfully.
+ *
+ */
+static inline unsigned long calculate_reserve_size(void)
+{
+	unsigned long size;
+
+	/*
+	 * Check if the size is specified through fadump_reserve_mem= cmdline
+	 * option. If yes, then use that.
+	 */
+	if (fw_dump.reserve_bootvar)
+		return fw_dump.reserve_bootvar;
+
+	/* divide by 20 to get 5% of value */
+	size = memblock_end_of_DRAM();
+	do_div(size, 20);
+
+	/* round it down in multiples of 256 */
+	size = size & ~0x0FFFFFFFUL;
+
+	/* Truncate to memory_limit. We don't want to over reserve the memory.*/
+	if (memory_limit && size > memory_limit)
+		size = memory_limit;
+
+	return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM);
+}
+
+/*
+ * Calculate the total memory size required to be reserved for
+ * firmware-assisted dump registration.
+ */
+static unsigned long get_dump_area_size(void)
+{
+	unsigned long size = 0;
+
+	size += fw_dump.cpu_state_data_size;
+	size += fw_dump.hpte_region_size;
+	size += fw_dump.boot_memory_size;
+
+	size = PAGE_ALIGN(size);
+	return size;
+}
+
+int __init fadump_reserve_mem(void)
+{
+	unsigned long base, size, memory_boundary;
+
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	if (!fw_dump.fadump_supported) {
+		printk(KERN_ERR "Firmware-assisted dump is not supported on"
+				" this hardware\n");
+		fw_dump.fadump_enabled = 0;
+		return 0;
+	}
+	/* Initialize boot memory size */
+	fw_dump.boot_memory_size = calculate_reserve_size();
+
+	/*
+	 * Calculate the memory boundary.
+	 * If memory_limit is less than actual memory boundary then reserve
+	 * the memory for fadump beyond the memory_limit and adjust the
+	 * memory_limit accordingly, so that the running kernel can run with
+	 * specified memory_limit.
+	 */
+	if (memory_limit && memory_limit < memblock_end_of_DRAM()) {
+		size = get_dump_area_size();
+		if ((memory_limit + size) < memblock_end_of_DRAM())
+			memory_limit += size;
+		else
+			memory_limit = memblock_end_of_DRAM();
+		printk(KERN_INFO "Adjusted memory_limit for firmware-assisted"
+				" dump, now %#016llx\n",
+				(unsigned long long)memory_limit);
+	}
+	if (memory_limit)
+		memory_boundary = memory_limit;
+	else
+		memory_boundary = memblock_end_of_DRAM();
+
+	if (fw_dump.dump_active) {
+		printk(KERN_INFO "Firmware-assisted dump is active.\n");
+		/*
+		 * If last boot has crashed then reserve all the memory
+		 * above boot_memory_size so that we don't touch it until
+		 * dump is written to disk by userspace tool. This memory
+		 * will be released for general use once the dump is saved.
+		 */
+		base = fw_dump.boot_memory_size;
+		size = memory_boundary - base;
+		memblock_reserve(base, size);
+		printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
+				"for saving crash dump\n",
+				(unsigned long)(size >> 20),
+				(unsigned long)(base >> 20));
+	} else {
+		/* Reserve the memory at the top of memory. */
+		size = get_dump_area_size();
+		base = memory_boundary - size;
+		memblock_reserve(base, size);
+		printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
+				"for firmware-assisted dump\n",
+				(unsigned long)(size >> 20),
+				(unsigned long)(base >> 20));
+	}
+	fw_dump.reserve_dump_area_start = base;
+	fw_dump.reserve_dump_area_size = size;
+	return 1;
+}
+
+/* Look for fadump= cmdline option. */
+static int __init early_fadump_param(char *p)
+{
+	if (!p)
+		return 1;
+
+	if (p[0] == '1')
+		fw_dump.fadump_enabled = 1;
+	else if (p[0] == '0')
+		fw_dump.fadump_enabled = 0;
+
+	return 0;
+}
+early_param("fadump", early_fadump_param);
+
+/* Look for fadump_reserve_mem= cmdline option */
+static int __init early_fadump_reserve_mem(char *p)
+{
+	if (p)
+		fw_dump.reserve_bootvar = memparse(p, &p);
+	return 0;
+}
+early_param("fadump_reserve_mem", early_fadump_reserve_mem);
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 174e1e9..3fe75eb 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -54,6 +54,7 @@
 #include <asm/pci-bridge.h>
 #include <asm/phyp_dump.h>
 #include <asm/kexec.h>
+#include <asm/fadump.h>
 #include <mm/mmu_decl.h>
 
 #ifdef DEBUG
@@ -712,6 +713,11 @@ void __init early_init_devtree(void *params)
 	of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL);
 #endif
 
+#ifdef CONFIG_FA_DUMP
+	/* scan tree to see if dump is active during last boot */
+	of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL);
+#endif
+
 	/* Retrieve various informations from the /chosen node of the
 	 * device-tree, including the platform type, initrd location and
 	 * size, TCE reserve, and more ...
@@ -735,7 +741,14 @@ void __init early_init_devtree(void *params)
 	if (PHYSICAL_START > MEMORY_START)
 		memblock_reserve(MEMORY_START, 0x8000);
 	reserve_kdump_trampoline();
-	reserve_crashkernel();
+#ifdef CONFIG_FA_DUMP
+	/*
+	 * If we fail to reserve memory for firmware-assisted dump then
+	 * fallback to kexec based kdump.
+	 */
+	if (fadump_reserve_mem() == 0)
+#endif
+		reserve_crashkernel();
 	early_reserve_mem();
 	phyp_dump_reserve_mem();
 

^ permalink raw reply related

* [RFC PATCH v4 07/10] fadump: Introduce cleanup routine to invalidate /proc/vmcore.
From: Mahesh J Salgaonkar @ 2011-11-07  9:56 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Amerigo Wang, Milton Miller, Eric W. Biederman, Anton Blanchard
In-Reply-To: <20111107095215.1997.14866.stgit@mars.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

With the firmware-assisted dump support we don't require a reboot when we
are in second kernel after crash. The second kernel after crash is a normal
kernel boot and has knowledge about entire system RAM with the page tables
initialized for entire system RAM. Hence once the dump is saved to disk, we
can just release the reserved memory area for general use and continue
with second kernel as production kernel.

Hence when we release the reserved memory that contains dump data, the
'/proc/vmcore' will not be valid anymore. Hence this patch introduces
a cleanup routine that invalidates and removes the /proc/vmcore file. This
routine will be invoked before we release the reserved dump memory area.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 fs/proc/vmcore.c |   23 +++++++++++++++++++++++
 1 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cd99bf5..fae5526 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -699,3 +699,26 @@ static int __init vmcore_init(void)
 	return 0;
 }
 module_init(vmcore_init)
+
+/* Cleanup function for vmcore module. */
+void vmcore_cleanup(void)
+{
+	struct list_head *pos, *next;
+
+	if (proc_vmcore) {
+		remove_proc_entry(proc_vmcore->name, proc_vmcore->parent);
+		proc_vmcore = NULL;
+	}
+
+	/* clear the vmcore list. */
+	list_for_each_safe(pos, next, &vmcore_list) {
+		struct vmcore *m;
+
+		m = list_entry(pos, struct vmcore, list);
+		list_del(&m->list);
+		kfree(m);
+	}
+	kfree(elfcorebuf);
+	elfcorebuf = NULL;
+}
+EXPORT_SYMBOL_GPL(vmcore_cleanup);

^ permalink raw reply related

* Re: New location of powerpc git tree
From: Stephen Rothwell @ 2011-11-07 10:26 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <1320622150.2779.6.camel@pasglop>

[-- Attachment #1: Type: text/plain, Size: 430 bytes --]

Hi Ben,

On Mon, 07 Nov 2011 10:29:10 +1100 Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
>
> I've moved the powerpc git tree back to kernel.org. The URL should be
> back to normal for users:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git

OK, I have switched back to that, now.

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH 4/7] powerpc/85xx: add support to JOG feature using cpufreq interface
From: Zhao Chenhui @ 2011-11-07 10:27 UTC (permalink / raw)
  To: Scott Wood; +Cc: Jerry Huang, linuxppc-dev
In-Reply-To: <4EB4403E.3040700@freescale.com>

On Fri, Nov 04, 2011 at 02:42:54PM -0500, Scott Wood wrote:
> On 11/04/2011 07:36 AM, Zhao Chenhui wrote:
> > From: Li Yang <leoli@freescale.com>
> > 
> > Some 85xx silicons like MPC8536 and P1022 has the JOG PM feature.
> > 
> > The patch adds the support to change CPU frequency using the standard
> > cpufreq interface. Add the all PLL ratio core support. The ratio CORE
> > to CCB can 1:1, 1.5, 2:1, 2.5:1, 3:1, 3.5:1 and 4:1
> > 
> > Signed-off-by: Dave Liu <daveliu@freescale.com>
> > Signed-off-by: Li Yang <leoli@freescale.com>
> > Signed-off-by: Jerry Huang <Chang-Ming.Huang@freescale.com>
> > Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com>
> > ---
> >  arch/powerpc/platforms/85xx/Makefile  |    1 +
> >  arch/powerpc/platforms/85xx/cpufreq.c |  255 +++++++++++++++++++++++++++++++++
> >  arch/powerpc/platforms/Kconfig        |    8 +
> >  3 files changed, 264 insertions(+), 0 deletions(-)
> >  create mode 100644 arch/powerpc/platforms/85xx/cpufreq.c
> 
> Please name this something more specific, such as 85xx/cpufreq-jog.c
> 
> Other 85xx/qoriq chips, such as p4080, have different mechanisms for
> updating CPU frequency.
> 
> > +static struct cpufreq_frequency_table mpc85xx_freqs[] = {
> > +	{2,	0},
> > +	{3,	0},
> > +	{4,	0},
> > +	{5,	0},
> > +	{6,	0},
> > +	{7,	0},
> > +	{8,	0},
> > +	{0,	CPUFREQ_TABLE_END},
> > +};
> 
> Only p1022 can handle 1:1 (index 2).
> 
> > +static void set_pll(unsigned int pll, int cpu)
> > +{
> > +	int shift;
> > +	u32 busfreq, corefreq, val;
> > +	u32 core_spd, mask, tmp;
> > +
> > +	tmp = in_be32(guts + PMJCR);
> > +	shift = (cpu == 1) ? CORE1_RATIO_SHIFT : CORE0_RATIO_SHIFT;
> > +	busfreq = fsl_get_sys_freq();
> > +	val = (pll & CORE_RATIO_MASK) << shift;
> > +
> > +	corefreq = ((busfreq * pll) >> 1);
> 
> Use "/ 2", not ">> 1".  Same asm code, more readable.
> 
> > +	/* must set the bit[18/19] if the requested core freq > 533 MHz */
> > +	core_spd = (cpu == 1) ? PMJCR_CORE1_SPD_MASK : PMJCR_CORE0_SPD_MASK;
> > +	if (corefreq > FREQ_533MHz)
> > +		val |= core_spd;
> 
> this is the cutoff for p1022 -- on mpc8536 the manual says the cutoff is
> 800 MHz.
> 
> > +	mask = (cpu == 1) ? (PMJCR_CORE1_RATIO_MASK | PMJCR_CORE1_SPD_MASK) :
> > +		(PMJCR_CORE0_RATIO_MASK | PMJCR_CORE0_SPD_MASK);
> > +	tmp &= ~mask;
> > +	tmp |= val;
> > +	out_be32(guts + PMJCR, tmp);
> 
> clrsetbits_be32()
> 
> > +	val = in_be32(guts + PMJCR);
> > +	out_be32(guts + POWMGTCSR,
> > +			POWMGTCSR_LOSSLESS_MASK | POWMGTCSR_JOG_MASK);
> 
> setbits32()
> 
> > +	pr_debug("PMJCR request %08x at CPU %d\n", tmp, cpu);
> > +}
> > +
> > +static void verify_pll(int cpu)
> > +{
> > +	int shift;
> > +	u32 busfreq, pll, corefreq;
> > +
> > +	shift = (cpu == 1) ? CORE1_RATIO_SHIFT : CORE0_RATIO_SHIFT;
> > +	busfreq = fsl_get_sys_freq();
> > +	pll = (in_be32(guts + PORPLLSR) >> shift) & CORE_RATIO_MASK;
> > +
> > +	corefreq = (busfreq * pll) >> 1;
> > +	corefreq /= 1000000;
> > +	pr_debug("PORPLLSR core freq %dMHz at CPU %d\n", corefreq, cpu);
> > +}
> 
> It looks like the entire point of this function is to make a debug
> print...  #ifdef DEBUG the contents?  Or if we mark fsl_get_sys_freq()
> as __pure (or better, read this once at init, since it involves
> searching the device tree), will it all get optimized away?
> 
> 
> > +	/* initialize frequency table */
> > +	pr_info("core %d frequency table:\n", policy->cpu);
> > +	for (i = 0; mpc85xx_freqs[i].frequency != CPUFREQ_TABLE_END; i++) {
> > +		mpc85xx_freqs[i].frequency =
> > +				(busfreq * mpc85xx_freqs[i].index) >> 1;
> > +		pr_info("%d: %dkHz\n", i, mpc85xx_freqs[i].frequency);
> > +	}
> 
> This should be pr_debug.
> 
> > +	/* the latency of a transition, the unit is ns */
> > +	policy->cpuinfo.transition_latency = 2000;
> > +
> > +	cur_pll = get_pll(policy->cpu);
> > +	pr_debug("current pll is at %d\n", cur_pll);
> > +
> > +	for (i = 0; mpc85xx_freqs[i].frequency != CPUFREQ_TABLE_END; i++) {
> > +		if (mpc85xx_freqs[i].index == cur_pll)
> > +			policy->cur = mpc85xx_freqs[i].frequency;
> > +	}
> 
> You could combine these loops.
> 
> > +	/* this ensures that policy->cpuinfo_min
> > +	 * and policy->cpuinfo_max are set correctly */
> 
> comment style
> 
> > +static int mpc85xx_cpufreq_target(struct cpufreq_policy *policy,
> > +			      unsigned int target_freq,
> > +			      unsigned int relation)
> > +{
> > +	struct cpufreq_freqs freqs;
> > +	unsigned int new;
> > +
> > +	cpufreq_frequency_table_target(policy,
> > +				       mpc85xx_freqs,
> > +				       target_freq,
> > +				       relation,
> > +				       &new);
> > +
> > +	freqs.old = policy->cur;
> > +	freqs.new = mpc85xx_freqs[new].frequency;
> > +	freqs.cpu = policy->cpu;
> > +
> > +	mutex_lock(&mpc85xx_switch_mutex);
> > +	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
> > +
> > +	pr_info("Setting frequency for core %d to %d kHz, " \
> > +		 "PLL ratio is %d/2\n",
> > +		 policy->cpu,
> > +		 mpc85xx_freqs[new].frequency,
> > +		 mpc85xx_freqs[new].index);
> > +
> > +	set_pll(mpc85xx_freqs[new].index, policy->cpu);
> > +
> > +	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
> > +	mutex_unlock(&mpc85xx_switch_mutex);
> > +
> > +	ppc_proc_freq = freqs.new * 1000ul;
> 
> ppc_proc_freq is global -- can CPUs not have their frequencies adjusted
> separately?
> 
> It should be under the lock, if the lock is needed at all.
> 

There is only one ppc_proc_freq. no lock.

> > +/*
> > + * module init and destoy
> > + */
> > +static struct of_device_id mpc85xx_jog_ids[] __initdata = {
> > +	{ .compatible = "fsl,mpc8536-guts", },
> > +	{ .compatible = "fsl,p1022-guts", },
> > +	{}
> > +};
> > +
> > +static int __init mpc85xx_cpufreq_init(void)
> > +{
> > +	struct device_node *np;
> > +
> > +	pr_info("Freescale MPC85xx CPU frequency switching driver\n");
> 
> If you're going to print something here, print it after you find a node
> you can work with -- not on all 85xx/qoriq that have this driver enabled.
> 
> -Scott

Thanks. I will fix them all.

-chenhui

^ permalink raw reply

* [RFC PATCH v4 00/10] fadump: Firmware-assisted dump support for Powerpc.
From: Mahesh J Salgaonkar @ 2011-11-07  9:55 UTC (permalink / raw)
  To: linuxppc-dev, Linux Kernel, Benjamin Herrenschmidt
  Cc: Amerigo Wang, Milton Miller, Eric W. Biederman, Anton Blanchard

Hi All,

Please find the version 4 of the patchset that implements firmware-assisted
dump mechanism to capture kernel crash dump for Powerpc architecture. The
firmware-assisted dump is a robust mechanism to get reliable kernel crash
dump with assistance from firmware. This approach does not use kexec, instead
firmware assists in booting the kdump kernel while preserving memory contents.

Changes in v4:
--------------
patch 04/10:
- Move the init_elfcore_header() function and 'memblock_num_regions' macro
  from generic code to power specific code as these are used only by
  firmware assisted dump implementation which is power specific feature.

patch 05/10:
- Fixes a issue where memblock_free() is invoked from build_cpu_notes()
  function during error_out path. Invoke cpu_notes_buf_free() in error_out
  path instead of memblock_free().

Changes in v3:
-------------
- Re-factored the implementation to work with kdump service start/stop.
  Introduce fadump_registered sysfs control file which will be used by
  kdump init scripts to start/stop firmware assisted dump. echo 1 to
  /sys/kernel/fadump_registered file for fadump registration and
  echo 0 to /sys/kernel/fadump_registered file for fadump un-registration.
- Introduced the locking mechanism to handle simultaneous writes to
  sysfs control files fadump_registered and fadump_release_mem

  Affected patches are: 01/10, 03/10, 08/10.

Changes in v2:
-------------
patch 01/10:
- Modified the documentation to reflect the change of fadump_region
  file under debugfs filesystem.

patch 02/10:
- Modified to use standard pr_debug() macro.
- Modified early_init_dt_scan_fw_dump() to get the size of
  "ibm,configure-kernel-dump-sizes" property and use it to iterate through
  an array of dump sections.
- Introduced boot option 'fadump_reserve_mem=' to let user specify the
  fadump boot memory to be reserved.

patch 03/10:
- Removed few debug print statements.
- Moved the setup_fadump() call from setup_system() and now calling it
  subsys_initcall.
- Moved fadump_region attribute under debugfs.
- Clear the TCE entries if firmware assisted dump is active.

patch 05/10:
- Moved the crash_fadump() invocation from generic code to panic notifier.
- Introduced cpu_notes_buf_alloc() function to allocate cpu notes buffer
  using get_free_pages().

patch 08/10:
- Introduced cpu_notes_buf_free() function to free memory allocated for
  cpu notes buffer.

The most of the code implementation has been adapted from phyp assisted dump
implementation written by Linas Vepstas and Manish Ahuja.

The first patch is a documentation that talks about firmware-assisted dump
mechanism, implementation details and TODO list.

I have tested the patches on following system configuration:
1. LPAR on Power6 with 4GB RAM and 8 CPUs
2. LPAR on Power7 with 2GB RAM and 20 CPUs
3. LPAR on Power7 with 1TB RAM and 896 CPUs

These patches cleanly apply on commit c3b92c878 in linux-2.6 git tree.

Please review the patchset and let me know your comments.

Thanks,
-Mahesh.
---

Mahesh Salgaonkar (10):
      fadump: Add documentation for firmware-assisted dump.
      fadump: Reserve the memory for firmware assisted dump.
      fadump: Register for firmware assisted dump.
      fadump: Initialize elfcore header and add PT_LOAD program headers.
      fadump: Convert firmware-assisted cpu state dump data into elf notes.
      fadump: Add PT_NOTE program header for vmcoreinfo
      fadump: Introduce cleanup routine to invalidate /proc/vmcore.
      fadump: Invalidate registration and release reserved memory for general use.
      fadump: Invalidate the fadump registration during machine shutdown.
      fadump: Introduce config option for firmware assisted dump feature


 Documentation/powerpc/firmware-assisted-dump.txt |  262 ++++
 arch/powerpc/Kconfig                             |   13 
 arch/powerpc/include/asm/fadump.h                |  211 ++++
 arch/powerpc/kernel/Makefile                     |    1 
 arch/powerpc/kernel/fadump.c                     | 1313 ++++++++++++++++++++++
 arch/powerpc/kernel/iommu.c                      |    8 
 arch/powerpc/kernel/prom.c                       |   15 
 arch/powerpc/kernel/setup-common.c               |   16 
 arch/powerpc/kernel/traps.c                      |    5 
 arch/powerpc/mm/hash_utils_64.c                  |   11 
 fs/proc/vmcore.c                                 |   23 
 11 files changed, 1876 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/powerpc/firmware-assisted-dump.txt
 create mode 100644 arch/powerpc/include/asm/fadump.h
 create mode 100644 arch/powerpc/kernel/fadump.c

-- 
Signature

^ permalink raw reply

* suspecting ibm_newemac driver problem.
From: suggest @ 2011-11-07 10:33 UTC (permalink / raw)
  To: linuxppc-dev

Hello All,

I am using ppc460ex based custom board.
using linux 2.6.30.2 kernel on this board it has ibm_newemac driver in it.

I am using this board as a iSCSI target which has RAID system behind it.

                                               ||
                                               ||                    ||                      
||
      n/w driver (ibm_newemac)    ||    n/w layer   ||    iSCSI Target  ||  
RAID
                                               ||                    ||                      
||

The problem i am facing is when i start iometer based i/o.
I am doing i/o with following specifications.

Workload type : All in one.
no of outstanding i/o : 32.
no of worker threads : 8.

With this kind of workload n/w stucks in between. and board becomes
unresponsive.
If i try to ping to the board then ping fails with 100% packet loss.

While debugging the issue i found out that
sock->sk_state_change callback is called from the TCP layer and state of the
socket is not TCP_ESTABLISHED in the iSCSI target driver.

So i am assuming that this is a problem related to n/w driver.

Did anyone faced similar kind of issue before? Please point me in right
direction.

Thanks and Regards,
Harshal Shete.
-- 
View this message in context: http://old.nabble.com/suspecting-ibm_newemac-driver-problem.-tp32788715p32788715.html
Sent from the linuxppc-dev mailing list archive at Nabble.com.

^ permalink raw reply

* Re: [PATCH 6/7] fsl_pmc: Add API to enable device as wakeup event source
From: Zhao Chenhui @ 2011-11-07 11:24 UTC (permalink / raw)
  To: Tabi Timur-B04825
  Cc: netdev@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	Li Yang-R58472, Zhao Chenhui-B35336
In-Reply-To: <CAOZdJXXB9zJWqC+kPq7ZDdzePtp8XNBnWcf5UmE8Ye50U-G7Dg@mail.gmail.com>

On Fri, Nov 04, 2011 at 07:08:24PM -0500, Tabi Timur-B04825 wrote:
> On Fri, Nov 4, 2011 at 7:39 AM, Zhao Chenhui <chenhui.zhao@freescale.co=
m> wrote:
>=20
> > + =A0 =A0 =A0 if (!pmc_regs) {
> > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 printk(KERN_WARNING "PMC is unavailable=
\n");
>=20
> Use pr_warn() and the other pr_xxx functions.
>=20
> > + =A0 =A0 =A0 pmcdr_mask =3D (u32 *)of_get_property(clk_np, "fsl,pmcd=
r-mask", NULL);
>=20
> The typecast is unnecessary here.
>=20
> > + =A0 =A0 =A0 /* clear to enable clock in low power mode */
> > + =A0 =A0 =A0 if (enable)
> > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 clrbits32(&pmc_regs->pmcdr, *pmcdr_mask=
);
> > + =A0 =A0 =A0 else
> > + =A0 =A0 =A0 =A0 =A0 =A0 =A0 setbits32(&pmc_regs->pmcdr, *pmcdr_mask=
);
>=20
> You need to use be32_to_cpup() when dereferencing a pointer to a
> device tree property.
>=20
> --=20
> Timur Tabi
> Linux kernel developer at Freescale

Thanks. I will fix them all.

-chenhui

^ permalink raw reply

* Re: [PATCH 6/7] fsl_pmc: Add API to enable device as wakeup event source
From: Zhao Chenhui @ 2011-11-07 11:22 UTC (permalink / raw)
  To: Scott Wood; +Cc: netdev, linuxppc-dev
In-Reply-To: <4EB455B1.8030009@freescale.com>

On Fri, Nov 04, 2011 at 04:14:25PM -0500, Scott Wood wrote:
> On 11/04/2011 07:39 AM, Zhao Chenhui wrote:
> > @@ -45,6 +46,72 @@ static int has_lossless;
> >   * code can be compatible with both 32-bit & 36-bit */
> >  extern void mpc85xx_enter_deep_sleep(u64 ccsrbar, u32 powmgtreq);
> >  
> > +#ifdef CONFIG_FSL_PMC
> > +/**
> > + * pmc_enable_wake - enable OF device as wakeup event source
> > + * @pdev: platform device affected
> > + * @state: PM state from which device will issue wakeup events
> > + * @enable: True to enable event generation; false to disable
> > + *
> > + * This enables the device as a wakeup event source, or disables it.
> > + *
> > + * RETURN VALUE:
> > + * 0 is returned on success
> > + * -EINVAL is returned if device is not supposed to wake up the system
> > + * Error code depending on the platform is returned if both the platform and
> > + * the native mechanism fail to enable the generation of wake-up events
> > + */
> > +int pmc_enable_wake(struct platform_device *pdev,
> > +				suspend_state_t state, bool enable)
> 
> "pmc" is too generic for a global function.  If this can be either
> enable or disable, perhaps it should be something like
> mpc85xx_pmc_set_wake().
> 
> > +{
> > +	int ret = 0;
> > +	struct device_node *clk_np;
> > +	u32 *pmcdr_mask;
> > +
> > +	if (!pmc_regs) {
> > +		printk(KERN_WARNING "PMC is unavailable\n");
> > +		return -ENOMEM;
> > +	}
> 
> -ENOMEM is not appropriate here, maybe -ENODEV?
> 
> Should print __func__ so the user knows what's complaining.
> 
> > +	if (enable && !device_may_wakeup(&pdev->dev))
> > +		return -EINVAL;
> > +
> > +	clk_np = of_parse_phandle(pdev->dev.of_node, "clk-handle", 0);
> > +	if (!clk_np)
> > +		return -EINVAL;
> > +
> > +	pmcdr_mask = (u32 *)of_get_property(clk_np, "fsl,pmcdr-mask", NULL);
> > +	if (!pmcdr_mask) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* clear to enable clock in low power mode */
> > +	if (enable)
> > +		clrbits32(&pmc_regs->pmcdr, *pmcdr_mask);
> > +	else
> > +		setbits32(&pmc_regs->pmcdr, *pmcdr_mask);
> 
> We should probably initialize PMCDR to all bits set (or at least all
> ones we know are valid) -- the default should be "not a wakeup source".

I think it should be initialized in u-boot.

> 
> > +/**
> > + * pmc_enable_lossless - enable lossless ethernet in low power mode
> > + * @enable: True to enable event generation; false to disable
> > + */
> > +void pmc_enable_lossless(int enable)
> > +{
> > +	if (enable && has_lossless)
> > +		setbits32(&pmc_regs->pmcsr, PMCSR_LOSSLESS);
> > +	else
> > +		clrbits32(&pmc_regs->pmcsr, PMCSR_LOSSLESS);
> > +}
> > +EXPORT_SYMBOL_GPL(pmc_enable_lossless);
> > +#endif
> 
> Won't we overwrite this later?
> 
> -Scott

Do you have any idea?

-chenhui

^ permalink raw reply

* [PATCH] gpio: mpc8xxx: don't allow input-only pins to be output for MPC5121
From: Wolfram Sang @ 2011-11-07 13:21 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Anatolij Gustschin

Add a 5121-custom reject if an input-only pin is requested to be output
(see 18.3.1.1 in the refman). Also, rewrite mach-specific quirk setup to
consume less lines.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
---
 drivers/gpio/gpio-mpc8xxx.c |   17 ++++++++++++-----
 1 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c
index ec3fcf0..25dc736 100644
--- a/drivers/gpio/gpio-mpc8xxx.c
+++ b/drivers/gpio/gpio-mpc8xxx.c
@@ -115,6 +115,14 @@ static int mpc8xxx_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio)
 	return 0;
 }
 
+static int mpc5121_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
+{
+	/* GPIO 28..31 are input only on MPC5121 */
+	if (gpio >= 28)
+		return -EINVAL;
+
+	return mpc8xxx_gpio_dir_out(gc, gpio, val);
+}
 static int mpc8xxx_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val)
 {
 	struct of_mm_gpio_chip *mm = to_of_mm_gpio_chip(gc);
@@ -340,11 +348,10 @@ static void __init mpc8xxx_add_controller(struct device_node *np)
 	mm_gc->save_regs = mpc8xxx_gpio_save_regs;
 	gc->ngpio = MPC8XXX_GPIO_PINS;
 	gc->direction_input = mpc8xxx_gpio_dir_in;
-	gc->direction_output = mpc8xxx_gpio_dir_out;
-	if (of_device_is_compatible(np, "fsl,mpc8572-gpio"))
-		gc->get = mpc8572_gpio_get;
-	else
-		gc->get = mpc8xxx_gpio_get;
+	gc->direction_output = of_device_is_compatible(np, "fsl,mpc5121-gpio") ?
+		mpc5121_gpio_dir_out : mpc8xxx_gpio_dir_out;
+	gc->get = of_device_is_compatible(np, "fsl,mpc8572-gpio") ?
+		mpc8572_gpio_get : mpc8xxx_gpio_get;
 	gc->set = mpc8xxx_gpio_set;
 	gc->to_irq = mpc8xxx_gpio_to_irq;
 
-- 
1.7.7.1

^ permalink raw reply related

* Re: [PATCH v2 1/5] [ppc] Process dynamic relocations for kernel
From: Josh Poimboeuf @ 2011-11-07 15:13 UTC (permalink / raw)
  To: Suzuki Poulose
  Cc: Nathan Miller, Josh Poimboeuf, Dave Hansen, Alan Modra,
	Scott Wood, Paul Mackerras, linuxppc-dev
In-Reply-To: <4EB3A40C.1070802@in.ibm.com>

On Fri, 2011-11-04 at 14:06 +0530, Suzuki Poulose wrote:
> On 11/03/11 05:06, Josh Poimboeuf wrote:
> > On Tue, 2011-10-25 at 17:23 +0530, Suzuki K. Poulose wrote:
> > @@ -137,6 +137,9 @@ get_type:
> >   	lwz	r0, 8(r9)	/* r_addend */
> >   	add	r0, r0, r3	/* final addend */
> >   	stwx	r0, r4, r7	/* memory[r4+r7]) = (u32)r0 */
> > +	dcbst	r4,r7		/* flush dcache line to memory */
> > +	sync			/* wait for flush to complete */
> > +	icbi	r4,r7		/* invalidate icache line */
> 
> Doing it this way has two drawbacks :
> 
> 1) Placing it here in relocate would do the flushing for each and every update.

I agree.  My kernel had around 80,000 relocations, which means 80,000
d-cache line flushes (for a 32k d-cache) and 80,000 i-cache line
invalidates (for a 32k i-cache).  Which is obviously a little overkill.
Although I didn't notice a performance hit during boot.

> 2) I would like to keep this code as generic as possible for the PPC32 code.
> 
> Could we move this to the place from relocate is called and flush the d-cache and
> i-cache entirely ?

Why not put the cache flushing code at the end of relocate?  Would some
of the other PPC32 platforms not require the cache flushing?

My PPC32 knowledge is 4xx-centric, so please feel free to rewrite the
patch as needed to accommodate other PPC32 cores.

Thanks,
Josh

^ permalink raw reply

* RE: [PATCH v2 1/5] [ppc] Process dynamic relocations for kernel
From: David Laight @ 2011-11-07 15:26 UTC (permalink / raw)
  To: Josh Poimboeuf, Suzuki Poulose
  Cc: Nathan Miller, Josh Poimboeuf, Dave Hansen, Alan Modra,
	Scott Wood, Paul Mackerras, linuxppc-dev
In-Reply-To: <1320678819.2750.15.camel@treble>

=20
> On Fri, 2011-11-04 at 14:06 +0530, Suzuki Poulose wrote:
> > On 11/03/11 05:06, Josh Poimboeuf wrote:
> > > On Tue, 2011-10-25 at 17:23 +0530, Suzuki K. Poulose wrote:
> > > @@ -137,6 +137,9 @@ get_type:
> > >   	lwz	r0, 8(r9)	/* r_addend */
> > >   	add	r0, r0, r3	/* final addend */
> > >   	stwx	r0, r4, r7	/* memory[r4+r7]) =3D (u32)r0 */
> > > +	dcbst	r4,r7		/* flush dcache line to memory */
> > > +	sync			/* wait for flush to complete */
> > > +	icbi	r4,r7		/* invalidate icache line */
> >=20
> > Doing it this way has two drawbacks :
> >=20
> > 1) Placing it here in relocate would do the flushing for=20
> each and every update.
>=20
> I agree.  My kernel had around 80,000 relocations, which means 80,000
> d-cache line flushes (for a 32k d-cache) and 80,000 i-cache line
> invalidates (for a 32k i-cache).  Which is obviously a little=20
> overkill.
> Although I didn't notice a performance hit during boot.

The I-cache invalidates shouldn't be needed, the un-relocated
code can't be in the I-cache (on the grounds that executing
it would crash the system).
A single sync at the end is probably enough as well.
I guess it is possible for the cpu to prefetch/preload
into the i-cache through the jump into the relocated code?
So maybe a full i-cache invalidate right at the end? (or
a jump indirect? - which is probably there anyway)

The d-cache will need some kind of flush, since the modified
lines have to be written out, the only time it generates
additional memeory cycles are if there are two (or more)
reloations in the same d-cache line. Otherwise the early
write-back might help!

	David

^ permalink raw reply

* Re: [PATCH 6/7] fsl_pmc: Add API to enable device as wakeup event source
From: Scott Wood @ 2011-11-07 15:49 UTC (permalink / raw)
  To: Zhao Chenhui; +Cc: netdev, linuxppc-dev
In-Reply-To: <20111107112236.GB16470@localhost.localdomain>

On 11/07/2011 05:22 AM, Zhao Chenhui wrote:
> On Fri, Nov 04, 2011 at 04:14:25PM -0500, Scott Wood wrote:
>> On 11/04/2011 07:39 AM, Zhao Chenhui wrote:
>>> +	if (enable && !device_may_wakeup(&pdev->dev))
>>> +		return -EINVAL;
>>> +
>>> +	clk_np = of_parse_phandle(pdev->dev.of_node, "clk-handle", 0);
>>> +	if (!clk_np)
>>> +		return -EINVAL;
>>> +
>>> +	pmcdr_mask = (u32 *)of_get_property(clk_np, "fsl,pmcdr-mask", NULL);
>>> +	if (!pmcdr_mask) {
>>> +		ret = -EINVAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	/* clear to enable clock in low power mode */
>>> +	if (enable)
>>> +		clrbits32(&pmc_regs->pmcdr, *pmcdr_mask);
>>> +	else
>>> +		setbits32(&pmc_regs->pmcdr, *pmcdr_mask);
>>
>> We should probably initialize PMCDR to all bits set (or at least all
>> ones we know are valid) -- the default should be "not a wakeup source".
> 
> I think it should be initialized in u-boot.

I don't see it.  If you mean you think this should be added to U-Boot, I
disagree.  U-Boot does not use this, and we should not add gratuitous
U-Boot dependencies to Linux -- especially in cases where there are
existing U-Boots in use for relevant boards, that do not have this.

>>> +/**
>>> + * pmc_enable_lossless - enable lossless ethernet in low power mode
>>> + * @enable: True to enable event generation; false to disable
>>> + */
>>> +void pmc_enable_lossless(int enable)
>>> +{
>>> +	if (enable && has_lossless)
>>> +		setbits32(&pmc_regs->pmcsr, PMCSR_LOSSLESS);
>>> +	else
>>> +		clrbits32(&pmc_regs->pmcsr, PMCSR_LOSSLESS);
>>> +}
>>> +EXPORT_SYMBOL_GPL(pmc_enable_lossless);
>>> +#endif
>>
>> Won't we overwrite this later?
>>
>> -Scott
> 
> Do you have any idea?

Set a flag that the code that enters (deep) sleep can use.

Also, rename function to mpc85xx_pmc_set_lossless_ethernet().

-Scott

^ permalink raw reply

* Re: [PATCH] powerpc: Export PIR data through sysfs
From: Scott Wood @ 2011-11-07 17:18 UTC (permalink / raw)
  To: ananth; +Cc: linuxppc-dev, Anton Blanchard, mahesh
In-Reply-To: <20111107044750.GB4361@in.ibm.com>

On 11/06/2011 10:47 PM, Ananth N Mavinakayanahalli wrote:
> The Processor Identification Register (PIR) on powerpc provides
> information to decode the processor identification tag. Decoding
> this information platform specfic.
> 
> Export PIR data via sysfs.
> 
> (Powerpc manuals state this register is 'optional'. I am not sure
> though if there are any Linux supported powerpc platforms that
> don't have it. Code in the kernel referencing PIR isn't under
> a platform ifdef).

Those references are in platform-specific files, under #ifdef
CONFIG_SMP, often in areas that would only be executed in the presence
of multiple CPUs (e.g. secondary release).  The reference in misc_32.S
is inside #ifdef CONFIG_KEXEC and is fairly recent -- it may not have
been tested on these systems.

I don't see PIR (other than in the acronym definition section) in
manuals for UP-only cores such as e300, 8xx, and 750.

What use does userspace have for this?  If you want to return the
currently executing CPU (which unless you're pinned could change as soon
as the value is read...), why not just return smp_processor_id() or
hard_smp_processor_id()?

-Scott

^ permalink raw reply

* Re: [PATCH 2/4] powerpc/time: Use clocksource_register_hz
From: john stultz @ 2011-11-07 18:26 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Anton Blanchard
In-Reply-To: <20111105005551.GB31510@bloggs.ozlabs.ibm.com>

On Sat, 2011-11-05 at 11:55 +1100, Paul Mackerras wrote:
> On Thu, Nov 03, 2011 at 09:14:44AM -0400, John Stultz wrote:
> > On Thu, 2011-11-03 at 11:59 +1100, Anton Blanchard wrote:
> > > plain text document attachment (clock3)
> > > Use clocksource_register_hz which calculates the shift/mult
> > > factors for us.
> > > 
> > > Signed-off-by: Anton Blanchard <anton@samba.org>
> > > ---
> > > 
> > > Index: linux-build/arch/powerpc/kernel/time.c
> > > ===================================================================
> > > --- linux-build.orig/arch/powerpc/kernel/time.c	2011-11-03 10:19:59.493679032 +1100
> > > +++ linux-build/arch/powerpc/kernel/time.c	2011-11-03 10:20:00.965704053 +1100
> > > @@ -86,8 +86,6 @@ static struct clocksource clocksource_rt
> > >  	.rating       = 400,
> > >  	.flags        = CLOCK_SOURCE_IS_CONTINUOUS,
> > >  	.mask         = CLOCKSOURCE_MASK(64),
> > > -	.shift        = 22,
> > > -	.mult         = 0,	/* To be filled in */
> > >  	.read         = rtc_read,
> > >  };
> > > 
> > > @@ -97,8 +95,6 @@ static struct clocksource clocksource_ti
> > >  	.rating       = 400,
> > >  	.flags        = CLOCK_SOURCE_IS_CONTINUOUS,
> > >  	.mask         = CLOCKSOURCE_MASK(64),
> > > -	.shift        = 22,
> > > -	.mult         = 0,	/* To be filled in */
> > >  	.read         = timebase_read,
> > >  };
> > 
> > So I've held off on ppc conversion to clocksource_register_hz due to the
> > fact that the ppc vdso gettimeofday at least used to make assumptions
> > that shift was 22.
> > 
> > Is that no longer the case?
> 
> It is still the case; specifically, update_vsyscall() in
> arch/powerpc/kernel/time.c converts a multiplier value to a 'tb_to_xs'
> multiplier (timebase to xsec conversion factor, where 1 xsec = 2^-20
> seconds) using a factor which assumes a shift of 22.  The factor needs
> to be 2^(20 + 64 - shift) / 1e9, so we could accommodate other shift
> values by changing the line that computes new_tb_to_xs to do
> 
>        new_tb_to_xs = (u64) mult * (19342813113834067ULL >> shift);
> 
> assuming the shift value is easily available to update_vsyscall
> (I assume it would be clock->shift).

Ok. That sounds reasonable. clock->shift should be correct there.

thanks
-john

^ permalink raw reply

* Re: [PATCH 6/7] fsl_pmc: Add API to enable device as wakeup event source
From: Scott Wood @ 2011-11-07 18:41 UTC (permalink / raw)
  To: Tabi Timur-B04825
  Cc: netdev@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	Li Yang-R58472, Zhao Chenhui-B35336
In-Reply-To: <CAOZdJXXB9zJWqC+kPq7ZDdzePtp8XNBnWcf5UmE8Ye50U-G7Dg@mail.gmail.com>

On 11/04/2011 07:08 PM, Tabi Timur-B04825 wrote:
> On Fri, Nov 4, 2011 at 7:39 AM, Zhao Chenhui <chenhui.zhao@freescale.com> wrote:
>> +       /* clear to enable clock in low power mode */
>> +       if (enable)
>> +               clrbits32(&pmc_regs->pmcdr, *pmcdr_mask);
>> +       else
>> +               setbits32(&pmc_regs->pmcdr, *pmcdr_mask);
> 
> You need to use be32_to_cpup() when dereferencing a pointer to a
> device tree property.

Or just use of_property_read_u32().

-Scott

^ permalink raw reply

* Re: [PATCH] powerpc/p1023: set IRQ[4:6, 11] to high level sensitive for PCIe
From: Scott Wood @ 2011-11-07 18:44 UTC (permalink / raw)
  To: Roy Zang; +Cc: linuxppc-dev
In-Reply-To: <1320654778-3294-1-git-send-email-tie-fei.zang@freescale.com>

On 11/07/2011 02:32 AM, Roy Zang wrote:
> P1023 external IRQ[4:6, 11] do not pin out, but the interrupts are
> shared with PCIe controller.
> The silicon internally ties the interrupts to L, so change the
> IRQ[4:6,11] to high level sensitive for PCIe.

Some extra commentary on why this works would be nice.

The manual says:

> If a PCI Express INTx interrupt is being used, then the PIC must be configured so that external interrupts
> are level-sensitive (EIVPRn[S] = 1).

and

> In general, these signals should be considered mutually exclusive. If a PCI Express INTx signal is being
> used, the PIC must be configured so that external interrupts are level sensitive (EIVPRn[S] = 1). If an IRQn
> signal is being used as edge-triggered (EIVPRn[S] = 0), the system must not allow inbound PCI Express
> INTx transactions.
>
> Note that it is possible to share IRQn and INTx if the external interrupt is level sensitive; however, if an
> interrupt occurs, the interrupt service routine must poll both the external sources connected to the IRQn
> input and the PCI Express INTx sources to determine from which path the external interrupt came. In any
> case, IRQn should be pulled to the negated state as determined by the associated polarity setting in
> EIVPRn[P].

So it looks like there's some magic whereby the configuration of the
MPIC affects how the PCIe feeds the interrupt in.

Is there (or will there be) an erratum, or anything in the manual
besides not being documented as external interrupts, about these
specific interrupts being tied low in silicon or needing to be active high?

-Scott

^ permalink raw reply

* RE: fpga driver on custom PPC target platform (P4080) ...
From: Robert Sciuk @ 2011-11-07 18:48 UTC (permalink / raw)
  To: David Gibson; +Cc: linuxppc-dev, devicetree-discuss, Tabi Timur-B04825
In-Reply-To: <20111105004050.GA1384@truffala.fritz.box>

> > Ah,  my compatible attribute was wrong:
> >
> > 	Compatible =3D "nxp,pca9539";
> >
> > 	Should have been:
> >
> > 	Compatible =3D "nxp,pca953x";
> >
> > The tree now seems to bind the i2c gpio drivers properly ... on to
> > the localbus mappings!
>=20
> Ah.  In that case the drivers should probably be extended to recognize
> the first compatible.  "wildcard" compatible strings are a bad idea,
> unfortunately there are some already in the wild, as you've seen.  We
> should try to phase them out though.
>=20
> --
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_
> _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson

Thanks, David

It did occur that a specific binding should be preferred over a generic =
one=20

Rob.

^ permalink raw reply

* Re: [PATCH 4/7] powerpc/85xx: add support to JOG feature using cpufreq interface
From: Scott Wood @ 2011-11-07 18:50 UTC (permalink / raw)
  To: Zhao Chenhui; +Cc: Jerry Huang, linuxppc-dev
In-Reply-To: <20111107102724.GA16470@localhost.localdomain>

On 11/07/2011 04:27 AM, Zhao Chenhui wrote:
> On Fri, Nov 04, 2011 at 02:42:54PM -0500, Scott Wood wrote:
>> On 11/04/2011 07:36 AM, Zhao Chenhui wrote:
>>> +	cpufreq_frequency_table_target(policy,
>>> +				       mpc85xx_freqs,
>>> +				       target_freq,
>>> +				       relation,
>>> +				       &new);
>>> +
>>> +	freqs.old = policy->cur;
>>> +	freqs.new = mpc85xx_freqs[new].frequency;
>>> +	freqs.cpu = policy->cpu;
>>> +
>>> +	mutex_lock(&mpc85xx_switch_mutex);
>>> +	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
>>> +
>>> +	pr_info("Setting frequency for core %d to %d kHz, " \
>>> +		 "PLL ratio is %d/2\n",
>>> +		 policy->cpu,
>>> +		 mpc85xx_freqs[new].frequency,
>>> +		 mpc85xx_freqs[new].index);
>>> +
>>> +	set_pll(mpc85xx_freqs[new].index, policy->cpu);
>>> +
>>> +	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
>>> +	mutex_unlock(&mpc85xx_switch_mutex);
>>> +
>>> +	ppc_proc_freq = freqs.new * 1000ul;
>>
>> ppc_proc_freq is global -- can CPUs not have their frequencies adjusted
>> separately?
>>
>> It should be under the lock, if the lock is needed at all.
>>
> 
> There is only one ppc_proc_freq. no lock.

I realize there's only one.

I'm asking whether CPUs can have their frequencies set indpendently --
if the answer is no, and this function is not specific to a CPU, my only
concern is the lock.  Either this function can be called multiple times
in parallel, in which case the ppc_proc_freq update should be inside the
lock, or it can't, in which case why do we need the lock at all?

-Scott

^ permalink raw reply

* RE: fpga driver on custom PPC target platform (P4080) ...
From: Robert Sciuk @ 2011-11-07 20:09 UTC (permalink / raw)
  To: linuxppc-dev, devicetree-discuss
In-Reply-To: <20111105004050.GA1384@truffala.fritz.box>

In my continuing saga of dev/tree driver development, I have a problem =
which might be obvious to those who have more experience in such =
matters.

I'm a bit perplexed on the tree nodes for the localbus/simplebus nodes =
for my FPGA.  CS0 is reserved for booting (from NOR flash as required by =
our design), CS1 is tied to an FPGA which will always be present.  CS2 =
actually is tied to both of two (optional) fpga's, which have been =
previously mapped by U-Boot (BRn/ORn configuration).  Should I specify a =
ranges command as follows?  This seems somehow wrong, to me, and I'm =
wondering if there is an alternative representation which would work =
better in this case.  If you recall, the programming control lines are =
handled on the I2C bus, via a gpio controller.  In an ideal world, the =
optional FPE1 and FPE2 fpgas will have the identical .bts stream, and =
should support the option to program both simultaneously, or each =
individually, but I'm at a loss as how to best represent this in the =
tree.

	        localbus@ffe124000 {
                compatible =3D "fsl,p4080-elbc", "fsl,elbc", =
"simple-bus";
                reg =3D <0xf 0xfe124000 0 0x1000>;
                interrupts =3D <25 2 0 0>;
                interrupt-parent =3D <&mpic>;
                #address-cells =3D <2>;
                #size-cells =3D <1>;

                /* Local bus region mappings */
                ranges =3D <0 0 0xf 0xe8000000 0x08000000         /* =
CS0: Boot flash */
                          1 0 0xf 0xd0000000 0x7fff             /* CS1: =
FPGA0 -  LIM */
                          2 0 0xf 0xd1000000 0x7fff             /* CS2: =
FPGA1 -  FPE1 */
                          2 0 0xf 0xd2000000 0x7fff >;          /* CS2: =
FPGA2 -  FPE2 */

                flash@0,0 {
                        compatible =3D "cfi-flash";
                        reg =3D <0 0 0x08000000>;
                        bank-width =3D <2>;
                        device-width =3D <2>;
                        #size-cells =3D <1>;
                        #address-cells =3D <1>;

                        partition@0 {
                                label =3D "rcw";
                                reg =3D <0x0 0x20000>;
                                read-only;
                        };
                        partition@40000 {
                                label =3D "saveenv";
                                reg =3D <0x40000 0x20000>;
                        };
                        partition@7000000 {
                                label =3D "fman-firmware";
                                reg =3D <0x7000000 0x20000>;
                                read-only;
                        };
                        partition@7f80000 {
                                label =3D "u-boot";
                                reg =3D <0x7f80000 0x80000>;
                                read-only;
                        };
                };

                lim: fpga@1, {
                }

                fpe1: fpga@2, {
                }

                fpe2: fpga@2, {
	}

Again, any pointers here would be greatly appreciated ...

Cheers,
Rob Sciuk

^ permalink raw reply

* Re: Regression: patch " hvc_console: display printk messages on console." causing infinite loop with 3.2-rc0 + Xen.
From: Greg KH @ 2011-11-07 20:24 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Rusty Russell, xen-devel, Konrad Rzeszutek Wilk, ppc-dev, miche,
	linux-kernel, virtualization, Anton Blanchard, Amit Shah, Linus
In-Reply-To: <20111107171942.fe21429583491475f245aa08@canb.auug.org.au>

On Mon, Nov 07, 2011 at 05:19:42PM +1100, Stephen Rothwell wrote:
> Hi Greg,
> 
> On Wed, 2 Nov 2011 18:30:12 -0700 Greg KH <gregkh@suse.de> wrote:
> >
> > On Wed, Nov 02, 2011 at 12:13:09PM +1100, Stephen Rothwell wrote:
> > > 
> > > On Thu, 27 Oct 2011 07:48:06 +0200 Greg KH <gregkh@suse.de> wrote:
> > > >
> > > > On Thu, Oct 27, 2011 at 01:30:08AM -0400, Konrad Rzeszutek Wilk wrote:
> > > > > Hey Miche.
> > > > > 
> > > > > The git commit 361162459f62dc0826b82c9690a741a940f457f0:
> > > > > 
> > > > >     hvc_console: display printk messages on console.
> > > > > 
> > > > > is causing an infinite loop when booting Linux under Xen, as so:
> > > > 
> > > > Ick, not good, thanks for letting us know.
> > > 
> > > Indeed. I am wondering why it was put in a tree and sent to Linus without
> > > any Acks or even being replied to by anyone.  It appeared in the tty tree
> > > between Oct 14 and Oct 25 (while I was unfortunately on vacation).  If
> > > anyone had tried to boot this on any PowerPC server, it would have been
> > > immediately obvious (as it was when I booted Linus' tree last night).
> > > 
> > > And the original author expressed doubts as to his understanding of how
> > > it should all work anyway.
> > > 
> > > Just a little more care, please.
> > > 
> > > I would vote for reverting the original and having it resubmitted with
> > > corrections at some later date.
> > 
> > You are right, I will go do that, sorry for the problems.
> 
> Ping ...
> 
> Linus can you please just revert 361162459f62dc0826b82c9690a741a940f457f0
> "hvc_console: display printk messages on console" as it breaks consoles
> for all PowerPC server machines.

Thanks for doing this, I was going to include it in my next pull request
after 3.2-rc1 was out, but you are right, it should have gone in sooner.

greg k-h

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox