LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* powerpc: Move /proc/ppc64 to /proc/powerpc and add symlink
From: Benjamin Herrenschmidt @ 2009-09-25  5:29 UTC (permalink / raw)
  To: linuxppc-dev list

Some of the stuff in /proc/ppc64 such as the RTAS bits are actually
useful to some 32-bit platforms. Rename the file, and create a
symlink on 64-bit for backward compatibility

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index b23664a..3faa391 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -42,7 +42,7 @@ obj-$(CONFIG_ALTIVEC)		+= vecemu.o
 obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
 obj-$(CONFIG_PPC_OF)		+= of_device.o of_platform.o prom_parse.o
 obj-$(CONFIG_PPC_CLOCK)		+= clock.o
-procfs-$(CONFIG_PPC64)		:= proc_ppc64.o
+procfs-y			:= proc_powerpc.o
 obj-$(CONFIG_PROC_FS)		+= $(procfs-y)
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)	:= rtas_pci.o
 obj-$(CONFIG_PPC_RTAS)		+= rtas.o rtas-rtc.o $(rtaspci-y-y)
diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c
index ed0ac4e..79a00bb 100644
--- a/arch/powerpc/kernel/lparcfg.c
+++ b/arch/powerpc/kernel/lparcfg.c
@@ -781,9 +781,9 @@ static int __init lparcfg_init(void)
 			!firmware_has_feature(FW_FEATURE_ISERIES))
 		mode |= S_IWUSR;
 
-	ent = proc_create("ppc64/lparcfg", mode, NULL, &lparcfg_fops);
+	ent = proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops);
 	if (!ent) {
-		printk(KERN_ERR "Failed to create ppc64/lparcfg\n");
+		printk(KERN_ERR "Failed to create powerpc/lparcfg\n");
 		return -EIO;
 	}
 
diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c
new file mode 100644
index 0000000..1ed3b8d
--- /dev/null
+++ b/arch/powerpc/kernel/proc_powerpc.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+
+#include <asm/machdep.h>
+#include <asm/vdso_datapage.h>
+#include <asm/rtas.h>
+#include <asm/uaccess.h>
+#include <asm/prom.h>
+
+#ifdef CONFIG_PPC64
+
+static loff_t page_map_seek( struct file *file, loff_t off, int whence)
+{
+	loff_t new;
+	struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
+
+	switch(whence) {
+	case 0:
+		new = off;
+		break;
+	case 1:
+		new = file->f_pos + off;
+		break;
+	case 2:
+		new = dp->size + off;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if ( new < 0 || new > dp->size )
+		return -EINVAL;
+	return (file->f_pos = new);
+}
+
+static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
+			      loff_t *ppos)
+{
+	struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
+	return simple_read_from_buffer(buf, nbytes, ppos, dp->data, dp->size);
+}
+
+static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
+{
+	struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
+
+	if ((vma->vm_end - vma->vm_start) > dp->size)
+		return -EINVAL;
+
+	remap_pfn_range(vma, vma->vm_start, __pa(dp->data) >> PAGE_SHIFT,
+						dp->size, vma->vm_page_prot);
+	return 0;
+}
+
+static const struct file_operations page_map_fops = {
+	.llseek	= page_map_seek,
+	.read	= page_map_read,
+	.mmap	= page_map_mmap
+};
+
+
+static int __init proc_ppc64_init(void)
+{
+	struct proc_dir_entry *pde;
+
+	pde = proc_create_data("powerpc/systemcfg", S_IFREG|S_IRUGO, NULL,
+			       &page_map_fops, vdso_data);
+	if (!pde)
+		return 1;
+	pde->size = PAGE_SIZE;
+
+	return 0;
+}
+__initcall(proc_ppc64_init);
+
+#endif /* CONFIG_PPC64 */
+
+/*
+ * Create the ppc64 and ppc64/rtas directories early. This allows us to
+ * assume that they have been previously created in drivers.
+ */
+static int __init proc_ppc64_create(void)
+{
+	struct proc_dir_entry *root;
+
+	root = proc_mkdir("powerpc", NULL);
+	if (!root)
+		return 1;
+
+#ifdef CONFIG_PPC64
+	if (!proc_symlink("ppc64", NULL, "powerpc"))
+		pr_err("Failed to create link /proc/ppc64 -> /proc/powerpc\n");
+#endif
+
+	if (!of_find_node_by_path("/rtas"))
+		return 0;
+
+	if (!proc_mkdir("rtas", root))
+		return 1;
+
+	if (!proc_symlink("rtas", NULL, "powerpc/rtas"))
+		return 1;
+
+	return 0;
+}
+core_initcall(proc_ppc64_create);
diff --git a/arch/powerpc/kernel/proc_ppc64.c b/arch/powerpc/kernel/proc_ppc64.c
deleted file mode 100644
index c647dde..0000000
--- a/arch/powerpc/kernel/proc_ppc64.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/kernel.h>
-
-#include <asm/machdep.h>
-#include <asm/vdso_datapage.h>
-#include <asm/rtas.h>
-#include <asm/uaccess.h>
-#include <asm/prom.h>
-
-static loff_t  page_map_seek( struct file *file, loff_t off, int whence);
-static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
-			      loff_t *ppos);
-static int     page_map_mmap( struct file *file, struct vm_area_struct *vma );
-
-static const struct file_operations page_map_fops = {
-	.llseek	= page_map_seek,
-	.read	= page_map_read,
-	.mmap	= page_map_mmap
-};
-
-/*
- * Create the ppc64 and ppc64/rtas directories early. This allows us to
- * assume that they have been previously created in drivers.
- */
-static int __init proc_ppc64_create(void)
-{
-	struct proc_dir_entry *root;
-
-	root = proc_mkdir("ppc64", NULL);
-	if (!root)
-		return 1;
-
-	if (!of_find_node_by_path("/rtas"))
-		return 0;
-
-	if (!proc_mkdir("rtas", root))
-		return 1;
-
-	if (!proc_symlink("rtas", NULL, "ppc64/rtas"))
-		return 1;
-
-	return 0;
-}
-core_initcall(proc_ppc64_create);
-
-static int __init proc_ppc64_init(void)
-{
-	struct proc_dir_entry *pde;
-
-	pde = proc_create_data("ppc64/systemcfg", S_IFREG|S_IRUGO, NULL,
-			       &page_map_fops, vdso_data);
-	if (!pde)
-		return 1;
-	pde->size = PAGE_SIZE;
-
-	return 0;
-}
-__initcall(proc_ppc64_init);
-
-static loff_t page_map_seek( struct file *file, loff_t off, int whence)
-{
-	loff_t new;
-	struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
-
-	switch(whence) {
-	case 0:
-		new = off;
-		break;
-	case 1:
-		new = file->f_pos + off;
-		break;
-	case 2:
-		new = dp->size + off;
-		break;
-	default:
-		return -EINVAL;
-	}
-	if ( new < 0 || new > dp->size )
-		return -EINVAL;
-	return (file->f_pos = new);
-}
-
-static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
-			      loff_t *ppos)
-{
-	struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
-	return simple_read_from_buffer(buf, nbytes, ppos, dp->data, dp->size);
-}
-
-static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
-{
-	struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
-
-	if ((vma->vm_end - vma->vm_start) > dp->size)
-		return -EINVAL;
-
-	remap_pfn_range(vma, vma->vm_start, __pa(dp->data) >> PAGE_SHIFT,
-						dp->size, vma->vm_page_prot);
-	return 0;
-}
-
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 13011a9..a85117d 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -6,7 +6,7 @@
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
  *
- * /proc/ppc64/rtas/firmware_flash interface
+ * /proc/powerpc/rtas/firmware_flash interface
  *
  * This file implements a firmware_flash interface to pump a firmware
  * image into the kernel.  At reboot time rtas_restart() will see the
@@ -740,7 +740,7 @@ static int __init rtas_flash_init(void)
 		return 1;
 	}
 
-	firmware_flash_pde = create_flash_pde("ppc64/rtas/"
+	firmware_flash_pde = create_flash_pde("powerpc/rtas/"
 					      FIRMWARE_FLASH_NAME,
 					      &rtas_flash_operations);
 	if (firmware_flash_pde == NULL) {
@@ -754,7 +754,7 @@ static int __init rtas_flash_init(void)
 	if (rc != 0)
 		goto cleanup;
 
-	firmware_update_pde = create_flash_pde("ppc64/rtas/"
+	firmware_update_pde = create_flash_pde("powerpc/rtas/"
 					       FIRMWARE_UPDATE_NAME,
 					       &rtas_flash_operations);
 	if (firmware_update_pde == NULL) {
@@ -768,7 +768,7 @@ static int __init rtas_flash_init(void)
 	if (rc != 0)
 		goto cleanup;
 
-	validate_pde = create_flash_pde("ppc64/rtas/" VALIDATE_FLASH_NAME,
+	validate_pde = create_flash_pde("powerpc/rtas/" VALIDATE_FLASH_NAME,
 			      		&validate_flash_operations);
 	if (validate_pde == NULL) {
 		rc = -ENOMEM;
@@ -781,7 +781,7 @@ static int __init rtas_flash_init(void)
 	if (rc != 0)
 		goto cleanup;
 
-	manage_pde = create_flash_pde("ppc64/rtas/" MANAGE_FLASH_NAME,
+	manage_pde = create_flash_pde("powerpc/rtas/" MANAGE_FLASH_NAME,
 				      &manage_flash_operations);
 	if (manage_pde == NULL) {
 		rc = -ENOMEM;
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index 2e2bbe1..5182d2b 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -184,7 +184,7 @@ static int pSeries_reconfig_remove_node(struct device_node *np)
 }
 
 /*
- * /proc/ppc64/ofdt - yucky binary interface for adding and removing
+ * /proc/powerpc/ofdt - yucky binary interface for adding and removing
  * OF device nodes.  Should be deprecated as soon as we get an
  * in-kernel wrapper for the RTAS ibm,configure-connector call.
  */
@@ -543,7 +543,7 @@ static const struct file_operations ofdt_fops = {
 	.write = ofdt_write
 };
 
-/* create /proc/ppc64/ofdt write-only by root */
+/* create /proc/powerpc/ofdt write-only by root */
 static int proc_ppc64_create_ofdt(void)
 {
 	struct proc_dir_entry *ent;
@@ -551,7 +551,7 @@ static int proc_ppc64_create_ofdt(void)
 	if (!machine_is(pseries))
 		return 0;
 
-	ent = proc_create("ppc64/ofdt", S_IWUSR, NULL, &ofdt_fops);
+	ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops);
 	if (ent)
 		ent->size = 0;
 
diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
index 417eca7..1b45c45 100644
--- a/arch/powerpc/platforms/pseries/scanlog.c
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -13,7 +13,7 @@
  * of this data using this driver.  A dump exists if the device-tree
  * /chosen/ibm,scan-log-data property exists.
  *
- * This driver exports /proc/ppc64/scan-log-dump which can be read.
+ * This driver exports /proc/powerpc/scan-log-dump which can be read.
  * The driver supports only sequential reads.
  *
  * The driver looks at a write to the driver for the single word "reset".
@@ -186,7 +186,7 @@ static int __init scanlog_init(void)
 	if (!data)
 		goto err;
 
-	ent = proc_create_data("ppc64/rtas/scan-log-dump", S_IRUSR, NULL,
+	ent = proc_create_data("powerpc/rtas/scan-log-dump", S_IRUSR, NULL,
 			       &scanlog_fops, data);
 	if (!ent)
 		goto err;

^ permalink raw reply related

* powerpc/chrp: Use the same RTAS daemon as pSeries
From: Benjamin Herrenschmidt @ 2009-09-25  5:30 UTC (permalink / raw)
  To: linuxppc-dev list

The CHRP code has some fishy timer based code to scan the RTAS event
log, which uses a 1KB stack buffer and doesn't even use the results.

The pSeries code as a nicer daemon that allows userspace to read the
event log and basically uses the same RTAS interface

This patch moves rtasd.c out of platform/pseries and makes it usable
by CHRP, after removing the old crufty event log mechanism in there.

The nvram logging part of the daemon is still only available on 64-bit
since the underlying nvram management routines aren't currently shared.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 3faa391..c002b04 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -46,6 +46,7 @@ procfs-y			:= proc_powerpc.o
 obj-$(CONFIG_PROC_FS)		+= $(procfs-y)
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)	:= rtas_pci.o
 obj-$(CONFIG_PPC_RTAS)		+= rtas.o rtas-rtc.o $(rtaspci-y-y)
+obj-$(CONFIG_PPC_RTAS_DAEMON)	+= rtasd.o
 obj-$(CONFIG_RTAS_FLASH)	+= rtas_flash.o
 obj-$(CONFIG_RTAS_PROC)		+= rtas-proc.o
 obj-$(CONFIG_LPARCFG)		+= lparcfg.o
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
new file mode 100644
index 0000000..2e4832a
--- /dev/null
+++ b/arch/powerpc/kernel/rtasd.c
@@ -0,0 +1,539 @@
+/*
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Communication to userspace based on kernel/printk.c
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/workqueue.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/nvram.h>
+#include <asm/atomic.h>
+#include <asm/machdep.h>
+
+
+static DEFINE_SPINLOCK(rtasd_log_lock);
+
+static DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait);
+
+static char *rtas_log_buf;
+static unsigned long rtas_log_start;
+static unsigned long rtas_log_size;
+
+static int surveillance_timeout = -1;
+
+static unsigned int rtas_error_log_max;
+static unsigned int rtas_error_log_buffer_max;
+
+/* RTAS service tokens */
+static unsigned int event_scan;
+static unsigned int rtas_event_scan_rate;
+
+static int full_rtas_msgs = 0;
+
+/* Stop logging to nvram after first fatal error */
+static int logging_enabled; /* Until we initialize everything,
+                             * make sure we don't try logging
+                             * anything */
+static int error_log_cnt;
+
+/*
+ * Since we use 32 bit RTAS, the physical address of this must be below
+ * 4G or else bad things happen. Allocate this in the kernel data and
+ * make it big enough.
+ */
+static unsigned char logdata[RTAS_ERROR_LOG_MAX];
+
+static char *rtas_type[] = {
+	"Unknown", "Retry", "TCE Error", "Internal Device Failure",
+	"Timeout", "Data Parity", "Address Parity", "Cache Parity",
+	"Address Invalid", "ECC Uncorrected", "ECC Corrupted",
+};
+
+static char *rtas_event_type(int type)
+{
+	if ((type > 0) && (type < 11))
+		return rtas_type[type];
+
+	switch (type) {
+		case RTAS_TYPE_EPOW:
+			return "EPOW";
+		case RTAS_TYPE_PLATFORM:
+			return "Platform Error";
+		case RTAS_TYPE_IO:
+			return "I/O Event";
+		case RTAS_TYPE_INFO:
+			return "Platform Information Event";
+		case RTAS_TYPE_DEALLOC:
+			return "Resource Deallocation Event";
+		case RTAS_TYPE_DUMP:
+			return "Dump Notification Event";
+	}
+
+	return rtas_type[0];
+}
+
+/* To see this info, grep RTAS /var/log/messages and each entry
+ * will be collected together with obvious begin/end.
+ * There will be a unique identifier on the begin and end lines.
+ * This will persist across reboots.
+ *
+ * format of error logs returned from RTAS:
+ * bytes	(size)	: contents
+ * --------------------------------------------------------
+ * 0-7		(8)	: rtas_error_log
+ * 8-47		(40)	: extended info
+ * 48-51	(4)	: vendor id
+ * 52-1023 (vendor specific) : location code and debug data
+ */
+static void printk_log_rtas(char *buf, int len)
+{
+
+	int i,j,n = 0;
+	int perline = 16;
+	char buffer[64];
+	char * str = "RTAS event";
+
+	if (full_rtas_msgs) {
+		printk(RTAS_DEBUG "%d -------- %s begin --------\n",
+		       error_log_cnt, str);
+
+		/*
+		 * Print perline bytes on each line, each line will start
+		 * with RTAS and a changing number, so syslogd will
+		 * print lines that are otherwise the same.  Separate every
+		 * 4 bytes with a space.
+		 */
+		for (i = 0; i < len; i++) {
+			j = i % perline;
+			if (j == 0) {
+				memset(buffer, 0, sizeof(buffer));
+				n = sprintf(buffer, "RTAS %d:", i/perline);
+			}
+
+			if ((i % 4) == 0)
+				n += sprintf(buffer+n, " ");
+
+			n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
+
+			if (j == (perline-1))
+				printk(KERN_DEBUG "%s\n", buffer);
+		}
+		if ((i % perline) != 0)
+			printk(KERN_DEBUG "%s\n", buffer);
+
+		printk(RTAS_DEBUG "%d -------- %s end ----------\n",
+		       error_log_cnt, str);
+	} else {
+		struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
+
+		printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
+		       error_log_cnt, rtas_event_type(errlog->type),
+		       errlog->severity);
+	}
+}
+
+static int log_rtas_len(char * buf)
+{
+	int len;
+	struct rtas_error_log *err;
+
+	/* rtas fixed header */
+	len = 8;
+	err = (struct rtas_error_log *)buf;
+	if (err->extended_log_length) {
+
+		/* extended header */
+		len += err->extended_log_length;
+	}
+
+	if (rtas_error_log_max == 0)
+		rtas_error_log_max = rtas_get_error_log_max();
+
+	if (len > rtas_error_log_max)
+		len = rtas_error_log_max;
+
+	return len;
+}
+
+/*
+ * First write to nvram, if fatal error, that is the only
+ * place we log the info.  The error will be picked up
+ * on the next reboot by rtasd.  If not fatal, run the
+ * method for the type of error.  Currently, only RTAS
+ * errors have methods implemented, but in the future
+ * there might be a need to store data in nvram before a
+ * call to panic().
+ *
+ * XXX We write to nvram periodically, to indicate error has
+ * been written and sync'd, but there is a possibility
+ * that if we don't shutdown correctly, a duplicate error
+ * record will be created on next reboot.
+ */
+void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
+{
+	unsigned long offset;
+	unsigned long s;
+	int len = 0;
+
+	pr_debug("rtasd: logging event\n");
+	if (buf == NULL)
+		return;
+
+	spin_lock_irqsave(&rtasd_log_lock, s);
+
+	/* get length and increase count */
+	switch (err_type & ERR_TYPE_MASK) {
+	case ERR_TYPE_RTAS_LOG:
+		len = log_rtas_len(buf);
+		if (!(err_type & ERR_FLAG_BOOT))
+			error_log_cnt++;
+		break;
+	case ERR_TYPE_KERNEL_PANIC:
+	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+#ifdef CONFIG_PPC64
+	/* Write error to NVRAM */
+	if (logging_enabled && !(err_type & ERR_FLAG_BOOT))
+		nvram_write_error_log(buf, len, err_type, error_log_cnt);
+#endif /* CONFIG_PPC64 */
+
+	/*
+	 * rtas errors can occur during boot, and we do want to capture
+	 * those somewhere, even if nvram isn't ready (why not?), and even
+	 * if rtasd isn't ready. Put them into the boot log, at least.
+	 */
+	if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG)
+		printk_log_rtas(buf, len);
+
+	/* Check to see if we need to or have stopped logging */
+	if (fatal || !logging_enabled) {
+		logging_enabled = 0;
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+	/* call type specific method for error */
+	switch (err_type & ERR_TYPE_MASK) {
+	case ERR_TYPE_RTAS_LOG:
+		offset = rtas_error_log_buffer_max *
+			((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK);
+
+		/* First copy over sequence number */
+		memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int));
+
+		/* Second copy over error log data */
+		offset += sizeof(int);
+		memcpy(&rtas_log_buf[offset], buf, len);
+
+		if (rtas_log_size < LOG_NUMBER)
+			rtas_log_size += 1;
+		else
+			rtas_log_start += 1;
+
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		wake_up_interruptible(&rtas_log_wait);
+		break;
+	case ERR_TYPE_KERNEL_PANIC:
+	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		return;
+	}
+
+}
+
+static int rtas_log_open(struct inode * inode, struct file * file)
+{
+	return 0;
+}
+
+static int rtas_log_release(struct inode * inode, struct file * file)
+{
+	return 0;
+}
+
+/* This will check if all events are logged, if they are then, we
+ * know that we can safely clear the events in NVRAM.
+ * Next we'll sit and wait for something else to log.
+ */
+static ssize_t rtas_log_read(struct file * file, char __user * buf,
+			 size_t count, loff_t *ppos)
+{
+	int error;
+	char *tmp;
+	unsigned long s;
+	unsigned long offset;
+
+	if (!buf || count < rtas_error_log_buffer_max)
+		return -EINVAL;
+
+	count = rtas_error_log_buffer_max;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&rtasd_log_lock, s);
+
+	/* if it's 0, then we know we got the last one (the one in NVRAM) */
+	while (rtas_log_size == 0) {
+		if (file->f_flags & O_NONBLOCK) {
+			spin_unlock_irqrestore(&rtasd_log_lock, s);
+			error = -EAGAIN;
+			goto out;
+		}
+
+		if (!logging_enabled) {
+			spin_unlock_irqrestore(&rtasd_log_lock, s);
+			error = -ENODATA;
+			goto out;
+		}
+#ifdef CONFIG_PPC64
+		nvram_clear_error_log();
+#endif /* CONFIG_PPC64 */
+
+		spin_unlock_irqrestore(&rtasd_log_lock, s);
+		error = wait_event_interruptible(rtas_log_wait, rtas_log_size);
+		if (error)
+			goto out;
+		spin_lock_irqsave(&rtasd_log_lock, s);
+	}
+
+	offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK);
+	memcpy(tmp, &rtas_log_buf[offset], count);
+
+	rtas_log_start += 1;
+	rtas_log_size -= 1;
+	spin_unlock_irqrestore(&rtasd_log_lock, s);
+
+	error = copy_to_user(buf, tmp, count) ? -EFAULT : count;
+out:
+	kfree(tmp);
+	return error;
+}
+
+static unsigned int rtas_log_poll(struct file *file, poll_table * wait)
+{
+	poll_wait(file, &rtas_log_wait, wait);
+	if (rtas_log_size)
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+static const struct file_operations proc_rtas_log_operations = {
+	.read =		rtas_log_read,
+	.poll =		rtas_log_poll,
+	.open =		rtas_log_open,
+	.release =	rtas_log_release,
+};
+
+static int enable_surveillance(int timeout)
+{
+	int error;
+
+	error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout);
+
+	if (error == 0)
+		return 0;
+
+	if (error == -EINVAL) {
+		printk(KERN_DEBUG "rtasd: surveillance not supported\n");
+		return 0;
+	}
+
+	printk(KERN_ERR "rtasd: could not update surveillance\n");
+	return -1;
+}
+
+static void do_event_scan(void)
+{
+	int error;
+	do {
+		memset(logdata, 0, rtas_error_log_max);
+		error = rtas_call(event_scan, 4, 1, NULL,
+				  RTAS_EVENT_SCAN_ALL_EVENTS, 0,
+				  __pa(logdata), rtas_error_log_max);
+		if (error == -1) {
+			printk(KERN_ERR "event-scan failed\n");
+			break;
+		}
+
+		if (error == 0)
+			pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+
+	} while(error == 0);
+}
+
+static void rtas_event_scan(struct work_struct *w);
+DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan);
+
+/*
+ * Delay should be at least one second since some machines have problems if
+ * we call event-scan too quickly.
+ */
+static unsigned long event_scan_delay = 1*HZ;
+static int first_pass = 1;
+
+static void rtas_event_scan(struct work_struct *w)
+{
+	unsigned int cpu;
+
+	do_event_scan();
+
+	get_online_cpus();
+
+	cpu = next_cpu(smp_processor_id(), cpu_online_map);
+	if (cpu == NR_CPUS) {
+		cpu = first_cpu(cpu_online_map);
+
+		if (first_pass) {
+			first_pass = 0;
+			event_scan_delay = 30*HZ/rtas_event_scan_rate;
+
+			if (surveillance_timeout != -1) {
+				pr_debug("rtasd: enabling surveillance\n");
+				enable_surveillance(surveillance_timeout);
+				pr_debug("rtasd: surveillance enabled\n");
+			}
+		}
+	}
+
+	schedule_delayed_work_on(cpu, &event_scan_work,
+		__round_jiffies_relative(event_scan_delay, cpu));
+
+	put_online_cpus();
+}
+
+#ifdef CONFIG_PPC64
+static void retreive_nvram_error_log(void)
+{
+	unsigned int err_type ;
+	int rc ;
+
+	/* See if we have any error stored in NVRAM */
+	memset(logdata, 0, rtas_error_log_max);
+	rc = nvram_read_error_log(logdata, rtas_error_log_max,
+	                          &err_type, &error_log_cnt);
+	/* We can use rtas_log_buf now */
+	logging_enabled = 1;
+	if (!rc) {
+		if (err_type != ERR_FLAG_ALREADY_LOGGED) {
+			pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0);
+		}
+	}
+}
+#else /* CONFIG_PPC64 */
+static void retreive_nvram_error_log(void)
+{
+}
+#endif /* CONFIG_PPC64 */
+
+static void start_event_scan(void)
+{
+	printk(KERN_DEBUG "RTAS daemon started\n");
+	pr_debug("rtasd: will sleep for %d milliseconds\n",
+		 (30000 / rtas_event_scan_rate));
+
+	/* Retreive errors from nvram if any */
+	retreive_nvram_error_log();
+
+	schedule_delayed_work_on(first_cpu(cpu_online_map), &event_scan_work,
+				 event_scan_delay);
+}
+
+static int __init rtas_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	if (!machine_is(pseries) && !machine_is(chrp))
+		return 0;
+
+	/* No RTAS */
+	event_scan = rtas_token("event-scan");
+	if (event_scan == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_INFO "rtasd: No event-scan on system\n");
+		return -ENODEV;
+	}
+
+	rtas_event_scan_rate = rtas_token("rtas-event-scan-rate");
+	if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) {
+		printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n");
+		return -ENODEV;
+	}
+
+	/* Make room for the sequence number */
+	rtas_error_log_max = rtas_get_error_log_max();
+	rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int);
+
+	rtas_log_buf = vmalloc(rtas_error_log_buffer_max*LOG_NUMBER);
+	if (!rtas_log_buf) {
+		printk(KERN_ERR "rtasd: no memory\n");
+		return -ENOMEM;
+	}
+
+	entry = proc_create("powerpc/rtas/error_log", S_IRUSR, NULL,
+			    &proc_rtas_log_operations);
+	if (!entry)
+		printk(KERN_ERR "Failed to create error_log proc entry\n");
+
+	start_event_scan();
+
+	return 0;
+}
+__initcall(rtas_init);
+
+static int __init surveillance_setup(char *str)
+{
+	int i;
+
+	/* We only do surveillance on pseries */
+	if (!machine_is(pseries))
+		return 0;
+
+	if (get_option(&str,&i)) {
+		if (i >= 0 && i <= 255)
+			surveillance_timeout = i;
+	}
+
+	return 1;
+}
+__setup("surveillance=", surveillance_setup);
+
+static int __init rtasmsgs_setup(char *str)
+{
+	if (strcmp(str, "on") == 0)
+		full_rtas_msgs = 1;
+	else if (strcmp(str, "off") == 0)
+		full_rtas_msgs = 0;
+
+	return 1;
+}
+__setup("rtasmsgs=", rtasmsgs_setup);
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 04a8061..56bf126 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -86,6 +86,11 @@ config RTAS_ERROR_LOGGING
 	depends on PPC_RTAS
 	default n
 
+config PPC_RTAS_DAEMON
+	bool
+	depends on PPC_RTAS
+	default n
+
 config RTAS_PROC
 	bool "Proc interface to RTAS"
 	depends on PPC_RTAS
diff --git a/arch/powerpc/platforms/chrp/Kconfig b/arch/powerpc/platforms/chrp/Kconfig
index 37d438b..bc0b0ef 100644
--- a/arch/powerpc/platforms/chrp/Kconfig
+++ b/arch/powerpc/platforms/chrp/Kconfig
@@ -5,6 +5,8 @@ config PPC_CHRP
 	select PPC_I8259
 	select PPC_INDIRECT_PCI
 	select PPC_RTAS
+	select PPC_RTAS_DAEMON
+	select RTAS_ERROR_LOGGING
 	select PPC_MPC106
 	select PPC_UDBG_16550
 	select PPC_NATIVE
diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c
index cd4ad9a..52f3df3 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -364,19 +364,6 @@ void __init chrp_setup_arch(void)
 	if (ppc_md.progress) ppc_md.progress("Linux/PPC "UTS_RELEASE"\n", 0x0);
 }
 
-void
-chrp_event_scan(unsigned long unused)
-{
-	unsigned char log[1024];
-	int ret = 0;
-
-	/* XXX: we should loop until the hardware says no more error logs -- Cort */
-	rtas_call(rtas_token("event-scan"), 4, 1, &ret, 0xffffffff, 0,
-		  __pa(log), 1024);
-	mod_timer(&__get_cpu_var(heartbeat_timer),
-		  jiffies + event_scan_interval);
-}
-
 static void chrp_8259_cascade(unsigned int irq, struct irq_desc *desc)
 {
 	unsigned int cascade_irq = i8259_irq();
@@ -568,9 +555,6 @@ void __init chrp_init_IRQ(void)
 void __init
 chrp_init2(void)
 {
-	struct device_node *device;
-	const unsigned int *p = NULL;
-
 #ifdef CONFIG_NVRAM
 	chrp_nvram_init();
 #endif
@@ -582,40 +566,6 @@ chrp_init2(void)
 	request_region(0x80,0x10,"dma page reg");
 	request_region(0xc0,0x20,"dma2");
 
-	/* Get the event scan rate for the rtas so we know how
-	 * often it expects a heartbeat. -- Cort
-	 */
-	device = of_find_node_by_name(NULL, "rtas");
-	if (device)
-		p = of_get_property(device, "rtas-event-scan-rate", NULL);
-	if (p && *p) {
-		/*
-		 * Arrange to call chrp_event_scan at least *p times
-		 * per minute.  We use 59 rather than 60 here so that
-		 * the rate will be slightly higher than the minimum.
-		 * This all assumes we don't do hotplug CPU on any
-		 * machine that needs the event scans done.
-		 */
-		unsigned long interval, offset;
-		int cpu, ncpus;
-		struct timer_list *timer;
-
-		interval = HZ * 59 / *p;
-		offset = HZ;
-		ncpus = num_online_cpus();
-		event_scan_interval = ncpus * interval;
-		for (cpu = 0; cpu < ncpus; ++cpu) {
-			timer = &per_cpu(heartbeat_timer, cpu);
-			setup_timer(timer, chrp_event_scan, 0);
-			timer->expires = jiffies + offset;
-			add_timer_on(timer, cpu);
-			offset += interval;
-		}
-		printk("RTAS Event Scan Rate: %u (%lu jiffies)\n",
-		       *p, interval);
-	}
-	of_node_put(device);
-
 	if (ppc_md.progress)
 		ppc_md.progress("  Have fun!    ", 0x7777);
 }
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index f0e6f28..26a24bd 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -4,6 +4,7 @@ config PPC_PSERIES
 	select MPIC
 	select PPC_I8259
 	select PPC_RTAS
+	select PPC_RTAS_DAEMON
 	select RTAS_ERROR_LOGGING
 	select PPC_UDBG_16550
 	select PPC_NATIVE
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 790c0b8..4b1c422 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -7,7 +7,7 @@ EXTRA_CFLAGS		+= -DDEBUG
 endif
 
 obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
-			   setup.o iommu.o ras.o rtasd.o \
+			   setup.o iommu.o ras.o \
 			   firmware.o power.o
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_XICS)	+= xics.o
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
deleted file mode 100644
index b3cbac8..0000000
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ /dev/null
@@ -1,519 +0,0 @@
-/*
- * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Communication to userspace based on kernel/printk.c
- */
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/poll.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/spinlock.h>
-#include <linux/cpu.h>
-#include <linux/workqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/rtas.h>
-#include <asm/prom.h>
-#include <asm/nvram.h>
-#include <asm/atomic.h>
-#include <asm/machdep.h>
-
-
-static DEFINE_SPINLOCK(rtasd_log_lock);
-
-static DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait);
-
-static char *rtas_log_buf;
-static unsigned long rtas_log_start;
-static unsigned long rtas_log_size;
-
-static int surveillance_timeout = -1;
-static unsigned int rtas_error_log_max;
-static unsigned int rtas_error_log_buffer_max;
-
-/* RTAS service tokens */
-static unsigned int event_scan;
-static unsigned int rtas_event_scan_rate;
-
-static int full_rtas_msgs = 0;
-
-/* Stop logging to nvram after first fatal error */
-static int logging_enabled; /* Until we initialize everything,
-                             * make sure we don't try logging
-                             * anything */
-static int error_log_cnt;
-
-/*
- * Since we use 32 bit RTAS, the physical address of this must be below
- * 4G or else bad things happen. Allocate this in the kernel data and
- * make it big enough.
- */
-static unsigned char logdata[RTAS_ERROR_LOG_MAX];
-
-static char *rtas_type[] = {
-	"Unknown", "Retry", "TCE Error", "Internal Device Failure",
-	"Timeout", "Data Parity", "Address Parity", "Cache Parity",
-	"Address Invalid", "ECC Uncorrected", "ECC Corrupted",
-};
-
-static char *rtas_event_type(int type)
-{
-	if ((type > 0) && (type < 11))
-		return rtas_type[type];
-
-	switch (type) {
-		case RTAS_TYPE_EPOW:
-			return "EPOW";
-		case RTAS_TYPE_PLATFORM:
-			return "Platform Error";
-		case RTAS_TYPE_IO:
-			return "I/O Event";
-		case RTAS_TYPE_INFO:
-			return "Platform Information Event";
-		case RTAS_TYPE_DEALLOC:
-			return "Resource Deallocation Event";
-		case RTAS_TYPE_DUMP:
-			return "Dump Notification Event";
-	}
-
-	return rtas_type[0];
-}
-
-/* To see this info, grep RTAS /var/log/messages and each entry
- * will be collected together with obvious begin/end.
- * There will be a unique identifier on the begin and end lines.
- * This will persist across reboots.
- *
- * format of error logs returned from RTAS:
- * bytes	(size)	: contents
- * --------------------------------------------------------
- * 0-7		(8)	: rtas_error_log
- * 8-47		(40)	: extended info
- * 48-51	(4)	: vendor id
- * 52-1023 (vendor specific) : location code and debug data
- */
-static void printk_log_rtas(char *buf, int len)
-{
-
-	int i,j,n = 0;
-	int perline = 16;
-	char buffer[64];
-	char * str = "RTAS event";
-
-	if (full_rtas_msgs) {
-		printk(RTAS_DEBUG "%d -------- %s begin --------\n",
-		       error_log_cnt, str);
-
-		/*
-		 * Print perline bytes on each line, each line will start
-		 * with RTAS and a changing number, so syslogd will
-		 * print lines that are otherwise the same.  Separate every
-		 * 4 bytes with a space.
-		 */
-		for (i = 0; i < len; i++) {
-			j = i % perline;
-			if (j == 0) {
-				memset(buffer, 0, sizeof(buffer));
-				n = sprintf(buffer, "RTAS %d:", i/perline);
-			}
-
-			if ((i % 4) == 0)
-				n += sprintf(buffer+n, " ");
-
-			n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
-
-			if (j == (perline-1))
-				printk(KERN_DEBUG "%s\n", buffer);
-		}
-		if ((i % perline) != 0)
-			printk(KERN_DEBUG "%s\n", buffer);
-
-		printk(RTAS_DEBUG "%d -------- %s end ----------\n",
-		       error_log_cnt, str);
-	} else {
-		struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
-
-		printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
-		       error_log_cnt, rtas_event_type(errlog->type),
-		       errlog->severity);
-	}
-}
-
-static int log_rtas_len(char * buf)
-{
-	int len;
-	struct rtas_error_log *err;
-
-	/* rtas fixed header */
-	len = 8;
-	err = (struct rtas_error_log *)buf;
-	if (err->extended_log_length) {
-
-		/* extended header */
-		len += err->extended_log_length;
-	}
-
-	if (rtas_error_log_max == 0)
-		rtas_error_log_max = rtas_get_error_log_max();
-
-	if (len > rtas_error_log_max)
-		len = rtas_error_log_max;
-
-	return len;
-}
-
-/*
- * First write to nvram, if fatal error, that is the only
- * place we log the info.  The error will be picked up
- * on the next reboot by rtasd.  If not fatal, run the
- * method for the type of error.  Currently, only RTAS
- * errors have methods implemented, but in the future
- * there might be a need to store data in nvram before a
- * call to panic().
- *
- * XXX We write to nvram periodically, to indicate error has
- * been written and sync'd, but there is a possibility
- * that if we don't shutdown correctly, a duplicate error
- * record will be created on next reboot.
- */
-void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
-{
-	unsigned long offset;
-	unsigned long s;
-	int len = 0;
-
-	pr_debug("rtasd: logging event\n");
-	if (buf == NULL)
-		return;
-
-	spin_lock_irqsave(&rtasd_log_lock, s);
-
-	/* get length and increase count */
-	switch (err_type & ERR_TYPE_MASK) {
-	case ERR_TYPE_RTAS_LOG:
-		len = log_rtas_len(buf);
-		if (!(err_type & ERR_FLAG_BOOT))
-			error_log_cnt++;
-		break;
-	case ERR_TYPE_KERNEL_PANIC:
-	default:
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-		spin_unlock_irqrestore(&rtasd_log_lock, s);
-		return;
-	}
-
-	/* Write error to NVRAM */
-	if (logging_enabled && !(err_type & ERR_FLAG_BOOT))
-		nvram_write_error_log(buf, len, err_type, error_log_cnt);
-
-	/*
-	 * rtas errors can occur during boot, and we do want to capture
-	 * those somewhere, even if nvram isn't ready (why not?), and even
-	 * if rtasd isn't ready. Put them into the boot log, at least.
-	 */
-	if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG)
-		printk_log_rtas(buf, len);
-
-	/* Check to see if we need to or have stopped logging */
-	if (fatal || !logging_enabled) {
-		logging_enabled = 0;
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-		spin_unlock_irqrestore(&rtasd_log_lock, s);
-		return;
-	}
-
-	/* call type specific method for error */
-	switch (err_type & ERR_TYPE_MASK) {
-	case ERR_TYPE_RTAS_LOG:
-		offset = rtas_error_log_buffer_max *
-			((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK);
-
-		/* First copy over sequence number */
-		memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int));
-
-		/* Second copy over error log data */
-		offset += sizeof(int);
-		memcpy(&rtas_log_buf[offset], buf, len);
-
-		if (rtas_log_size < LOG_NUMBER)
-			rtas_log_size += 1;
-		else
-			rtas_log_start += 1;
-
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-		spin_unlock_irqrestore(&rtasd_log_lock, s);
-		wake_up_interruptible(&rtas_log_wait);
-		break;
-	case ERR_TYPE_KERNEL_PANIC:
-	default:
-		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-		spin_unlock_irqrestore(&rtasd_log_lock, s);
-		return;
-	}
-
-}
-
-
-static int rtas_log_open(struct inode * inode, struct file * file)
-{
-	return 0;
-}
-
-static int rtas_log_release(struct inode * inode, struct file * file)
-{
-	return 0;
-}
-
-/* This will check if all events are logged, if they are then, we
- * know that we can safely clear the events in NVRAM.
- * Next we'll sit and wait for something else to log.
- */
-static ssize_t rtas_log_read(struct file * file, char __user * buf,
-			 size_t count, loff_t *ppos)
-{
-	int error;
-	char *tmp;
-	unsigned long s;
-	unsigned long offset;
-
-	if (!buf || count < rtas_error_log_buffer_max)
-		return -EINVAL;
-
-	count = rtas_error_log_buffer_max;
-
-	if (!access_ok(VERIFY_WRITE, buf, count))
-		return -EFAULT;
-
-	tmp = kmalloc(count, GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&rtasd_log_lock, s);
-	/* if it's 0, then we know we got the last one (the one in NVRAM) */
-	while (rtas_log_size == 0) {
-		if (file->f_flags & O_NONBLOCK) {
-			spin_unlock_irqrestore(&rtasd_log_lock, s);
-			error = -EAGAIN;
-			goto out;
-		}
-
-		if (!logging_enabled) {
-			spin_unlock_irqrestore(&rtasd_log_lock, s);
-			error = -ENODATA;
-			goto out;
-		}
-		nvram_clear_error_log();
-
-		spin_unlock_irqrestore(&rtasd_log_lock, s);
-		error = wait_event_interruptible(rtas_log_wait, rtas_log_size);
-		if (error)
-			goto out;
-		spin_lock_irqsave(&rtasd_log_lock, s);
-	}
-
-	offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK);
-	memcpy(tmp, &rtas_log_buf[offset], count);
-
-	rtas_log_start += 1;
-	rtas_log_size -= 1;
-	spin_unlock_irqrestore(&rtasd_log_lock, s);
-
-	error = copy_to_user(buf, tmp, count) ? -EFAULT : count;
-out:
-	kfree(tmp);
-	return error;
-}
-
-static unsigned int rtas_log_poll(struct file *file, poll_table * wait)
-{
-	poll_wait(file, &rtas_log_wait, wait);
-	if (rtas_log_size)
-		return POLLIN | POLLRDNORM;
-	return 0;
-}
-
-static const struct file_operations proc_rtas_log_operations = {
-	.read =		rtas_log_read,
-	.poll =		rtas_log_poll,
-	.open =		rtas_log_open,
-	.release =	rtas_log_release,
-};
-
-static int enable_surveillance(int timeout)
-{
-	int error;
-
-	error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout);
-
-	if (error == 0)
-		return 0;
-
-	if (error == -EINVAL) {
-		printk(KERN_DEBUG "rtasd: surveillance not supported\n");
-		return 0;
-	}
-
-	printk(KERN_ERR "rtasd: could not update surveillance\n");
-	return -1;
-}
-
-static void do_event_scan(void)
-{
-	int error;
-	do {
-		memset(logdata, 0, rtas_error_log_max);
-		error = rtas_call(event_scan, 4, 1, NULL,
-				  RTAS_EVENT_SCAN_ALL_EVENTS, 0,
-				  __pa(logdata), rtas_error_log_max);
-		if (error == -1) {
-			printk(KERN_ERR "event-scan failed\n");
-			break;
-		}
-
-		if (error == 0)
-			pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
-
-	} while(error == 0);
-}
-
-static void rtas_event_scan(struct work_struct *w);
-DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan);
-
-/*
- * Delay should be at least one second since some machines have problems if
- * we call event-scan too quickly.
- */
-static unsigned long event_scan_delay = 1*HZ;
-static int first_pass = 1;
-
-static void rtas_event_scan(struct work_struct *w)
-{
-	unsigned int cpu;
-
-	do_event_scan();
-
-	get_online_cpus();
-
-	cpu = next_cpu(smp_processor_id(), cpu_online_map);
-	if (cpu == NR_CPUS) {
-		cpu = first_cpu(cpu_online_map);
-
-		if (first_pass) {
-			first_pass = 0;
-			event_scan_delay = 30*HZ/rtas_event_scan_rate;
-
-			if (surveillance_timeout != -1) {
-				pr_debug("rtasd: enabling surveillance\n");
-				enable_surveillance(surveillance_timeout);
-				pr_debug("rtasd: surveillance enabled\n");
-			}
-		}
-	}
-
-	schedule_delayed_work_on(cpu, &event_scan_work,
-		__round_jiffies_relative(event_scan_delay, cpu));
-
-	put_online_cpus();
-}
-
-static void start_event_scan(void)
-{
-	unsigned int err_type;
-	int rc;
-
-	printk(KERN_DEBUG "RTAS daemon started\n");
-	pr_debug("rtasd: will sleep for %d milliseconds\n",
-		 (30000 / rtas_event_scan_rate));
-
-	/* See if we have any error stored in NVRAM */
-	memset(logdata, 0, rtas_error_log_max);
-	rc = nvram_read_error_log(logdata, rtas_error_log_max,
-	                          &err_type, &error_log_cnt);
-	/* We can use rtas_log_buf now */
-	logging_enabled = 1;
-
-	if (!rc) {
-		if (err_type != ERR_FLAG_ALREADY_LOGGED) {
-			pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0);
-		}
-	}
-
-	schedule_delayed_work_on(first_cpu(cpu_online_map), &event_scan_work,
-				 event_scan_delay);
-}
-
-static int __init rtas_init(void)
-{
-	struct proc_dir_entry *entry;
-
-	if (!machine_is(pseries))
-		return 0;
-
-	/* No RTAS */
-	event_scan = rtas_token("event-scan");
-	if (event_scan == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_DEBUG "rtasd: no event-scan on system\n");
-		return -ENODEV;
-	}
-
-	rtas_event_scan_rate = rtas_token("rtas-event-scan-rate");
-	if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) {
-		printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n");
-		return -ENODEV;
-	}
-
-	/* Make room for the sequence number */
-	rtas_error_log_max = rtas_get_error_log_max();
-	rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int);
-
-	rtas_log_buf = vmalloc(rtas_error_log_buffer_max*LOG_NUMBER);
-	if (!rtas_log_buf) {
-		printk(KERN_ERR "rtasd: no memory\n");
-		return -ENOMEM;
-	}
-
-	entry = proc_create("ppc64/rtas/error_log", S_IRUSR, NULL,
-			    &proc_rtas_log_operations);
-	if (!entry)
-		printk(KERN_ERR "Failed to create error_log proc entry\n");
-
-	start_event_scan();
-
-	return 0;
-}
-
-static int __init surveillance_setup(char *str)
-{
-	int i;
-
-	if (get_option(&str,&i)) {
-		if (i >= 0 && i <= 255)
-			surveillance_timeout = i;
-	}
-
-	return 1;
-}
-
-static int __init rtasmsgs_setup(char *str)
-{
-	if (strcmp(str, "on") == 0)
-		full_rtas_msgs = 1;
-	else if (strcmp(str, "off") == 0)
-		full_rtas_msgs = 0;
-
-	return 1;
-}
-__initcall(rtas_init);
-__setup("surveillance=", surveillance_setup);
-__setup("rtasmsgs=", rtasmsgs_setup);

^ permalink raw reply related

* Re: powerpc: Move /proc/ppc64 to /proc/powerpc and add symlink
From: Stephen Rothwell @ 2009-09-25  6:28 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev list
In-Reply-To: <1253856553.7103.513.camel@pasglop>

[-- Attachment #1: Type: text/plain, Size: 550 bytes --]

Hi Ben,

On Fri, 25 Sep 2009 15:29:13 +1000 Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
>
>  obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
>  obj-$(CONFIG_PPC_OF)		+= of_device.o of_platform.o prom_parse.o
>  obj-$(CONFIG_PPC_CLOCK)		+= clock.o
> -procfs-$(CONFIG_PPC64)		:= proc_ppc64.o
> +procfs-y			:= proc_powerpc.o
>  obj-$(CONFIG_PROC_FS)		+= $(procfs-y)

Surely just:

obj-$(CONFIG_PROC_FS)		+= proc_powerpc.o

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/

[-- Attachment #2: Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* Re: [v6 PATCH 0/7]: cpuidle/x86/POWER: Cleanup idle power management code in x86, cleanup drivers/cpuidle/cpuidle.c and introduce cpuidle to POWER.
From: Vaidyanathan Srinivasan @ 2009-09-25  7:06 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Peter Zijlstra, Gautham R Shenoy, Venkatesh Pallipadi,
	linux-kernel, linux-acpi, Paul Mackerras, arun, Ingo Molnar,
	Shaohua Li, linuxppc-dev, Len Brown
In-Reply-To: <20090924142228.5a2ddf59@infradead.org>

* Arjan van de Ven <arjan@infradead.org> [2009-09-24 14:22:28]:

> On Thu, 24 Sep 2009 10:42:41 +0530
> Arun R Bharadwaj <arun@linux.vnet.ibm.com> wrote:
> 
> > * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-22 16:55:27]:
> > 
> > Hi Len, (or other acpi folks),
> > 
> > I had a question regarding ACPI-cpuidle interaction in the current
> > implementation.
> > 
> > Currently, every cpu (i.e. acpi_processor) registers to cpuidle as
> > a cpuidle_device. So every cpu has to go through the process of
> > setting up the idle states and then registering as a cpuidle device.
> > 
> > What exactly is the reason behind this?
> > 
> 
> technically a BIOS can opt to give you C states via ACPI on some cpus,
> but not on others.
> 
> in practice when this happens it tends to be a bug.. but it's
> technically a valid configuration

So we will need to keep the per-cpu registration as of now because we
may have such buggy BIOS in the field and we don't want the cpuidle
framework to malfunction there.

--Vaidy

^ permalink raw reply

* Re: 2.6.31-git5 kernel boot hangs on powerpc
From: Sachin Sant @ 2009-09-25  7:15 UTC (permalink / raw)
  To: Tejun Heo; +Cc: David Miller, Linux/PPC Development
In-Reply-To: <4ABC376D.1020704@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2979 bytes --]

Tejun Heo wrote:
> Benjamin Herrenschmidt wrote:
>   
>>> --- Exception: 301 at .memset+0x60/0xfc
>>>     LR = .pcpu_alloc+0x718/0x8fc
>>>       
>> So it's memsetting something that causes it to hash_page(), ie, faulting
>> in pages (vmalloc space ?) so far nothing obviously wrong....
>>     
>
> It's probably memset() call near the end of pcpu_populate_chunk()
> where percpu allocator clears the allocated areas before returning to
> user.  I don't think the first chunk is causing the problem as they're
> all in the linear mapped area.  From the second chunk on, they're on
> vmalloc area and very near to the top of it, so that might be exposing
> a hidden problem in paging code?  BTW, for some reason, the problem is
> not reproducible on my powerstation.
>
> Sachin, can you please apply the attached patch on top of the current
> linus tree, reproduce the hang and report full kernel log?  Let's see
> which address is causing the problem.
>   
Here is the dmesg log captured with the debug patch.

Some of the debug messages related to PERCPU

<6>PERCPU: Embedded 2 pages/cpu @c000000001100000 s97160 r0 d33912 u524288
<6>pcpu-alloc: s97160 r0 d33912 u524288 alloc=1*1048576
<6>pcpu-alloc: [0] 0 1
<4>PERCPU: initialized 19 slots [c000000001120200,c000000001120330)
<4>PERCPU: chunk 0 relocating -1 -> 13 c000000001120380 <c000000001120380:c000000001120380>
<4>PERCPU: relocated <c0000000011202d0:c0000000011202d0>

<4>PERCPU: chunk 0 relocating 13 -> 12 c000000001120380 <c0000000011202d0:c0000000011202d0>
<4>PERCPU: relocated <c0000000011202c0:c0000000011202c0>

<4>PERCPU: chunk 0 relocating 12 -> 11 c000000001120380 <c0000000011202c0:c0000000011202c0>
<4>PERCPU: relocated <c0000000011202b0:c0000000011202b0>

<6>ehea: eth0: Physical port up
<6>ehea: External switch port is backup port
<7>irq: irq 33540 on host null mapped to virtual irq 260
<6>NET: Registered protocol family 10
<4>PERCPU: chunk 0 relocating 11 -> 10 c000000001120380 <c0000000011202b0:c0000000011202b0>
<4>PERCPU: relocated <c0000000011202a0:c0000000011202a0>
<4>PERCPU: chunk 0 relocating 10 -> 9 c000000001120380 <c0000000011202a0:c0000000011202a0>
<4>PERCPU: relocated <c000000001120290:c000000001120290>
<4>PERCPU: chunk 1 relocating -1 -> 18 c0000000db70fb00 <c0000000db70fb00:c0000000db70fb00>
<4>PERCPU: relocated <c000000001120320:c000000001120320>
<4>PERCPU: chunk 1 relocating 18 -> 16 c0000000db70fb00 <c000000001120320:c000000001120320>
<4>PERCPU: relocated <c000000001120300:c000000001120300>
<4>PERCPU: chunk 1, alloc pages [0,1)
<4>PERCPU: chunk 1, map pages [0,1)
<4>PERCPU: map 0xd00007fffff00000, 1 pages 53544
<4>PERCPU: map 0xd00007fffff80000, 1 pages 53545
<4>PERCPU: chunk 1, will clear 4096b/unit d00007fffff00000 d00007fffff80000
<3>INFO: RCU detected CPU 0 stall (t=1000 jiffies)

Thanks
-Sachin



-- 

---------------------------------
Sachin Sant
IBM Linux Technology Center
India Systems and Technology Labs
Bangalore, India
---------------------------------


[-- Attachment #2: pcpu-debug-log --]
[-- Type: text/plain, Size: 12987 bytes --]

1:mon> c 0
0:mon> t
[link register   ] c00000000016d0d0 .pcpu_alloc+0x8a4/0xae4
[c0000000da427960] c00000000016d094 .pcpu_alloc+0x868/0xae4 (unreliable)
[c0000000da427a90] c000000000613c08 .snmp_mib_init+0x34/0x9c
[c0000000da427b20] d00000000262e074 .ipv6_add_dev+0x1d4/0x3e4 [ipv6]
[c0000000da427bc0] d00000000265997c .addrconf_init+0x6c/0x194 [ipv6]
[c0000000da427c50] d00000000265974c .inet6_init+0x1bc/0x34c [ipv6]
[c0000000da427ce0] c0000000000097a4 .do_one_initcall+0x88/0x1bc
[c0000000da427d90] c0000000000d59fc .SyS_init_module+0x118/0x2a4
[c0000000da427e30] c0000000000085b4 syscall_exit+0x0/0x40
--- Exception: c01 (System Call) at 00000fff8350b568
SP (fffdc3a7f50) is in userspace
0:mon> dl
<4>Crash kernel location must be 0x2000000
<6>Reserving 256MB of memory at 32MB for crashkernel (System RAM: 4096MB)
<6>Phyp-dump disabled at boot time
<6>Using pSeries machine description
<7>Page orders: linear mapping = 24, virtual = 16, io = 12
<6>Using 1TB segments
<4>Found initrd at 0xc000000003500000:0xc000000003cca9b1
<6>bootconsole [udbg0] enabled
<6>Partition configured for 2 cpus.
<6>CPU maps initialized for 2 threads per core
<7> (thread shift is 1)
<4>Starting Linux PPC64 #2 SMP Fri Sep 25 12:31:24 IST 2009
<4>-----------------------------------------------------
<4>ppc64_pft_size                = 0x1a
<4>physicalMemorySize            = 0x100000000
<4>htab_hash_mask                = 0x7ffff
<4>-----------------------------------------------------
<6>Initializing cgroup subsys cpuset
<6>Initializing cgroup subsys cpu
<5>Linux version 2.6.31-git15 (root@mpower6lp5) (gcc version 4.3.2 [gcc-4_3-branch revision 141291] (SUSE Linux) ) #2 SMP Fri Sep 25 12:31:24 IST 2009
<4>[boot]0012 Setup Arch
<7>Node 0 Memory:
<7>Node 2 Memory: 0x0-0xe0000000
<7>Node 3 Memory: 0xe0000000-0x100000000
<4>EEH: No capable adapters found
<6>PPC64 nvram contains 15360 bytes
<7>Using shared processor idle loop
<4>Zone PFN ranges:
<4>  DMA      0x00000000 -> 0x00010000
<4>  Normal   0x00010000 -> 0x00010000
<4>Movable zone start PFN for each node
<4>early_node_map[2] active PFN ranges
<4>    2: 0x00000000 -> 0x0000e000
<4>    3: 0x0000e000 -> 0x00010000
<4>Could not find start_pfn for node 0
<7>On node 0 totalpages: 0
<7>On node 2 totalpages: 57344
<7>  DMA zone: 56 pages used for memmap
<7>  DMA zone: 0 pages reserved
<7>  DMA zone: 57288 pages, LIFO batch:1
<7>On node 3 totalpages: 8192
<7>  DMA zone: 8 pages used for memmap
<7>  DMA zone: 0 pages reserved
<7>  DMA zone: 8184 pages, LIFO batch:0
<4>[boot]0015 Setup Done
<6>PERCPU: Embedded 2 pages/cpu @c000000001100000 s97160 r0 d33912 u524288
<6>pcpu-alloc: s97160 r0 d33912 u524288 alloc=1*1048576
<6>pcpu-alloc: [0] 0 1 
<4>PERCPU: initialized 19 slots [c000000001120200,c000000001120330)
<4>PERCPU: chunk 0 relocating -1 -> 13 c000000001120380 <c000000001120380:c000000001120380>
<4>PERCPU: relocated <c0000000011202d0:c0000000011202d0>
<4>Built 3 zonelists in Node order, mobility grouping on.  Total pages: 65472
<4>Policy zone: DMA
<5>Kernel command line: root=/dev/sda3 sysrq=8 insmod=sym53c8xx insmod=ipr crashkernel=512M-:256M xmon=on 
<6>PID hash table entries: 4096 (order: -1, 32768 bytes)
<4>freeing bootmem node 2
<4>freeing bootmem node 3
<6>Memory: 3897152k/4194304k available (9664k kernel code, 297152k reserved, 2944k data, 4274k bss, 576k init)
<6>Hierarchical RCU implementation.
<6>RCU-based detection of stalled CPUs is enabled.
<6>NR_IRQS:512
<4>[boot]0020 XICS Init
<4>[boot]0021 XICS Done
<7>pic: no ISA interrupt controller
<7>time_init: decrementer frequency = 512.000000 MHz
<7>time_init: processor frequency   = 4704.000000 MHz
<6>clocksource: timebase mult[7d0000] shift[22] registered
<7>clockevent: decrementer mult[83126e97] shift[32] cpu[0]
<4>Console: colour dummy device 80x25
<6>console [hvc0] enabled, bootconsole disabled
<6>allocated 2621440 bytes of page_cgroup
<6>please try 'cgroup_disable=memory' option if you don't want memory cgroups
<6>Security Framework initialized
<6>SELinux:  Disabled at boot.
<6>Dentry cache hash table entries: 524288 (order: 6, 4194304 bytes)
<6>Inode-cache hash table entries: 262144 (order: 5, 2097152 bytes)
<4>Mount-cache hash table entries: 4096
<6>Initializing cgroup subsys ns
<6>Initializing cgroup subsys cpuacct
<6>Initializing cgroup subsys memory
<6>Initializing cgroup subsys devices
<6>Initializing cgroup subsys freezer
<7>irq: irq 2 on host null mapped to virtual irq 16
<7>clockevent: decrementer mult[83126e97] shift[32] cpu[1]
<4>Processor 1 found.
<6>Brought up 2 CPUs
<7>Node 0 CPUs: 0-1
<7>Node 2 CPUs:
<7>Node 3 CPUs:
<7>CPU0 attaching sched-domain:
<7> domain 0: span 0-1 level SIBLING
<7>  groups: 0 (cpu_power = 589) 1 (cpu_power = 589)
<7>  domain 1: span 0-1 level CPU
<7>   groups: 0-1 (cpu_power = 1178)
<7>CPU1 attaching sched-domain:
<7> domain 0: span 0-1 level SIBLING
<7>  groups: 1 (cpu_power = 589) 0 (cpu_power = 589)
<7>  domain 1: span 0-1 level CPU
<7>   groups: 0-1 (cpu_power = 1178)
<6>NET: Registered protocol family 16
<6>IBM eBus Device Driver
<6>POWER6 performance monitor hardware support registered
<6>PCI: Probing PCI hardware
<7>PCI: Probing PCI hardware done
<4>bio: create slab <bio-0> at 0
<4>PERCPU: chunk 0 relocating 13 -> 12 c000000001120380 <c0000000011202d0:c0000000011202d0>
<4>PERCPU: relocated <c0000000011202c0:c0000000011202c0>
<6>vgaarb: loaded
<6>usbcore: registered new interface driver usbfs
<6>usbcore: registered new interface driver hub
<6>usbcore: registered new device driver usb
<6>Switching to clocksource timebase
<6>NET: Registered protocol family 2
<6>IP route cache hash table entries: 32768 (order: 2, 262144 bytes)
<6>TCP established hash table entries: 131072 (order: 5, 2097152 bytes)
<6>TCP bind hash table entries: 65536 (order: 5, 2097152 bytes)
<6>TCP: Hash tables configured (established 131072 bind 65536)
<6>TCP reno registered
<6>NET: Registered protocol family 1
<6>Unpacking initramfs...
<7>Switched to high resolution mode on CPU 0
<7>Switched to high resolution mode on CPU 1
<7>irq: irq 655360 on host null mapped to virtual irq 17
<7>irq: irq 655367 on host null mapped to virtual irq 18
<6>IOMMU table initialized, virtual merging enabled
<7>irq: irq 589825 on host null mapped to virtual irq 19
<7>RTAS daemon started
<6>audit: initializing netlink socket (disabled)
<5>type=2000 audit(1253862433.200:1): initialized
<6>Kprobe smoke test started
<6>Kprobe smoke test passed successfully
<6>HugeTLB registered 16 MB page size, pre-allocated 0 pages
<6>HugeTLB registered 16 GB page size, pre-allocated 0 pages
<5>VFS: Disk quotas dquot_6.5.2
<4>Dquot-cache hash table entries: 8192 (order 0, 65536 bytes)
<6>Btrfs loaded
<6>msgmni has been set to 7608
<4>PERCPU: chunk 0 relocating 12 -> 11 c000000001120380 <c0000000011202c0:c0000000011202c0>
<4>PERCPU: relocated <c0000000011202b0:c0000000011202b0>
<6>alg: No test for stdrng (krng)
<6>Block layer SCSI generic (bsg) driver version 0.4 loaded (major 254)
<6>io scheduler noop registered
<6>io scheduler anticipatory registered
<6>io scheduler deadline registered
<6>io scheduler cfq registered (default)
<6>pci_hotplug: PCI Hot Plug PCI Core version: 0.5
<6>rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
<7>vio_register_driver: driver hvc_console registering
<7>HVSI: registered 0 devices
<6>Generic RTC Driver v1.07
<6>Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
<6>pmac_zilog: 0.6 (Benjamin Herrenschmidt <benh@kernel.crashing.org>)
<6>input: Macintosh mouse button emulation as /devices/virtual/input/input0
<6>Uniform Multi-Platform E-IDE driver
<6>ide-gd driver 1.18
<6>IBM eHEA ethernet device driver (Release EHEA_0102)
<7>irq: irq 590088 on host null mapped to virtual irq 264
<6>ehea: eth0: Jumbo frames are disabled
<6>ehea: eth0 -> logical port id #2
<6>ehci_hcd: USB 2.0 'Enhanced' Host Controller (EHCI) Driver
<6>ohci_hcd: USB 1.1 'Open' Host Controller (OHCI) Driver
<6>mice: PS/2 mouse device common for all mice
<6>EDAC MC: Ver: 2.1.0 Sep 25 2009
<6>usbcore: registered new interface driver hiddev
<6>usbcore: registered new interface driver usbhid
<6>usbhid: v2.6:USB HID core driver
<6>TCP cubic registered
<6>NET: Registered protocol family 15
<4>registered taskstats version 1
<4>Freeing unused kernel memory: 576k freed
<6>SysRq : Changing Loglevel
<4>Loglevel set to 8
<5>SCSI subsystem initialized
<7>vio_register_driver: driver ibmvscsi registering
<6>ibmvscsi 30000007: SRP_VERSION: 16.a
<6>scsi0 : IBM POWER Virtual SCSI Adapter 1.5.8
<6>ibmvscsi 30000007: partner initialization complete
<6>ibmvscsi 30000007: host srp version: 16.a, host partition VIO Server (1), OS 3, max io 1048576
<6>ibmvscsi 30000007: Client reserve enabled
<6>ibmvscsi 30000007: sent SRP login
<6>ibmvscsi 30000007: SRP_LOGIN succeeded
<5>scsi 0:0:1:0: Direct-Access     AIX      VDASD            0001 PQ: 0 ANSI: 3
<5>scsi 0:0:2:0: CD-ROM            AIX      VOPTA                 PQ: 0 ANSI: 4
<6>udevd version 128 started
<5>sd 0:0:1:0: [sda] 146800640 512-byte logical blocks: (75.1 GB/70.0 GiB)
<5>sd 0:0:1:0: [sda] Write Protect is off
<7>sd 0:0:1:0: [sda] Mode Sense: 17 00 00 08
<5>sd 0:0:1:0: [sda] Cache data unavailable
<3>sd 0:0:1:0: [sda] Assuming drive cache: write through
<5>sd 0:0:1:0: [sda] Cache data unavailable
<3>sd 0:0:1:0: [sda] Assuming drive cache: write through
<6> sda: sda1 sda2 sda3
<5>sd 0:0:1:0: [sda] Cache data unavailable
<3>sd 0:0:1:0: [sda] Assuming drive cache: write through
<5>sd 0:0:1:0: [sda] Attached SCSI disk
<6>kjournald starting.  Commit interval 5 seconds
<6>EXT3 FS on sda3, internal journal
<6>EXT3-fs: mounted filesystem with writeback data mode.
<6>udevd version 128 started
<5>sd 0:0:1:0: Attached scsi generic sg0 type 0
<5>scsi 0:0:2:0: Attached scsi generic sg1 type 5
<4>sr0: scsi-1 drive
<6>Uniform CD-ROM driver Revision: 3.20
<7>sr 0:0:2:0: Attached scsi CD-ROM sr0
<6>Adding 2096320k swap on /dev/sda2.  Priority:-1 extents:1 across:2096320k 
<6>device-mapper: uevent: version 1.0.3
<6>device-mapper: ioctl: 4.15.0-ioctl (2009-04-01) initialised: dm-devel@redhat.com
<6>loop: module loaded
<6>fuse init (API version 7.13)
<7>irq: irq 33539 on host null mapped to virtual irq 259
<6>ehea: eth0: Physical port up
<6>ehea: External switch port is backup port
<7>irq: irq 33540 on host null mapped to virtual irq 260
<6>NET: Registered protocol family 10
<4>PERCPU: chunk 0 relocating 11 -> 10 c000000001120380 <c0000000011202b0:c0000000011202b0>
<4>PERCPU: relocated <c0000000011202a0:c0000000011202a0>
<4>PERCPU: chunk 0 relocating 10 -> 9 c000000001120380 <c0000000011202a0:c0000000011202a0>
<4>PERCPU: relocated <c000000001120290:c000000001120290>
<4>PERCPU: chunk 1 relocating -1 -> 18 c0000000db70fb00 <c0000000db70fb00:c0000000db70fb00>
<4>PERCPU: relocated <c000000001120320:c000000001120320>
<4>PERCPU: chunk 1 relocating 18 -> 16 c0000000db70fb00 <c000000001120320:c000000001120320>
<4>PERCPU: relocated <c000000001120300:c000000001120300>
<4>PERCPU: chunk 1, alloc pages [0,1)
<4>PERCPU: chunk 1, map pages [0,1)
<4>PERCPU: map 0xd00007fffff00000, 1 pages 53544
<4>PERCPU: map 0xd00007fffff80000, 1 pages 53545
<4>PERCPU: chunk 1, will clear 4096b/unit d00007fffff00000 d00007fffff80000
<3>INFO: RCU detected CPU 0 stall (t=1000 jiffies)
<3>INFO: RCU detected CPU 0 stall (t=4000 jiffies)
<3>BUG: soft lockup - CPU#0 stuck for 61s! [modprobe:1864]
<4>Modules linked in: ipv6(+) fuse loop dm_mod sr_mod cdrom sg sd_mod crc_t10dif ibmvscsic scsi_transport_srp scsi_tgt scsi_mod
<4>NIP: c000000000043140 LR: c00000000016d0d0 CTR: 0000000000000040
<4>REGS: c0000000da4276e0 TRAP: 0901   Not tainted  (2.6.31-git15)
<4>MSR: 8000000000009032 <EE,ME,IR,DR>  CR: 44224880  XER: 20000002
<4>TASK = c0000000dbbf71d0[1864] 'modprobe' THREAD: c0000000da424000 CPU: 0
<4>GPR00: 0000000000000040 c0000000da427960 c000000000b8bca8 d00007fffff00000 
<4>GPR04: 0000000000000000 0000000000000000 d00007fffff00000 80000000565a6cc0 
<4>GPR08: 0000000000000000 c000000001120180 c000000000c45aa0 00000000000003c0 
<4>GPR12: 0000000028224882 c000000000c62600 
<4>NIP [c000000000043140] .memset+0x60/0xfc
<4>LR [c00000000016d0d0] .pcpu_alloc+0x8a4/0xae4
<4>Call Trace:
<4>[c0000000da427960] [c00000000016d094] .pcpu_alloc+0x868/0xae4 (unreliable)
<4>[c0000000da427a90] [c000000000613c08] .snmp_mib_init+0x34/0x9c
<4>[c0000000da427b20] [d00000000262e074] .ipv6_add_dev+0x1d4/0x3e4 [ipv6]
<4>[c0000000da427bc0] [d00000000265997c] .addrconf_init+0x6c/0x194 [ipv6]
<4>[c0000000da427c50] [d00000000265974c] .inet6_init+0x1bc/0x34c [ipv6]
<4>[c0000000da427ce0] [c0000000000097a4] .do_one_initcall+0x88/0x1bc
<4>[c0000000da427d90] [c0000000000d59fc] .SyS_init_module+0x118/0x2a4
<4>[c0000000da427e30] [c0000000000085b4] syscall_exit+0x0/0x40
<4>Instruction dump:
<4>98860000 38c60001 409e000c b0860000 38c60002 409d000c 90860000 38c60004 
<4>78a0d183 78a506a0 7c0903a6 4182002c <f8860000> f8860008 f8860010 f8860018 
<3>INFO: RCU detected CPU 0 stall (t=7000 jiffies)
<3>INFO: RCU detected CPU 0 stall (t=10000 jiffies)
0:mon>



^ permalink raw reply

* Re: [v6 PATCH 0/7]: cpuidle/x86/POWER: Cleanup idle power management code in x86, cleanup drivers/cpuidle/cpuidle.c and introduce cpuidle to POWER.
From: Balbir Singh @ 2009-09-25  7:20 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Shaohua Li, Peter Zijlstra, Gautham R Shenoy, Venkatesh Pallipadi,
	linux-kernel, linux-acpi, Paul Mackerras, arun, Ingo Molnar,
	linuxppc-dev, Len Brown
In-Reply-To: <20090924142228.5a2ddf59@infradead.org>

On Thu, Sep 24, 2009 at 5:52 PM, Arjan van de Ven <arjan@infradead.org> wrote:
> On Thu, 24 Sep 2009 10:42:41 +0530
> Arun R Bharadwaj <arun@linux.vnet.ibm.com> wrote:
>
>> * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-22 16:55:27]:
>>
>> Hi Len, (or other acpi folks),
>>
>> I had a question regarding ACPI-cpuidle interaction in the current
>> implementation.
>>
>> Currently, every cpu (i.e. acpi_processor) registers to cpuidle as
>> a cpuidle_device. So every cpu has to go through the process of
>> setting up the idle states and then registering as a cpuidle device.
>>
>> What exactly is the reason behind this?
>>
>
> technically a BIOS can opt to give you C states via ACPI on some cpus,
> but not on others.
>
> in practice when this happens it tends to be a bug.. but it's
> technically a valid configuration

In this day and age of flashable BIOS with recovery BIOS built in,
can't we just print out a big far warning, asking users of such
systems to go back to their vendors and ask for updates or find the
updates and apply them? Does the OS have to do the heavy lifting and
allow users to live with buggy BIOS's.

When you say it is a technically valid configuration, you mean that
the ACPI spec allows for such inconsistency?

Balbir Singh

^ permalink raw reply

* Re: [PATCH v2 0/2] cpu: pseries: Offline state framework.
From: Vaidyanathan Srinivasan @ 2009-09-25  7:25 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Peter Zijlstra, Gautham R Shenoy, Venkatesh Pallipadi,
	linux-kernel, linuxppc-dev, Darrick J. Wong
In-Reply-To: <20090924134123.4acd1adf@infradead.org>

* Arjan van de Ven <arjan@infradead.org> [2009-09-24 13:41:23]:

> On Thu, 24 Sep 2009 13:33:07 +0200
> Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > On Thu, 2009-09-24 at 18:38 +1000, Benjamin Herrenschmidt wrote:
> > > On Thu, 2009-09-24 at 09:51 +0200, Peter Zijlstra wrote:
> > > > > I don't quite follow your logic here. This is useful for more
> > > > > than just hypervisors. For example, take the HV out of the
> > > > > picture for a moment and imagine that the HW has the ability to
> > > > > offline CPU in various power levels, with varying latencies to
> > > > > bring them back.
> > > > 
> > > > cpu-hotplug is an utter slow path, anybody saying latency and
> > > > hotplug in the same sentence doesn't seem to grasp either or both
> > > > concepts.
> > > 
> > > Let's forget about latency then. Let's imagine I want to set a CPU
> > > offline to save power, vs. setting it offline -and- opening the back
> > > door of the machine to actually physically replace it :-)
> > 
> > If the hardware is capable of physical hotplug, then surely powering
> > the socket down saves most power and is the preferred mode?
> 
> btw just to take away a perception that generally powering down sockets
> help; it does not help for all cpus. Some cpus are so efficient in idle
> that the incremental gain one would get by "offlining" a core is just
> not worth it
> (in fact, in x86, it's the same thing)
> 
> I obviously can't speak for p-series cpus, just wanted to point out
> that there is no universal truth about "offlining saves power".

Hi Arjan,

As you have said, on some cpus the extra effort of offlining does not
save us any extra power, and the state will be same as idle.  The
assertion that offlining saves power is still valid, it could be same
as idle or better depending on the architecture and implementation.

On x86 we still need the code (Venki posted) to take cpus to C6 on
offline to save power or else offlining consumes more power than idle
due to C1/hlt state.  This framework can help here as well if we have
any apprehension on making lowest sleep state as default on x86 and
want the administrator to decide.

--Vaidy

^ permalink raw reply

* Re: 2.6.31-git5 kernel boot hangs on powerpc
From: Tejun Heo @ 2009-09-25  7:39 UTC (permalink / raw)
  To: Sachin Sant; +Cc: David Miller, Linux/PPC Development
In-Reply-To: <4ABC6E25.7090904@in.ibm.com>

Hello,

Sachin Sant wrote:
> <4>PERCPU: chunk 1 relocating -1 -> 18 c0000000db70fb00
> <c0000000db70fb00:c0000000db70fb00>
> <4>PERCPU: relocated <c000000001120320:c000000001120320>
> <4>PERCPU: chunk 1 relocating 18 -> 16 c0000000db70fb00
> <c000000001120320:c000000001120320>
> <4>PERCPU: relocated <c000000001120300:c000000001120300>
> <4>PERCPU: chunk 1, alloc pages [0,1)
> <4>PERCPU: chunk 1, map pages [0,1)
> <4>PERCPU: map 0xd00007fffff00000, 1 pages 53544
> <4>PERCPU: map 0xd00007fffff80000, 1 pages 53545
> <4>PERCPU: chunk 1, will clear 4096b/unit d00007fffff00000 d00007fffff80000
> <3>INFO: RCU detected CPU 0 stall (t=1000 jiffies)

This supports my hypothesis.  This is the first area being allocated
from a dynamic chunk and cleared.  PFN 53544 and 53545 have been
allocated and successfully mapped to 0xd00007fffff00000 and
0xd00007fffff80000 using map_kernel_range_noflush() but when those
addresses are actually accessed, we end up with infinite faults.  The
fault handler probably thinks that the fault has been handled
correctly but, when the control is returned, the processor faults
again.  Benjamin, I'm way out of my depth here, can you please help?

Oh, one more simple experiment.  Sachin, does the following patch make
any difference?

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 69511e6..93d29eb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2102,7 +2102,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     size_t align, gfp_t gfp_mask)
 {
 	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
-	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+	//const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+	const unsigned long vmalloc_end = vmalloc_start + (512 << 20);
 	struct vmap_area **vas, *prev, *next;
 	struct vm_struct **vms;
 	int area, area2, last_area, term_area;


-- 
tejun

^ permalink raw reply related

* Re: [PATCH v2 0/2] cpu: pseries: Offline state framework.
From: Arjan van de Ven @ 2009-09-25  7:42 UTC (permalink / raw)
  To: svaidy
  Cc: Peter Zijlstra, Gautham R Shenoy, Venkatesh Pallipadi,
	linux-kernel, linuxppc-dev, Darrick J. Wong
In-Reply-To: <20090925072549.GB9562@dirshya.in.ibm.com>

On Fri, 25 Sep 2009 12:55:49 +0530
Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:

> > I obviously can't speak for p-series cpus, just wanted to point out
> > that there is no universal truth about "offlining saves power".
> 
> Hi Arjan,
> 
> As you have said, on some cpus the extra effort of offlining does not
> save us any extra power, and the state will be same as idle.  The
> assertion that offlining saves power is still valid, it could be same
> as idle or better depending on the architecture and implementation.
> 
> On x86 we still need the code (Venki posted) to take cpus to C6 on
> offline to save power or else offlining consumes more power than idle
> due to C1/hlt state.  This framework can help here as well if we have
> any apprehension on making lowest sleep state as default on x86 and
> want the administrator to decide.

even with Venki's patch, all our measurements indicate that taking
cores away is damage on x86. Don't let that stop what you do for
powerpc, but for x86 it's not a win. Linux is good at keeping cores in
C6 long enough that the downside of offlining is bigger...



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply

* Re: 2.6.31-git5 kernel boot hangs on powerpc
From: Tejun Heo @ 2009-09-25  7:43 UTC (permalink / raw)
  To: Sachin Sant; +Cc: David Miller, Linux/PPC Development
In-Reply-To: <4ABC73C7.20403@kernel.org>

Tejun Heo wrote:
> Hello,
> 
> Sachin Sant wrote:
>> <4>PERCPU: chunk 1 relocating -1 -> 18 c0000000db70fb00
>> <c0000000db70fb00:c0000000db70fb00>
>> <4>PERCPU: relocated <c000000001120320:c000000001120320>
>> <4>PERCPU: chunk 1 relocating 18 -> 16 c0000000db70fb00
>> <c000000001120320:c000000001120320>
>> <4>PERCPU: relocated <c000000001120300:c000000001120300>
>> <4>PERCPU: chunk 1, alloc pages [0,1)
>> <4>PERCPU: chunk 1, map pages [0,1)
>> <4>PERCPU: map 0xd00007fffff00000, 1 pages 53544
>> <4>PERCPU: map 0xd00007fffff80000, 1 pages 53545
>> <4>PERCPU: chunk 1, will clear 4096b/unit d00007fffff00000 d00007fffff80000
>> <3>INFO: RCU detected CPU 0 stall (t=1000 jiffies)
> 
> This supports my hypothesis.  This is the first area being allocated
> from a dynamic chunk and cleared.  PFN 53544 and 53545 have been
> allocated and successfully mapped to 0xd00007fffff00000 and
> 0xd00007fffff80000 using map_kernel_range_noflush() but when those
> addresses are actually accessed, we end up with infinite faults.  The
> fault handler probably thinks that the fault has been handled
> correctly but, when the control is returned, the processor faults
> again.  Benjamin, I'm way out of my depth here, can you please help?
> 
> Oh, one more simple experiment.  Sachin, does the following patch make
> any difference?

Oops, the patch should look like the following.

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 69511e6..37ab9e2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2056,7 +2056,8 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
 				       struct vmap_area **pprev,
 				       unsigned long align)
 {
-	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
+	const unsigned long vmalloc_end = vmalloc_start + (512 << 20);
 	unsigned long addr;

 	if (*pnext)
@@ -2102,7 +2103,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				     size_t align, gfp_t gfp_mask)
 {
 	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
-	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+	const unsigned long vmalloc_end = vmalloc_start + (512 << 20);
 	struct vmap_area **vas, *prev, *next;
 	struct vm_struct **vms;
 	int area, area2, last_area, term_area;

^ permalink raw reply related

* Re: 2.6.31-git5 kernel boot hangs on powerpc
From: Sachin Sant @ 2009-09-25  8:03 UTC (permalink / raw)
  To: Tejun Heo; +Cc: David Miller, Linux/PPC Development
In-Reply-To: <4ABC7486.8040500@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2821 bytes --]

Tejun Heo wrote:
> Tejun Heo wrote:
>   
>> Hello,
>>
>> Sachin Sant wrote:
>>     
>>> <4>PERCPU: chunk 1 relocating -1 -> 18 c0000000db70fb00
>>> <c0000000db70fb00:c0000000db70fb00>
>>> <4>PERCPU: relocated <c000000001120320:c000000001120320>
>>> <4>PERCPU: chunk 1 relocating 18 -> 16 c0000000db70fb00
>>> <c000000001120320:c000000001120320>
>>> <4>PERCPU: relocated <c000000001120300:c000000001120300>
>>> <4>PERCPU: chunk 1, alloc pages [0,1)
>>> <4>PERCPU: chunk 1, map pages [0,1)
>>> <4>PERCPU: map 0xd00007fffff00000, 1 pages 53544
>>> <4>PERCPU: map 0xd00007fffff80000, 1 pages 53545
>>> <4>PERCPU: chunk 1, will clear 4096b/unit d00007fffff00000 d00007fffff80000
>>> <3>INFO: RCU detected CPU 0 stall (t=1000 jiffies)
>>>       
>> This supports my hypothesis.  This is the first area being allocated
>> from a dynamic chunk and cleared.  PFN 53544 and 53545 have been
>> allocated and successfully mapped to 0xd00007fffff00000 and
>> 0xd00007fffff80000 using map_kernel_range_noflush() but when those
>> addresses are actually accessed, we end up with infinite faults.  The
>> fault handler probably thinks that the fault has been handled
>> correctly but, when the control is returned, the processor faults
>> again.  Benjamin, I'm way out of my depth here, can you please help?
>>
>> Oh, one more simple experiment.  Sachin, does the following patch make
>> any difference?
>>     
With this patch applied the machine boots OK :-)

Have attached the boot log. Note that this boot log is
from a different machine, but the reported problem can be
recreate on this machine as well.

Thanks
-Sachin

>
> Oops, the patch should look like the following.
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 69511e6..37ab9e2 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2056,7 +2056,8 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
>  				       struct vmap_area **pprev,
>  				       unsigned long align)
>  {
> -	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
> +	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
> +	const unsigned long vmalloc_end = vmalloc_start + (512 << 20);
>  	unsigned long addr;
>
>  	if (*pnext)
> @@ -2102,7 +2103,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  				     size_t align, gfp_t gfp_mask)
>  {
>  	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
> -	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
> +	const unsigned long vmalloc_end = vmalloc_start + (512 << 20);
>  	struct vmap_area **vas, *prev, *next;
>  	struct vm_struct **vms;
>  	int area, area2, last_area, term_area;
>
>   


-- 

---------------------------------
Sachin Sant
IBM Linux Technology Center
India Systems and Technology Labs
Bangalore, India
---------------------------------


[-- Attachment #2: boot-log-with-patch-2 --]
[-- Type: text/plain, Size: 11143 bytes --]

Phyp-dump disabled at boot time
Using pSeries machine description
Page orders: linear mapping = 24, virtual = 16, io = 12, vmemmap = 24
Using 1TB segments
Found initrd at 0xc000000003700000:0xc000000003eca37e
bootconsole [udbg0] enabled
Partition configured for 8 cpus.
CPU maps initialized for 2 threads per core
 (thread shift is 1)
Starting Linux PPC64 #3 SMP Fri Sep 25 13:19:46 IST 2009
-----------------------------------------------------
ppc64_pft_size                = 0x19
physicalMemorySize            = 0x80000000
htab_hash_mask                = 0x3ffff
-----------------------------------------------------
Initializing cgroup subsys cpuset
Initializing cgroup subsys cpu
Linux version 2.6.31-git15 (root@mjs22lp5) (gcc version 4.3.2 [gcc-4_3-branch revision 141291] (SUSE Linux) ) #3 SMP Fri Sep 25 13:19:46 IST 2009
[boot]0012 Setup Arch
Node 0 Memory: 0x0-0x42000000
Node 1 Memory: 0x42000000-0x80000000
EEH: No capable adapters found
PPC64 nvram contains 15360 bytes
Using shared processor idle loop
Zone PFN ranges:
  DMA      0x00000000 -> 0x00008000
  Normal   0x00008000 -> 0x00008000
Movable zone start PFN for each node
early_node_map[2] active PFN ranges
    0: 0x00000000 -> 0x00004200
    1: 0x00004200 -> 0x00008000
On node 0 totalpages: 16896
  DMA zone: 15 pages used for memmap
  DMA zone: 0 pages reserved
  DMA zone: 16881 pages, LIFO batch:1
On node 1 totalpages: 15872
  DMA zone: 14 pages used for memmap
  DMA zone: 0 pages reserved
  DMA zone: 15858 pages, LIFO batch:1
[boot]0015 Setup Done
PERCPU: Embedded 2 pages/cpu @c000000001400000 s96744 r0 d34328 u131072
pcpu-alloc: s96744 r0 d34328 u131072 alloc=1*1048576
pcpu-alloc: [0] 0 1 2 3 4 5 6 7 
PERCPU: initialized 17 slots [c000000001500200,c000000001500310)
PERCPU: chunk 0 relocating -1 -> 13 c000000001500380 <c000000001500380:c000000001500380>
PERCPU: relocated <c0000000015002d0:c0000000015002d0>
Built 2 zonelists in Node order, mobility grouping on.  Total pages: 32739
Policy zone: DMA
Kernel command line: root=/dev/sda3 sysrq=8 xmon=on 
PID hash table entries: 4096 (order: -1, 32768 bytes)
freeing bootmem node 0
freeing bootmem node 1
Memory: 2040320k/2097152k available (12800k kernel code, 56832k reserved, 2880k data, 4268k bss, 4800k init)
Hierarchical RCU implementation.
NR_IRQS:512
[boot]0020 XICS Init
[boot]0021 XICS Done
pic: no ISA interrupt controller
time_init: decrementer frequency = 512.000000 MHz
time_init: processor frequency   = 4005.000000 MHz
clocksource: timebase mult[7d0000] shift[22] registered
clockevent: decrementer mult[83126e97] shift[32] cpu[0]
Console: colour dummy device 80x25
console [hvc0] enabled, bootconsole disabled
allocated 1310720 bytes of page_cgroup
please try 'cgroup_disable=memory' option if you don't want memory cgroups
Security Framework initialized
SELinux:  Disabled at boot.
Dentry cache hash table entries: 262144 (order: 5, 2097152 bytes)
Inode-cache hash table entries: 131072 (order: 4, 1048576 bytes)
Mount-cache hash table entries: 4096
Initializing cgroup subsys ns
Initializing cgroup subsys cpuacct
Initializing cgroup subsys memory
Initializing cgroup subsys devices
Initializing cgroup subsys freezer
irq: irq 2 on host null mapped to virtual irq 16
clockevent: decrementer mult[83126e97] shift[32] cpu[1]
Processor 1 found.
clockevent: decrementer mult[83126e97] shift[32] cpu[2]
Processor 2 found.
clockevent: decrementer mult[83126e97] shift[32] cpu[3]
Processor 3 found.
Brought up 4 CPUs
Node 0 CPUs: 0-3
Node 1 CPUs:
CPU0 attaching sched-domain:
 domain 0: span 0-1 level SIBLING
  groups: 0 (cpu_power = 589) 1 (cpu_power = 589)
  domain 1: span 0-3 level CPU
   groups: 0-1 (cpu_power = 1178) 2-3 (cpu_power = 1178)
CPU1 attaching sched-domain:
 domain 0: span 0-1 level SIBLING
  groups: 1 (cpu_power = 589) 0 (cpu_power = 589)
  domain 1: span 0-3 level CPU
   groups: 0-1 (cpu_power = 1178) 2-3 (cpu_power = 1178)
CPU2 attaching sched-domain:
 domain 0: span 2-3 level SIBLING
  groups: 2 (cpu_power = 589) 3 (cpu_power = 589)
  domain 1: span 0-3 level CPU
   groups: 2-3 (cpu_power = 1178) 0-1 (cpu_power = 1178)
CPU3 attaching sched-domain:
 domain 0: span 2-3 level SIBLING
  groups: 3 (cpu_power = 589) 2 (cpu_power = 589)
  domain 1: span 0-3 level CPU
   groups: 2-3 (cpu_power = 1178) 0-1 (cpu_power = 1178)
NET: Registered protocol family 16
IBM eBus Device Driver
POWER6 performance monitor hardware support registered
PCI: Probing PCI hardware
PCI: Probing PCI hardware done
bio: create slab <bio-0> at 0
vgaarb: loaded
usbcore: registered new interface driver usbfs
usbcore: registered new interface driver hub
usbcore: registered new device driver usb
Switching to clocksource timebase
Switched to high resolution mode on CPU 1
Switched to high resolution mode on CPU 2
Switched to high resolution mode on CPU 3
NET: Registered protocol family 2
PERCPU: chunk 0 relocating 13 -> 12 c000000001500380 <c0000000015002d0:c0000000015002d0>
PERCPU: relocated <c0000000015002c0:c0000000015002c0>
IP route cache hash table entries: 16384 (order: 1, 131072 bytes)
TCP established hash table entries: 65536 (order: 4, 1048576 bytes)
TCP bind hash table entries: 65536 (order: 4, 1048576 bytes)
TCP: Hash tables configured (established 65536 bind 65536)
TCP reno registered
NET: Registered protocol family 1
Unpacking initramfs...
Switched to high resolution mode on CPU 0
Freeing initrd memory: 7976k freed
irq: irq 655360 on host null mapped to virtual irq 17
irq: irq 655362 on host null mapped to virtual irq 18
IOMMU table initialized, virtual merging enabled
irq: irq 655364 on host null mapped to virtual irq 19
irq: irq 655365 on host null mapped to virtual irq 20
irq: irq 589825 on host null mapped to virtual irq 21
RTAS daemon started
audit: initializing netlink socket (disabled)
type=2000 audit(1253865170.250:1): initialized
HugeTLB registered 16 MB page size, pre-allocated 0 pages
HugeTLB registered 16 GB page size, pre-allocated 0 pages
VFS: Disk quotas dquot_6.5.2
Dquot-cache hash table entries: 8192 (order 0, 65536 bytes)
msgmni has been set to 4000
alg: No test for stdrng (krng)
Block layer SCSI generic (bsg) driver version 0.4 loaded (major 254)
io scheduler noop registered
io scheduler anticipatory registered
io scheduler deadline registered
io scheduler cfq registered (default)
pci_hotplug: PCI Hot Plug PCI Core version: 0.5
rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
vio_register_driver: driver hvc_console registering
HVSI: registered 0 devices
Generic RTC Driver v1.07
Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
pmac_zilog: 0.6 (Benjamin Herrenschmidt <benh@kernel.crashing.org>)
input: Macintosh mouse button emulation as /devices/virtual/input/input0
Uniform Multi-Platform E-IDE driver
ide-gd driver 1.18
ehci_hcd: USB 2.0 'Enhanced' Host Controller (EHCI) Driver
ohci_hcd: USB 1.1 'Open' Host Controller (OHCI) Driver
mice: PS/2 mouse device common for all mice
EDAC MC: Ver: 2.1.0 Sep 25 2009
usbcore: registered new interface driver hiddev
usbcore: registered new interface driver usbhid
usbhid: v2.6:USB HID core driver
TCP cubic registered
NET: Registered protocol family 15
registered taskstats version 1
Freeing unused kernel memory: 4800k freed
PERCPU: chunk 0 relocating 12 -> 11 c000000001500380 <c0000000015002c0:c0000000015002c0>
PERCPU: relocated <c0000000015002b0:c0000000015002b0>
SysRq : Changing Loglevel
Loglevel set to 8
SCSI subsystem initialized
vio_register_driver: driver ibmvscsi registering
ibmvscsi 30000002: SRP_VERSION: 16.a
scsi0 : IBM POWER Virtual SCSI Adapter 1.5.8
ibmvscsi 30000002: partner initialization complete
ibmvscsi 30000002: host srp version: 16.a, host partition 06-1C12A (1), OS 3, max io 262144
ibmvscsi 30000002: Client reserve enabled
ibmvscsi 30000002: sent SRP login
ibmvscsi 30000002: SRP_LOGIN succeeded
scsi 0:0:1:0: Direct-Access     AIX      VDASD            0001 PQ: 0 ANSI: 3
scsi 0:0:2:0: CD-ROM            AIX      VOPTA                 PQ: 0 ANSI: 4
udevd version 128 started
sd 0:0:1:0: [sda] 33554432 512-byte logical blocks: (17.1 GB/16.0 GiB)
sd 0:0:1:0: [sda] Write Protect is off
sd 0:0:1:0: [sda] Mode Sense: 17 00 00 08
sd 0:0:1:0: [sda] Cache data unavailable
sd 0:0:1:0: [sda] Assuming drive cache: write through
sd 0:0:1:0: [sda] Cache data unavailable
sd 0:0:1:0: [sda] Assuming drive cache: write through
 sda: sda1 sda2 sda3
sd 0:0:1:0: [sda] Cache data unavailable
sd 0:0:1:0: [sda] Assuming drive cache: write through
sd 0:0:1:0: [sda] Attached SCSI disk
kjournald starting.  Commit interval 5 seconds
EXT3 FS on sda3, internal journal
EXT3-fs: mounted filesystem with writeback data mode.
udevd version 128 started
sd 0:0:1:0: Attached scsi generic sg0 type 0
scsi 0:0:2:0: Attached scsi generic sg1 type 5
drivers/net/ibmveth.c: ibmveth: IBM i/pSeries Virtual Ethernet Driver 1.03
vio_register_driver: driver ibmveth registering
IBM eHEA ethernet device driver (Release EHEA_0102)
irq: irq 590080 on host null mapped to virtual irq 256
ehea: eth2: Jumbo frames are enabled
ehea: eth2 -> logical port id #9
ehea: eth3: Jumbo frames are enabled
ehea: eth3 -> logical port id #10
sr0: scsi-1 drive
Uniform CD-ROM driver Revision: 3.20
sr 0:0:2:0: Attached scsi CD-ROM sr0
Adding 1044096k swap on /dev/sda2.  Priority:-1 extents:1 across:1044096k 
device-mapper: uevent: version 1.0.3
device-mapper: ioctl: 4.15.0-ioctl (2009-04-01) initialised: dm-devel@redhat.com
loop: module loaded
fuse init (API version 7.13)
irq: irq 777 on host null mapped to virtual irq 265
ehea: eth2: Physical port up
ehea: External switch port is backup port
irq: irq 778 on host null mapped to virtual irq 266
NET: Registered protocol family 10
PERCPU: chunk 0 relocating 11 -> 10 c000000001500380 <c0000000015002b0:c0000000015002b0>
PERCPU: relocated <c0000000015002a0:c0000000015002a0>
PERCPU: chunk 0 relocating 10 -> 9 c000000001500380 <c0000000015002a0:c0000000015002a0>
PERCPU: relocated <c000000001500290:c000000001500290>
PERCPU: chunk 1 relocating -1 -> 16 c00000003e6d7500 <c00000003e6d7500:c00000003e6d7500>
PERCPU: relocated <c000000001500300:c000000001500300>
PERCPU: chunk 1 relocating 16 -> 14 c00000003e6d7500 <c000000001500300:c000000001500300>
PERCPU: relocated <c0000000015002e0:c0000000015002e0>
PERCPU: chunk 1, alloc pages [0,1)
PERCPU: chunk 1, map pages [0,1)
PERCPU: map 0xd00000001ff00000, 1 pages 14136
PERCPU: map 0xd00000001ff20000, 1 pages 14137
PERCPU: map 0xd00000001ff40000, 1 pages 14159
PERCPU: map 0xd00000001ff60000, 1 pages 14166
PERCPU: map 0xd00000001ff80000, 1 pages 14161
PERCPU: map 0xd00000001ffa0000, 1 pages 14165
PERCPU: map 0xd00000001ffc0000, 1 pages 15732
PERCPU: map 0xd00000001ffe0000, 1 pages 16049
PERCPU: chunk 1, will clear 4096b/unit d00000001ff00000 d00000001ff20000 d00000001ff40000 d00000001ff60000 d00000001ff80000 d00000001ffa0000 d00000001ffc0000 d00000001ffe0000
PERCPU: chunk 0 relocating 9 -> 8 c000000001500380 <c000000001500290:c000000001500290>
PERCPU: relocated <c000000001500280:c000000001500280>
PERCPU: chunk 0 relocating 8 -> 7 c000000001500380 <c000000001500280:c000000001500280>
PERCPU: relocated <c000000001500270:c000000001500270>
eth2: no IPv6 routers present

^ permalink raw reply

* Re: 2.6.31-git5 kernel boot hangs on powerpc
From: Benjamin Herrenschmidt @ 2009-09-25  8:31 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Linux/PPC Development, David Miller
In-Reply-To: <4ABC73C7.20403@kernel.org>

On Fri, 2009-09-25 at 16:39 +0900, Tejun Heo wrote:
> Hello,
> 
> Sachin Sant wrote:
> > <4>PERCPU: chunk 1 relocating -1 -> 18 c0000000db70fb00
> > <c0000000db70fb00:c0000000db70fb00>
> > <4>PERCPU: relocated <c000000001120320:c000000001120320>
> > <4>PERCPU: chunk 1 relocating 18 -> 16 c0000000db70fb00
> > <c000000001120320:c000000001120320>
> > <4>PERCPU: relocated <c000000001120300:c000000001120300>
> > <4>PERCPU: chunk 1, alloc pages [0,1)
> > <4>PERCPU: chunk 1, map pages [0,1)
> > <4>PERCPU: map 0xd00007fffff00000, 1 pages 53544
> > <4>PERCPU: map 0xd00007fffff80000, 1 pages 53545
> > <4>PERCPU: chunk 1, will clear 4096b/unit d00007fffff00000 d00007fffff80000
> > <3>INFO: RCU detected CPU 0 stall (t=1000 jiffies)
> 
> This supports my hypothesis.  This is the first area being allocated
> from a dynamic chunk and cleared.  PFN 53544 and 53545 have been
> allocated and successfully mapped to 0xd00007fffff00000 and
> 0xd00007fffff80000 using map_kernel_range_noflush() but when those
> addresses are actually accessed, we end up with infinite faults.  The
> fault handler probably thinks that the fault has been handled
> correctly but, when the control is returned, the processor faults
> again.  Benjamin, I'm way out of my depth here, can you please help?

Definitely looks like a powerpc mm problem. I'll have a look on monday.

Cheers,
Ben.

> Oh, one more simple experiment.  Sachin, does the following patch make
> any difference?
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 69511e6..93d29eb 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2102,7 +2102,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  				     size_t align, gfp_t gfp_mask)
>  {
>  	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
> -	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
> +	//const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
> +	const unsigned long vmalloc_end = vmalloc_start + (512 << 20);
>  	struct vmap_area **vas, *prev, *next;
>  	struct vm_struct **vms;
>  	int area, area2, last_area, term_area;
> 
> 

^ permalink raw reply

* Re: [PATCH] powerpc/8xx: fix regression introduced by cache coherency rewrite
From: Joakim Tjernlund @ 2009-09-25  8:31 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev@ozlabs.org, Rex Feany
In-Reply-To: <1253847827.7103.504.camel@pasglop>

>
>
> > I think there's more finishyness to 8xx than we thought. IE. That
> > tlbil_va might have more reasons to be there than what the comment
> > seems to advertize. Can you try to move it even higher up ? IE.
> > Unconditionally at the beginning of set_pte_filter ?
> >
> > Also, if that doesn't help, can you try putting one in
> > set_access_flags_filter() just below ?
>
> Ok, I got a refresher on the whole concept of "unpopulated TLB entries"
> on 8xx, and that's damn scary. I think what mislead me initially is that
> the comment around the workaround is simply not properly describing the
> extent of the problem :-)
>
> So I'm not going to make the 8xx TLB miss code sane, that's beyond what
> I'm prepare to do with it, but I suspect that this should fix it (on top
> of upstream). Let me know if that's enough or if we also need to put
> one of these in ptep_set_access_flags().
>
> Please let me know if that works for you.
>
> Cheers,
> Ben.
>
> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index 5304093..7a8e676 100644
> --- a/arch/powerpc/mm/pgtable.c
> +++ b/arch/powerpc/mm/pgtable.c
> @@ -170,6 +170,16 @@ struct page * maybe_pte_to_page(pte_t pte)
>
>  static pte_t set_pte_filter(pte_t pte, unsigned long addr)
>  {
> +#ifdef CONFIG_8xx
> +   /* 8xx has a weird concept of "unpopulated" entries. When we take
> +    * a TLB miss for a non-valid PTE, we insert such an entry which
> +    * causes a page fault the next time around. This entry must now
> +    * be kicked out or we'll just fault again
> +    */
> +   /* 8xx doesn't care about PID, size or ind args */
> +   _tlbil_va(addr, 0, 0, 0);
> +#endif /* CONFIG_8xx */
> +

The main problem with 8xx it does not update the DAR register in
the TLB Miss/Fault handlers for cache instructions :( It on old bug
that was found only some years ago.

I think the old comment is correct though, as I recall it was Marcelo
that found the problem and added the workaround.

   Jocke

^ permalink raw reply

* Re: [v6 PATCH 0/7]: cpuidle/x86/POWER: Cleanup idle power management code in x86, cleanup drivers/cpuidle/cpuidle.c and introduce cpuidle to POWER.
From: Peter Zijlstra @ 2009-09-25  8:54 UTC (permalink / raw)
  To: svaidy
  Cc: Shaohua Li, Gautham R Shenoy, Venkatesh Pallipadi, linux-kernel,
	linux-acpi, Paul Mackerras, arun, Ingo Molnar, Arjan van de Ven,
	linuxppc-dev, Len Brown
In-Reply-To: <20090925070623.GH8595@dirshya.in.ibm.com>

On Fri, 2009-09-25 at 12:36 +0530, Vaidyanathan Srinivasan wrote:
> * Arjan van de Ven <arjan@infradead.org> [2009-09-24 14:22:28]:
> 
> > On Thu, 24 Sep 2009 10:42:41 +0530
> > Arun R Bharadwaj <arun@linux.vnet.ibm.com> wrote:
> > 
> > > * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-22 16:55:27]:
> > > 
> > > Hi Len, (or other acpi folks),
> > > 
> > > I had a question regarding ACPI-cpuidle interaction in the current
> > > implementation.
> > > 
> > > Currently, every cpu (i.e. acpi_processor) registers to cpuidle as
> > > a cpuidle_device. So every cpu has to go through the process of
> > > setting up the idle states and then registering as a cpuidle device.
> > > 
> > > What exactly is the reason behind this?
> > > 
> > 
> > technically a BIOS can opt to give you C states via ACPI on some cpus,
> > but not on others.
> > 
> > in practice when this happens it tends to be a bug.. but it's
> > technically a valid configuration
> 
> So we will need to keep the per-cpu registration as of now because we
> may have such buggy BIOS in the field and we don't want the cpuidle
> framework to malfunction there.

If the BIOS doesn't mention a certain C state on a cpu, and you try to
set it anyway, does that go boom?

This whole per-cpu registration thing is horridly ugly, can't you have a
per-cpu C state exception mask and leave it at that -- if its really
needed?

^ permalink raw reply

* Re: 2.6.31-git5 kernel boot hangs on powerpc
From: Tejun Heo @ 2009-09-25  9:01 UTC (permalink / raw)
  To: Sachin Sant; +Cc: David Miller, Linux/PPC Development
In-Reply-To: <4ABC7955.2070404@in.ibm.com>

Sachin Sant wrote:
> With this patch applied the machine boots OK :-)

Ah... so, the problem really is too high address.  If you've got some
time, it might be interesting to find out how far high is safe.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [v6 PATCH 0/7]: cpuidle/x86/POWER: Cleanup idle power management code in x86, cleanup drivers/cpuidle/cpuidle.c and introduce cpuidle to POWER.
From: Arjan van de Ven @ 2009-09-25  9:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Shaohua Li, Gautham R Shenoy, Venkatesh Pallipadi, linux-kernel,
	linux-acpi, Paul Mackerras, arun, Ingo Molnar, linuxppc-dev,
	Len Brown
In-Reply-To: <1253868864.10287.3.camel@twins>

On Fri, 25 Sep 2009 10:54:24 +0200
Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> On Fri, 2009-09-25 at 12:36 +0530, Vaidyanathan Srinivasan wrote:
> > * Arjan van de Ven <arjan@infradead.org> [2009-09-24 14:22:28]:
> > 
> > > On Thu, 24 Sep 2009 10:42:41 +0530
> > > Arun R Bharadwaj <arun@linux.vnet.ibm.com> wrote:
> > > 
> > > > * Arun R Bharadwaj <arun@linux.vnet.ibm.com> [2009-09-22
> > > > 16:55:27]:
> > > > 
> > > > Hi Len, (or other acpi folks),
> > > > 
> > > > I had a question regarding ACPI-cpuidle interaction in the
> > > > current implementation.
> > > > 
> > > > Currently, every cpu (i.e. acpi_processor) registers to cpuidle
> > > > as a cpuidle_device. So every cpu has to go through the process
> > > > of setting up the idle states and then registering as a cpuidle
> > > > device.
> > > > 
> > > > What exactly is the reason behind this?
> > > > 
> > > 
> > > technically a BIOS can opt to give you C states via ACPI on some
> > > cpus, but not on others.
> > > 
> > > in practice when this happens it tends to be a bug.. but it's
> > > technically a valid configuration
> > 
> > So we will need to keep the per-cpu registration as of now because
> > we may have such buggy BIOS in the field and we don't want the
> > cpuidle framework to malfunction there.
> 
> If the BIOS doesn't mention a certain C state on a cpu, and you try to
> set it anyway, does that go boom?
> 
> This whole per-cpu registration thing is horridly ugly, can't you
> have a per-cpu C state exception mask and leave it at that -- if its
> really needed?

the real solution is to make the acpi code always know about C1, even
if the bios doesn't.... That's one for Len :)

(C1 is just "hlt", what we do in the other idle loop ;-) 


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply

* Re: [patch] powerpc: build modules outside the kernel tree fails, if it was built using O=
From: Yuri Frolov @ 2009-09-25  9:39 UTC (permalink / raw)
  To: Sam Ravnborg; +Cc: linux-kbuild, linuxppc-dev, rep.dot.nop
In-Reply-To: <20090925043902.GA2484@merkur.ravnborg.org>

On 09/25/2009 08:39 AM, Sam Ravnborg wrote:
> On Fri, Sep 25, 2009 at 11:12:21AM +1000, Benjamin Herrenschmidt wrote:
>> On Thu, 2009-09-24 at 15:28 +0400, Yuri Frolov wrote:
>>> Hello,
>>>
>>> here is a corresponding bug: http://bugzilla.kernel.org/show_bug.cgi?id=11143
>>> This patch should correctly export crtsavres.o in order to make O= option working.
>>> Please, consider to apply.
>>>
>>>
>>> Fix linking modules against crtsavres.o
>> Hi !
>>
>> This is the same patch you already posted as "
>>
>>
>> [PATCH] Fix linking modules against
>> crtsavres.o
>> "
>>
>> Or it's an update ?

It's the same, sorry for not mentioning it. The previous letter contains attachment crap, so I sent the letter with patch in-lined.

>>
>> I've asked Sam to review it already since it affects the main kernel
>> makefiles, waiting for his answer.
> Saw the duplicates. Will get back to it tonight (morning here now).
> 
> 	Sam
Ok, thank you.

^ permalink raw reply

* Re: [PATCH] powerpc/8xx: fix regression introduced by cache coherency rewrite
From: Benjamin Herrenschmidt @ 2009-09-25  9:47 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev@ozlabs.org, Rex Feany
In-Reply-To: <OF028339A2.3A7A8D6F-ONC125763C.002DE73B-C125763C.002ECF90@transmode.se>

On Fri, 2009-09-25 at 10:31 +0200, Joakim Tjernlund wrote:
> 
> The main problem with 8xx it does not update the DAR register in
> the TLB Miss/Fault handlers for cache instructions :( It on old bug
> that was found only some years ago.
> 
> I think the old comment is correct though, as I recall it was Marcelo
> that found the problem and added the workaround.

But the TLB needs flushing on more than just the cache instructions,
no ?

IE. We take a TLB miss, there's no valid PTE, we put one of those
"unpopulated" entries in and get into the page fault, at which point we
do a set_pte, we -still- need to do an invalidation to get rid of the
unpopulated entry so it gets a new TLB miss no ? Without that, it's just
going to fault over and over again...

In any case, I think flushing unconditionally the target address isn't
going to hurt since we are just changing its PTE anyways.

As for the DAR problem, I'm not sure whether we really need a workaround
since I haven't seem much people complaining about it so far :-)

Can you educate me more on the problem ? Can it be fixed without
bloating those handlers to oblivion ?

Cheers,
Ben.

^ permalink raw reply

* Re: 2.6.31-git5 kernel boot hangs on powerpc
From: Benjamin Herrenschmidt @ 2009-09-25  9:48 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Linux/PPC Development, David Miller
In-Reply-To: <4ABC86E0.9090807@kernel.org>

On Fri, 2009-09-25 at 18:01 +0900, Tejun Heo wrote:
> > With this patch applied the machine boots OK :-)
> 
> Ah... so, the problem really is too high address.  If you've got some
> time, it might be interesting to find out how far high is safe.
> 
Might give me a clue about what the problem is but I think I'll just
cook up a test case that forcibly vmap something high up and see how it
goes from there. It could be a very old bug that nobody ever noticed
because our vmalloc space on 64-bit is so huge :-)

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] i2c-mpc: Do not generate STOP after read.
From: Wolfgang Grandegger @ 2009-09-25 10:01 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev, linux-i2c, Esben Haabendal
In-Reply-To: <1253620242-18461-1-git-send-email-Joakim.Tjernlund@transmode.se>

Joakim Tjernlund wrote:
> The driver always ends a read with a STOP condition which
> breaks subsequent I2C reads/writes in the same transaction as
> these expect to do a repeated START(ReSTART).
> 
> This will also help I2C multimaster as the bus will not be released
> after the first read, but when the whole transaction ends.
> 
> Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Tested-by: Wolfgang Grandegger <wg@grandegger.com>

on a MPC8548 board with an up-to-date kernel. I did not realize any
problems.

Wolfgang.

^ permalink raw reply

* Re: [PATCH] powerpc/8xx: fix regression introduced by cache coherency rewrite
From: Joakim Tjernlund @ 2009-09-25 10:21 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev@ozlabs.org, Rex Feany
In-Reply-To: <1253872054.7103.519.camel@pasglop>

[-- Attachment #1: Type: text/plain, Size: 2112 bytes --]

Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote on 25/09/2009 11:47:34:
>
> On Fri, 2009-09-25 at 10:31 +0200, Joakim Tjernlund wrote:
> >
> > The main problem with 8xx it does not update the DAR register in
> > the TLB Miss/Fault handlers for cache instructions :( It on old bug
> > that was found only some years ago.
> >
> > I think the old comment is correct though, as I recall it was Marcelo
> > that found the problem and added the workaround.
>
> But the TLB needs flushing on more than just the cache instructions,
> no ?
>
> IE. We take a TLB miss, there's no valid PTE, we put one of those
> "unpopulated" entries in and get into the page fault, at which point we
> do a set_pte, we -still- need to do an invalidation to get rid of the
> unpopulated entry so it gets a new TLB miss no ? Without that, it's just
> going to fault over and over again...

I don't know enough about 8xx in 2.6 as we still use 2.4 for 8xx to
say for sure.

>
> In any case, I think flushing unconditionally the target address isn't
> going to hurt since we are just changing its PTE anyways.
>
> As for the DAR problem, I'm not sure whether we really need a workaround
> since I haven't seem much people complaining about it so far :-)

I did some years ago on 2.4 but no one cared enough :(
The drawbacks of not handling this problem is that you will have
to very carful to use cache instructions and user space must
be especially compiled to omit using them in optimizations.

>
> Can you educate me more on the problem ? Can it be fixed without
> bloating those handlers to oblivion ?

Yes, I fixed it for myself but the fix was never accepted. Currently
only TLB Error depends on DAR so what I did was to tag DAR with an impossible
value and test for that value in the TLB Error handler. If it matched I
branched to a subroutine the did instruction decoding in assembler to
get at registers used and calculate DAR, then return to the TLB error
handler. In hindsight it would have been better to do this work in
handle_page_fault.

I am attaching my old head_8xx.S for 2.4

 Jocke
(See attached file: head_8xx.S)

[-- Attachment #2: head_8xx.S --]
[-- Type: application/octet-stream, Size: 33768 bytes --]

/*
 *  arch/ppc/kernel/except_8xx.S
 *
 *  PowerPC version 
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *  Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP
 *    Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu>
 *  Low-level exception handlers and MMU support
 *  rewritten by Paul Mackerras.
 *    Copyright (C) 1996 Paul Mackerras.
 *  MPC8xx modifications by Dan Malek
 *    Copyright (C) 1997 Dan Malek (dmalek@jlc.net).
 *
 *  This file contains low-level support and setup for PowerPC 8xx
 *  embedded processors, including trap and interrupt dispatch.
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 *
 */

#include <linux/config.h>
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/mmu.h>
#include <asm/cache.h>
#include <asm/pgtable.h>
#include <asm/cputable.h>
#include <asm/ppc_asm.h>
#include "ppc_defs.h"

#ifdef CONFIG_8xx_DCBxFIXED
/* These macros are used to tag DAR with a known value so that the
 * DataTLBError can recognize a buggy dcbx instruction and workaround
 * the problem.
 */
	#define TAG_VAL 0x00f0	/*  -1 may also be used */
	#define TAG_DAR_R20 	\
		li	r20, TAG_VAL;\
		mtspr	DAR, r20;
#else
	#define TAG_DAR_R20
#endif
/* Macro to make the code more readable. */
#ifdef CONFIG_8xx_CPU6
  #define DO_8xx_CPU6(val, reg) \
	li	reg, val; \
	stw	reg, 12(r0); \
	lwz	reg, 12(r0);
#else
  #define DO_8xx_CPU6(val, reg)
#endif
	.text
	.globl	_stext
_stext:

/*
 * _start is defined this way because the XCOFF loader in the OpenFirmware
 * on the powermac expects the entry point to be a procedure descriptor.
 */
	.text
	.globl	_start
_start:

/* MPC8xx
 * This port was done on an MBX board with an 860.  Right now I only
 * support an ELF compressed (zImage) boot from EPPC-Bug because the
 * code there loads up some registers before calling us:
 *   r3: ptr to board info data
 *   r4: initrd_start or if no initrd then 0
 *   r5: initrd_end - unused if r4 is 0
 *   r6: Start of command line string
 *   r7: End of command line string
 *
 * I decided to use conditional compilation instead of checking PVR and
 * adding more processor specific branches around code I don't need.
 * Since this is an embedded processor, I also appreciate any memory
 * savings I can get.
 *
 * The MPC8xx does not have any BATs, but it supports large page sizes.
 * We first initialize the MMU to support 8M byte pages, then load one
 * entry into each of the instruction and data TLBs to map the first
 * 8M 1:1.  I also mapped an additional I/O space 1:1 so we can get to
 * the "internal" processor registers before MMU_init is called.
 *
 * The TLB code currently contains a major hack.  Since I use the condition
 * code register, I have to save and restore it.  I am out of registers, so
 * I just store it in memory location 0 (the TLB handlers are not reentrant).
 * To avoid making any decisions, I need to use the "segment" valid bit
 * in the first level table, but that would require many changes to the
 * Linux page directory/table functions that I don't want to do right now.
 *
 * I used to use SPRG2 for a temporary register in the TLB handler, but it
 * has since been put to other uses.  I now use a hack to save a register
 * and the CCR at memory location 0.....Someday I'll fix this.....
 *	-- Dan
 */

	.globl	__start
__start:
	/* To accomodate some SMP systems that overwrite the first few
	 * locations before cpu 0 starts, the bootloader starts us at 0xc.
	 */
	nop
	nop
	nop
	mr	r31,r3			/* save parameters */
	mr	r30,r4
	mr	r29,r5
	mr	r28,r6
	mr	r27,r7
	li	r24,0			/* cpu # */

	/* We have to turn on the MMU right away so we get cache modes
	 * set correctly.
	 */
	bl	initial_mmu

/* We now have the lower 8 Meg mapped into TLB entries, and the caches
 * ready to work.
 */

turn_on_mmu:
	mfmsr	r0
	ori	r0,r0,MSR_DR|MSR_IR
	mtspr	SRR1,r0
	lis	r0,start_here@h
	ori	r0,r0,start_here@l
	mtspr	SRR0,r0
	SYNC
	rfi				/* enables MMU */

/*
 * Exception entry code.  This code runs with address translation
 * turned off, i.e. using physical addresses.
 * We assume sprg3 has the physical address of the current
 * task's thread_struct.
 */
#define EXCEPTION_PROLOG	\
	mtspr	SPRG0,r20;	\
	mtspr	SPRG1,r21;	\
	mfcr	r20;		\
	mfspr	r21,SPRG2;		/* exception stack to use from */ \
	cmpwi	0,r21,0;		/* user mode or RTAS */ \
	bne	1f;		\
	tophys(r21,r1);			/* use tophys(kernel sp) otherwise */ \
	subi	r21,r21,INT_FRAME_SIZE;	/* alloc exc. frame */\
1:	stw	r20,_CCR(r21);		/* save registers */ \
	stw	r22,GPR22(r21);	\
	stw	r23,GPR23(r21);	\
	mfspr	r20,SPRG0;	\
	stw	r20,GPR20(r21);	\
	mfspr	r22,SPRG1;	\
	stw	r22,GPR21(r21);	\
	mflr	r20;		\
	stw	r20,_LINK(r21);	\
	mfctr	r22;		\
	stw	r22,_CTR(r21);	\
	mfspr	r20,XER;	\
	stw	r20,_XER(r21);	\
	mfspr	r22,SRR0;	\
	mfspr	r23,SRR1;	\
	stw	r0,GPR0(r21);	\
	stw	r1,GPR1(r21);	\
	stw	r2,GPR2(r21);	\
	stw	r1,0(r21);	\
	tovirt(r1,r21);			/* set new kernel sp */	\
	SAVE_4GPRS(3, r21);	\
	SAVE_GPR(7, r21);
/*
 * Note: code which follows this uses cr0.eq (set if from kernel),
 * r21, r22 (SRR0), and r23 (SRR1).
 */

/*
 * Exception vectors.
 */

#define FINISH_EXCEPTION(func)			\
	bl	transfer_to_handler;		\
	.long	func;				\
	.long	ret_from_except

#define STD_EXCEPTION(n, label, hdlr)		\
	. = n;					\
label:						\
	EXCEPTION_PROLOG;			\
	TAG_DAR_R20;				\
	addi	r3,r1,STACK_FRAME_OVERHEAD;	\
	li	r20,MSR_KERNEL;			\
	FINISH_EXCEPTION(hdlr)

/* System reset */
	STD_EXCEPTION(0x100, Reset, UnknownException)

/* Machine check */
	STD_EXCEPTION(0x200, MachineCheck, MachineCheckException)

/* Data access exception.
 * This is "never generated" by the MPC8xx.  We jump to it for other
 * translation errors.
 */
	. = 0x300
DataAccess:
	EXCEPTION_PROLOG
	mfspr	r20,DSISR
	stw	r20,_DSISR(r21)
	mr	r5,r20
	mfspr	r4,DAR
	stw	r4,_DAR(r21)
	TAG_DAR_R20
	addi	r3,r1,STACK_FRAME_OVERHEAD
	li	r20,MSR_KERNEL
	rlwimi	r20,r23,0,16,16		/* copy EE bit from saved MSR */
	FINISH_EXCEPTION(do_page_fault)

/* Instruction access exception.
 * This is "never generated" by the MPC8xx.  We jump to it for other
 * translation errors.
 */
	. = 0x400
InstructionAccess:
	EXCEPTION_PROLOG
	addi	r3,r1,STACK_FRAME_OVERHEAD
	mr	r4,r22
	mr	r5,r23
	li	r20,MSR_KERNEL
	rlwimi	r20,r23,0,16,16		/* copy EE bit from saved MSR */
	FINISH_EXCEPTION(do_page_fault)

/* External interrupt */
	. = 0x500;
HardwareInterrupt:
	EXCEPTION_PROLOG;
	addi	r3,r1,STACK_FRAME_OVERHEAD
	li	r20,MSR_KERNEL
	li	r4,0
	bl	transfer_to_handler
	.globl	do_IRQ_intercept
do_IRQ_intercept:
	.long	do_IRQ;
	.long	ret_from_intercept

/* Alignment exception */
	. = 0x600
Alignment:
	EXCEPTION_PROLOG
	mfspr	r4,DAR
	stw	r4,_DAR(r21)
	TAG_DAR_R20
	mfspr	r5,DSISR
	stw	r5,_DSISR(r21)
	addi	r3,r1,STACK_FRAME_OVERHEAD
	li	r20,MSR_KERNEL
	rlwimi	r20,r23,0,16,16		/* copy EE bit from saved MSR */
	FINISH_EXCEPTION(AlignmentException)

/* Program check exception */
	. = 0x700
ProgramCheck:
	EXCEPTION_PROLOG
	addi	r3,r1,STACK_FRAME_OVERHEAD
	li	r20,MSR_KERNEL
	rlwimi	r20,r23,0,16,16		/* copy EE bit from saved MSR */
	FINISH_EXCEPTION(ProgramCheckException)

/* No FPU on MPC8xx.  This exception is not supposed to happen.
*/
	STD_EXCEPTION(0x800, FPUnavailable, UnknownException)

	. = 0x900
Decrementer:
	EXCEPTION_PROLOG
	addi	r3,r1,STACK_FRAME_OVERHEAD
	li	r20,MSR_KERNEL
	bl	transfer_to_handler
	.globl	timer_interrupt_intercept
timer_interrupt_intercept:
	.long	timer_interrupt
	.long	ret_from_intercept

	STD_EXCEPTION(0xa00, Trap_0a, UnknownException)
	STD_EXCEPTION(0xb00, Trap_0b, UnknownException)

/* System call */
	. = 0xc00
SystemCall:
	EXCEPTION_PROLOG
	stw	r3,ORIG_GPR3(r21)
	li	r20,MSR_KERNEL
	rlwimi	r20,r23,0,16,16		/* copy EE bit from saved MSR */
	FINISH_EXCEPTION(DoSyscall)

/* Single step - not used on 601 */
	STD_EXCEPTION(0xd00, SingleStep, SingleStepException)

	STD_EXCEPTION(0xe00, Trap_0e, UnknownException)
	STD_EXCEPTION(0xf00, Trap_0f, UnknownException)

/* On the MPC8xx, this is a software emulation interrupt.  It occurs
 * for all unimplemented and illegal instructions.
 */
	STD_EXCEPTION(0x1000, SoftEmu, SoftwareEmulation)

	. = 0x1100
/*
 * For the MPC8xx, this is a software tablewalk to load the instruction
 * TLB.  It is modelled after the example in the Motorola manual.  The task
 * switch loads the M_TWB register with the pointer to the first level table.
 * If we discover there is no second level table (the value is zero), the
 * plan was to load that into the TLB, which causes another fault into the
 * TLB Error interrupt where we can handle such problems.  However, that did
 * not work, so if we discover there is no second level table, we restore
 * registers and branch to the error exception.  We have to use the MD_xxx
 * registers for the tablewalk because the equivalent MI_xxx registers
 * only perform the attribute functions.
 */
InstructionTLBMiss:
#ifdef CONFIG_8xx_CPU6
	stw	r3, 8(r0)
	li	r3, 0x3f80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	M_TW, r20	/* Save a couple of working registers */
#if !CONFIG_PIN_TLB || CONFIG_MODULES
	mfcr	r20
	stw	r20, 0(r0)
#endif
	stw	r21, 4(r0)
	mfspr	r20, SRR0	/* Get effective address of fault */
#ifdef CONFIG_8xx_CPU6
	li	r3, 0x3780
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	MD_EPN, r20	/* Have to use MD_EPN for walk, MI_EPN can't */
	mfspr	r20, M_TWB	/* Get level 1 table entry address */

#if !CONFIG_PIN_TLB || CONFIG_MODULES
	/* If we are faulting a kernel address, we have to use the
	 * kernel page tables.
	 */
	andi.	r21, r20, 0x0800	/* Address >= 0x80000000 */
	beq	3f
	lis	r21, swapper_pg_dir@h
	ori	r21, r21, swapper_pg_dir@l
	rlwimi	r20, r21, 0, 2, 19
3:
	lwz	r21, 0(r20)	/* Get the level 1 entry */
	rlwinm.	r20, r21,0,0,19	/* Extract page descriptor page address */
	tophys(r21,r21)
	ori	r21,r21,1		/* Set valid bit */
	beq	2f		/* If zero, don't try to find a pte */
#else
	lwz	r21, 0(r20)	/* Get the level 1 entry */
	mfcr	r20
	cmplwi	cr0,r21,0x0fff	/* Test page descriptor page address */
	tophys(r21,r21)
	ori	r21,r21,1		/* Set valid bit */
	bng-	2f		/* If zero, don't try to find a pte */
	mtcr	r20
#endif

	/* We have a pte table, so load the MI_TWC with the attributes
	 * for this "segment."
	 */
#ifdef CONFIG_8xx_CPU6
	li	r3, 0x2b80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	MI_TWC, r21	/* Set segment attributes */
#ifdef CONFIG_8xx_CPU6
	li	r3, 0x3b80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	MD_TWC, r21	/* Load pte table base address */
	mfspr	r21, MD_TWC	/* ....and get the pte address */
	lwz	r20, 0(r21)	/* Get the pte */

	ori	r20, r20, _PAGE_ACCESSED
	stw	r20, 0(r21)

	/* The Linux PTE won't go exactly into the MMU TLB.
	 * Software indicator bits 21, 22 and 28 must be clear.
	 * Software indicator bits 24, 25, 26, and 27 must be
	 * set.  All other Linux PTE bits control the behavior
	 * of the MMU.
	 */
	li	r21, 0x00f0
	rlwimi	r20, r21, 0, 24, 28	/* Set 24-27, clear 28 */

#ifdef CONFIG_8xx_CPU6
	li	r3, 0x2d80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	MI_RPN, r20	/* Update TLB entry */

	mfspr	r20, M_TW	/* Restore registers */
#if !CONFIG_PIN_TLB || CONFIG_MODULES
	lwz	r21, 0(r0)
	mtcr	r21
#endif
	lwz	r21, 4(r0)
#ifdef CONFIG_8xx_CPU6
	lwz	r3, 8(r0)
#endif
	rfi

2:	/* Restore registers */
#if !CONFIG_PIN_TLB || CONFIG_MODULES
	lwz	r21, 0(r0)
	mtcr	r21
#else
	mtcr	r20
#endif
	mfspr	r20, M_TW
	lwz	r21, 4(r0)
#ifdef CONFIG_8xx_CPU6
	lwz	r3, 8(r0)
#endif
	b	InstructionAccess

	. = 0x1200
DataStoreTLBMiss:
#ifdef CONFIG_8xx_CPU6
	stw	r3, 8(r0)
	li	r3, 0x3f80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	M_TW, r20	/* Save a couple of working registers */
	mfcr	r20
	stw	r20, 0(r0)
	stw	r21, 4(r0)
	mfspr	r20, M_TWB	/* Get level 1 table entry address */

	/* If we are faulting a kernel address, we have to use the
	 * kernel page tables.
	 */
	andi.	r21, r20, 0x0800
	beq+	3f
	lis	r21, swapper_pg_dir@h
	ori	r21, r21, swapper_pg_dir@l
	rlwimi r20, r21, 0, 2, 19
3:
	lwz	r21, 0(r20)	/* Get the level 1 entry */
	rlwinm.	r20, r21,0,0,19	/* Extract page descriptor page address */

//	beq	4f		/* If zero, don't try to find a pte */
	/* We have a pte table, so load fetch the pte from the table.
	 */
	tophys(r21, r21)
	ori	r21, r21, 1	/* Set valid bit in physical L2 page */
//	beq-	4f		/* If zero, don't try to find a pte */
	beq-	2f		/* If zero, don't try to find a pte */

#ifdef CONFIG_8xx_CPU6
	li	r3, 0x3b80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	MD_TWC, r21	/* Load pte table base address */
	mfspr	r20, MD_TWC	/* ....and get the pte address */
	lwz	r20, 0(r20)	/* Get the pte */

	/* Insert the Guarded flag into the TWC from the Linux PTE.
	 * It is bit 27 of both the Linux PTE and the TWC (at least
	 * I got that right :-).  It will be better when we can put
	 * this into the Linux pgd/pmd and load it in the operation
	 * above.
	 */
	rlwimi	r21, r20, 0, 27, 27
#ifdef CONFIG_8xx_CPU6
	li	r3, 0x3b80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	MD_TWC, r21

//	mfspr	r21, MD_TWC	/* get the pte address again */
	ori	r20, r20, _PAGE_ACCESSED
999:	mfspr	r21, MD_TWC	/* get the pte address again */
	stw	r20, 0(r21)

	/* The Linux PTE won't go exactly into the MMU TLB.
	 * Software indicator bits 21, 22 and 28 must be clear.
	 * Software indicator bits 24, 25, 26, and 27 must be
	 * set.  All other Linux PTE bits control the behavior
	 * of the MMU.
	 */
4:	li	r21, 0x00f0
	rlwimi	r20, r21, 0, 24, 28	/* Set 24-27, clear 28 */

#ifdef CONFIG_8xx_CPU6
	li	r3, 0x3d80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	MD_RPN, r20	/* Update TLB entry */

#ifdef CONFIG_8xx_DCBxFIXED
#if TAG_VAL == 0x00f0 /* Save 1 instr. by reusing the val loaded in r21 above */
	mtspr	DAR, r21
#else
	TAG_DAR_R20
#endif
#endif
	mfspr	r20, M_TW	/* Restore registers */
	lwz	r21, 0(r0)
	mtcr	r21
	lwz	r21, 4(r0)
#ifdef CONFIG_8xx_CPU6
	lwz	r3, 8(r0)
#endif
	rfi

2:
#ifdef CONFIG_8xx_DCBxFIXED
	/* Copy 20 msb from MD_EPN to DAR since the dcxx instructions fails
	 * to update DAR when they cause a DTLB Miss.
	 */
	mfspr	r21, MD_EPN
	mfspr	r20, DAR
	rlwimi	r20, r21, 0, 0, 19
	mtspr	DAR, r20
#endif
	mfspr	r20, M_TW	/* Restore registers */
	lwz	r21, 0(r0)
	mtcr	r21
	lwz	r21, 4(r0)
#ifdef CONFIG_8xx_CPU6
	lwz	r3, 8(r0)
#endif
	b	DataAccess

/* This is an instruction TLB error on the MPC8xx.  This could be due
 * to many reasons, such as executing guarded memory or illegal instruction
 * addresses.  There is nothing to do but handle a big time error fault.
 */
	. = 0x1300
InstructionTLBError:
	b	InstructionAccess

/* This is the data TLB error on the MPC8xx.  This could be due to
 * many reasons, including a dirty update to a pte.  We can catch that
 * one here, but anything else is an error.  First, we track down the
 * Linux pte.  If it is valid, write access is allowed, but the
 * page dirty bit is not set, we will set it and reload the TLB.  For
 * any other case, we bail out to a higher level function that can
 * handle it.
 */
	. = 0x1400
DataTLBError:
#ifdef CONFIG_8xx_CPU6
	stw	r3, 8(r0)
	li	r3, 0x3f80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	M_TW, r20	/* Save a couple of working registers */
	mfcr	r20
	stw	r20, 0(r0)
	stw	r21, 4(r0)

	mfspr	r20, DAR
#ifdef  CONFIG_8xx_DCBxFIXED
	/* If DAR contains TAG_VAL implies a buggy dcbx instruction
	 * that did not set DAR.
	 */
	cmpwi	cr0, r20, TAG_VAL
	beq-	100f	/* Branch if TAG_VAL to dcbx workaround procedure */
101:	/* return from dcbx instruction bug workaround, r20 holds value of DAR */	
	/* First, make sure this was a store operation.
	*/
#endif
	mfspr	r21, DSISR
	andis.	r21, r21, 0x0200	/* If set, indicates store op */
//	beq	2f

	/* The EA of a data TLB miss is automatically stored in the MD_EPN 
	 * register.  The EA of a data TLB error is automatically stored in 
	 * the DAR, but not the MD_EPN register.  We must copy the 20 most 
	 * significant bits of the EA from the DAR to MD_EPN before we 
	 * start walking the page tables.  We also need to copy the CASID 
	 * value from the M_CASID register.
	 * Addendum:  The EA of a data TLB error is _supposed_ to be stored 
	 * in DAR, but it seems that this doesn't happen in some cases, such 
	 * as when the error is due to a dcbi instruction to a page with a 
	 * TLB that doesn't have the changed bit set.  In such cases, there 
	 * does not appear to be any way  to recover the EA of the error 
	 * since it is neither in DAR nor MD_EPN.  As a workaround, the 
	 * _PAGE_HWWRITE bit is set for all kernel data pages when the PTEs 
	 * are initialized in mapin_ram().  This will avoid the problem, 
	 * assuming we only use the dcbi instruction on kernel addresses.
	 */
	/* DAR is in r20 already */
	rlwinm	r21, r20, 0, 0, 19
	ori	r21, r21, MD_EVALID
	beq-	2f
	mfspr	r20, M_CASID
	rlwimi	r21, r20, 0, 28, 31
#ifdef CONFIG_8xx_CPU6
	li	r3, 0x3780
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	MD_EPN, r21

	mfspr	r20, M_TWB	/* Get level 1 table entry address */

	/* If we are faulting a kernel address, we have to use the
	 * kernel page tables.
	 */
	andi.	r21, r20, 0x0800
	beq+	3f
	lis	r21, swapper_pg_dir@h
	ori	r21, r21, swapper_pg_dir@l
	rlwimi	r20, r21, 0, 2, 19
3:
	lwz	r21, 0(r20)	/* Get the level 1 entry */
	rlwinm.	r20, r21,0,0,19	/* Extract page descriptor page address */
//	beq	2f		/* If zero, bail */

	/* We have a pte table, so fetch the pte from the table.
	 */
	tophys(r21, r21)
	ori	r21, r21, 1		/* Set valid bit in physical L2 page */
	beq-	2f		/* If zero, bail */
#ifdef CONFIG_8xx_CPU6
	li	r3, 0x3b80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	MD_TWC, r21		/* Load pte table base address */
	mfspr	r21, MD_TWC		/* ....and get the pte address */
	lwz	r20, 0(r21)		/* Get the pte */

	andi.	r21, r20, _PAGE_RW	/* Is it writeable? */
//	beq	2f			/* Bail out if not */

	/* Update 'changed', among others.
	*/
	ori	r20, r20, _PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_HWWRITE
	beq-	2f			/* Bail out if not */
	b	999b
	mfspr	r21, MD_TWC		/* Get pte address again */
	stw	r20, 0(r21)		/* and update pte in table */

	/* The Linux PTE won't go exactly into the MMU TLB.
	 * Software indicator bits 21, 22 and 28 must be clear.
	 * Software indicator bits 24, 25, 26, and 27 must be
	 * set.  All other Linux PTE bits control the behavior
	 * of the MMU.
	 */
	li	r21, 0x00f0
	rlwimi	r20, r21, 0, 24, 28	/* Set 24-27, clear 28 */

#ifdef CONFIG_8xx_CPU6
	li	r3, 0x3d80
	stw	r3, 12(r0)
	lwz	r3, 12(r0)
#endif
	mtspr	MD_RPN, r20	/* Update TLB entry */

#ifdef CONFIG_8xx_DCBxFIXED
#if TAG_VAL == 0x00f0 /* Save 1 instr. by reusing the val loaded in r21 above */
	mtspr	DAR, r21
#else
	TAG_DAR_R20
#endif
#endif
	mfspr	r20, M_TW	/* Restore registers */
	lwz	r21, 0(r0)
	mtcr	r21
	lwz	r21, 4(r0)
#ifdef CONFIG_8xx_CPU6
	lwz	r3, 8(r0)
#endif
	rfi
2:
	mfspr	r20, M_TW	/* Restore registers */
	lwz	r21, 0(r0)
	mtcr	r21
	lwz	r21, 4(r0)
#ifdef CONFIG_8xx_CPU6
	lwz	r3, 8(r0)
#endif
	b	DataAccess

	STD_EXCEPTION(0x1500, Trap_15, UnknownException)
	STD_EXCEPTION(0x1600, Trap_16, UnknownException)
	STD_EXCEPTION(0x1700, Trap_17, TAUException)
	STD_EXCEPTION(0x1800, Trap_18, UnknownException)
	STD_EXCEPTION(0x1900, Trap_19, UnknownException)
	STD_EXCEPTION(0x1a00, Trap_1a, UnknownException)
	STD_EXCEPTION(0x1b00, Trap_1b, UnknownException)

/* On the MPC8xx, these next four traps are used for development
 * support of breakpoints and such.  Someday I will get around to
 * using them.
 */
	STD_EXCEPTION(0x1c00, Trap_1c, UnknownException)
	STD_EXCEPTION(0x1d00, Trap_1d, UnknownException)
	STD_EXCEPTION(0x1e00, Trap_1e, UnknownException)
	STD_EXCEPTION(0x1f00, Trap_1f, UnknownException)

	. = 0x2000

#ifdef CONFIG_8xx_DCBxFIXED
/* This is the workaround procedure to calculate the data EA for buggy dcbx,dcbi instructions
 * by decoding the registers used by the dcbx instruction and adding them.
 * DAR is set to the calculated address and r20 also holds the EA on exit.
 */
//#define INSTR_CHECK /* define to verify if it is a dcbx instr. Should not be needed. */
//#define NO_SELF_MODIFYING_CODE /* define if you don't want to use self modifying code */
//#define DEBUG_DCBX_INSTRUCTIONS /* for debugging only. Needs INSTR_CHECK defined as well. */
//#define KERNEL_SPACE_ONLY /* define if user space do NOT contain dcbx instructions. */

#ifndef KERNEL_SPACE_ONLY
	nop	/* A few nops to make the modified_instr: space below cache line aligned */
	nop
139:	/* fetch instruction from userspace memory */
	DO_8xx_CPU6(0x3780, r3)
	mtspr	MD_EPN, r20
	mfspr	r21, M_TWB	/* Get level 1 table entry address */
	lwz	r21, 0(r21)	/* Get the level 1 entry */
	tophys  (r21, r21)
	DO_8xx_CPU6(0x3b80, r3)
	mtspr	MD_TWC, r21	/* Load pte table base address */
	mfspr	r21, MD_TWC	/* ....and get the pte address */
	lwz	r21, 0(r21)	/* Get the pte */
	/* concat physical page address(r21) and page offset(r20) */
	rlwimi	r21, r20, 0, 20, 31
	b	140f
#endif
100:	/* Entry point for dcbx workaround. */
	/* fetch instruction from memory. */
	mfspr	r20,SRR0
#ifndef KERNEL_SPACE_ONLY
	andis.	r21, r20, 0x8000
	tophys  (r21, r20)
	beq-	139b		/* Branch if user space address */
#else
	tophys  (r21, r20)
#endif
140:	lwz	r21,0(r21)
#ifdef INSTR_CHECK
/* Check if it really is a dcbx instruction. This is not needed as far as I can tell */
/* dcbt and dcbtst does not generate DTLB Misses/Errors, no need to include them here */
	rlwinm	r20, r21, 0, 21, 30
	cmpwi	cr0, r20, 2028	/* Is dcbz? */
	beq+	142f
	cmpwi	cr0, r20, 940	/* Is dcbi? */
	beq+	142f
	cmpwi	cr0, r20, 108	/* Is dcbst? */
	beq+	142f
	cmpwi	cr0, r20, 172	/* Is dcbf? */
	beq+	142f
	cmpwi	cr0, r20, 1964	/* Is icbi? */
	beq+	142f
#ifdef DEBUG_DCBX_INSTRUCTIONS
141:	b 141b /* Stop here if no dcbx instruction */
#endif
	mfspr	r20, DAR	/* r20 must hold DAR at exit */
	b 101b			/* None of the above, go back to normal TLB processing */
142:	/* continue, it was a dcbx instruction. */
#endif
#ifdef CONFIG_8xx_CPU6
	lwz	r3, 8(r0)		/* restore r3 from memory */
#endif
#ifndef NO_SELF_MODIFYING_CODE
	andis.	r20,r21,0x1f	/* test if reg RA is r0 */
	li	r20,modified_instr@l
	dcbtst	r0,r20		/* touch for store */
	rlwinm	r21,r21,0,0,20	/* Zero lower 10 bits */
	oris	r21,r21,640	/* Transform instr. to a "add r20,RA,RB" */
	ori	r21,r21,532
	stw	r21,0(r20)	/* store add/and instruction */
	dcbf	0,r20		/* flush new instr. to memory. */
	icbi	0,r20		/* invalidate instr. cache line */
	lwz	r21, 4(r0)	/* restore r21 from memory */
	mfspr	r20, M_TW	/* restore r20 from M_TW */
	isync			/* Wait until new instr is loaded from memory */
modified_instr:
	.space	4		/* this is where the add/and instr. is stored */
#ifdef DEBUG_DCBX_INSTRUCTIONS
	/* fill with some garbage */ 
	li	r21,0xffff
	stw	r21,0(r21)
#endif
	bne+	143f
	subf	r20,r0,r20		/* r20=r20-r0, only if reg RA is r0 */
143:	mtdar	r20			/* store faulting EA in DAR */
	b	101b			/* Go back to normal TLB handling */
#else
	mfctr	r20
	mtdar	r20			/* save ctr reg in DAR */
	rlwinm	r20, r21, 24, 24, 28	/* offset into jump table for reg RB */
	addi	r20, r20, 150f@l	/* add start of table */
	mtctr	r20			/* load ctr with jump address */
	xor	r20, r20, r20		/* sum starts at zero */
	bctr				/* jump into table */
150:
	add	r20, r20, r0
	b	151f
	add	r20, r20, r1
	b	151f
	add	r20, r20, r2
	b	151f
	add	r20, r20, r3
	b	151f
	add	r20, r20, r4
	b	151f
	add	r20, r20, r5
	b	151f
	add	r20, r20, r6
	b	151f
	add	r20, r20, r7
	b	151f
	add	r20, r20, r8
	b	151f
	add	r20, r20, r9
	b	151f
	add	r20, r20, r10
	b	151f
	add	r20, r20, r11
	b	151f
	add	r20, r20, r12
	b	151f
	add	r20, r20, r13
	b	151f
	add	r20, r20, r14
	b	151f
	add	r20, r20, r15
	b	151f
	add	r20, r20, r16
	b	151f
	add	r20, r20, r17
	b	151f
	add	r20, r20, r18
	b	151f
	add	r20, r20, r19
	b	151f
	mtctr	r21	/* reg 20 needs special handling */
	b	154f
	mtctr	r21	/* reg 21 needs special handling */
	b	153f
	add	r20, r20, r22
	b	151f
	add	r20, r20, r23
	b	151f
	add	r20, r20, r24
	b	151f
	add	r20, r20, r25
	b	151f
	add	r20, r20, r25
	b	151f
	add	r20, r20, r27
	b	151f
	add	r20, r20, r28
	b	151f
	add	r20, r20, r29
	b	151f
	add	r20, r20, r30
	b	151f
	add	r20, r20, r31
151:
	rlwinm. r21,r21,19,24,28	/* offset into jump table for reg RA */
	beq	152f			/* if reg RA is zero, don't add it */ 
	addi	r21, r21, 150b@l	/* add start of table */
	mtctr	r21			/* load ctr with jump address */
	rlwinm	r21,r21,0,16,10		/* make sure we don't execute this more than once */
	bctr				/* jump into table */
152:
	mfdar	r21
	mtctr	r21			/* restore ctr reg from DAR */
	mtdar	r20			/* save fault EA to DAR */
	b	101b			/* Go back to normal TLB handling */

	/* special handling for r20,r21 since these are modified already */
153:	lwz	r21, 4(r0)	/* load r21 from memory */
	b	155f
154:	mfspr	r21, M_TW	/* load r20 from M_TW */
155:	add	r20, r20, r21	/* add it */
	mfctr	r21		/* restore r21 */
	b	151b
#endif
#endif
/*
 * This code finishes saving the registers to the exception frame
 * and jumps to the appropriate handler for the exception, turning
 * on address translation.
 */
	.globl	transfer_to_handler
transfer_to_handler:
	stw	r22,_NIP(r21)
	lis	r22,MSR_POW@h
	andc	r23,r23,r22
	stw	r23,_MSR(r21)
	SAVE_4GPRS(8, r21)
	SAVE_8GPRS(12, r21)
	SAVE_8GPRS(24, r21)
	andi.	r23,r23,MSR_PR
	mfspr	r23,SPRG3		/* if from user, fix up THREAD.regs */
	beq	2f
	addi	r24,r1,STACK_FRAME_OVERHEAD
	stw	r24,PT_REGS(r23)
2:	addi	r2,r23,-THREAD		/* set r2 to current */
	tovirt(r2,r2)
	mflr	r23
	andi.	r24,r23,0x3f00		/* get vector offset */
	stw	r24,TRAP(r21)
	li	r22,0
	stw	r22,RESULT(r21)
	mtspr	SPRG2,r22		/* r1 is now kernel sp */
	addi	r24,r2,TASK_STRUCT_SIZE	/* check for kernel stack overflow */
	cmplw	0,r1,r2
	cmplw	1,r1,r24
	crand	1,1,4
	bgt-	stack_ovf		/* if r2 < r1 < r2+TASK_STRUCT_SIZE */
	lwz	r24,0(r23)		/* virtual address of handler */
	lwz	r23,4(r23)		/* where to go when done */
	mtspr	SRR0,r24
	mtspr	SRR1,r20
	mtlr	r23
	SYNC
	rfi				/* jump to handler, enable MMU */

/*
 * On kernel stack overflow, load up an initial stack pointer
 * and call StackOverflow(regs), which should not return.
 */
stack_ovf:
	addi	r3,r1,STACK_FRAME_OVERHEAD
	lis	r1,init_task_union@ha
	addi	r1,r1,init_task_union@l
	addi	r1,r1,TASK_UNION_SIZE-STACK_FRAME_OVERHEAD
	lis	r24,StackOverflow@ha
	addi	r24,r24,StackOverflow@l
	li	r20,MSR_KERNEL
	mtspr	SRR0,r24
	mtspr	SRR1,r20
	SYNC
	rfi

	.globl	giveup_fpu
giveup_fpu:
	blr

/* Maybe someday.......
*/
_GLOBAL(__setup_cpu_8xx)
	blr

/*
 * This is where the main kernel code starts.
 */
start_here:

	/* ptr to current */
	lis	r2,init_task_union@h
	ori	r2,r2,init_task_union@l

	/* ptr to phys current thread */
	tophys(r4,r2)
	addi	r4,r4,THREAD	/* init task's THREAD */
	mtspr	SPRG3,r4
	li	r3,0
	mtspr	SPRG2,r3	/* 0 => r1 has kernel sp */

	/* stack */
	addi	r1,r2,TASK_UNION_SIZE
	li	r0,0
	stwu	r0,-STACK_FRAME_OVERHEAD(r1)

	bl	early_init	/* We have to do this with MMU on */

/*
 * Decide what sort of machine this is and initialize the MMU.
 */
	mr	r3,r31
	mr	r4,r30
	mr	r5,r29
	mr	r6,r28
	mr	r7,r27
	bl	machine_init
	bl	MMU_init

/*
 * Go back to running unmapped so we can load up new values
 * and change to using our exception vectors.
 * On the 8xx, all we have to do is invalidate the TLB to clear
 * the old 8M byte TLB mappings and load the page table base register.
 */
	/* The right way to do this would be to track it down through
	 * init's THREAD like the context switch code does, but this is
	 * easier......until someone changes init's static structures.
	 */
	lis	r6, swapper_pg_dir@h
	ori	r6, r6, swapper_pg_dir@l
	tophys(r6,r6)
#ifdef CONFIG_8xx_CPU6
	lis	r4, cpu6_errata_word@h
	ori	r4, r4, cpu6_errata_word@l
	li	r3, 0x3980
	stw	r3, 12(r4)
	lwz	r3, 12(r4)
#endif
	mtspr	M_TWB, r6
	lis	r4,2f@h
	ori	r4,r4,2f@l
	tophys(r4,r4)
	li	r3,MSR_KERNEL & ~(MSR_IR|MSR_DR)
	mtspr	SRR0,r4
	mtspr	SRR1,r3
	rfi
/* Load up the kernel context */
2:
	SYNC			/* Force all PTE updates to finish */
	tlbia			/* Clear all TLB entries */
	sync			/* wait for tlbia/tlbie to finish */
	TLBSYNC			/* ... on all CPUs */

#ifdef CONFIG_BDI_SWITCH
	/* Add helper information for the Abatron bdiGDB debugger.
	 * We do this here because we know the mmu is disabled, and
	 * will be enabled for real in just a few instructions.
	 */
	tovirt(r6,r6)
	lis	r5, abatron_pteptrs@h
	ori	r5, r5, abatron_pteptrs@l
	stw	r5, 0xf0(r0)	/* Must match your Abatron config file */
	tophys(r5,r5)
	stw	r6, 0(r5)
#endif

/* Now turn on the MMU for real! */
	li	r4,MSR_KERNEL
	lis	r3,start_kernel@h
	ori	r3,r3,start_kernel@l
	mtspr	SRR0,r3
	mtspr	SRR1,r4
	rfi			/* enable MMU and jump to start_kernel */

/* Set up the initial MMU state so we can do the first level of
 * kernel initialization.  This maps the first 8 MBytes of memory 1:1
 * virtual to physical.  Also, set the cache mode since that is defined
 * by TLB entries and perform any additional mapping (like of the IMMR).
 * If configured to pin some TLBs, we pin the first 8 Mbytes of kernel,
 * 24 Mbytes of data, and the 8M IMMR space.  Anything not covered by
 * these mappings is mapped by page tables.
 */
initial_mmu:
	tlbia			/* Invalidate all TLB entries */
#ifdef CONFIG_PIN_TLB
	lis	r8, MI_RSV4I@h
	ori	r8, r8, 0x1c00
#else
	li	r8, 0
#endif
	mtspr	MI_CTR, r8	/* Set instruction MMU control */

#ifdef CONFIG_PIN_TLB
	lis	r10, (MD_RSV4I | MD_RESETVAL)@h
	ori	r10, r10, 0x1c00
	mr	r8, r10
#else
	lis	r10, MD_RESETVAL@h
#endif
#ifndef CONFIG_8xx_COPYBACK
	oris	r10, r10, MD_WTDEF@h
#endif
	mtspr	MD_CTR, r10	/* Set data TLB control */

	/* Now map the lower 8 Meg into the TLBs.  For this quick hack,
	 * we can load the instruction and data TLB registers with the
	 * same values.
	 */
	lis	r8, KERNELBASE@h	/* Create vaddr for TLB */
	ori	r8, r8, MI_EVALID	/* Mark it valid */
	mtspr	MI_EPN, r8
	mtspr	MD_EPN, r8
	li	r8, MI_PS8MEG		/* Set 8M byte page */
	ori	r8, r8, MI_SVALID	/* Make it valid */
	mtspr	MI_TWC, r8
	mtspr	MD_TWC, r8
	li	r8, MI_BOOTINIT		/* Create RPN for address 0 */
	mtspr	MI_RPN, r8		/* Store TLB entry */
	mtspr	MD_RPN, r8
	lis	r8, MI_Kp@h		/* Set the protection mode */
	mtspr	MI_AP, r8
	mtspr	MD_AP, r8

	/* Map another 8 MByte at the IMMR to get the processor
	 * internal registers (among other things).
	 */
#ifdef CONFIG_PIN_TLB
	addi	r10, r10, 0x0100
	mtspr	MD_CTR, r10
#endif
	mfspr	r9, 638			/* Get current IMMR */
	andis.	r9, r9, 0xff80		/* Get 8Mbyte boundary */

	mr	r8, r9			/* Create vaddr for TLB */
	ori	r8, r8, MD_EVALID	/* Mark it valid */
	mtspr	MD_EPN, r8
	li	r8, MD_PS8MEG		/* Set 8M byte page */
	ori	r8, r8, MD_SVALID	/* Make it valid */
	mtspr	MD_TWC, r8
	mr	r8, r9			/* Create paddr for TLB */
	ori	r8, r8, MI_BOOTINIT|0x2 /* Inhibit cache -- Cort */
	mtspr	MD_RPN, r8

#ifdef CONFIG_PIN_TLB
	/* Map two more 8M kernel data pages.
	*/
	addi	r10, r10, 0x0100
	mtspr	MD_CTR, r10

	lis	r8, KERNELBASE@h	/* Create vaddr for TLB */
	addis	r8, r8, 0x0080		/* Add 8M */
	ori	r8, r8, MI_EVALID	/* Mark it valid */
	mtspr	MD_EPN, r8
	li	r9, MI_PS8MEG		/* Set 8M byte page */
	ori	r9, r9, MI_SVALID	/* Make it valid */
	mtspr	MD_TWC, r9
	li	r11, MI_BOOTINIT	/* Create RPN for address 0 */
	addis	r11, r11, 0x0080	/* Add 8M */
	mtspr	MD_RPN, r8

	addis	r8, r8, 0x0080		/* Add 8M */
	mtspr	MD_EPN, r8
	mtspr	MD_TWC, r9
	addis	r11, r11, 0x0080	/* Add 8M */
	mtspr	MD_RPN, r8
#endif

	/* Since the cache is enabled according to the information we
	 * just loaded into the TLB, invalidate and enable the caches here.
	 * We should probably check/set other modes....later.
	 */
	lis	r8, IDC_INVALL@h
	mtspr	IC_CST, r8
	mtspr	DC_CST, r8
	lis	r8, IDC_ENABLE@h
	mtspr	IC_CST, r8
#ifdef CONFIG_8xx_COPYBACK
	mtspr	DC_CST, r8
#else
	/* For a debug option, I left this here to easily enable
	 * the write through cache mode
	 */
	lis	r8, DC_SFWT@h
	mtspr	DC_CST, r8
	lis	r8, IDC_ENABLE@h
	mtspr	DC_CST, r8
#endif
	blr


/*
 * Set up to use a given MMU context.
 * r3 is context number, r4 is PGD pointer.
 *
 * We place the physical address of the new task page directory loaded
 * into the MMU base register, and set the ASID compare register with
 * the new "context."
 */
_GLOBAL(set_context)

#ifdef CONFIG_BDI_SWITCH
	/* Context switch the PTE pointer for the Abatron BDI2000.
	 * The PGDIR is passed as second argument.
	 */
	lis	r5, KERNELBASE@h
	lwz	r5, 0xf0(r5)
	stw	r4, 0x4(r5)
#endif

#ifdef CONFIG_8xx_CPU6
	lis	r6, cpu6_errata_word@h
	ori	r6, r6, cpu6_errata_word@l
	tophys	(r4, r4)
	li	r7, 0x3980
	stw	r7, 12(r6)
	lwz	r7, 12(r6)
        mtspr   M_TWB, r4               /* Update MMU base address */
	li	r7, 0x3380
	stw	r7, 12(r6)
	lwz	r7, 12(r6)
        mtspr   M_CASID, r3             /* Update context */
#else
        mtspr   M_CASID,r3		/* Update context */
	tophys	(r4, r4)
	mtspr	M_TWB, r4		/* and pgd */
#endif
	SYNC
	blr

#ifdef CONFIG_8xx_CPU6
/* It's here because it is unique to the 8xx.
 * It is important we get called with interrupts disabled.  I used to
 * do that, but it appears that all code that calls this already had
 * interrupt disabled.
 */
	.globl	set_dec_cpu6
set_dec_cpu6:
	lis	r7, cpu6_errata_word@h
	ori	r7, r7, cpu6_errata_word@l
	li	r4, 0x2c00
	stw	r4, 8(r7)
	lwz	r4, 8(r7)
        mtspr   22, r3		/* Update Decrementer */
	SYNC
	blr
#endif

/*
 * We put a few things here that have to be page-aligned.
 * This stuff goes at the beginning of the data segment,
 * which is page-aligned.
 */
	.data
	.globl	sdata
sdata:
	.globl	empty_zero_page
empty_zero_page:
	.space	4096

	.globl	swapper_pg_dir
swapper_pg_dir:
	.space	4096

/*
 * This space gets a copy of optional info passed to us by the bootstrap
 * Used to pass parameters into the kernel like root=/dev/sda1, etc.
 */
	.globl	cmd_line
cmd_line:
	.space	512

#ifdef CONFIG_BDI_SWITCH
/* Room for two PTE table poiners, usually the kernel and current user
 * pointer to their respective root page table (pgdir).
 */
abatron_pteptrs:
	.space	8
#endif

#ifdef CONFIG_8xx_CPU6
	.globl	cpu6_errata_word
cpu6_errata_word:
	.space	16
#endif

^ permalink raw reply

* e300 (MPC5121) dlmzb
From: Fortini Matteo @ 2009-09-25 11:07 UTC (permalink / raw)
  To: linux-ppc list

I was trying to insert an optimized strlen() function using the 
following code taken from the ibm site on an MPC5121, but it crashes the 
kernel.
Is it because it's an unsupported op, or because I'm missing some needed 
steps?

Thank you,
Matteo

_GLOBAL(strlen)
    addi   r4,0,8    // Load byte count of 8
    mtxer  r4        // Set byte count for load string
    xor    r4,r4,r4  // r4 = 0, r4 == accumulator
1:
    lswx   r5,r3,r4  // load string into r5 & r6
    dlmzb. r12,r5,r6 // find NULL byte and record in r7.
    add    r4,r4,r12 // Update accumulator.
    beq    1b        // Loop if NULL not found.
    addi   r3,r4,-1  // Subtract 1 for NULL byte.
    blr              // Return length

^ permalink raw reply

* Re: lite5200b kernel not booting
From: Jon Smirl @ 2009-09-25 12:15 UTC (permalink / raw)
  To: Asier Llano Palacios; +Cc: linuxppc-dev, Aitor Arzuaga
In-Reply-To: <1253803279.4632.91.camel@allano>

On Thu, Sep 24, 2009 at 10:41 AM, Asier Llano Palacios
<asierllano@gmail.com> wrote:
> Hi Grant,
>
> We've been working with a lite5200b for a while, we have been working
> with the ppc platform in linux 2.6.x for 5 years and it worked properly
> until 2.6.25 included. We want to switch to the powerpc platform but it
> doesn't seem to work.
>
> After the bootloader (tested with the uboot 1.2.0 and 2009.08) starts
> the cuImage.lite5200 it doesn't show anything in the console.
>
> I'd like to know if the lite5200b is still supported and which version
> is known to work with it and what is the default configuration. I want
> to test it like the developers do, until I configure it myself.
>
> I've managed to do some debugging in assembler, to know that it works
> properly until DCACHE is enabled in setup_common_caches of
> arch/powerpc/kernel/cpu_setup_6xx.S. If I skip enabling the DCACHE it
> continues properly until the MMU is enabled.
>
> I'm only debugging it writing to the serial port registers in assembler,
> so I'm not very sure if it continues properly or if I am not able to
> debug it after the DCACHE is enabled or the MMU is enabled. I want to
> debug it with a JTAG debugger, but I still don't have one (do you
> recommend me anyone?).

No one has tried the Macraigor USB wiggler on the mpc5200 and reported
back if it works.

Chart says it is supported...
http://www.macraigor.com/cpus.htm

It's a $250 device so it would good to know if it works.
http://www.macraigor.com/usbWiggler.htm

uboot 1.2 is very old. That may be the cause of your problems. For
example old u-boots don't initialize the PCI hardware correctly on
systems that don't have PCI implemented.

In general the current powerpc kernel works fine on the mpc5200b. We
are running it on four different CPU boards but I don't have a
lite5200b.

We use the Phytec dev boards. We've never had any trouble with them.
http://www.phytec.de/de/produkte/rapid-development-kits/linux-kits/produktdetails.html?tx_ttproducts_pi1[backPID]=270&tx_ttproducts_pi1[product]=62&cHash=6fbc8dcdb2

http://www.phytec.com/products/rdk/PowerPC/phyCORE-MPC5200B-tinyRDK.html

>
> Kind regards and thank you for the help,
> Asier
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>

-- 
Jon Smirl
jonsmirl@gmail.com

^ permalink raw reply

* linux-next: 20090925 - hvc driver build breaks with !HVC_CONSOLE
From: Kamalesh Babulal @ 2009-09-25 13:31 UTC (permalink / raw)
  To: Stephen Rothwell; +Cc: linuxppc-dev, linux-next, LKML
In-Reply-To: <20090925133830.1ba29584.sfr@canb.auug.org.au>

Hi Stephen,

	next-20090925 randconfig build breaks on hvcs driver on powerpc,
with HVC_CONSOLE=n.

ERROR: ".hvc_put_chars" [drivers/char/hvcs.ko] undefined!
ERROR: ".hvc_get_chars" [drivers/char/hvcs.ko] undefined!

adding the dependency of HVC_CONSOLE helped

Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
--
 drivers/char/Kconfig |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index a2a0e67..2583231 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -682,7 +682,7 @@ config VIRTIO_CONSOLE
 
 config HVCS
 	tristate "IBM Hypervisor Virtual Console Server support"
-	depends on PPC_PSERIES
+	depends on PPC_PSERIES && HVC_CONSOLE
 	help
 	  Partitionable IBM Power5 ppc64 machines allow hosting of
 	  firmware virtual consoles from one Linux partition by
			
			Kamalesh

^ permalink raw reply related

* Re: [PATCH v3 0/3] cpu: pseries: Cpu offline states framework
From: Peter Zijlstra @ 2009-09-25 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: Gautham R Shenoy, linux-kernel, Venkatesh Pallipadi,
	Arun R Bharadwaj, linuxppc-dev, Darrick J. Wong
In-Reply-To: <1253753501.7103.358.camel@pasglop>

On Thu, 2009-09-24 at 10:51 +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2009-09-15 at 14:11 +0200, Peter Zijlstra wrote:
> > I still think its a layering violation... its the hypervisor manager
> > that should be bothered in what state an off-lined cpu is in. 
> > 
> That's not how our hypervisor works.

Then fix it?

> If you ask through the management interface, to remove a CPU from a
> partition, the HV will communicate with a daemon inside the partition
> that will then unplug the CPU via the right call.
> 
> I don't really understand your objections to be honest. And I fail to
> see why it would be a layering violation to have the ability for the OS
> to indicate in what state it wishes to relinguish a CPU to the
> hypervisor, which more or less defines what is the expected latency for
> getting it back later on.

OK, so the main objection is the abuse of CPU hotplug as resource
management feature.

CPU hotplug is terribly invasive and expensive to the kernel, doing
hotplug on a minute basis is just plain crazy.

If you want a CPU in a keep it near and don't hand it back to the HV
state, why not use cpusets to isolate it and simply not run tasks on it?

cpusets don't use stopmachine and are much nicer to the rest of the
kernel over-all.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox