LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 10/11] powerpc/perf: add kconfig option for hypervisor provided counters
From: Cody P Schafer @ 2014-02-27 21:05 UTC (permalink / raw)
  To: Linux PPC, Anshuman Khandual, Benjamin Herrenschmidt,
	Cody P Schafer, Deepthi Dharwar, Gavin Shan, Lijun Pan, Li Zhong,
	Michael Ellerman, Paul Bolle, Priyanka Jain, Srivatsa S. Bhat
  Cc: Peter Zijlstra, LKML, Ingo Molnar, Paul Mackerras,
	Arnaldo Carvalho de Melo, scottwood
In-Reply-To: <1393535105-7528-1-git-send-email-cody@linux.vnet.ibm.com>

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
---
 arch/powerpc/perf/Makefile             |  2 ++
 arch/powerpc/platforms/pseries/Kconfig | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index 60d71ee..f9c083a 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -11,5 +11,7 @@ obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
 obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o
 obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o
 
+obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o
+
 obj-$(CONFIG_PPC64)		+= $(obj64-y)
 obj-$(CONFIG_PPC32)		+= $(obj32-y)
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 80b1d57..2cb8b77 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -111,6 +111,18 @@ config CMM
 	  will be reused for other LPARs. The interface allows firmware to
 	  balance memory across many LPARs.
 
+config HV_PERF_CTRS
+       bool "Hypervisor supplied PMU events (24x7 & GPCI)"
+       default y
+       depends on PERF_EVENTS && PPC_PSERIES
+       help
+	  Enable access to hypervisor supplied counters in perf. Currently,
+	  this enables code that uses the hcall GetPerfCounterInfo and 24x7
+	  interfaces to retrieve counters. GPCI exists on Power 6 and later
+	  systems. 24x7 is available on Power 8 systems.
+
+          If unsure, select Y.
+
 config DTL
 	bool "Dispatch Trace Log"
 	depends on PPC_SPLPAR && DEBUG_FS
-- 
1.9.0

^ permalink raw reply related

* [PATCH v3 11/11] powerpc/perf/hv_{gpci, 24x7}: add documentation of device attributes
From: Cody P Schafer @ 2014-02-27 21:05 UTC (permalink / raw)
  To: Linux PPC, Cody P Schafer
  Cc: Rob Landley, linux-doc, Peter Zijlstra, LKML, Michael Ellerman,
	Ingo Molnar, Paul Mackerras, Arnaldo Carvalho de Melo, scottwood
In-Reply-To: <1393535105-7528-1-git-send-email-cody@linux.vnet.ibm.com>

gpci and 24x7 expose some device specific attributes. Add some
documentation for them.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
---
 .../testing/sysfs-bus-event_source-devices-hv_24x7 | 23 ++++++++++++
 .../testing/sysfs-bus-event_source-devices-hv_gpci | 43 ++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
 create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_gpci

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
new file mode 100644
index 0000000..e78ee79
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
@@ -0,0 +1,23 @@
+What:		/sys/bus/event_source/devices/hv_24x7/interface/catalog
+Date:		February 2014
+Contact:	Cody P Schafer <cody@linux.vnet.ibm.com>
+Description:
+		Provides access to the binary "24x7 catalog" provided by the
+		hypervisor on POWER7 and 8 systems. This catalog lists events
+		avaliable from the powerpc "hv_24x7" pmu. Its format is
+		documented here:
+		https://raw.githubusercontent.com/jmesmon/catalog-24x7/master/hv-24x7-catalog.h
+
+What:		/sys/bus/event_source/devices/hv_24x7/interface/catalog_length
+Date:		February 2014
+Contact:	Cody P Schafer <cody@linux.vnet.ibm.com>
+Description:
+		A number equal to the length in bytes of the catalog. This is
+		also extractable from the provided binary "catalog" sysfs entry.
+
+What:		/sys/bus/event_source/devices/hv_24x7/interface/catalog_version
+Date:		February 2014
+Contact:	Cody P Schafer <cody@linux.vnet.ibm.com>
+Description:
+		Exposes the "version" field of the 24x7 catalog. This is also
+		extractable from the provided binary "catalog" sysfs entry.
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_gpci b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_gpci
new file mode 100644
index 0000000..3fa58c2
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_gpci
@@ -0,0 +1,43 @@
+What:		/sys/bus/event_source/devices/hv_gpci/interface/collect_privileged
+Date:		February 2014
+Contact:	Cody P Schafer <cody@linux.vnet.ibm.com>
+Description:
+		'0' if the hypervisor is configured to forbid access to event
+		counters being accumulated by other guests and to physical
+		domain event counters.
+		'1' if that access is allowed.
+
+What:		/sys/bus/event_source/devices/hv_gpci/interface/ga
+Date:		February 2014
+Contact:	Cody P Schafer <cody@linux.vnet.ibm.com>
+Description:
+		0 or 1. Indicates whether we have access to "GA" events (listed
+		in arch/powerpc/perf/hv-gpci.h).
+
+What:		/sys/bus/event_source/devices/hv_gpci/interface/expanded
+Date:		February 2014
+Contact:	Cody P Schafer <cody@linux.vnet.ibm.com>
+Description:
+		0 or 1. Indicates whether we have access to "EXPANDED" events (listed
+		in arch/powerpc/perf/hv-gpci.h).
+
+What:		/sys/bus/event_source/devices/hv_gpci/interface/lab
+Date:		February 2014
+Contact:	Cody P Schafer <cody@linux.vnet.ibm.com>
+Description:
+		0 or 1. Indicates whether we have access to "LAB" events (listed
+		in arch/powerpc/perf/hv-gpci.h).
+
+What:		/sys/bus/event_source/devices/hv_gpci/interface/version
+Date:		February 2014
+Contact:	Cody P Schafer <cody@linux.vnet.ibm.com>
+Description:
+		A number indicating the version of the gpci interface that the
+		hypervisor reports supporting.
+
+What:		/sys/bus/event_source/devices/hv_gpci/interface/kernel_version
+Date:		February 2014
+Contact:	Cody P Schafer <cody@linux.vnet.ibm.com>
+Description:
+		A number indicating the latest version of the gpci interface
+		that the kernel is aware of.
-- 
1.9.0

^ permalink raw reply related

* Re: [PATCH] mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS
From: Andrew Morton @ 2014-02-27 23:41 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: Peter Zijlstra, Liu Ping Fan, linux-mm, Paul Mackerras,
	linuxppc-dev
In-Reply-To: <87k3cifgzz.fsf@linux.vnet.ibm.com>

On Wed, 26 Feb 2014 13:22:16 +0530 "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:

> Andrew Morton <akpm@linux-foundation.org> writes:
> 
> > On Wed,  5 Feb 2014 09:25:46 +0800 Liu Ping Fan <qemulist@gmail.com> wrote:
> >
> >> When doing some numa tests on powerpc, I triggered an oops bug. I find
> >> it is caused by using page->_last_cpupid.  It should be initialized as
> >> "-1 & LAST_CPUPID_MASK", but not "-1". Otherwise, in task_numa_fault(),
> >> we will miss the checking (last_cpupid == (-1 & LAST_CPUPID_MASK)).
> >> And finally cause an oops bug in task_numa_group(), since the online cpu is
> >> less than possible cpu.
> >
> > I grabbed this.  I added this to the changelog:
> >
> > : PPC needs the LAST_CPUPID_NOT_IN_PAGE_FLAGS case because ppc needs to
> > : support a large physical address region, up to 2^46 but small section size
> > : (2^24).  So when NR_CPUS grows up, it is easily to cause
> > : not-in-page-flags.
> >
> > to hopefully address Peter's observation.
> >
> > How should we proceed with this?  I'm getting the impression that numa
> > balancing on ppc is a dead duck in 3.14, so perhaps this and 
> >
> > powerpc-mm-add-new-set-flag-argument-to-pte-pmd-update-function.patch
> > mm-dirty-accountable-change-only-apply-to-non-prot-numa-case.patch
> > mm-use-ptep-pmdp_set_numa-for-updating-_page_numa-bit.patch
> >
> 
> All these are already in 3.14  ?

Yes.

> > are 3.15-rc1 material?
> >
> 
> We should push the first hunk to 3.14. I will wait for Liu to redo the
> patch. BTW this should happen only when SPARSE_VMEMMAP is not
> specified. Srikar had reported the issue here
> 
> http://mid.gmane.org/20140219180200.GA29257@linux.vnet.ibm.com
> 
> #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
> #define SECTIONS_WIDTH		SECTIONS_SHIFT
> #else
> #define SECTIONS_WIDTH		0
> #endif
> 

I'm lost.  What patch are you talking about?  The first hunk of what?

I assume we're talking about
mm-numa-bugfix-for-last_cpupid_not_in_page_flags.patch, which I had
queued for 3.14.  I'll put it on hold until there's some clarity here.

^ permalink raw reply

* [PATCH] powerpc/powernv: Read OPAL error log and export it through sysfs
From: Stewart Smith @ 2014-02-28  0:58 UTC (permalink / raw)
  To: Mahesh J Salgaonkar, benh, linuxppc-dev; +Cc: Stewart Smith
In-Reply-To: <87k3cin5kd.fsf@river.au.ibm.com>

Based on a patch by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

This patch adds support to read error logs from OPAL and export
them to userspace through a sysfs interface.

We export each log entry as a directory in /sys/firmware/opal/elog/

Currently, OPAL will buffer up to 128 error log records, we don't
need to have any knowledge of this limit on the Linux side as that
is actually largely transparent to us.

Each error log entry has the following files: id, type, acknowledge, raw.
Currently we just export the raw binary error log in the 'raw' attribute.
In a future patch, we may parse more of the error log to make it a bit
easier for userspace (e.g. to be able to display a brief summary in
petitboot without having to have a full parser).

If we have >128 logs from OPAL, we'll only be notified of 128 until
userspace starts acknowledging them. This limitation may be lifted in
the future and with this patch, that should "just work" from the linux side.

A userspace daemon should:
- wait for error log entries using normal mechanisms (we announce creation)
- read error log entry
- save error log entry safely to disk
- acknowledge the error log entry
- rinse, repeat.

On the Linux side, we read the error log when we're notified of it. This
possibly isn't ideal as it would be better to only read them on-demand.
However, this doesn't really work with current OPAL interface, so we
read the error log immediately when notified at the moment.

I've tested this pretty extensively and am rather confident that the
linux side of things works rather well. There is currently an issue with
the service processor side of things for >128 error logs though.

Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
---
 Documentation/ABI/stable/sysfs-firmware-opal-elog |   60 ++++
 arch/powerpc/include/asm/opal.h                   |   13 +
 arch/powerpc/platforms/powernv/Makefile           |    2 +-
 arch/powerpc/platforms/powernv/opal-elog.c        |  312 +++++++++++++++++++++
 arch/powerpc/platforms/powernv/opal-wrappers.S    |    5 +
 arch/powerpc/platforms/powernv/opal.c             |    2 +
 6 files changed, 393 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ABI/stable/sysfs-firmware-opal-elog
 create mode 100644 arch/powerpc/platforms/powernv/opal-elog.c

diff --git a/Documentation/ABI/stable/sysfs-firmware-opal-elog b/Documentation/ABI/stable/sysfs-firmware-opal-elog
new file mode 100644
index 0000000..e1f3058
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-firmware-opal-elog
@@ -0,0 +1,60 @@
+What:		/sys/firmware/opal/elog
+Date:		Feb 2014
+Contact:	Stewart Smith <stewart@linux.vnet.ibm.com>
+Description:
+		This directory exposes error log entries retrieved
+		through the OPAL firmware interface.
+
+		Each error log is identified by a unique ID and will
+		exist until explicitly acknowledged to firmware.
+
+		Each log entry has a directory in /sys/firmware/opal/elog.
+
+		Log entries may be purged by the service processor
+		before retrieved by firmware or retrieved/acknowledged by
+		Linux if there is no room for more log entries.
+
+		In the event that Linux has retrieved the log entries
+		but not explicitly acknowledged them to firmware and
+		the service processor needs more room for log entries,
+		the only remaining copy of a log message may be in
+		Linux.
+
+		Typically, a user space daemon will monitor for new
+		entries, read them out and acknowledge them.
+
+		The service processor may be able to store more log
+		entries than firmware can, so after you acknowledge
+		an event from Linux you may instantly get another one
+		from the queue that was generated some time in the past.
+
+		The raw log format is a binary format. We currently
+		do not parse this at all in kernel, leaving it up to
+		user space to solve the problem. In future, we may
+		do more parsing in kernel and add more files to make
+		it easier for simple user space processes to extract
+		more information.
+
+		For each log entry (directory), there are the following
+		files:
+
+		id:		An ASCII representation of the ID of the
+				error log, in hex - e.g. "0x01".
+
+		type:		An ASCII representation of the type id and
+				description of the type of error log.
+				Currently just "0x00 PEL" - platform error log.
+				In the future there may be additional types.
+
+		raw:		A read-only binary file that can be read
+				to get the raw log entry. These are
+				<16kb, often just hundreds of bytes and
+				"average" 2kb.
+
+		acknowledge:	Writing 'ack' to this file will acknowledge
+				the error log to firmware (and in turn
+				the service processor, if applicable).
+				Shortly after acknowledging it, the log
+				entry will be removed from sysfs.
+				Reading this file will list the supported
+				operations (curently just acknowledge).
\ No newline at end of file
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 40157e2..b404545 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -151,6 +151,11 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_LPC_READ				67
 #define OPAL_LPC_WRITE				68
 #define OPAL_RETURN_CPU				69
+#define OPAL_ELOG_READ				71
+#define OPAL_ELOG_WRITE				72
+#define OPAL_ELOG_ACK				73
+#define OPAL_ELOG_RESEND			74
+#define OPAL_ELOG_SIZE				75
 #define OPAL_FLASH_VALIDATE			76
 #define OPAL_FLASH_MANAGE			77
 #define OPAL_FLASH_UPDATE			78
@@ -823,6 +828,13 @@ int64_t opal_lpc_write(uint32_t chip_id, enum OpalLPCAddressType addr_type,
 		       uint32_t addr, uint32_t data, uint32_t sz);
 int64_t opal_lpc_read(uint32_t chip_id, enum OpalLPCAddressType addr_type,
 		      uint32_t addr, __be32 *data, uint32_t sz);
+
+int64_t opal_read_elog(uint64_t buffer, size_t size, uint64_t log_id);
+int64_t opal_get_elog_size(uint64_t *log_id, size_t *size, uint64_t *elog_type);
+int64_t opal_write_elog(uint64_t buffer, uint64_t size, uint64_t offset);
+int64_t opal_send_ack_elog(uint64_t log_id);
+void opal_resend_pending_logs(void);
+
 int64_t opal_validate_flash(uint64_t buffer, uint32_t *size, uint32_t *result);
 int64_t opal_manage_flash(uint8_t op);
 int64_t opal_update_flash(uint64_t blk_list);
@@ -861,6 +873,7 @@ extern void opal_get_rtc_time(struct rtc_time *tm);
 extern unsigned long opal_get_boot_time(void);
 extern void opal_nvram_init(void);
 extern void opal_flash_init(void);
+extern int opal_elog_init(void);
 
 extern int opal_machine_check(struct pt_regs *regs);
 
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 8d767fd..189fd45 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,6 +1,6 @@
 obj-y			+= setup.o opal-takeover.o opal-wrappers.o opal.o
 obj-y			+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
-obj-y			+= rng.o
+obj-y			+= rng.o opal-elog.o
 
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c
new file mode 100644
index 0000000..61e2ef3
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@@ -0,0 +1,312 @@
+/*
+ * Error log support on PowerNV.
+ *
+ * Copyright 2013,2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/fcntl.h>
+#include <asm/uaccess.h>
+#include <asm/opal.h>
+
+struct elog_obj {
+	struct kobject kobj;
+	struct bin_attribute raw_attr;
+	uint64_t id;
+	uint64_t type;
+	size_t size;
+	char *buffer;
+};
+#define to_elog_obj(x) container_of(x, struct elog_obj, kobj)
+
+struct elog_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct elog_obj *elog, struct elog_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct elog_obj *elog, struct elog_attribute *attr,
+			 const char *buf, size_t count);
+};
+#define to_elog_attr(x) container_of(x, struct elog_attribute, attr)
+
+static ssize_t elog_id_show(struct elog_obj *elog_obj,
+			    struct elog_attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "0x%llx\n", elog_obj->id);
+}
+
+static const char *elog_type_to_string(uint64_t type)
+{
+	switch (type) {
+	case 0: return "PEL";
+	default: return "unknown";
+	}
+}
+
+static ssize_t elog_type_show(struct elog_obj *elog_obj,
+			      struct elog_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "0x%llx %s\n",
+		       elog_obj->type,
+		       elog_type_to_string(elog_obj->type));
+}
+
+static ssize_t elog_ack_show(struct elog_obj *elog_obj,
+			     struct elog_attribute *attr,
+			     char *buf)
+{
+	return sprintf(buf, "ack - acknowledge log message\n");
+}
+
+static void delay_release_kobj(void *kobj)
+{
+	kobject_put((struct kobject *)kobj);
+}
+
+static ssize_t elog_ack_store(struct elog_obj *elog_obj,
+			      struct elog_attribute *attr,
+			      const char *buf,
+			      size_t count)
+{
+	opal_send_ack_elog(elog_obj->id);
+	sysfs_schedule_callback(&elog_obj->kobj, delay_release_kobj,
+				&elog_obj->kobj, THIS_MODULE);
+	return count;
+}
+
+static struct elog_attribute id_attribute =
+	__ATTR(id, 0666, elog_id_show, NULL);
+static struct elog_attribute type_attribute =
+	__ATTR(type, 0666, elog_type_show, NULL);
+static struct elog_attribute ack_attribute =
+	__ATTR(acknowledge, 0660, elog_ack_show, elog_ack_store);
+
+static struct kset *elog_kset;
+
+static ssize_t elog_attr_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
+{
+	struct elog_attribute *attribute;
+	struct elog_obj *elog;
+
+	attribute = to_elog_attr(attr);
+	elog = to_elog_obj(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(elog, attribute, buf);
+}
+
+static ssize_t elog_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct elog_attribute *attribute;
+	struct elog_obj *elog;
+
+	attribute = to_elog_attr(attr);
+	elog = to_elog_obj(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(elog, attribute, buf, len);
+}
+
+static const struct sysfs_ops elog_sysfs_ops = {
+	.show = elog_attr_show,
+	.store = elog_attr_store,
+};
+
+static void elog_release(struct kobject *kobj)
+{
+	struct elog_obj *elog;
+
+	elog = to_elog_obj(kobj);
+	kfree(elog->buffer);
+	kfree(elog);
+}
+
+static struct attribute *elog_default_attrs[] = {
+	&id_attribute.attr,
+	&type_attribute.attr,
+	&ack_attribute.attr,
+	NULL,
+};
+
+static struct kobj_type elog_ktype = {
+	.sysfs_ops = &elog_sysfs_ops,
+	.release = &elog_release,
+	.default_attrs = elog_default_attrs,
+};
+
+/* Maximum size of a single log on FSP is 16KB */
+#define OPAL_MAX_ERRLOG_SIZE	16384
+
+static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
+			     struct bin_attribute *bin_attr,
+			     char *buffer, loff_t pos, size_t count)
+{
+	int opal_rc;
+
+	struct elog_obj *elog = to_elog_obj(kobj);
+
+	/* We may have had an error reading before, so let's retry */
+	if (!elog->buffer) {
+		elog->buffer = kzalloc(elog->size, GFP_KERNEL);
+		if (!elog->buffer)
+			return -EIO;
+
+		opal_rc = opal_read_elog(__pa(elog->buffer),
+					 elog->size, elog->id);
+		if (opal_rc != OPAL_SUCCESS) {
+			pr_err("ELOG: log read failed for log-id=%llx\n",
+			       elog->id);
+			kfree(elog->buffer);
+			elog->buffer = NULL;
+			return -EIO;
+		}
+	}
+
+	memcpy(buffer, elog->buffer + pos, count);
+
+	return count;
+}
+
+static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
+{
+	struct elog_obj *elog;
+	int rc;
+
+	elog = kzalloc(sizeof(*elog), GFP_KERNEL);
+	if (!elog)
+		return NULL;
+
+	elog->kobj.kset = elog_kset;
+
+	kobject_init(&elog->kobj, &elog_ktype);
+
+	sysfs_bin_attr_init(&elog->raw_attr);
+
+	elog->raw_attr.attr.name = "raw";
+	elog->raw_attr.attr.mode = 0400;
+	elog->raw_attr.size = size;
+	elog->raw_attr.read = raw_attr_read;
+
+	elog->id = id;
+	elog->size = size;
+	elog->type = type;
+
+	elog->buffer = kzalloc(elog->size, GFP_KERNEL);
+
+	if (elog->buffer) {
+		rc = opal_read_elog(__pa(elog->buffer),
+					 elog->size, elog->id);
+		if (rc != OPAL_SUCCESS) {
+			pr_err("ELOG: log read failed for log-id=%llx\n",
+			       elog->id);
+			kfree(elog->buffer);
+			elog->buffer = NULL;
+		}
+	}
+
+	rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
+	if (rc) {
+		kobject_put(&elog->kobj);
+		return NULL;
+	}
+
+	rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
+	if (rc) {
+		kobject_put(&elog->kobj);
+		return NULL;
+	}
+
+	kobject_uevent(&elog->kobj, KOBJ_ADD);
+
+	return elog;
+}
+
+static void elog_work_fn(struct work_struct *work)
+{
+	size_t elog_size;
+	uint64_t log_id;
+	uint64_t elog_type;
+	int rc;
+	char name[2+16+1];
+
+	rc = opal_get_elog_size(&log_id, &elog_size, &elog_type);
+	if (rc != OPAL_SUCCESS) {
+		pr_err("ELOG: Opal log read failed\n");
+		return;
+	}
+
+	BUG_ON(elog_size > OPAL_MAX_ERRLOG_SIZE);
+
+	if (elog_size >= OPAL_MAX_ERRLOG_SIZE)
+		elog_size  =  OPAL_MAX_ERRLOG_SIZE;
+
+	sprintf(name, "0x%llx", log_id);
+
+	/* we may get notified twice, let's handle
+	 * that gracefully and not create two conflicting
+	 * entries.
+	 */
+	if (kset_find_obj(elog_kset, name))
+		return;
+
+	create_elog_obj(log_id, elog_size, elog_type);
+}
+
+static DECLARE_WORK(elog_work, elog_work_fn);
+
+static int elog_event(struct notifier_block *nb,
+				unsigned long events, void *change)
+{
+	/* check for error log event */
+	if (events & OPAL_EVENT_ERROR_LOG_AVAIL)
+		schedule_work(&elog_work);
+	return 0;
+}
+
+static struct notifier_block elog_nb = {
+	.notifier_call  = elog_event,
+	.next           = NULL,
+	.priority       = 0
+};
+
+int __init opal_elog_init(void)
+{
+	int rc = 0;
+
+	elog_kset = kset_create_and_add("elog", NULL, opal_kobj);
+	if (!elog_kset) {
+		pr_warn("%s: failed to create elog kset\n", __func__);
+		return -1;
+	}
+
+	rc = opal_notifier_register(&elog_nb);
+	if (rc) {
+		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+		__func__, rc);
+		return rc;
+	}
+
+	/* We are now ready to pull error logs from opal. */
+	opal_resend_pending_logs();
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 3e8829c..5fcbf25 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -123,6 +123,11 @@ OPAL_CALL(opal_xscom_write,			OPAL_XSCOM_WRITE);
 OPAL_CALL(opal_lpc_read,			OPAL_LPC_READ);
 OPAL_CALL(opal_lpc_write,			OPAL_LPC_WRITE);
 OPAL_CALL(opal_return_cpu,			OPAL_RETURN_CPU);
+OPAL_CALL(opal_read_elog,			OPAL_ELOG_READ);
+OPAL_CALL(opal_send_ack_elog,			OPAL_ELOG_ACK);
+OPAL_CALL(opal_get_elog_size,			OPAL_ELOG_SIZE);
+OPAL_CALL(opal_resend_pending_logs,		OPAL_ELOG_RESEND);
+OPAL_CALL(opal_write_elog,			OPAL_ELOG_WRITE);
 OPAL_CALL(opal_validate_flash,			OPAL_FLASH_VALIDATE);
 OPAL_CALL(opal_manage_flash,			OPAL_FLASH_MANAGE);
 OPAL_CALL(opal_update_flash,			OPAL_FLASH_UPDATE);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 65499ad..fb77302 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -472,6 +472,8 @@ static int __init opal_init(void)
 	/* Create "opal" kobject under /sys/firmware */
 	rc = opal_sysfs_init();
 	if (rc == 0) {
+		/* Setup error log interface */
+		rc = opal_elog_init();
 		/* Setup code update interface */
 		opal_flash_init();
 	}
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH] mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS
From: liu ping fan @ 2014-02-28  2:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, linux-mm, Paul Mackerras, Aneesh Kumar K.V,
	linuxppc-dev
In-Reply-To: <20140227154104.4e3572f1d9e2692d431d1a4e@linux-foundation.org>

On Fri, Feb 28, 2014 at 7:41 AM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Wed, 26 Feb 2014 13:22:16 +0530 "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:
>
>> Andrew Morton <akpm@linux-foundation.org> writes:
>>
>> > On Wed,  5 Feb 2014 09:25:46 +0800 Liu Ping Fan <qemulist@gmail.com> wrote:
>> >
>> >> When doing some numa tests on powerpc, I triggered an oops bug. I find
>> >> it is caused by using page->_last_cpupid.  It should be initialized as
>> >> "-1 & LAST_CPUPID_MASK", but not "-1". Otherwise, in task_numa_fault(),
>> >> we will miss the checking (last_cpupid == (-1 & LAST_CPUPID_MASK)).
>> >> And finally cause an oops bug in task_numa_group(), since the online cpu is
>> >> less than possible cpu.
>> >
>> > I grabbed this.  I added this to the changelog:
>> >
>> > : PPC needs the LAST_CPUPID_NOT_IN_PAGE_FLAGS case because ppc needs to
>> > : support a large physical address region, up to 2^46 but small section size
>> > : (2^24).  So when NR_CPUS grows up, it is easily to cause
>> > : not-in-page-flags.
>> >
>> > to hopefully address Peter's observation.
>> >
>> > How should we proceed with this?  I'm getting the impression that numa
>> > balancing on ppc is a dead duck in 3.14, so perhaps this and
>> >
>> > powerpc-mm-add-new-set-flag-argument-to-pte-pmd-update-function.patch
>> > mm-dirty-accountable-change-only-apply-to-non-prot-numa-case.patch
>> > mm-use-ptep-pmdp_set_numa-for-updating-_page_numa-bit.patch
>> >
>>
>> All these are already in 3.14  ?
>
> Yes.
>
>> > are 3.15-rc1 material?
>> >
>>
>> We should push the first hunk to 3.14. I will wait for Liu to redo the
>> patch. BTW this should happen only when SPARSE_VMEMMAP is not
>> specified. Srikar had reported the issue here
>>
>> http://mid.gmane.org/20140219180200.GA29257@linux.vnet.ibm.com
>>
>> #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
>> #define SECTIONS_WIDTH                SECTIONS_SHIFT
>> #else
>> #define SECTIONS_WIDTH                0
>> #endif
>>
>
> I'm lost.  What patch are you talking about?  The first hunk of what?
>
I think Aneesh was talking about the chunk of patch, which modified
the file "page-flags-layout.h".
I tried to collapse and simplify the logic, but it will incur that
LAST_CPUPID_WIDTH depends on CONFIG_NUMA_BALANCING.
It is an error since we need LAST_CPUPID_WIDTH even without
CONFIG_NUMA_BALANCING. (Sorry, I compiled and run kernel, but not find
this).

Thanks and best regards,
Fan

> I assume we're talking about
> mm-numa-bugfix-for-last_cpupid_not_in_page_flags.patch, which I had
> queued for 3.14.  I'll put it on hold until there's some clarity here.
>

^ permalink raw reply

* Re: [RFC PATCH] powerpc: allow allyesconfig to build more
From: Michael Neuling @ 2014-02-28  4:06 UTC (permalink / raw)
  To: Stephen Rothwell; +Cc: ppc-dev, paulus, Mahesh Salgaonkar
In-Reply-To: <20140227171719.01d7de4be01559dd02968f7c@canb.auug.org.au>

Stephen Rothwell <sfr@canb.auug.org.au> wrote:

> Fixes this build error:
> 
> arch/powerpc/kernel/exceptions-64s.S: Assembler messages:
> arch/powerpc/kernel/exceptions-64s.S:1312: Error: attempt to move .org backwards
> 
> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>

Builds and boots for me

Acked-off-by: Michael Neuling <mikey@neuling.org>


> ---
>  arch/powerpc/kernel/exceptions-64s.S | 20 ++++++++++----------
>  1 file changed, 10 insertions(+), 10 deletions(-)
> 
> This builds allyesconfig better (we still have RELOC failures in the
> link) and hopefully fixes the allmodconfig build, but I don't know if
> it is semantically OK.
> 
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 38d507306a11..b87859ffc8e7 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -1294,16 +1294,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
>  	.globl	__end_handlers
>  __end_handlers:
>  
> -	/* Equivalents to the above handlers for relocation-on interrupt vectors */
> -	STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
> -	MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
> -
> -	STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
> -	STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
> -	STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
> -	STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
> -	STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
> -
>  #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
>  /*
>   * Data area reserved for FWNMI option.
> @@ -1325,6 +1315,16 @@ fwnmi_data_area:
>  initial_stab:
>  	.space	4096
>  
> +	/* Equivalents to the above handlers for relocation-on interrupt vectors */
> +	STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
> +	MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
> +
> +	STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
> +	STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
> +	STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
> +	STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
> +	STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
> +
>  #ifdef CONFIG_PPC_POWERNV
>  _GLOBAL(opal_mc_secondary_handler)
>  	HMT_MEDIUM_PPR_DISCARD
> -- 
> 1.9.0
> 
> -- 
> Cheers,
> Stephen Rothwell                    sfr@canb.auug.org.au

^ permalink raw reply

* Re: [PATCH] mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS
From: Aneesh Kumar K.V @ 2014-02-28  4:47 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, Liu Ping Fan, linux-mm, Paul Mackerras,
	linuxppc-dev
In-Reply-To: <20140227154104.4e3572f1d9e2692d431d1a4e@linux-foundation.org>

Andrew Morton <akpm@linux-foundation.org> writes:

> On Wed, 26 Feb 2014 13:22:16 +0530 "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:
>
>> Andrew Morton <akpm@linux-foundation.org> writes:
>> 
>> > On Wed,  5 Feb 2014 09:25:46 +0800 Liu Ping Fan <qemulist@gmail.com> wrote:
>> >
>> >> When doing some numa tests on powerpc, I triggered an oops bug. I find
>> >> it is caused by using page->_last_cpupid.  It should be initialized as
>> >> "-1 & LAST_CPUPID_MASK", but not "-1". Otherwise, in task_numa_fault(),
>> >> we will miss the checking (last_cpupid == (-1 & LAST_CPUPID_MASK)).
>> >> And finally cause an oops bug in task_numa_group(), since the online cpu is
>> >> less than possible cpu.
>> >
>> > I grabbed this.  I added this to the changelog:
>> >
>> > : PPC needs the LAST_CPUPID_NOT_IN_PAGE_FLAGS case because ppc needs to
>> > : support a large physical address region, up to 2^46 but small section size
>> > : (2^24).  So when NR_CPUS grows up, it is easily to cause
>> > : not-in-page-flags.
>> >
>> > to hopefully address Peter's observation.
>> >
>> > How should we proceed with this?  I'm getting the impression that numa
>> > balancing on ppc is a dead duck in 3.14, so perhaps this and 
>> >
>> > powerpc-mm-add-new-set-flag-argument-to-pte-pmd-update-function.patch
>> > mm-dirty-accountable-change-only-apply-to-non-prot-numa-case.patch
>> > mm-use-ptep-pmdp_set_numa-for-updating-_page_numa-bit.patch
>> >
>> 
>> All these are already in 3.14  ?
>
> Yes.
>
>> > are 3.15-rc1 material?
>> >
>> 
>> We should push the first hunk to 3.14. I will wait for Liu to redo the
>> patch. BTW this should happen only when SPARSE_VMEMMAP is not
>> specified. Srikar had reported the issue here
>> 
>> http://mid.gmane.org/20140219180200.GA29257@linux.vnet.ibm.com
>> 
>> #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
>> #define SECTIONS_WIDTH		SECTIONS_SHIFT
>> #else
>> #define SECTIONS_WIDTH		0
>> #endif
>> 
>
> I'm lost.  What patch are you talking about?  The first hunk of what?

The patch in this thread.

>
> I assume we're talking about
> mm-numa-bugfix-for-last_cpupid_not_in_page_flags.patch, which I had
> queued for 3.14.  I'll put it on hold until there's some clarity here.

We don't need the complete patch, it is just the first hunk that we need
to fix the crash ie. we only need

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a7b4e31..ddc66df4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -727,7 +727,7 @@ static inline int page_cpupid_last(struct page *page)
 }
 static inline void page_cpupid_reset_last(struct page *page)
 {
-	page->_last_cpupid = -1;
+	page->_last_cpupid = -1 & LAST_CPUPID_MASK;
 }
 #else
 static inline int page_cpupid_last(struct page *page)

Also the issue will only happen when SPARSE_VMEMMAP is not enabled. I
will send a proper patch with updated changelog. I was hoping Liu will
get to that quickly


-aneesh

^ permalink raw reply related

* [PATCH 1/2] powerpc/powernv: Fix opal_xscom_{read,write} prototype
From: Benjamin Herrenschmidt @ 2014-02-28  5:20 UTC (permalink / raw)
  To: linuxppc-dev list

The OPAL firmware functions opal_xscom_read and opal_xscom_write
take a 64-bit argument for the XSCOM (PCB) address in order to
support the indirect mode on P8.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: <stable@vger.kernel.org> [v3.13]
---
 arch/powerpc/include/asm/opal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 40157e2..ed82142 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -816,8 +816,8 @@ int64_t opal_pci_next_error(uint64_t phb_id, uint64_t *first_frozen_pe,
 int64_t opal_pci_poll(uint64_t phb_id);
 int64_t opal_return_cpu(void);
 
-int64_t opal_xscom_read(uint32_t gcid, uint32_t pcb_addr, __be64 *val);
-int64_t opal_xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val);
+int64_t opal_xscom_read(uint32_t gcid, uint64_t pcb_addr, __be64 *val);
+int64_t opal_xscom_write(uint32_t gcid, uint64_t pcb_addr, uint64_t val);
 
 int64_t opal_lpc_write(uint32_t chip_id, enum OpalLPCAddressType addr_type,
 		       uint32_t addr, uint32_t data, uint32_t sz);

^ permalink raw reply related

* [PATCH 2/2] powerpc/powernv: Fix indirect XSCOM unmangling
From: Benjamin Herrenschmidt @ 2014-02-28  5:20 UTC (permalink / raw)
  To: linuxppc-dev list

We need to unmangle the full address, not just the register
number, and we also need to support the real indirect bit
being set for in-kernel uses.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: <stable@vger.kernel.org> [v3.13]
---
 arch/powerpc/platforms/powernv/opal-xscom.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index 4fbf276..4cd2ea6 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -71,11 +71,11 @@ static int opal_xscom_err_xlate(int64_t rc)
 	}
 }
 
-static u64 opal_scom_unmangle(u64 reg)
+static u64 opal_scom_unmangle(u64 addr)
 {
 	/*
 	 * XSCOM indirect addresses have the top bit set. Additionally
-	 * the reset of the top 3 nibbles is always 0.
+	 * the rest of the top 3 nibbles is always 0.
 	 *
 	 * Because the debugfs interface uses signed offsets and shifts
 	 * the address left by 3, we basically cannot use the top 4 bits
@@ -86,10 +86,13 @@ static u64 opal_scom_unmangle(u64 reg)
 	 * conversion here. To leave room for further xscom address
 	 * expansion, we only clear out the top byte
 	 *
+	 * For in-kernel use, we also support the real indirect bit, so
+	 * we test for any of the top 5 bits
+	 *
 	 */
-	if (reg & (1ull << 59))
-		reg = (reg & ~(0xffull << 56)) | (1ull << 63);
-	return reg;
+	if (addr & (0x1full << 59))
+		addr = (addr & ~(0xffull << 56)) | (1ull << 63);
+	return addr;
 }
 
 static int opal_scom_read(scom_map_t map, u64 reg, u64 *value)
@@ -98,8 +101,8 @@ static int opal_scom_read(scom_map_t map, u64 reg, u64 *value)
 	int64_t rc;
 	__be64 v;
 
-	reg = opal_scom_unmangle(reg);
-	rc = opal_xscom_read(m->chip, m->addr + reg, (__be64 *)__pa(&v));
+	reg = opal_scom_unmangle(m->addr + reg);
+	rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v));
 	*value = be64_to_cpu(v);
 	return opal_xscom_err_xlate(rc);
 }
@@ -109,8 +112,8 @@ static int opal_scom_write(scom_map_t map, u64 reg, u64 value)
 	struct opal_scom_map *m = map;
 	int64_t rc;
 
-	reg = opal_scom_unmangle(reg);
-	rc = opal_xscom_write(m->chip, m->addr + reg, value);
+	reg = opal_scom_unmangle(m->addr + reg);
+	rc = opal_xscom_write(m->chip, reg, value);
 	return opal_xscom_err_xlate(rc);
 }
 

^ permalink raw reply related

* [PATCH V2] mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS
From: Aneesh Kumar K.V @ 2014-02-28  5:28 UTC (permalink / raw)
  To: benh, akpm, Peter Zijlstra
  Cc: Liu Ping Fan, linux-kernel, Liu Ping Fan, linux-mm,
	Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <877g8fn8qw.fsf@linux.vnet.ibm.com>

From: Liu Ping Fan <qemulist@gmail.com>

When doing some numa tests on powerpc, I triggered an oops bug. I find
it is caused by using page->_last_cpupid.  It should be initialized as
"-1 & LAST_CPUPID_MASK", but not "-1". Otherwise, in task_numa_fault(),
we will miss the checking (last_cpupid == (-1 & LAST_CPUPID_MASK)).
And finally cause an oops bug in task_numa_group(), since the online cpu is
less than possible cpu. This happen with CONFIG_SPARSE_VMEMMAP disabled

Call trace:
[   55.978091] SMP NR_CPUS=64 NUMA PowerNV
[   55.978118] Modules linked in:
[   55.978145] CPU: 24 PID: 804 Comm: systemd-udevd Not tainted3.13.0-rc1+ #32
[   55.978183] task: c000001e2746aa80 ti: c000001e32c50000 task.ti:c000001e32c50000
[   55.978219] NIP: c0000000000f5ad0 LR: c0000000000f5ac8 CTR:c000000000913cf0
[   55.978256] REGS: c000001e32c53510 TRAP: 0300   Not tainted(3.13.0-rc1+)
[   55.978286] MSR: 9000000000009032 <SF,HV,EE,ME,IR,DR,RI>  CR:28024424  XER: 20000000
[   55.978380] CFAR: c000000000009324 DAR: 7265717569726857 DSISR:40000000 SOFTE: 1
GPR00: c0000000000f5ac8 c000001e32c53790 c000000001f343380000000000000021
GPR04: 0000000000000000 0000000000000031 c000000001f743380000ffffffffffff
GPR08: 0000000000000001 7265717569726573 00000000000000000000000000000000
GPR12: 0000000028024422 c00000000ffdd800 00000000296b2e640000000000000020
GPR16: 0000000000000002 0000000000000003 c000001e2f8e4658c000001e25c1c1d8
GPR20: c000001e2f8e4000 c000000001f7a858 00000000000006580000000040000392
GPR24: 00000000000000a8 c000001e33c1a400 00000000000001d8c000001e25c1c000
GPR28: c000001e33c37ff0 0007837840000392 000000000000003fc000001e32c53790
[   55.978903] NIP [c0000000000f5ad0] .task_numa_fault+0x1470/0x2370
[   55.978934] LR [c0000000000f5ac8] .task_numa_fault+0x1468/0x2370
[   55.978964] Call Trace:
[   55.978978] [c000001e32c53790] [c0000000000f5ac8].task_numa_fault+0x1468/0x2370 (unreliable)
[   55.979036] [c000001e32c539e0] [c00000000020a820].do_numa_page+0x480/0x4a0
[   55.979072] [c000001e32c53b10] [c00000000020bfec].handle_mm_fault+0x4ec/0xc90
[   55.979123] [c000001e32c53c00] [c000000000e88c98].do_page_fault+0x3a8/0x890
[   55.979161] [c000001e32c53e30] [c000000000009568]handle_page_fault+0x10/0x30
[   55.979197] Instruction dump:
[   55.979216] 3c82fefb 3884b138 48d9cff1 60000000 48000574 3c62fefb3863af78 3c82fefb
[   55.979277] 3884b138 48d9cfd5 60000000 e93f0100 <812902e4> 7d2907b45529063e 7d2a07b4
[   55.979354] ---[ end trace 15f2510da5ae07cf ]---

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f28f46eade6a..1a0ea24ff972 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -766,7 +766,7 @@ static inline int page_cpupid_last(struct page *page)
 }
 static inline void page_cpupid_reset_last(struct page *page)
 {
-	page->_last_cpupid = -1;
+	page->_last_cpupid = -1 & LAST_CPUPID_MASK;
 }
 #else
 static inline int page_cpupid_last(struct page *page)
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 1/2] powerpc: Add a cpu feature CPU_FTR_PMAO_BUG
From: Michael Ellerman @ 2014-02-28  6:29 UTC (permalink / raw)
  To: linuxppc-dev

Some power8 revisions have a hardware bug where we can lose a
Performance Monitor (PMU) exception under certain circumstances.

We will be adding a workaround for this case, see the next commit for
details. The observed behaviour is that writing PMAO doesn't cause an
exception as we would expect, hence the name of the feature.

This commit just adds a CPU feature bit. We set it on all power8 cpus,
and then clear it at cpu setup time if we are on a revision where the
bug is fixed.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cputable.h   |  3 ++-
 arch/powerpc/kernel/cpu_setup_power.S | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 617cc76..56d3166 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -189,6 +189,7 @@ extern const char *powerpc_base_platform;
 #define	CPU_FTR_HAS_PPR			LONG_ASM_CONST(0x0200000000000000)
 #define CPU_FTR_DAWR			LONG_ASM_CONST(0x0400000000000000)
 #define CPU_FTR_DABRX			LONG_ASM_CONST(0x0800000000000000)
+#define CPU_FTR_PMAO_BUG		LONG_ASM_CONST(0x1000000000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -444,7 +445,7 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
 	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
 	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
-	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP)
+	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_PMAO_BUG)
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 37d1bb0..cb2da74 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -179,4 +179,19 @@ __init_PMU:
 	mtspr	SPRN_MMCR0,r5
 	mtspr	SPRN_MMCR1,r5
 	mtspr	SPRN_MMCR2,r5
+
+	/* Compare our PVR to POWER8 DD2 (0x4d0200) */
+	mfpvr	r5
+	lis	r6,0x4d
+	ori	r6,r6,0x0200
+	cmplw	r5,r6
+
+	/* If we are older then return */
+	bltlr
+
+	/* Otherwise clear CPU_FTR_PMAO_BUG */
+	ld	r5,CPU_SPEC_FEATURES(r4)
+	LOAD_REG_IMMEDIATE(r6,CPU_FTR_PMAO_BUG)
+	xor	r5,r5,r6
+	std	r5,CPU_SPEC_FEATURES(r4)
 	blr
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 2/2] powerpc/perf: Add lost exception workaround
From: Michael Ellerman @ 2014-02-28  6:29 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <1393568946-4136-1-git-send-email-mpe@ellerman.id.au>

Some power8 revisions have a hardware bug where we can lose a PMU
exception, this commit adds a workaround to detect the bad condition and
rectify the situation.

See the comment in the commit for a full description.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/reg.h  |   2 +
 arch/powerpc/perf/core-book3s.c | 100 +++++++++++++++++++++++++++++++++++++++-
 arch/powerpc/perf/power8-pmu.c  |   5 ++
 3 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 90c06ec..3003472 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -670,6 +670,7 @@
 #define   MMCR0_PMC1CE	0x00008000UL /* PMC1 count enable*/
 #define   MMCR0_PMCjCE	0x00004000UL /* PMCj count enable*/
 #define   MMCR0_TRIGGER	0x00002000UL /* TRIGGER enable */
+#define   MMCR0_PMAO_SYNC 0x00000800UL /* PMU interrupt is synchronous */
 #define   MMCR0_PMAO	0x00000080UL /* performance monitor alert has occurred, set to 0 after handling exception */
 #define   MMCR0_SHRFC	0x00000040UL /* SHRre freeze conditions between threads */
 #define   MMCR0_FC56	0x00000010UL /* freeze counters 5 and 6 */
@@ -703,6 +704,7 @@
 #define SPRN_EBBHR	804	/* Event based branch handler register */
 #define SPRN_EBBRR	805	/* Event based branch return register */
 #define SPRN_BESCR	806	/* Branch event status and control register */
+#define   BESCR_GE	0x8000000000000000ULL /* Global Enable */
 #define SPRN_WORT	895	/* Workload optimization register - thread */
 
 #define SPRN_PMC1	787
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 67cf220..72cdd05 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -120,6 +120,7 @@ static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
 static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
 void power_pmu_flush_branch_stack(void) {}
 static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
+static void pmao_restore_workaround(bool ebb) { }
 #endif /* CONFIG_PPC32 */
 
 static bool regs_use_siar(struct pt_regs *regs)
@@ -545,10 +546,18 @@ static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0)
 	/* Enable EBB and read/write to all 6 PMCs for userspace */
 	mmcr0 |= MMCR0_EBE | MMCR0_PMCC_U6;
 
-	/* Add any bits from the user reg, FC or PMAO */
+	/*
+	 * Add any bits from the user MMCR0, FC or PMAO. This is compatible
+	 * with pmao_restore_workaround() because we may add PMAO but we never
+	 * clear it here.
+	 */
 	mmcr0 |= current->thread.mmcr0;
 
-	/* Be careful not to set PMXE if userspace had it cleared */
+	/*
+	 * Be careful not to set PMXE if userspace had it cleared. This is also
+	 * compatible with pmao_restore_workaround() because it has already
+	 * cleared PMXE and we leave PMAO alone.
+	 */
 	if (!(current->thread.mmcr0 & MMCR0_PMXE))
 		mmcr0 &= ~MMCR0_PMXE;
 
@@ -559,6 +568,91 @@ static unsigned long ebb_switch_in(bool ebb, unsigned long mmcr0)
 out:
 	return mmcr0;
 }
+
+static void pmao_restore_workaround(bool ebb)
+{
+	unsigned pmcs[6];
+
+	if (!cpu_has_feature(CPU_FTR_PMAO_BUG))
+		return;
+
+	/*
+	 * On POWER8 there is a hardware defect which affects the PMU context
+	 * switch logic, ie. power_pmu_disable/enable().
+	 *
+	 * When a counter overflows PMXE is cleared and FC/PMAO is set in MMCR0
+	 * by the hardware. Sometime later the actual PMU exception is
+	 * delivered.
+	 *
+	 * If we context switch, or simply disable/enable, the PMU prior to the
+	 * exception arriving, the exception will be lost when we clear PMAO.
+	 *
+	 * When we reenable the PMU, we will write the saved MMCR0 with PMAO
+	 * set, and this _should_ generate an exception. However because of the
+	 * defect no exception is generated when we write PMAO, and we get
+	 * stuck with no counters counting but no exception delivered.
+	 *
+	 * The workaround is to detect this case and tweak the hardware to
+	 * create another pending PMU exception.
+	 *
+	 * We do that by setting up PMC6 (cycles) for an imminent overflow and
+	 * enabling the PMU. That causes a new exception to be generated in the
+	 * chip, but we don't take it yet because we have interrupts hard
+	 * disabled. We then write back the PMU state as we want it to be seen
+	 * by the exception handler. When we reenable interrupts the exception
+	 * handler will be called and see the correct state.
+	 *
+	 * The logic is the same for EBB, except that the exception is gated by
+	 * us having interrupts hard disabled as well as the fact that we are
+	 * not in userspace. The exception is finally delivered when we return
+	 * to userspace.
+	 */
+
+	/* Only if PMAO is set and PMAO_SYNC is clear */
+	if ((current->thread.mmcr0 & (MMCR0_PMAO | MMCR0_PMAO_SYNC)) != MMCR0_PMAO)
+		return;
+
+	/* If we're doing EBB, only if BESCR[GE] is set */
+	if (ebb && !(current->thread.bescr & BESCR_GE))
+		return;
+
+	/*
+	 * We are already soft-disabled in power_pmu_enable(). We need to hard
+	 * enable to actually prevent the PMU exception from firing.
+	 */
+	hard_irq_disable();
+
+	/*
+	 * This is a bit gross, but we know we're on POWER8 and have 6 PMCs.
+	 * Using read/write_pmc() in a for loop adds 12 function calls and
+	 * almost doubles our code size.
+	 */
+	pmcs[0] = mfspr(SPRN_PMC1);
+	pmcs[1] = mfspr(SPRN_PMC2);
+	pmcs[2] = mfspr(SPRN_PMC3);
+	pmcs[3] = mfspr(SPRN_PMC4);
+	pmcs[4] = mfspr(SPRN_PMC5);
+	pmcs[5] = mfspr(SPRN_PMC6);
+
+	/* Ensure all freeze bits are unset */
+	mtspr(SPRN_MMCR2, 0);
+
+	/* Set up PMC6 to overflow in one cycle */
+	mtspr(SPRN_PMC6, 0x7FFFFFFE);
+
+	/* Enable exceptions and unfreeze PMC6 */
+	mtspr(SPRN_MMCR0, MMCR0_PMXE | MMCR0_PMCjCE | MMCR0_PMAO);
+
+	/* Now we need to refreeze and restore the PMCs */
+	mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMAO);
+
+	mtspr(SPRN_PMC1, pmcs[0]);
+	mtspr(SPRN_PMC2, pmcs[1]);
+	mtspr(SPRN_PMC3, pmcs[2]);
+	mtspr(SPRN_PMC4, pmcs[3]);
+	mtspr(SPRN_PMC5, pmcs[4]);
+	mtspr(SPRN_PMC6, pmcs[5]);
+}
 #endif /* CONFIG_PPC64 */
 
 static void perf_event_interrupt(struct pt_regs *regs);
@@ -1144,6 +1238,8 @@ static void power_pmu_enable(struct pmu *pmu)
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
 
  out_enable:
+	pmao_restore_workaround(ebb);
+
 	mmcr0 = ebb_switch_in(ebb, cpuhw->mmcr[0]);
 
 	mb();
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index 96cee20..64f04cf 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -10,6 +10,8 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#define pr_fmt(fmt)	"power8-pmu: " fmt
+
 #include <linux/kernel.h>
 #include <linux/perf_event.h>
 #include <asm/firmware.h>
@@ -774,6 +776,9 @@ static int __init init_power8_pmu(void)
 	/* Tell userspace that EBB is supported */
 	cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_EBB;
 
+	if (cpu_has_feature(CPU_FTR_PMAO_BUG))
+		pr_info("PMAO restore workaround active.\n");
+
 	return 0;
 }
 early_initcall(init_power8_pmu);
-- 
1.8.3.2

^ permalink raw reply related

* Re: [PATCH] mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS
From: liu ping fan @ 2014-02-28  6:36 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: Peter Zijlstra, linux-mm, Paul Mackerras, Andrew Morton,
	linuxppc-dev
In-Reply-To: <877g8fn8qw.fsf@linux.vnet.ibm.com>

On Fri, Feb 28, 2014 at 12:47 PM, Aneesh Kumar K.V
<aneesh.kumar@linux.vnet.ibm.com> wrote:
> Andrew Morton <akpm@linux-foundation.org> writes:
>
>> On Wed, 26 Feb 2014 13:22:16 +0530 "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> wrote:
>>
>>> Andrew Morton <akpm@linux-foundation.org> writes:
>>>
>>> > On Wed,  5 Feb 2014 09:25:46 +0800 Liu Ping Fan <qemulist@gmail.com> wrote:
>>> >
>>> >> When doing some numa tests on powerpc, I triggered an oops bug. I find
>>> >> it is caused by using page->_last_cpupid.  It should be initialized as
>>> >> "-1 & LAST_CPUPID_MASK", but not "-1". Otherwise, in task_numa_fault(),
>>> >> we will miss the checking (last_cpupid == (-1 & LAST_CPUPID_MASK)).
>>> >> And finally cause an oops bug in task_numa_group(), since the online cpu is
>>> >> less than possible cpu.
>>> >
>>> > I grabbed this.  I added this to the changelog:
>>> >
>>> > : PPC needs the LAST_CPUPID_NOT_IN_PAGE_FLAGS case because ppc needs to
>>> > : support a large physical address region, up to 2^46 but small section size
>>> > : (2^24).  So when NR_CPUS grows up, it is easily to cause
>>> > : not-in-page-flags.
>>> >
>>> > to hopefully address Peter's observation.
>>> >
>>> > How should we proceed with this?  I'm getting the impression that numa
>>> > balancing on ppc is a dead duck in 3.14, so perhaps this and
>>> >
>>> > powerpc-mm-add-new-set-flag-argument-to-pte-pmd-update-function.patch
>>> > mm-dirty-accountable-change-only-apply-to-non-prot-numa-case.patch
>>> > mm-use-ptep-pmdp_set_numa-for-updating-_page_numa-bit.patch
>>> >
>>>
>>> All these are already in 3.14  ?
>>
>> Yes.
>>
>>> > are 3.15-rc1 material?
>>> >
>>>
>>> We should push the first hunk to 3.14. I will wait for Liu to redo the
>>> patch. BTW this should happen only when SPARSE_VMEMMAP is not
>>> specified. Srikar had reported the issue here
>>>
>>> http://mid.gmane.org/20140219180200.GA29257@linux.vnet.ibm.com
>>>
>>> #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
>>> #define SECTIONS_WIDTH               SECTIONS_SHIFT
>>> #else
>>> #define SECTIONS_WIDTH               0
>>> #endif
>>>
>>
>> I'm lost.  What patch are you talking about?  The first hunk of what?
>
> The patch in this thread.
>
>>
>> I assume we're talking about
>> mm-numa-bugfix-for-last_cpupid_not_in_page_flags.patch, which I had
>> queued for 3.14.  I'll put it on hold until there's some clarity here.
>
> We don't need the complete patch, it is just the first hunk that we need
> to fix the crash ie. we only need
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a7b4e31..ddc66df4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -727,7 +727,7 @@ static inline int page_cpupid_last(struct page *page)
>  }
>  static inline void page_cpupid_reset_last(struct page *page)
>  {
> -       page->_last_cpupid = -1;
> +       page->_last_cpupid = -1 & LAST_CPUPID_MASK;
>  }
>  #else
>  static inline int page_cpupid_last(struct page *page)
>
> Also the issue will only happen when SPARSE_VMEMMAP is not enabled. I
> will send a proper patch with updated changelog. I was hoping Liu will
> get to that quickly
>
Thanks for sending V2.  Since the ppc machine env is changed by
others, I am blocking on setting up the env for re-test this patch.
And not send out it quickly.

Best regards,
Fan
>
> -aneesh
>

^ permalink raw reply

* Re: [RFC PATCH] powerpc: allow allyesconfig to build more
From: Benjamin Herrenschmidt @ 2014-02-28  7:46 UTC (permalink / raw)
  To: Michael Neuling; +Cc: Stephen Rothwell, ppc-dev, paulus, Mahesh Salgaonkar
In-Reply-To: <5087.1393560369@ale.ozlabs.ibm.com>

On Fri, 2014-02-28 at 15:06 +1100, Michael Neuling wrote:
> Stephen Rothwell <sfr@canb.auug.org.au> wrote:
> 
> > Fixes this build error:
> > 
> > arch/powerpc/kernel/exceptions-64s.S: Assembler messages:
> > arch/powerpc/kernel/exceptions-64s.S:1312: Error: attempt to move .org backwards
> > 
> > Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
> 
> Builds and boots for me
> 
> Acked-off-by: Michael Neuling <mikey@neuling.org>
> 

And breaks at least one of my configs (the g5 one) with a relocation
problem in head_64.o (which is why I had modified the original patch to
be less aggressive iirc).

We end up moving things too far away from a conditional branch, I
think the masked_*interrupt stuff. We need to shuffle things a bit
more to get that to work.

Cheers,
Ben.

^ permalink raw reply

* [git pull] Please pull powerpc.git merge branch
From: Benjamin Herrenschmidt @ 2014-02-28  8:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linuxppc-dev list, Linux Kernel list

Hi Linus !

Here are a few more powerpc fixes for 3.14. Most of these are also
CC'ed to stable and fix bugs in new functionality introduced in
the last 2 or 3 versions.

The following changes since commit 66f9af83e56bfa12964d251df9d60fb571579913:

  powerpc/eeh: Disable EEH on reboot (2014-02-17 11:19:39 +1100)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git merge

for you to fetch changes up to e0cf957614976896111e676e5134ac98ee227d3d:

  powerpc/powernv: Fix indirect XSCOM unmangling (2014-02-28 19:15:49 +1100)

----------------------------------------------------------------
Benjamin Herrenschmidt (2):
      powerpc/powernv: Fix opal_xscom_{read,write} prototype
      powerpc/powernv: Fix indirect XSCOM unmangling

Gavin Shan (2):
      powerpc/powernv: Dump PHB diag-data immediately
      powerpc/powernv: Refactor PHB diag-data dump

Laurent Dufour (1):
      powerpc/crashdump : Fix page frame number check in copy_oldmem_page

Liu Ping Fan (1):
      powerpc/ftrace: bugfix for test_24bit_addr

Paul Mackerras (1):
      powerpc: Increase stack redzone for 64-bit userspace to 512 bytes

Tony Breeds (1):
      powerpc/le: Ensure that the 'stop-self' RTAS token is handled correctly

 arch/powerpc/include/asm/compat.h            |   5 +-
 arch/powerpc/include/asm/opal.h              |   4 +-
 arch/powerpc/include/asm/ptrace.h            |  16 +-
 arch/powerpc/kernel/crash_dump.c             |   8 +-
 arch/powerpc/kernel/ftrace.c                 |   1 +
 arch/powerpc/kernel/signal_64.c              |   4 +-
 arch/powerpc/platforms/powernv/eeh-ioda.c    |  96 ++++++------
 arch/powerpc/platforms/powernv/opal-xscom.c  |  21 +--
 arch/powerpc/platforms/powernv/pci.c         | 220 +++++++++++++++------------
 arch/powerpc/platforms/pseries/hotplug-cpu.c |  22 +--
 10 files changed, 219 insertions(+), 178 deletions(-)

^ permalink raw reply

* [PATCH] Corenet: Add QE platform support for Corenet
From: Zhao Qiang @ 2014-02-28  8:48 UTC (permalink / raw)
  To: linuxppc-dev, B07421; +Cc: Zhao Qiang, R63061

There is QE on platform T104x, add support.
Call funcs qe_ic_init and qe_init if CONFIG_QUICC_ENGINE is defined.

Signed-off-by: Zhao Qiang <B45475@freescale.com>
---
 arch/powerpc/platforms/85xx/corenet_generic.c | 32 +++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c
index fbd871e..f8c8e0c 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -26,6 +26,8 @@
 #include <asm/udbg.h>
 #include <asm/mpic.h>
 #include <asm/ehv_pic.h>
+#include <asm/qe.h>
+#include <asm/qe_ic.h>
 
 #include <linux/of_platform.h>
 #include <sysdev/fsl_soc.h>
@@ -38,6 +40,10 @@ void __init corenet_gen_pic_init(void)
 	unsigned int flags = MPIC_BIG_ENDIAN | MPIC_SINGLE_DEST_CPU |
 		MPIC_NO_RESET;
 
+#ifdef CONFIG_QUICC_ENGINE
+	struct device_node *np;
+#endif
+
 	if (ppc_md.get_irq == mpic_get_coreint_irq)
 		flags |= MPIC_ENABLE_COREINT;
 
@@ -45,6 +51,16 @@ void __init corenet_gen_pic_init(void)
 	BUG_ON(mpic == NULL);
 
 	mpic_init(mpic);
+
+#ifdef CONFIG_QUICC_ENGINE
+	np = of_find_compatible_node(NULL, NULL, "fsl,qe-ic");
+	if (np) {
+		qe_ic_init(np, 0, qe_ic_cascade_low_mpic,
+				qe_ic_cascade_high_mpic);
+		of_node_put(np);
+	}
+#endif
+
 }
 
 /*
@@ -52,11 +68,24 @@ void __init corenet_gen_pic_init(void)
  */
 void __init corenet_gen_setup_arch(void)
 {
+#ifdef CONFIG_QUICC_ENGINE
+	struct device_node *np;
+#endif
 	mpc85xx_smp_init();
 
 	swiotlb_detect_4g();
 
 	pr_info("%s board from Freescale Semiconductor\n", ppc_md.name);
+
+#ifdef CONFIG_QUICC_ENGINE
+	np = of_find_compatible_node(NULL, NULL, "fsl,qe");
+	if (!np) {
+		pr_err("%s: Could not find Quicc Engine node\n", __func__);
+		return;
+	}
+	qe_reset();
+	of_node_put(np);
+#endif
 }
 
 static const struct of_device_id of_device_ids[] = {
@@ -81,6 +110,9 @@ static const struct of_device_id of_device_ids[] = {
 	{
 		.compatible	= "fsl,qoriq-pcie-v3.0",
 	},
+	{
+		.compatible	= "fsl,qe",
+	},
 	/* The following two are for the Freescale hypervisor */
 	{
 		.name		= "hypervisor",
-- 
1.8.5

^ permalink raw reply related

* [PATCH V3] mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS
From: Aneesh Kumar K.V @ 2014-02-28  9:02 UTC (permalink / raw)
  To: benh, akpm, Peter Zijlstra
  Cc: linux-mm, Liu Ping Fan, linuxppc-dev, linux-kernel,
	Aneesh Kumar K.V

From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>

When doing some numa tests on powerpc, I triggered an oops bug. I find
it is caused by using page->_last_cpupid.  It should be initialized as
"-1 & LAST_CPUPID_MASK", but not "-1". Otherwise, in task_numa_fault(),
we will miss the checking (last_cpupid == (-1 & LAST_CPUPID_MASK)).
And finally cause an oops bug in task_numa_group(), since the online cpu is
less than possible cpu. This happen with CONFIG_SPARSE_VMEMMAP disabled

Call trace:
[   55.978091] SMP NR_CPUS=64 NUMA PowerNV
[   55.978118] Modules linked in:
[   55.978145] CPU: 24 PID: 804 Comm: systemd-udevd Not tainted3.13.0-rc1+ #32
[   55.978183] task: c000001e2746aa80 ti: c000001e32c50000 task.ti:c000001e32c50000
[   55.978219] NIP: c0000000000f5ad0 LR: c0000000000f5ac8 CTR:c000000000913cf0
[   55.978256] REGS: c000001e32c53510 TRAP: 0300   Not tainted(3.13.0-rc1+)
[   55.978286] MSR: 9000000000009032 <SF,HV,EE,ME,IR,DR,RI>  CR:28024424  XER: 20000000
[   55.978380] CFAR: c000000000009324 DAR: 7265717569726857 DSISR:40000000 SOFTE: 1
GPR00: c0000000000f5ac8 c000001e32c53790 c000000001f343380000000000000021
GPR04: 0000000000000000 0000000000000031 c000000001f743380000ffffffffffff
GPR08: 0000000000000001 7265717569726573 00000000000000000000000000000000
GPR12: 0000000028024422 c00000000ffdd800 00000000296b2e640000000000000020
GPR16: 0000000000000002 0000000000000003 c000001e2f8e4658c000001e25c1c1d8
GPR20: c000001e2f8e4000 c000000001f7a858 00000000000006580000000040000392
GPR24: 00000000000000a8 c000001e33c1a400 00000000000001d8c000001e25c1c000
GPR28: c000001e33c37ff0 0007837840000392 000000000000003fc000001e32c53790
[   55.978903] NIP [c0000000000f5ad0] .task_numa_fault+0x1470/0x2370
[   55.978934] LR [c0000000000f5ac8] .task_numa_fault+0x1468/0x2370
[   55.978964] Call Trace:
[   55.978978] [c000001e32c53790] [c0000000000f5ac8].task_numa_fault+0x1468/0x2370 (unreliable)
[   55.979036] [c000001e32c539e0] [c00000000020a820].do_numa_page+0x480/0x4a0
[   55.979072] [c000001e32c53b10] [c00000000020bfec].handle_mm_fault+0x4ec/0xc90
[   55.979123] [c000001e32c53c00] [c000000000e88c98].do_page_fault+0x3a8/0x890
[   55.979161] [c000001e32c53e30] [c000000000009568]handle_page_fault+0x10/0x30
[   55.979197] Instruction dump:
[   55.979216] 3c82fefb 3884b138 48d9cff1 60000000 48000574 3c62fefb3863af78 3c82fefb
[   55.979277] 3884b138 48d9cfd5 60000000 e93f0100 <812902e4> 7d2907b45529063e 7d2a07b4
[   55.979354] ---[ end trace 15f2510da5ae07cf ]---

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
Changes from V2:
* Also updated cpupid_xchg_last to use LAST_CPUPID_MASK. We use the
  that function with value -1
* Update Author information after taking to Liu
  
 include/linux/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f28f46eade6a..86245839c9fa 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -757,7 +757,7 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
 {
-	return xchg(&page->_last_cpupid, cpupid);
+	return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
 }
 
 static inline int page_cpupid_last(struct page *page)
@@ -766,7 +766,7 @@ static inline int page_cpupid_last(struct page *page)
 }
 static inline void page_cpupid_reset_last(struct page *page)
 {
-	page->_last_cpupid = -1;
+	page->_last_cpupid = -1 & LAST_CPUPID_MASK;
 }
 #else
 static inline int page_cpupid_last(struct page *page)
-- 
1.8.3.2

^ permalink raw reply related

* Re: [PATCH] mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS
From: Aneesh Kumar K.V @ 2014-02-28  9:04 UTC (permalink / raw)
  To: liu ping fan
  Cc: Peter Zijlstra, linux-mm, Paul Mackerras, Andrew Morton,
	linuxppc-dev
In-Reply-To: <CAJnKYQkVziWMmCL=rTakSA4955VMvFnaFtFdmexQAKUfTuVv_Q@mail.gmail.com>

liu ping fan <qemulist@gmail.com> writes:

> On Fri, Feb 28, 2014 at 12:47 PM, Aneesh Kumar K.V
> <aneesh.kumar@linux.vnet.ibm.com> wrote:
>> Andrew Morton <akpm@linux-foundation.org> writes:
>>
> Thanks for sending V2.  Since the ppc machine env is changed by
> others, I am blocking on setting up the env for re-test this patch.
> And not send out it quickly.

I sent an updated v3 also taking care of xchg

http://article.gmane.org/gmane.linux.kernel/1657613

-aneesh

^ permalink raw reply

* Re: [PATCH V3] mm: numa: bugfix for LAST_CPUPID_NOT_IN_PAGE_FLAGS
From: Peter Zijlstra @ 2014-02-28 11:31 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Liu Ping Fan, linux-kernel, linux-mm, akpm, linuxppc-dev
In-Reply-To: <1393578122-6500-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Fri, Feb 28, 2014 at 02:32:02PM +0530, Aneesh Kumar K.V wrote:
> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> 
> When doing some numa tests on powerpc, I triggered an oops bug. I find
> it is caused by using page->_last_cpupid.  It should be initialized as
> "-1 & LAST_CPUPID_MASK", but not "-1". Otherwise, in task_numa_fault(),
> we will miss the checking (last_cpupid == (-1 & LAST_CPUPID_MASK)).
> And finally cause an oops bug in task_numa_group(), since the online cpu is
> less than possible cpu. This happen with CONFIG_SPARSE_VMEMMAP disabled
> 
> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>


Acked-by: Peter Zijlstra <peterz@infradead.org>

> ---
>   
>  include/linux/mm.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f28f46eade6a..86245839c9fa 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -757,7 +757,7 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
>  #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
>  static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
>  {
> -	return xchg(&page->_last_cpupid, cpupid);
> +	return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
>  }
>  
>  static inline int page_cpupid_last(struct page *page)
> @@ -766,7 +766,7 @@ static inline int page_cpupid_last(struct page *page)
>  }
>  static inline void page_cpupid_reset_last(struct page *page)
>  {
> -	page->_last_cpupid = -1;
> +	page->_last_cpupid = -1 & LAST_CPUPID_MASK;
>  }
>  #else
>  static inline int page_cpupid_last(struct page *page)
> -- 
> 1.8.3.2
> 

^ permalink raw reply

* Re: [PATCH RFC v8 2/5] dma: mpc512x: add support for peripheral transfers
From: Alexander Popov @ 2014-03-01  9:19 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: Lars-Peter Clausen, Arnd Bergmann, Vinod Koul, Gerhard Sittig,
	Alexander Popov, dmaengine, Dan Williams, Anatolij Gustschin,
	linuxppc-dev
In-Reply-To: <1393247012.28803.7.camel@smile.fi.intel.com>

Hello Andy.

2014-02-24 17:03 GMT+04:00 Andy Shevchenko <andriy.shevchenko@linux.intel.com>:
> On Mon, 2014-02-24 at 15:09 +0400, Alexander Popov wrote:
>> Introduce support for slave s/g transfer preparation and the associated
>> device control callback in the MPC512x DMA controller driver, which adds
>> support for data transfers between memory and peripheral I/O to the
>> previously supported mem-to-mem transfers.
>>
>
> Few comments below.

Thanks for your feedback. I agree with your points and will fix my code.

Best regards,
Alexander

^ permalink raw reply

* Sound on PowerBook and iBook (snd_powermac/snd_aoa)‏
From: Adam Smith @ 2014-03-02 13:44 UTC (permalink / raw)
  To: linuxppc-dev@lists.ozlabs.org

[-- Attachment #1: Type: text/plain, Size: 3395 bytes --]

Hi list,

I was hoping to pick your brains regarding the sound on Apple PowerPC machines. For a long time now, sound has been broken on a number of these machines. This, I think, is due to i2c changes that started in April 2012. A number of bug reports have been raised by users at a distribution level (e.g. https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=714345 ) but as of yet no progress has been made on these.

Ubuntu has been hit by a "Fixing recursive fault, but reboot is needed!" bug that is caused by snd_powermac ( https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1066435 ). I tried the latest daily build of Lubuntu yesterday and the problem appears to be gone, but since the bug is a bit temperamental, I'm wondering if it was a fluke? Has any work been done on this, and can we genuinely mark it as solved? However, I could not get sound working on my G4 iBook.

Ubuntu (and I think Debian) add snd_powermac to /etc/modules. Is this still necessary or does the module now auto-load? The installer has a complicated routine (/bin/discover-mac-io) to load certain sound modules and blacklist others. Can we remove this now?

I seem to remember reading somewhere that certain machines will be migrating from snd_powermac to snd_aoa, although I can't seem to find the reference to that now. If this is the case, I think a certain number of machines have slipped through the net. As users, what information do we need to provide to get this fixed? Attached is a patch that was sent by Stefan Gartner to the Debian mailing list ( https://lists.debian.org/debian-powerpc/2013/09/msg00031.html ). Is this patch okay? I'm afraid I don't know how to check the device id/codec used by my own machine, as at a kernel level I'm a bit clueless. Would it be possible to provide instructions on how to do this? I could then circulate the instructions to other users who also have no sound. That way we could hopefully fix all broken machines.

If you need any more info then I will be happy to provide it, although you may have to give me a few days to do it (no internet connection!...)

Many thanks

Adam



--- a/sound/aoa/fabrics/layout.c	2013-09-18 21:09:41.910672970 +0200
+++ b/sound/aoa/fabrics/layout.c	2013-09-18 23:23:58.230511273 +0200
@@ -113,6 +113,8 @@
 MODULE_ALIAS("aoa-device-id-14");
 MODULE_ALIAS("aoa-device-id-22");
 MODULE_ALIAS("aoa-device-id-35");
+MODULE_ALIAS("aoa-device-id-38");
+MODULE_ALIAS("aoa-device-id-40");
 MODULE_ALIAS("aoa-device-id-44");
 
 /* onyx with all but microphone connected */
@@ -362,7 +364,20 @@
 		.connections = tas_connections_nolineout,
 	  },
 	},
+	/* PowerBook6,4 */
+	{ .device_id = 40,
+	  .codecs[0] = {
+		.name = "tas",
+		.connections = tas_connections_all,
+	  },
+	},
 	/* PowerBook6,5 */
+	{ .device_id = 38,
+	  .codecs[0] = {
+		.name = "tas",
+		.connections = tas_connections_all,
+	  },
+	},
 	{ .device_id = 44,
 	  .codecs[0] = {
 		.name = "tas",
--- a/sound/aoa/soundbus/i2sbus/core.c	2013-09-18 20:24:03.962348741 +0200
+++ b/sound/aoa/soundbus/i2sbus/core.c	2013-09-18 23:34:41.934550116 +0200
@@ -201,7 +201,7 @@
 			 * so restrict to those we do handle for now.
 			 */
 			if (id && (*id == 22 || *id == 14 || *id == 35 ||
-				   *id == 44)) {
+				   *id == 44 || *id == 40 || *id == 38)) {
 				snprintf(dev->sound.modalias, 32,
 					 "aoa-device-id-%d", *id);
 				ok = 1;
 		 	   		  

[-- Attachment #2: Type: text/html, Size: 3672 bytes --]

^ permalink raw reply

* [PATCH v3] powerpc/powernv Platform dump interface
From: Stewart Smith @ 2014-03-02 23:25 UTC (permalink / raw)
  To: benh, linuxppc-dev, Vasant Hegde; +Cc: Stewart Smith
In-Reply-To: <1393393321-9902-1-git-send-email-stewart@linux.vnet.ibm.com>

This enables support for userspace to fetch and initiate FSP and
Platform dumps from the service processor (via firmware) through sysfs.

Based on original patch from Vasant Hegde <hegdevasant@linux.vnet.ibm.com>

Flow:
  - We register for OPAL notification events.
  - OPAL sends new dump available notification.
  - We make information on dump available via sysfs
  - Userspace requests dump contents
  - We retrieve the dump via OPAL interface
  - User copies the dump data
  - userspace sends ack for dump
  - We send ACK to OPAL.

sysfs files:
  - We add the /sys/firmware/opal/dump directory
  - echoing 1 (well, anything, but in future we may support
    different dump types) to /sys/firmware/opal/dump/initiate_dump
    will initiate a dump.
  - Each dump that we've been notified of gets a directory
    in /sys/firmware/opal/dump/ with a name of the dump type and ID (in hex,
    as this is what's used elsewhere to identify the dump).
  - Each dump has files: id, type, dump and acknowledge
    dump is binary and is the dump itself.
    echoing 'ack' to acknowledge (currently any string will do) will
    acknowledge the dump and it will soon after disappear from sysfs.

OPAL APIs:
  - opal_dump_init()
  - opal_dump_info()
  - opal_dump_read()
  - opal_dump_ack()
  - opal_dump_resend_notification()

Currently we are only ever notified for one dump at a time (until
the user explicitly acks the current dump, then we get a notification
of the next dump), but this kernel code should "just work" when OPAL
starts notifying us of all the dumps present.

Changes since v2:
 - fix bug where we would free the dump buffer after userspace read it,
   refetching if needed. Refetching doesn't currently work, so we must
   keep the dump around for subsequent reads.

Changes since v1:
 - Add support for getting dump type from OPAL through new OPAL call
   (falling back to old OPAL_DUMP_INFO call if OPAL_DUMP_INFO2 isn't
    supported)
 - use dump type in directory name for dump

Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
---
 Documentation/ABI/stable/sysfs-firmware-opal-dump |   41 ++
 arch/powerpc/include/asm/opal.h                   |   14 +
 arch/powerpc/platforms/powernv/Makefile           |    2 +-
 arch/powerpc/platforms/powernv/opal-dump.c        |  525 +++++++++++++++++++++
 arch/powerpc/platforms/powernv/opal-wrappers.S    |    6 +
 arch/powerpc/platforms/powernv/opal.c             |    2 +
 6 files changed, 589 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ABI/stable/sysfs-firmware-opal-dump
 create mode 100644 arch/powerpc/platforms/powernv/opal-dump.c

diff --git a/Documentation/ABI/stable/sysfs-firmware-opal-dump b/Documentation/ABI/stable/sysfs-firmware-opal-dump
new file mode 100644
index 0000000..32fe7f5
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-firmware-opal-dump
@@ -0,0 +1,41 @@
+What:		/sys/firmware/opal/dump
+Date:		Feb 2014
+Contact:	Stewart Smith <stewart@linux.vnet.ibm.com>
+Description:
+		This directory exposes interfaces for interacting with
+		the FSP and platform dumps through OPAL firmware interface.
+
+		This is only for the powerpc/powernv platform.
+
+		initiate_dump:	When '1' is written to it,
+				we will initiate a dump.
+				Read this file for supported commands.
+
+		0xXX-0xYYYY:	A directory for dump of type 0xXX and
+				id 0xYYYY (in hex). The name of this
+				directory should not be relied upon to
+				be in this format, only that it's unique
+				among all dumps. For determining the type
+				and ID of the dump, use the id and type files.
+				Do not rely on any particular size of dump
+				type or dump id.
+
+		Each dump has the following files:
+		id:		An ASCII representation of the dump ID
+				in hex (e.g. '0x01')
+		type:		An ASCII representation of the type of
+				dump in the format "0x%x %s" with the ID
+				in hex and a description of the dump type
+				(or 'unknown').
+				Type '0xffffffff unknown' is used when
+				we could not get the type from firmware.
+				e.g. '0x02 System/Platform Dump'
+		dump:		A binary file containing the dump.
+				The size of the dump is the size of this file.
+		acknowledge:	When 'ack' is written to this, we will
+				acknowledge that we've retrieved the
+				dump to the service processor. It will
+				then remove it, making the dump
+				inaccessible.
+				Reading this file will get a list of
+				supported actions.
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 40157e2..89c840c 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -154,9 +154,15 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_FLASH_VALIDATE			76
 #define OPAL_FLASH_MANAGE			77
 #define OPAL_FLASH_UPDATE			78
+#define OPAL_DUMP_INIT				81
+#define OPAL_DUMP_INFO				82
+#define OPAL_DUMP_READ				83
+#define OPAL_DUMP_ACK				84
 #define OPAL_GET_MSG				85
 #define OPAL_CHECK_ASYNC_COMPLETION		86
+#define OPAL_DUMP_RESEND			91
 #define OPAL_SYNC_HOST_REBOOT			87
+#define OPAL_DUMP_INFO2				94
 
 #ifndef __ASSEMBLY__
 
@@ -237,6 +243,7 @@ enum OpalPendingState {
 	OPAL_EVENT_EPOW			= 0x80,
 	OPAL_EVENT_LED_STATUS		= 0x100,
 	OPAL_EVENT_PCI_ERROR		= 0x200,
+	OPAL_EVENT_DUMP_AVAIL		= 0x400,
 	OPAL_EVENT_MSG_PENDING		= 0x800,
 };
 
@@ -826,6 +833,12 @@ int64_t opal_lpc_read(uint32_t chip_id, enum OpalLPCAddressType addr_type,
 int64_t opal_validate_flash(uint64_t buffer, uint32_t *size, uint32_t *result);
 int64_t opal_manage_flash(uint8_t op);
 int64_t opal_update_flash(uint64_t blk_list);
+int64_t opal_dump_init(uint8_t dump_type);
+int64_t opal_dump_info(uint32_t *dump_id, uint32_t *dump_size);
+int64_t opal_dump_info2(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type);
+int64_t opal_dump_read(uint32_t dump_id, uint64_t buffer);
+int64_t opal_dump_ack(uint32_t dump_id);
+int64_t opal_dump_resend_notification(void);
 
 int64_t opal_get_msg(uint64_t buffer, size_t size);
 int64_t opal_check_completion(uint64_t buffer, size_t size, uint64_t token);
@@ -861,6 +874,7 @@ extern void opal_get_rtc_time(struct rtc_time *tm);
 extern unsigned long opal_get_boot_time(void);
 extern void opal_nvram_init(void);
 extern void opal_flash_init(void);
+extern void opal_platform_dump_init(void);
 
 extern int opal_machine_check(struct pt_regs *regs);
 
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 8d767fd..3528c11 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,6 +1,6 @@
 obj-y			+= setup.o opal-takeover.o opal-wrappers.o opal.o
 obj-y			+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
-obj-y			+= rng.o
+obj-y			+= rng.o opal-dump.o
 
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
new file mode 100644
index 0000000..0c767c5
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -0,0 +1,525 @@
+/*
+ * PowerNV OPAL Dump Interface
+ *
+ * Copyright 2013,2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+
+#include <asm/opal.h>
+
+#define DUMP_TYPE_FSP	0x01
+
+struct dump_obj {
+	struct kobject  kobj;
+	struct bin_attribute dump_attr;
+	uint32_t	id;  /* becomes object name */
+	uint32_t	type;
+	uint32_t	size;
+	char		*buffer;
+};
+#define to_dump_obj(x) container_of(x, struct dump_obj, kobj)
+
+struct dump_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct dump_obj *dump, struct dump_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct dump_obj *dump, struct dump_attribute *attr,
+			 const char *buf, size_t count);
+};
+#define to_dump_attr(x) container_of(x, struct dump_attribute, attr)
+
+static ssize_t dump_id_show(struct dump_obj *dump_obj,
+			    struct dump_attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "0x%x\n", dump_obj->id);
+}
+
+static const char* dump_type_to_string(uint32_t type)
+{
+	switch (type) {
+	case 0x01: return "SP Dump";
+	case 0x02: return "System/Platform Dump";
+	case 0x03: return "SMA Dump";
+	default: return "unknown";
+	}
+}
+
+static ssize_t dump_type_show(struct dump_obj *dump_obj,
+			      struct dump_attribute *attr,
+			      char *buf)
+{
+	
+	return sprintf(buf, "0x%x %s\n", dump_obj->type,
+		       dump_type_to_string(dump_obj->type));
+}
+
+static ssize_t dump_ack_show(struct dump_obj *dump_obj,
+			     struct dump_attribute *attr,
+			     char *buf)
+{
+	return sprintf(buf, "ack - acknowledge dump\n");
+}
+
+/*
+ * Send acknowledgement to OPAL
+ */
+static int64_t dump_send_ack(uint32_t dump_id)
+{
+	int rc;
+
+	rc = opal_dump_ack(dump_id);
+	if (rc)
+		pr_warn("%s: Failed to send ack to Dump ID 0x%x (%d)\n",
+			__func__, dump_id, rc);
+	return rc;
+}
+
+static void delay_release_kobj(void *kobj)
+{
+	kobject_put((struct kobject *)kobj);
+}
+
+static ssize_t dump_ack_store(struct dump_obj *dump_obj,
+			      struct dump_attribute *attr,
+			      const char *buf,
+			      size_t count)
+{
+	dump_send_ack(dump_obj->id);
+	sysfs_schedule_callback(&dump_obj->kobj, delay_release_kobj,
+				&dump_obj->kobj, THIS_MODULE);
+	return count;
+}
+
+/* Attributes of a dump
+ * The binary attribute of the dump itself is dynamic
+ * due to the dynamic size of the dump
+ */
+static struct dump_attribute id_attribute =
+	__ATTR(id, 0666, dump_id_show, NULL);
+static struct dump_attribute type_attribute =
+	__ATTR(type, 0666, dump_type_show, NULL);
+static struct dump_attribute ack_attribute =
+	__ATTR(acknowledge, 0660, dump_ack_show, dump_ack_store);
+
+static ssize_t init_dump_show(struct dump_obj *dump_obj,
+			      struct dump_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "1 - initiate dump\n");
+}
+
+static int64_t dump_fips_init(uint8_t type)
+{
+	int rc;
+
+	rc = opal_dump_init(type);
+	if (rc)
+		pr_warn("%s: Failed to initiate FipS dump (%d)\n",
+			__func__, rc);
+	return rc;
+}
+
+static ssize_t init_dump_store(struct dump_obj *dump_obj,
+			       struct dump_attribute *attr,
+			       const char *buf,
+			       size_t count)
+{
+	dump_fips_init(DUMP_TYPE_FSP);
+	pr_info("%s: Initiated FSP dump\n", __func__);
+	return count;
+}
+
+static struct dump_attribute initiate_attribute =
+	__ATTR(initiate_dump, 0600, init_dump_show, init_dump_store);
+
+static struct attribute *initiate_attrs[] = {
+	&initiate_attribute.attr,
+	NULL,
+};
+
+static struct attribute_group initiate_attr_group = {
+	.attrs = initiate_attrs,
+};
+
+static struct kset *dump_kset;
+
+static ssize_t dump_attr_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
+{
+	struct dump_attribute *attribute;
+	struct dump_obj *dump;
+
+	attribute = to_dump_attr(attr);
+	dump = to_dump_obj(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(dump, attribute, buf);
+}
+
+static ssize_t dump_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct dump_attribute *attribute;
+	struct dump_obj *dump;
+
+	attribute = to_dump_attr(attr);
+	dump = to_dump_obj(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(dump, attribute, buf, len);
+}
+
+static const struct sysfs_ops dump_sysfs_ops = {
+	.show = dump_attr_show,
+	.store = dump_attr_store,
+};
+
+static void dump_release(struct kobject *kobj)
+{
+	struct dump_obj *dump;
+
+	dump = to_dump_obj(kobj);
+	vfree(dump->buffer);
+	kfree(dump);
+}
+
+static struct attribute *dump_default_attrs[] = {
+	&id_attribute.attr,
+	&type_attribute.attr,
+	&ack_attribute.attr,
+	NULL,
+};
+
+static struct kobj_type dump_ktype = {
+	.sysfs_ops = &dump_sysfs_ops,
+	.release = &dump_release,
+	.default_attrs = dump_default_attrs,
+};
+
+static void free_dump_sg_list(struct opal_sg_list *list)
+{
+	struct opal_sg_list *sg1;
+	while (list) {
+		sg1 = list->next;
+		kfree(list);
+		list = sg1;
+	}
+	list = NULL;
+}
+
+static struct opal_sg_list *dump_data_to_sglist(struct dump_obj *dump)
+{
+	struct opal_sg_list *sg1, *list = NULL;
+	void *addr;
+	int64_t size;
+
+	addr = dump->buffer;
+	size = dump->size;
+
+	sg1 = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!sg1)
+		goto nomem;
+
+	list = sg1;
+	sg1->num_entries = 0;
+	while (size > 0) {
+		/* Translate virtual address to physical address */
+		sg1->entry[sg1->num_entries].data =
+			(void *)(vmalloc_to_pfn(addr) << PAGE_SHIFT);
+
+		if (size > PAGE_SIZE)
+			sg1->entry[sg1->num_entries].length = PAGE_SIZE;
+		else
+			sg1->entry[sg1->num_entries].length = size;
+
+		sg1->num_entries++;
+		if (sg1->num_entries >= SG_ENTRIES_PER_NODE) {
+			sg1->next = kzalloc(PAGE_SIZE, GFP_KERNEL);
+			if (!sg1->next)
+				goto nomem;
+
+			sg1 = sg1->next;
+			sg1->num_entries = 0;
+		}
+		addr += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	}
+	return list;
+
+nomem:
+	pr_err("%s : Failed to allocate memory\n", __func__);
+	free_dump_sg_list(list);
+	return NULL;
+}
+
+static void sglist_to_phy_addr(struct opal_sg_list *list)
+{
+	struct opal_sg_list *sg, *next;
+
+	for (sg = list; sg; sg = next) {
+		next = sg->next;
+		/* Don't translate NULL pointer for last entry */
+		if (sg->next)
+			sg->next = (struct opal_sg_list *)__pa(sg->next);
+		else
+			sg->next = NULL;
+
+		/* Convert num_entries to length */
+		sg->num_entries =
+			sg->num_entries * sizeof(struct opal_sg_entry) + 16;
+	}
+}
+
+static int64_t dump_read_info(uint32_t *id, uint32_t *size, uint32_t *type)
+{
+	int rc;
+	*type = 0xffffffff;
+
+	rc = opal_dump_info2(id, size, type);
+
+	if (rc == OPAL_PARAMETER)
+		rc = opal_dump_info(id, size);
+
+	if (rc)
+		pr_warn("%s: Failed to get dump info (%d)\n",
+			__func__, rc);
+	return rc;
+}
+
+static int64_t dump_read_data(struct dump_obj *dump)
+{
+	struct opal_sg_list *list;
+	uint64_t addr;
+	int64_t rc;
+
+	/* Allocate memory */
+	dump->buffer = vzalloc(PAGE_ALIGN(dump->size));
+	if (!dump->buffer) {
+		pr_err("%s : Failed to allocate memory\n", __func__);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Generate SG list */
+	list = dump_data_to_sglist(dump);
+	if (!list) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Translate sg list addr to real address */
+	sglist_to_phy_addr(list);
+
+	/* First entry address */
+	addr = __pa(list);
+
+	/* Fetch data */
+	rc = OPAL_BUSY_EVENT;
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_dump_read(dump->id, addr);
+		if (rc == OPAL_BUSY_EVENT) {
+			opal_poll_events(NULL);
+			msleep(20);
+		}
+	}
+
+	if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL)
+		pr_warn("%s: Extract dump failed for ID 0x%x\n",
+			__func__, dump->id);
+
+	/* Free SG list */
+	free_dump_sg_list(list);
+
+out:
+	return rc;
+}
+
+static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
+			      struct bin_attribute *bin_attr,
+			      char *buffer, loff_t pos, size_t count)
+{
+	ssize_t rc;
+
+	struct dump_obj *dump = to_dump_obj(kobj);
+
+	if (!dump->buffer) {
+		rc = dump_read_data(dump);
+
+		if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) {
+			vfree(dump->buffer);
+			dump->buffer = NULL;
+
+			return -EIO;
+		}
+		if (rc == OPAL_PARTIAL) {
+			/* On a partial read, we just return EIO
+			 * and rely on userspace to ask us to try
+			 * again.
+			 */
+			pr_info("%s: Platform dump partially read.ID = 0x%x\n",
+				__func__, dump->id);
+			return -EIO;
+		}
+	}
+
+	memcpy(buffer, dump->buffer + pos, count);
+
+	/* You may think we could free the dump buffer now and retrieve
+	 * it again later if needed, but due to current firmware limitation,
+	 * that's not the case. So, once read into userspace once,
+	 * we keep the dump around until it's acknowledged by userspace.
+	 */
+
+	return count;
+}
+
+static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
+					uint32_t type)
+{
+	struct dump_obj *dump;
+	int rc;
+
+	dump = kzalloc(sizeof(*dump), GFP_KERNEL);
+	if (!dump)
+		return NULL;
+
+	dump->kobj.kset = dump_kset;
+
+	kobject_init(&dump->kobj, &dump_ktype);
+
+	sysfs_bin_attr_init(&dump->dump_attr);
+
+	dump->dump_attr.attr.name = "dump";
+	dump->dump_attr.attr.mode = 0400;
+	dump->dump_attr.size = size;
+	dump->dump_attr.read = dump_attr_read;
+
+	dump->id = id;
+	dump->size = size;
+	dump->type = type;
+
+	rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
+	if (rc) {
+		kobject_put(&dump->kobj);
+		return NULL;
+	}
+
+	rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
+	if (rc) {
+		kobject_put(&dump->kobj);
+		return NULL;
+	}
+
+	pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
+		__func__, dump->id, dump->size);
+
+	kobject_uevent(&dump->kobj, KOBJ_ADD);
+
+	return dump;
+}
+
+static int process_dump(void)
+{
+	int rc;
+	uint32_t dump_id, dump_size, dump_type;
+	struct dump_obj *dump;
+	char name[22];
+
+	rc = dump_read_info(&dump_id, &dump_size, &dump_type);
+	if (rc != OPAL_SUCCESS)
+		return rc;
+
+	sprintf(name, "0x%x-0x%x", dump_type, dump_id);
+
+	/* we may get notified twice, let's handle
+	 * that gracefully and not create two conflicting
+	 * entries.
+	 */
+	if (kset_find_obj(dump_kset, name))
+		return 0;
+
+	dump = create_dump_obj(dump_id, dump_size, dump_type);
+	if (!dump)
+		return -1;
+
+	return 0;
+}
+
+static void dump_work_fn(struct work_struct *work)
+{
+	process_dump();
+}
+
+static DECLARE_WORK(dump_work, dump_work_fn);
+
+static void schedule_process_dump(void)
+{
+	schedule_work(&dump_work);
+}
+
+/*
+ * New dump available notification
+ *
+ * Once we get notification, we add sysfs entries for it.
+ * We only fetch the dump on demand, and create sysfs asynchronously.
+ */
+static int dump_event(struct notifier_block *nb,
+		      unsigned long events, void *change)
+{
+	if (events & OPAL_EVENT_DUMP_AVAIL)
+		schedule_process_dump();
+
+	return 0;
+}
+
+static struct notifier_block dump_nb = {
+	.notifier_call  = dump_event,
+	.next           = NULL,
+	.priority       = 0
+};
+
+void __init opal_platform_dump_init(void)
+{
+	int rc;
+
+	dump_kset = kset_create_and_add("dump", NULL, opal_kobj);
+	if (!dump_kset) {
+		pr_warn("%s: Failed to create dump kset\n", __func__);
+		return;
+	}
+
+	rc = sysfs_create_group(&dump_kset->kobj, &initiate_attr_group);
+	if (rc) {
+		pr_warn("%s: Failed to create initiate dump attr group\n",
+			__func__);
+		kobject_put(&dump_kset->kobj);
+		return;
+	}
+
+	rc = opal_notifier_register(&dump_nb);
+	if (rc) {
+		pr_warn("%s: Can't register OPAL event notifier (%d)\n",
+			__func__, rc);
+		return;
+	}
+
+	opal_dump_resend_notification();
+}
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 3e8829c..3e02783 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -126,6 +126,12 @@ OPAL_CALL(opal_return_cpu,			OPAL_RETURN_CPU);
 OPAL_CALL(opal_validate_flash,			OPAL_FLASH_VALIDATE);
 OPAL_CALL(opal_manage_flash,			OPAL_FLASH_MANAGE);
 OPAL_CALL(opal_update_flash,			OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_dump_init,			OPAL_DUMP_INIT);
+OPAL_CALL(opal_dump_info,			OPAL_DUMP_INFO);
+OPAL_CALL(opal_dump_info2,			OPAL_DUMP_INFO2);
+OPAL_CALL(opal_dump_read,			OPAL_DUMP_READ);
+OPAL_CALL(opal_dump_ack,			OPAL_DUMP_ACK);
 OPAL_CALL(opal_get_msg,				OPAL_GET_MSG);
 OPAL_CALL(opal_check_completion,		OPAL_CHECK_ASYNC_COMPLETION);
+OPAL_CALL(opal_dump_resend_notification,	OPAL_DUMP_RESEND);
 OPAL_CALL(opal_sync_host_reboot,		OPAL_SYNC_HOST_REBOOT);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 65499ad..262cd1a 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -474,6 +474,8 @@ static int __init opal_init(void)
 	if (rc == 0) {
 		/* Setup code update interface */
 		opal_flash_init();
+		/* Setup platform dump extract interface */
+		opal_platform_dump_init();
 	}
 
 	return 0;
-- 
1.7.10.4

^ permalink raw reply related

* powerpc/tm: Fix crash when forking inside a transaction
From: Michael Neuling @ 2014-03-03  3:21 UTC (permalink / raw)
  To: benh; +Cc: Linux PPC dev, Adhemerval Zanella Neto, Paul Mackerras, matt

When we fork/clone we currently don't copy any of the TM state to the new
thread.  This results in a TM bad thing (program check) when the new process is
switched in as the kernel does a tmrechkpt with TEXASR FS not set.  Also, since
R1 is from userspace, we trigger the bad kernel stack pointer detection.  So we
end up with something like this:

   Bad kernel stack pointer 0 at c0000000000404fc
   cpu 0x2: Vector: 700 (Program Check) at [c00000003ffefd40]
       pc: c0000000000404fc: restore_gprs+0xc0/0x148
       lr: 0000000000000000
       sp: 0
      msr: 9000000100201030
     current = 0xc000001dd1417c30
     paca    = 0xc00000000fe00800   softe: 0        irq_happened: 0x01
       pid   = 0, comm = swapper/2
   WARNING: exception is not recoverable, can't continue

The below fixes this by flushing the TM state before we copy the task_struct to
the clone.  To do this we go through the tmreclaim patch, which removes the
checkpointed registers from the CPU and transitions the CPU out of TM suspend
mode.  Hence we need to call tmrechkpt after to restore the checkpointed state
and the TM mode for the current task.

To make this fail from userspace is simply:
	tbegin
	li	r0, 2
	sc
	<boom>

Kudos to Adhemerval Zanella Neto for finding this.

Signed-off-by: Michael Neuling <mikey@neuling.org>
cc: Adhemerval Zanella Neto <azanella@br.ibm.com>
cc: stable@vger.kernel.org

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 8d4c247f1..af064d2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1048,6 +1048,15 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 	flush_altivec_to_thread(src);
 	flush_vsx_to_thread(src);
 	flush_spe_to_thread(src);
+	/*
+	 * Flush TM state out so we can copy it.  __switch_to_tm() does this
+	 * flush but it removes the checkpointed state from the current CPU and
+	 * transitions the CPU out of TM mode.  Hence we need to call
+	 * tm_recheckpoint_new_task() (on the same task) to restore the
+	 * checkpointed state back and the TM mode.
+	 */
+	__switch_to_tm(src);
+	tm_recheckpoint_new_task(src);
 
 	*dst = *src;
 

^ permalink raw reply related

* [PATCH 1/2] powerpc/eeh: More reliability of PCI dev reset
From: Gavin Shan @ 2014-03-03  3:26 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan, stable

The PCI core has function pci_reset_function() to do reset on the
specified PCI device. Before the reset starts, the sate of the PCI
device is saved and it is restored after reset. The real reset work
could be routed to pcibios_set_pcie_reset_state() by quirks. However,
the PCI bus or PCI device isn't settled down fully for restore (PCI
config and MMIO for MSIx table) after reset and it would introduce
unnecessary frozen PE. One of the observed cases is failure of passing
IPR adapter from host to KVM-based guest because of this.

The patch adds delay in pcibios_set_pcie_reset_state() so that the
PCI bus/device can settle down fully before restoring PCI device
states. The patch also does cleanup on the names of those macros for
PE reset hold and settle time.

CC: <stable@vger.kernel.org>
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/eeh.c |   19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index e7b76a6..251e370 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -84,8 +84,10 @@
  */
 #define EEH_MAX_FAILS	2100000
 
-/* Time to wait for a PCI slot to report status, in milliseconds */
-#define PCI_BUS_RESET_WAIT_MSEC (5*60*1000)
+/* All in milliseconds */
+#define EEH_PE_STATUS_WAIT_TIME		(5 * 60 * 1000)
+#define EEH_PE_RESET_HOLD_TIME		250
+#define EEH_PE_RESET_SETTLE_TIME	1800
 
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
@@ -522,7 +524,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
 		pr_warning("%s: Unexpected state change %d on PHB#%d-PE#%x, err=%d\n",
 			__func__, function, pe->phb->global_number, pe->addr, rc);
 
-	rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+	rc = eeh_ops->wait_state(pe, EEH_PE_STATUS_WAIT_TIME);
 	if (rc > 0 && (rc & EEH_STATE_MMIO_ENABLED) &&
 	   (function == EEH_OPT_THAW_MMIO))
 		return 0;
@@ -552,12 +554,15 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
 	switch (state) {
 	case pcie_deassert_reset:
 		eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
+		msleep(EEH_PE_RESET_HOLD_TIME);
 		break;
 	case pcie_hot_reset:
 		eeh_ops->reset(pe, EEH_RESET_HOT);
+		msleep(EEH_PE_RESET_HOLD_TIME);
 		break;
 	case pcie_warm_reset:
 		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
+		msleep(EEH_PE_RESET_SETTLE_TIME);
 		break;
 	default:
 		return -EINVAL;
@@ -615,8 +620,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
 	/* The PCI bus requires that the reset be held high for at least
 	 * a 100 milliseconds. We wait a bit longer 'just in case'.
 	 */
-#define PCI_BUS_RST_HOLD_TIME_MSEC 250
-	msleep(PCI_BUS_RST_HOLD_TIME_MSEC);
+	msleep(EEH_PE_RESET_HOLD_TIME);
 
 	/* We might get hit with another EEH freeze as soon as the
 	 * pci slot reset line is dropped. Make sure we don't miss
@@ -630,8 +634,7 @@ static void eeh_reset_pe_once(struct eeh_pe *pe)
 	 * a 1.5 second idle time for the bus to stabilize, before starting
 	 * up traffic.
 	 */
-#define PCI_BUS_SETTLE_TIME_MSEC 1800
-	msleep(PCI_BUS_SETTLE_TIME_MSEC);
+	msleep(EEH_PE_RESET_SETTLE_TIME);
 }
 
 /**
@@ -651,7 +654,7 @@ int eeh_reset_pe(struct eeh_pe *pe)
 	for (i=0; i<3; i++) {
 		eeh_reset_pe_once(pe);
 
-		rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+		rc = eeh_ops->wait_state(pe, EEH_PE_STATUS_WAIT_TIME);
 		if ((rc & flags) == flags)
 			return 0;
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 2/2] powerpc/eeh: Check PCIe link in pcibios_set_pcie_reset_state()
From: Gavin Shan @ 2014-03-03  3:26 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1393817192-14271-1-git-send-email-shangw@linux.vnet.ibm.com>

After PE reset in pcibios_set_pcie_reset_state(), the PCIe link
might be not ready after settle time of PE primary bus. The
subsequent access to PCI config and MMIO of the affected domain
would cause more problems (e.g. unexpected frozen PE).

The patch checks the PCIe link in pcibios_set_pcie_reset_state()
to make sure all PCIe links are up after PE reset so that to
avoid unexpected problems.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h |    1 +
 arch/powerpc/kernel/eeh.c      |   29 +++++++++++++++++++++++++++++
 arch/powerpc/kernel/eeh_pe.c   |   21 +++++++++++++--------
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index d4dd41f..e96ed32 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -231,6 +231,7 @@ void *eeh_pe_traverse(struct eeh_pe *root,
 		eeh_traverse_func fn, void *flag);
 void *eeh_pe_dev_traverse(struct eeh_pe *root,
 		eeh_traverse_func fn, void *flag);
+void eeh_bridge_check_link(struct eeh_dev *edev);
 void eeh_pe_restore_bars(struct eeh_pe *pe);
 struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 251e370..ba2dd2d 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -532,6 +532,14 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
 	return rc;
 }
 
+static void *eeh_dev_check_link(void *data, void *flag)
+{
+	struct eeh_dev *edev = data;
+
+	eeh_bridge_check_link(edev);
+	return NULL;
+}
+
 /**
  * pcibios_set_pcie_slot_reset - Set PCI-E reset state
  * @dev: pci device struct
@@ -544,6 +552,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
 {
 	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
 	struct eeh_pe *pe = edev->pe;
+	struct pci_bus *bus;
 
 	if (!pe) {
 		pr_err("%s: No PE found on PCI device %s\n",
@@ -551,10 +560,30 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
 		return -EINVAL;
 	}
 
+	bus = eeh_pe_bus_get(pe);
+	if (!bus) {
+		pr_err("%s: No PE primary bus found for PCI dev %s\n",
+			__func__, pci_name(dev));
+		return -EINVAL;
+	}
+
 	switch (state) {
 	case pcie_deassert_reset:
 		eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
 		msleep(EEH_PE_RESET_HOLD_TIME);
+
+		/*
+		 * After PE reset, the PCIe link is probably
+		 * not ready after settle period. We're checking
+		 * all PCIe downstream port of the affected PE
+		 * ensure that.
+		 */
+		if (bus->self) {
+			edev = pci_dev_to_eeh_dev(bus->self);
+			eeh_bridge_check_link(edev);
+		}
+		eeh_pe_dev_traverse(pe, eeh_dev_check_link, NULL);
+
 		break;
 	case pcie_hot_reset:
 		eeh_ops->reset(pe, EEH_RESET_HOT);
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index f0c353f..a49f9dc 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -567,6 +567,9 @@ void eeh_pe_state_clear(struct eeh_pe *pe, int state)
 }
 
 /*
+ * eeh_bridge_check_link - Check PCI link is up or down
+ * @edev: EEH device
+ *
  * Some PCI bridges (e.g. PLX bridges) have primary/secondary
  * buses assigned explicitly by firmware, and we probably have
  * lost that after reset. So we have to delay the check until
@@ -577,18 +580,20 @@ void eeh_pe_state_clear(struct eeh_pe *pe, int state)
  * blocked on normal path during the stage. So we need utilize
  * eeh operations, which is always permitted.
  */
-static void eeh_bridge_check_link(struct eeh_dev *edev,
-				  struct device_node *dn)
+void eeh_bridge_check_link(struct eeh_dev *edev)
 {
+	struct device_node *dn;
 	int cap;
 	uint32_t val;
 	int timeout = 0;
 
-	/*
-	 * We only check root port and downstream ports of
-	 * PCIe switches
-	 */
-	if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT)))
+	/* Only for root port and downstream ports */
+	if (!edev || !(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT)))
+		return;
+
+	/* Device node */
+	dn = eeh_dev_to_of_node(edev);
+	if (!dn)
 		return;
 
 	pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n",
@@ -678,7 +683,7 @@ static void eeh_restore_bridge_bars(struct eeh_dev *edev,
 	eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]);
 
 	/* Check the PCIe link is ready */
-	eeh_bridge_check_link(edev, dn);
+	eeh_bridge_check_link(edev);
 }
 
 static void eeh_restore_device_bars(struct eeh_dev *edev,
-- 
1.7.10.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox