[PATCH] EDAC: Add AMD Seattle SoC EDAC

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
@ 2015-10-19 19:23 Brijesh Singh
  2015-10-19 20:14 ` Borislav Petkov
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Brijesh Singh @ 2015-10-19 19:23 UTC (permalink / raw)
  To: linux-arm-kernel

Add support for the AMD Seattle SoC EDAC driver.

Signed-off-by: Brijesh Singh <brijeshkumar.singh@amd.com>
---
 .../devicetree/bindings/edac/amd-seattle-edac.txt  |  15 +
 drivers/edac/Kconfig                               |   6 +
 drivers/edac/Makefile                              |   1 +
 drivers/edac/seattle_edac.c                        | 306 +++++++++++++++++++++
 4 files changed, 328 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
 create mode 100644 drivers/edac/seattle_edac.c

diff --git a/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
new file mode 100644
index 0000000..a0354e8
--- /dev/null
+++ b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
@@ -0,0 +1,15 @@
+* AMD Seattle SoC EDAC node
+
+EDAC node is defined to describe on-chip error detection and reporting.
+
+Required properties:
+- compatible: Should be "amd,arm-seattle-edac"
+- poll-delay-msec: Indicates how often the edac check callback should be called.
+  Time in msec.
+
+Example:
+	edac {
+		compatible = "amd,arm-seattle-edac";
+		poll-delay-msec = <100>;
+	};
+
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index ef25000..d342335 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -390,4 +390,10 @@ config EDAC_XGENE
 	  Support for error detection and correction on the
 	  APM X-Gene family of SOCs.
 
+config EDAC_SEATTLE
+	tristate "AMD Seattle EDAC"
+	depends on EDAC_MM_EDAC && ARCH_SEATTLE
+	help
+	  Support for error detection and correction on the
+	  AMD Seattle SOC.
 endif # EDAC
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index ae3c5f3..9e4f3ef 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI)		+= octeon_edac-pci.o
 obj-$(CONFIG_EDAC_ALTERA_MC)		+= altera_edac.o
 obj-$(CONFIG_EDAC_SYNOPSYS)		+= synopsys_edac.o
 obj-$(CONFIG_EDAC_XGENE)		+= xgene_edac.o
+obj-$(CONFIG_EDAC_SEATTLE)		+= seattle_edac.o
diff --git a/drivers/edac/seattle_edac.c b/drivers/edac/seattle_edac.c
new file mode 100644
index 0000000..78101aa
--- /dev/null
+++ b/drivers/edac/seattle_edac.c
@@ -0,0 +1,306 @@
+/*
+ * AMD Seattle EDAC
+ *
+ * Copyright (c) 2015, Advanced Micro Devices
+ * Author: Brijesh Singh <brijeshkumar.singh@amd.com>
+ *
+ * The driver polls CPUMERRSR_EL1 and L2MERRSR_EL1 registers to logs the 
+ * non-fatal errors. Whereas the single bit and double bit ECC erros are 
+ * handled by firmware.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+
+#include "edac_core.h"
+
+#define EDAC_MOD_STR             "seattle_edac"
+
+#define CPUMERRSR_EL1_INDEX(x)   ((x) & 0x1ffff)
+#define CPUMERRSR_EL1_BANK(x)    (((x) >> 18) & 0x1f)
+#define CPUMERRSR_EL1_RAMID(x)   (((x) >> 24) & 0x7f)
+#define CPUMERRSR_EL1_VALID(x)   ((x) & (1 << 31))
+#define CPUMERRSR_EL1_REPEAT(x)  (((x) >> 32) & 0x7f)
+#define CPUMERRSR_EL1_OTHER(x)   (((x) >> 40) & 0xff)
+#define CPUMERRSR_EL1_FATAL(x)   ((x) & (1UL << 63))
+
+#define L2MERRSR_EL1_INDEX(x)    ((x) & 0x1ffff)
+#define L2MERRSR_EL1_CPUID(x)    (((x) >> 18) & 0xf)
+#define L2MERRSR_EL1_RAMID(x)    (((x) >> 24) & 0x7f)
+#define L2MERRSR_EL1_VALID(x)    ((x) & (1 << 31))
+#define L2MERRSR_EL1_REPEAT(x)   (((x) >> 32) & 0xff)
+#define L2MERRSR_EL1_OTHER(x)    (((x) >> 40) & 0xff)
+#define L2MERRSR_EL1_FATAL(x)    ((x) & (1UL << 63))
+
+struct seattle_edac {
+	struct edac_device_ctl_info *edac_ctl;
+};
+
+static inline u64 read_cpumerrsr_el1(void)
+{
+	u64 val;
+
+	asm volatile("mrs %0, s3_1_c15_c2_2" : "=r" (val));
+	return val;
+}
+
+static inline void write_cpumerrsr_el1(u64 val)
+{
+	asm volatile("msr s3_1_c15_c2_2, %0" :: "r" (val));
+}
+
+static inline u64 read_l2merrsr_el1(void)
+{
+	u64 val;
+
+	asm volatile("mrs %0, s3_1_c15_c2_3" : "=r" (val));
+	return val;
+}
+
+static inline void write_l2merrsr_el1(u64 val)
+{
+	asm volatile("msr s3_1_c15_c2_3, %0" :: "r" (val));
+}
+
+static void check_l2merrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
+{
+	int fatal;
+	int cpuid;
+	u64 val = read_l2merrsr_el1();
+
+	if (!L2MERRSR_EL1_VALID(val))
+		return;
+
+	fatal = L2MERRSR_EL1_FATAL(val);
+	cpuid = L2MERRSR_EL1_CPUID(val);
+	edac_printk(KERN_CRIT, EDAC_MOD_STR,
+		    "CPU%d detected %s error on L2 (L2MERRSR=%#llx)!\n",
+		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
+
+	switch (L2MERRSR_EL1_RAMID(val)) {
+	case 0x10:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "L2 Tag RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
+		break;
+	case 0x11:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "L2 Data RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
+		break;
+	case 0x12:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "L2 Snoop tag RAM cpu %d way %d\n",
+			    cpuid / 2, cpuid % 2);
+		break;
+	case 0x14:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "L2 Dirty RAM cpu %d way %d\n",
+			    cpuid / 2, cpuid % 2);
+		break;
+	case 0x18:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "L2 inclusion RAM cpu %d way %d\n",
+			    cpuid / 2, cpuid % 2);
+		break;
+	default:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "unknown RAMID cpuid %d\n", cpuid);
+		break;
+	}
+
+	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
+		    (int)L2MERRSR_EL1_REPEAT(val));
+	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
+		    (int)L2MERRSR_EL1_OTHER(val));
+	if (fatal)
+		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
+				      edac_ctl->name);
+	else
+		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
+				      edac_ctl->name);
+	write_l2merrsr_el1(0);
+}
+
+static void check_cpumerrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
+{
+	int fatal;
+	int bank;
+	u64 val = read_cpumerrsr_el1();
+
+	if (!CPUMERRSR_EL1_VALID(val))
+		return;
+
+	bank = CPUMERRSR_EL1_BANK(val);
+	fatal = CPUMERRSR_EL1_FATAL(val);
+	edac_printk(KERN_CRIT, EDAC_MOD_STR,
+		    "CPU%d detected %s error on L1 (CPUMERRSR=%#llx)!\n",
+		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
+
+	switch (CPUMERRSR_EL1_RAMID(val)) {
+	case 0x0:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "L1-I Tag RAM bank %d\n", bank);
+		break;
+	case 0x1:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "L1-I Data RAM bank %d\n", bank);
+		break;
+	case 0x8:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "L1-D Tag RAM bank %d\n", bank);
+		break;
+	case 0x9:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "L1-D Data RAM bank %d\n", bank);
+		break;
+	case 0x18:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "L2 TLB RAM bank %d\n", bank);
+		break;
+	default:
+		edac_printk(KERN_CRIT, EDAC_MOD_STR,
+			    "unknown ramid %d bank %d\n",
+			    (int)CPUMERRSR_EL1_RAMID(val), bank);
+		break;
+	}
+
+	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
+		    (int)CPUMERRSR_EL1_REPEAT(val));
+	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
+		    (int)CPUMERRSR_EL1_OTHER(val));
+	if (fatal)
+		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
+				      edac_ctl->name);
+	else
+		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
+				      edac_ctl->name);
+	write_cpumerrsr_el1(0);
+}
+
+static void cpu_check_errors(void *args)
+{
+	struct edac_device_ctl_info *edev_ctl = args;
+
+	check_cpumerrsr_el1_error(edev_ctl);
+	check_l2merrsr_el1_error(edev_ctl);
+}
+
+static void edac_check_errors(struct edac_device_ctl_info *edev_ctl)
+{
+	int cpu;
+
+	/* read L1 and L2 memory error syndrome register on possible CPU's */
+	for_each_possible_cpu(cpu)
+		smp_call_function_single(cpu, cpu_check_errors, edev_ctl, 0);
+}
+
+static int seattle_edac_probe(struct platform_device *pdev)
+{
+	int rc;
+	u32 poll_msec;
+	struct seattle_edac *drv;
+	struct device *dev = &pdev->dev;
+
+	rc = of_property_read_u32(pdev->dev.of_node, "poll-delay-msec",
+				  &poll_msec);
+	if (rc < 0) {
+		edac_printk(KERN_ERR, EDAC_MOD_STR,
+			    "failed to get poll interval\n");
+		return rc;
+	}
+
+	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
+	if (!drv)
+		return -ENOMEM;
+
+	drv->edac_ctl = edac_device_alloc_ctl_info(0, "cpu",
+						   num_possible_cpus(), "L", 2,
+						   1, NULL, 0,
+						   edac_device_alloc_index());
+
+	drv->edac_ctl->poll_msec = poll_msec;
+	drv->edac_ctl->edac_check = edac_check_errors;
+	drv->edac_ctl->dev = dev;
+	drv->edac_ctl->mod_name = dev_name(dev);
+	drv->edac_ctl->dev_name = dev_name(dev);
+	drv->edac_ctl->ctl_name = "cpu_err";
+	drv->edac_ctl->panic_on_ue = 1;
+	platform_set_drvdata(pdev, drv);
+
+	rc = edac_device_add_device(drv->edac_ctl);
+	if (rc)
+		goto edac_alloc_failed;
+
+	return 0;
+
+edac_alloc_failed:
+	edac_device_free_ctl_info(drv->edac_ctl);
+	return rc;
+}
+
+static int seattle_edac_remove(struct platform_device *pdev)
+{
+	struct seattle_edac *drv = dev_get_drvdata(&pdev->dev);
+	struct edac_device_ctl_info *edac_ctl = drv->edac_ctl;
+
+	edac_device_del_device(edac_ctl->dev);
+	edac_device_free_ctl_info(edac_ctl);
+
+	return 0;
+}
+
+static const struct of_device_id seattle_edac_of_match[] = {
+	{ .compatible = "amd,arm-seattle-edac" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, seattle_edac_of_match);
+
+static struct platform_driver seattle_edac_driver = {
+	.probe = seattle_edac_probe,
+	.remove = seattle_edac_remove,
+	.driver = {
+		.name = "seattle-edac",
+		.of_match_table = seattle_edac_of_match,
+	},
+};
+
+static int __init seattle_edac_init(void)
+{
+	int rc;
+
+	/* we support poll method */
+	edac_op_state = EDAC_OPSTATE_POLL;
+
+	rc = platform_driver_register(&seattle_edac_driver);
+	if (rc) {
+		edac_printk(KERN_ERR, EDAC_MOD_STR,
+			    "EDAC fails to register\n");
+		return rc;
+	}
+
+	return 0;
+}
+module_init(seattle_edac_init);
+
+static void __exit seattle_edac_exit(void)
+{
+	platform_driver_unregister(&seattle_edac_driver);
+}
+module_exit(seattle_edac_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Brijesh Singh <brijeshkumar.singh@amd.com>");
+MODULE_DESCRIPTION("AMD Seattle EDAC driver");
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-19 19:23 [PATCH] EDAC: Add AMD Seattle SoC EDAC Brijesh Singh
@ 2015-10-19 20:14 ` Borislav Petkov
  2015-10-19 20:52 ` Mark Rutland
  2015-10-20  2:21 ` Hanjun Guo
  2 siblings, 0 replies; 19+ messages in thread
From: Borislav Petkov @ 2015-10-19 20:14 UTC (permalink / raw)
  To: linux-arm-kernel

+ Arnd for the DT bindings.

On Mon, Oct 19, 2015 at 02:23:17PM -0500, Brijesh Singh wrote:
> Add support for the AMD Seattle SoC EDAC driver.
> 
> Signed-off-by: Brijesh Singh <brijeshkumar.singh@amd.com>
> ---
>  .../devicetree/bindings/edac/amd-seattle-edac.txt  |  15 +
>  drivers/edac/Kconfig                               |   6 +
>  drivers/edac/Makefile                              |   1 +
>  drivers/edac/seattle_edac.c                        | 306 +++++++++++++++++++++
>  4 files changed, 328 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
>  create mode 100644 drivers/edac/seattle_edac.c
> 
> diff --git a/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
> new file mode 100644
> index 0000000..a0354e8
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
> @@ -0,0 +1,15 @@
> +* AMD Seattle SoC EDAC node
> +
> +EDAC node is defined to describe on-chip error detection and reporting.
> +
> +Required properties:
> +- compatible: Should be "amd,arm-seattle-edac"
> +- poll-delay-msec: Indicates how often the edac check callback should be called.
> +  Time in msec.
> +
> +Example:
> +	edac {
> +		compatible = "amd,arm-seattle-edac";
> +		poll-delay-msec = <100>;
> +	};
> +
> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
> index ef25000..d342335 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -390,4 +390,10 @@ config EDAC_XGENE
>  	  Support for error detection and correction on the
>  	  APM X-Gene family of SOCs.
>  
> +config EDAC_SEATTLE
> +	tristate "AMD Seattle EDAC"
> +	depends on EDAC_MM_EDAC && ARCH_SEATTLE
> +	help
> +	  Support for error detection and correction on the
> +	  AMD Seattle SOC.
>  endif # EDAC
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
> index ae3c5f3..9e4f3ef 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -68,3 +68,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI)		+= octeon_edac-pci.o
>  obj-$(CONFIG_EDAC_ALTERA_MC)		+= altera_edac.o
>  obj-$(CONFIG_EDAC_SYNOPSYS)		+= synopsys_edac.o
>  obj-$(CONFIG_EDAC_XGENE)		+= xgene_edac.o
> +obj-$(CONFIG_EDAC_SEATTLE)		+= seattle_edac.o
> diff --git a/drivers/edac/seattle_edac.c b/drivers/edac/seattle_edac.c
> new file mode 100644
> index 0000000..78101aa
> --- /dev/null
> +++ b/drivers/edac/seattle_edac.c
> @@ -0,0 +1,306 @@
> +/*
> + * AMD Seattle EDAC
> + *
> + * Copyright (c) 2015, Advanced Micro Devices
> + * Author: Brijesh Singh <brijeshkumar.singh@amd.com>
> + *
> + * The driver polls CPUMERRSR_EL1 and L2MERRSR_EL1 registers to logs the 

								to log the ...

> + * non-fatal errors. Whereas the single bit and double bit ECC erros are 
> + * handled by firmware.

"Single and double bit ECC errors are handled by firmware." - no need
"for Whereas".

Please run this through a spell checker before your next submission.

Question about the content itself: non-fatal errors are polled but
single and double-bit errors are handled by fw. Single bit errors are
non-fatal, methinks. So what is this sentence actually saying?

> + *
> + * This program is free software; you can redistribute  it and/or modify it
> + * under  the terms of  the GNU General  Public License as published by the
> + * Free Software Foundation;  either version 2 of the  License, or (at your
> + * option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.

Can you get rid of that copyright boilerplate and use a single sentence like:

"Licensed under GPL v2."

Ask your legal people and/or your manager - they should know.

> + */
> +
> +#include <linux/module.h>
> +#include <linux/of_device.h>
> +#include <linux/platform_device.h>
> +
> +#include "edac_core.h"
> +
> +#define EDAC_MOD_STR             "seattle_edac"
> +
> +#define CPUMERRSR_EL1_INDEX(x)   ((x) & 0x1ffff)
> +#define CPUMERRSR_EL1_BANK(x)    (((x) >> 18) & 0x1f)
> +#define CPUMERRSR_EL1_RAMID(x)   (((x) >> 24) & 0x7f)
> +#define CPUMERRSR_EL1_VALID(x)   ((x) & (1 << 31))
> +#define CPUMERRSR_EL1_REPEAT(x)  (((x) >> 32) & 0x7f)
> +#define CPUMERRSR_EL1_OTHER(x)   (((x) >> 40) & 0xff)
> +#define CPUMERRSR_EL1_FATAL(x)   ((x) & (1UL << 63))
> +
> +#define L2MERRSR_EL1_INDEX(x)    ((x) & 0x1ffff)
> +#define L2MERRSR_EL1_CPUID(x)    (((x) >> 18) & 0xf)
> +#define L2MERRSR_EL1_RAMID(x)    (((x) >> 24) & 0x7f)
> +#define L2MERRSR_EL1_VALID(x)    ((x) & (1 << 31))
> +#define L2MERRSR_EL1_REPEAT(x)   (((x) >> 32) & 0xff)
> +#define L2MERRSR_EL1_OTHER(x)    (((x) >> 40) & 0xff)
> +#define L2MERRSR_EL1_FATAL(x)    ((x) & (1UL << 63))
> +
> +struct seattle_edac {
> +	struct edac_device_ctl_info *edac_ctl;
> +};
> +
> +static inline u64 read_cpumerrsr_el1(void)
> +{
> +	u64 val;
> +
> +	asm volatile("mrs %0, s3_1_c15_c2_2" : "=r" (val));
> +	return val;
> +}
> +
> +static inline void write_cpumerrsr_el1(u64 val)
> +{
> +	asm volatile("msr s3_1_c15_c2_2, %0" :: "r" (val));
> +}
> +
> +static inline u64 read_l2merrsr_el1(void)
> +{
> +	u64 val;
> +
> +	asm volatile("mrs %0, s3_1_c15_c2_3" : "=r" (val));
> +	return val;
> +}
> +
> +static inline void write_l2merrsr_el1(u64 val)
> +{
> +	asm volatile("msr s3_1_c15_c2_3, %0" :: "r" (val));
> +}
> +
> +static void check_l2merrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
> +{

All those function names are very cryptic. Can you make them more
human-readable and -understandable? Don't be afraid to use comments too.

> +	int fatal;
> +	int cpuid;
> +	u64 val = read_l2merrsr_el1();
> +
> +	if (!L2MERRSR_EL1_VALID(val))
> +		return;
> +
> +	fatal = L2MERRSR_EL1_FATAL(val);
> +	cpuid = L2MERRSR_EL1_CPUID(val);
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +		    "CPU%d detected %s error on L2 (L2MERRSR=%#llx)!\n",

		    "CPU%d detected %s L2 error ..."

> +		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
> +
> +	switch (L2MERRSR_EL1_RAMID(val)) {
> +	case 0x10:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Tag RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);

What is a "L2 Tag RAM cpu <num> way <num>"? Can you make those more
humanly understandable please. Like

	" Error in the L2 Tag array.\n"

Also, you're dumping smp_processor_id() above and here you have cpuid
/ 2 (btw, I'm guessing you wanna do "cpuid >> 1" for the division and
"cpuid & 1" for the modulo for additional speed since division is
costly) and you're calling this "cpu" too. What is that "cpu" supposed
to mean?

Ditto for the remaining printk messages below.

> +		break;
> +	case 0x11:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Data RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x12:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Snoop tag RAM cpu %d way %d\n",
> +			    cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x14:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Dirty RAM cpu %d way %d\n",
> +			    cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x18:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 inclusion RAM cpu %d way %d\n",
> +			    cpuid / 2, cpuid % 2);
> +		break;
> +	default:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "unknown RAMID cpuid %d\n", cpuid);
> +		break;
> +	}
> +
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
> +		    (int)L2MERRSR_EL1_REPEAT(val));

Why the cast? You can simply print %lu.

> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
> +		    (int)L2MERRSR_EL1_OTHER(val));

Ditto.

> +	if (fatal)
> +		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	else
> +		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	write_l2merrsr_el1(0);
> +}
> +
> +static void check_cpumerrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
> +{

Also a cryptic name.

> +	int fatal;
> +	int bank;
> +	u64 val = read_cpumerrsr_el1();
> +
> +	if (!CPUMERRSR_EL1_VALID(val))
> +		return;
> +
> +	bank = CPUMERRSR_EL1_BANK(val);
> +	fatal = CPUMERRSR_EL1_FATAL(val);
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +		    "CPU%d detected %s error on L1 (CPUMERRSR=%#llx)!\n",

As above.

> +		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
> +
> +	switch (CPUMERRSR_EL1_RAMID(val)) {
> +	case 0x0:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-I Tag RAM bank %d\n", bank);
> +		break;
> +	case 0x1:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-I Data RAM bank %d\n", bank);
> +		break;
> +	case 0x8:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-D Tag RAM bank %d\n", bank);
> +		break;
> +	case 0x9:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-D Data RAM bank %d\n", bank);
> +		break;
> +	case 0x18:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 TLB RAM bank %d\n", bank);

Are those to say *where* the errors were? If so, please say so in the
error message:

	"CPU%d: L1 %s error detected in the L1-I tag RAM bank"

you can use pr_cont() for the continuation messages.

> +		break;
> +	default:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "unknown ramid %d bank %d\n",
> +			    (int)CPUMERRSR_EL1_RAMID(val), bank);
> +		break;
> +	}
> +
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
> +		    (int)CPUMERRSR_EL1_REPEAT(val));
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
> +		    (int)CPUMERRSR_EL1_OTHER(val));

No need to cast.

> +	if (fatal)
> +		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	else
> +		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	write_cpumerrsr_el1(0);
> +}
> +
> +static void cpu_check_errors(void *args)
> +{
> +	struct edac_device_ctl_info *edev_ctl = args;
> +
> +	check_cpumerrsr_el1_error(edev_ctl);
> +	check_l2merrsr_el1_error(edev_ctl);
> +}
> +
> +static void edac_check_errors(struct edac_device_ctl_info *edev_ctl)
> +{
> +	int cpu;
> +
> +	/* read L1 and L2 memory error syndrome register on possible CPU's */
> +	for_each_possible_cpu(cpu)

This is not cpu hotplug-safe. What if I offline a CPU while this loop
runs?

> +		smp_call_function_single(cpu, cpu_check_errors, edev_ctl, 0);

Hint:
	get_online_cpus();
	on_each_cpu();
	put_online_cpus();

> +}
> +
> +static int seattle_edac_probe(struct platform_device *pdev)
> +{
> +	int rc;
> +	u32 poll_msec;
> +	struct seattle_edac *drv;
> +	struct device *dev = &pdev->dev;
> +
> +	rc = of_property_read_u32(pdev->dev.of_node, "poll-delay-msec",
> +				  &poll_msec);
> +	if (rc < 0) {
> +		edac_printk(KERN_ERR, EDAC_MOD_STR,
> +			    "failed to get poll interval\n");
> +		return rc;
> +	}
> +
> +	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
> +	if (!drv)
> +		return -ENOMEM;
> +
> +	drv->edac_ctl = edac_device_alloc_ctl_info(0, "cpu",
> +						   num_possible_cpus(), "L", 2,

Same problematic if someone offlines a core and then loads this module.

> +						   1, NULL, 0,
> +						   edac_device_alloc_index());

Why aren't you checking the return value of this function?

> +	drv->edac_ctl->poll_msec = poll_msec;
> +	drv->edac_ctl->edac_check = edac_check_errors;
> +	drv->edac_ctl->dev = dev;
> +	drv->edac_ctl->mod_name = dev_name(dev);
> +	drv->edac_ctl->dev_name = dev_name(dev);
> +	drv->edac_ctl->ctl_name = "cpu_err";
> +	drv->edac_ctl->panic_on_ue = 1;
> +	platform_set_drvdata(pdev, drv);
> +
> +	rc = edac_device_add_device(drv->edac_ctl);
> +	if (rc)
> +		goto edac_alloc_failed;
> +
> +	return 0;
> +
> +edac_alloc_failed:
> +	edac_device_free_ctl_info(drv->edac_ctl);
> +	return rc;
> +}
-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-19 19:23 [PATCH] EDAC: Add AMD Seattle SoC EDAC Brijesh Singh
  2015-10-19 20:14 ` Borislav Petkov
@ 2015-10-19 20:52 ` Mark Rutland
  2015-10-20 16:44   ` Brijesh Singh
  2015-10-20  2:21 ` Hanjun Guo
  2 siblings, 1 reply; 19+ messages in thread
From: Mark Rutland @ 2015-10-19 20:52 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

Please Cc the devicetree list (devicetree at vger.kernel.org) when sending
binding patches. I see you've added the people from the MAINTAINERS
entry; the list should also be Cc'd.

On Mon, Oct 19, 2015 at 02:23:17PM -0500, Brijesh Singh wrote:
> Add support for the AMD Seattle SoC EDAC driver.
> 
> Signed-off-by: Brijesh Singh <brijeshkumar.singh@amd.com>
> ---
>  .../devicetree/bindings/edac/amd-seattle-edac.txt  |  15 +
>  drivers/edac/Kconfig                               |   6 +
>  drivers/edac/Makefile                              |   1 +
>  drivers/edac/seattle_edac.c                        | 306 +++++++++++++++++++++
>  4 files changed, 328 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
>  create mode 100644 drivers/edac/seattle_edac.c
> 
> diff --git a/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
> new file mode 100644
> index 0000000..a0354e8
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
> @@ -0,0 +1,15 @@
> +* AMD Seattle SoC EDAC node
> +
> +EDAC node is defined to describe on-chip error detection and reporting.
> +
> +Required properties:
> +- compatible: Should be "amd,arm-seattle-edac"
> +- poll-delay-msec: Indicates how often the edac check callback should be called.
> +  Time in msec.

This second property doesn't describe the hardware in any way. It should
be runtime-configurable and dpesn't belong in the DT.

Regardless, the binding is wrong. This is in no way specific to AMD
Seattle, and per the code is actually used to imply the presence of a
Cortex-A57 feature. No reference to AMD Seattle belongs in the DT
binding (with the exception of the example, perhaps), nor in the driver.

NAK while this pretends to be something that it isn't. At minimum, you
need to correctly describe the feature you are trying to add support
for.

> +
> +Example:
> +	edac {
> +		compatible = "amd,arm-seattle-edac";
> +		poll-delay-msec = <100>;
> +	};
> +
> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
> index ef25000..d342335 100644
> --- a/drivers/edac/Kconfig
> +++ b/drivers/edac/Kconfig
> @@ -390,4 +390,10 @@ config EDAC_XGENE
>  	  Support for error detection and correction on the
>  	  APM X-Gene family of SOCs.
>  
> +config EDAC_SEATTLE
> +	tristate "AMD Seattle EDAC"
> +	depends on EDAC_MM_EDAC && ARCH_SEATTLE
> +	help
> +	  Support for error detection and correction on the
> +	  AMD Seattle SOC.
>  endif # EDAC
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
> index ae3c5f3..9e4f3ef 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -68,3 +68,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI)		+= octeon_edac-pci.o
>  obj-$(CONFIG_EDAC_ALTERA_MC)		+= altera_edac.o
>  obj-$(CONFIG_EDAC_SYNOPSYS)		+= synopsys_edac.o
>  obj-$(CONFIG_EDAC_XGENE)		+= xgene_edac.o
> +obj-$(CONFIG_EDAC_SEATTLE)		+= seattle_edac.o
> diff --git a/drivers/edac/seattle_edac.c b/drivers/edac/seattle_edac.c
> new file mode 100644
> index 0000000..78101aa
> --- /dev/null
> +++ b/drivers/edac/seattle_edac.c
> @@ -0,0 +1,306 @@
> +/*
> + * AMD Seattle EDAC
> + *
> + * Copyright (c) 2015, Advanced Micro Devices
> + * Author: Brijesh Singh <brijeshkumar.singh@amd.com>
> + *
> + * The driver polls CPUMERRSR_EL1 and L2MERRSR_EL1 registers to logs the 

These are IMPLEMENTATION DEFINED registers which are specific to
Cortex-A57, and I note that L2MERRSR_EL1 changed in r1p0.

Which revisions of Cortex-A57 does this work with?

Generally we avoid touching IMPLEMENTATION DEFINED registers as they may
not exist in some configurations or revisions, and could trap or undef.
Is it always safe to access these registers (in current revisions of
Cortex-A57)?

> + * non-fatal errors. Whereas the single bit and double bit ECC erros are 
> + * handled by firmware.

I had expected this would be all be left for firmware, given that this
feature may change in any revision of the CPU.

What precisely does the AMD Seattle firmware do for double-bit ECC
errors, and how is it triggered?

> + * This program is free software; you can redistribute  it and/or modify it
> + * under  the terms of  the GNU General  Public License as published by the
> + * Free Software Foundation;  either version 2 of the  License, or (at your
> + * option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/of_device.h>
> +#include <linux/platform_device.h>
> +
> +#include "edac_core.h"
> +
> +#define EDAC_MOD_STR             "seattle_edac"
> +
> +#define CPUMERRSR_EL1_INDEX(x)   ((x) & 0x1ffff)
> +#define CPUMERRSR_EL1_BANK(x)    (((x) >> 18) & 0x1f)
> +#define CPUMERRSR_EL1_RAMID(x)   (((x) >> 24) & 0x7f)
> +#define CPUMERRSR_EL1_VALID(x)   ((x) & (1 << 31))
> +#define CPUMERRSR_EL1_REPEAT(x)  (((x) >> 32) & 0x7f)
> +#define CPUMERRSR_EL1_OTHER(x)   (((x) >> 40) & 0xff)
> +#define CPUMERRSR_EL1_FATAL(x)   ((x) & (1UL << 63))
> +
> +#define L2MERRSR_EL1_INDEX(x)    ((x) & 0x1ffff)
> +#define L2MERRSR_EL1_CPUID(x)    (((x) >> 18) & 0xf)
> +#define L2MERRSR_EL1_RAMID(x)    (((x) >> 24) & 0x7f)
> +#define L2MERRSR_EL1_VALID(x)    ((x) & (1 << 31))
> +#define L2MERRSR_EL1_REPEAT(x)   (((x) >> 32) & 0xff)
> +#define L2MERRSR_EL1_OTHER(x)    (((x) >> 40) & 0xff)
> +#define L2MERRSR_EL1_FATAL(x)    ((x) & (1UL << 63))
> +
> +struct seattle_edac {
> +	struct edac_device_ctl_info *edac_ctl;
> +};
> +
> +static inline u64 read_cpumerrsr_el1(void)
> +{
> +	u64 val;
> +
> +	asm volatile("mrs %0, s3_1_c15_c2_2" : "=r" (val));
> +	return val;
> +}
> +
> +static inline void write_cpumerrsr_el1(u64 val)
> +{
> +	asm volatile("msr s3_1_c15_c2_2, %0" :: "r" (val));
> +}
> +
> +static inline u64 read_l2merrsr_el1(void)
> +{
> +	u64 val;
> +
> +	asm volatile("mrs %0, s3_1_c15_c2_3" : "=r" (val));
> +	return val;
> +}
> +
> +static inline void write_l2merrsr_el1(u64 val)
> +{
> +	asm volatile("msr s3_1_c15_c2_3, %0" :: "r" (val));
> +}
> +
> +static void check_l2merrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
> +{
> +	int fatal;
> +	int cpuid;
> +	u64 val = read_l2merrsr_el1();
> +
> +	if (!L2MERRSR_EL1_VALID(val))
> +		return;
> +
> +	fatal = L2MERRSR_EL1_FATAL(val);
> +	cpuid = L2MERRSR_EL1_CPUID(val);

Per the spec this appears to be the physical index of the CPU within a
cluster. So the logged messages aren't all that useful in a
multi-cluster system.

Additionally this isn't just the CPUID, which is why all the prints
below look weird. Please split the field here rather than propagating it
with a misleading name.

> +	edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +		    "CPU%d detected %s error on L2 (L2MERRSR=%#llx)!\n",
> +		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
> +
> +	switch (L2MERRSR_EL1_RAMID(val)) {
> +	case 0x10:

Define some macro mnemonics for these.

> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Tag RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x11:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Data RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x12:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Snoop tag RAM cpu %d way %d\n",
> +			    cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x14:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Dirty RAM cpu %d way %d\n",
> +			    cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x18:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 inclusion RAM cpu %d way %d\n",
> +			    cpuid / 2, cpuid % 2);
> +		break;
> +	default:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "unknown RAMID cpuid %d\n", cpuid);
> +		break;
> +	}
> +
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
> +		    (int)L2MERRSR_EL1_REPEAT(val));
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
> +		    (int)L2MERRSR_EL1_OTHER(val));
> +	if (fatal)
> +		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	else
> +		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	write_l2merrsr_el1(0);
> +}
> +
> +static void check_cpumerrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
> +{
> +	int fatal;
> +	int bank;
> +	u64 val = read_cpumerrsr_el1();
> +
> +	if (!CPUMERRSR_EL1_VALID(val))
> +		return;
> +
> +	bank = CPUMERRSR_EL1_BANK(val);
> +	fatal = CPUMERRSR_EL1_FATAL(val);
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +		    "CPU%d detected %s error on L1 (CPUMERRSR=%#llx)!\n",
> +		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
> +
> +	switch (CPUMERRSR_EL1_RAMID(val)) {
> +	case 0x0:

Mnemonics please.

> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-I Tag RAM bank %d\n", bank);
> +		break;
> +	case 0x1:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-I Data RAM bank %d\n", bank);
> +		break;
> +	case 0x8:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-D Tag RAM bank %d\n", bank);
> +		break;
> +	case 0x9:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-D Data RAM bank %d\n", bank);
> +		break;
> +	case 0x18:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 TLB RAM bank %d\n", bank);
> +		break;
> +	default:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "unknown ramid %d bank %d\n",
> +			    (int)CPUMERRSR_EL1_RAMID(val), bank);
> +		break;
> +	}
> +
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
> +		    (int)CPUMERRSR_EL1_REPEAT(val));
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
> +		    (int)CPUMERRSR_EL1_OTHER(val));
> +	if (fatal)
> +		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	else
> +		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	write_cpumerrsr_el1(0);
> +}
> +
> +static void cpu_check_errors(void *args)
> +{
> +	struct edac_device_ctl_info *edev_ctl = args;
> +
> +	check_cpumerrsr_el1_error(edev_ctl);
> +	check_l2merrsr_el1_error(edev_ctl);
> +}
> +
> +static void edac_check_errors(struct edac_device_ctl_info *edev_ctl)
> +{
> +	int cpu;
> +
> +	/* read L1 and L2 memory error syndrome register on possible CPU's */

Nit: get rid of the apostrophe

> +	for_each_possible_cpu(cpu)
> +		smp_call_function_single(cpu, cpu_check_errors, edev_ctl, 0);

This doesn't look right. Why are we cross-calling to other CPUs?

As far as I can see, this is a per-cluster thing, so surely we should
have a device per-cluster?

> +}
> +
> +static int seattle_edac_probe(struct platform_device *pdev)
> +{
> +	int rc;
> +	u32 poll_msec;
> +	struct seattle_edac *drv;
> +	struct device *dev = &pdev->dev;
> +
> +	rc = of_property_read_u32(pdev->dev.of_node, "poll-delay-msec",
> +				  &poll_msec);

As stated above, this property does not belong in the DT.

I see that this is a module; this can easily be a module parameter.

Thanks,
Mark.

> +	if (rc < 0) {
> +		edac_printk(KERN_ERR, EDAC_MOD_STR,
> +			    "failed to get poll interval\n");
> +		return rc;
> +	}
> +
> +	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
> +	if (!drv)
> +		return -ENOMEM;
> +
> +	drv->edac_ctl = edac_device_alloc_ctl_info(0, "cpu",
> +						   num_possible_cpus(), "L", 2,
> +						   1, NULL, 0,
> +						   edac_device_alloc_index());
> +
> +	drv->edac_ctl->poll_msec = poll_msec;
> +	drv->edac_ctl->edac_check = edac_check_errors;
> +	drv->edac_ctl->dev = dev;
> +	drv->edac_ctl->mod_name = dev_name(dev);
> +	drv->edac_ctl->dev_name = dev_name(dev);
> +	drv->edac_ctl->ctl_name = "cpu_err";
> +	drv->edac_ctl->panic_on_ue = 1;
> +	platform_set_drvdata(pdev, drv);
> +
> +	rc = edac_device_add_device(drv->edac_ctl);
> +	if (rc)
> +		goto edac_alloc_failed;
> +
> +	return 0;
> +
> +edac_alloc_failed:
> +	edac_device_free_ctl_info(drv->edac_ctl);
> +	return rc;
> +}
> +
> +static int seattle_edac_remove(struct platform_device *pdev)
> +{
> +	struct seattle_edac *drv = dev_get_drvdata(&pdev->dev);
> +	struct edac_device_ctl_info *edac_ctl = drv->edac_ctl;
> +
> +	edac_device_del_device(edac_ctl->dev);
> +	edac_device_free_ctl_info(edac_ctl);
> +
> +	return 0;
> +}
> +
> +static const struct of_device_id seattle_edac_of_match[] = {
> +	{ .compatible = "amd,arm-seattle-edac" },
> +	{},
> +};
> +MODULE_DEVICE_TABLE(of, seattle_edac_of_match);
> +
> +static struct platform_driver seattle_edac_driver = {
> +	.probe = seattle_edac_probe,
> +	.remove = seattle_edac_remove,
> +	.driver = {
> +		.name = "seattle-edac",
> +		.of_match_table = seattle_edac_of_match,
> +	},
> +};
> +
> +static int __init seattle_edac_init(void)
> +{
> +	int rc;
> +
> +	/* we support poll method */
> +	edac_op_state = EDAC_OPSTATE_POLL;
> +
> +	rc = platform_driver_register(&seattle_edac_driver);
> +	if (rc) {
> +		edac_printk(KERN_ERR, EDAC_MOD_STR,
> +			    "EDAC fails to register\n");
> +		return rc;
> +	}
> +
> +	return 0;
> +}
> +module_init(seattle_edac_init);
> +
> +static void __exit seattle_edac_exit(void)
> +{
> +	platform_driver_unregister(&seattle_edac_driver);
> +}
> +module_exit(seattle_edac_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Brijesh Singh <brijeshkumar.singh@amd.com>");
> +MODULE_DESCRIPTION("AMD Seattle EDAC driver");
> -- 
> 1.9.1
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-19 19:23 [PATCH] EDAC: Add AMD Seattle SoC EDAC Brijesh Singh
  2015-10-19 20:14 ` Borislav Petkov
  2015-10-19 20:52 ` Mark Rutland
@ 2015-10-20  2:21 ` Hanjun Guo
  2015-10-20 21:26   ` Brijesh Singh
  2 siblings, 1 reply; 19+ messages in thread
From: Hanjun Guo @ 2015-10-20  2:21 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Brijesh,

On 2015/10/20 3:23, Brijesh Singh wrote:
> Add support for the AMD Seattle SoC EDAC driver.
>
> Signed-off-by: Brijesh Singh <brijeshkumar.singh@amd.com>
> ---
>  .../devicetree/bindings/edac/amd-seattle-edac.txt  |  15 +
>  drivers/edac/Kconfig                               |   6 +
>  drivers/edac/Makefile                              |   1 +
>  drivers/edac/seattle_edac.c                        | 306 +++++++++++++++++++++
>  4 files changed, 328 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
>  create mode 100644 drivers/edac/seattle_edac.c
>
>
[...]
> +config EDAC_SEATTLE
> +	tristate "AMD Seattle EDAC"
> +	depends on EDAC_MM_EDAC && ARCH_SEATTLE
> +	help
> +	  Support for error detection and correction on the
> +	  AMD Seattle SOC.
>  endif # EDAC
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
> index ae3c5f3..9e4f3ef 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -68,3 +68,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI)		+= octeon_edac-pci.o
>  obj-$(CONFIG_EDAC_ALTERA_MC)		+= altera_edac.o
>  obj-$(CONFIG_EDAC_SYNOPSYS)		+= synopsys_edac.o
>  obj-$(CONFIG_EDAC_XGENE)		+= xgene_edac.o
> +obj-$(CONFIG_EDAC_SEATTLE)		+= seattle_edac.o
> diff --git a/drivers/edac/seattle_edac.c b/drivers/edac/seattle_edac.c
> new file mode 100644
> index 0000000..78101aa
> --- /dev/null
> +++ b/drivers/edac/seattle_edac.c
> @@ -0,0 +1,306 @@
> +/*
> + * AMD Seattle EDAC
> + *
> + * Copyright (c) 2015, Advanced Micro Devices
> + * Author: Brijesh Singh <brijeshkumar.singh@amd.com>
> + *
> + * The driver polls CPUMERRSR_EL1 and L2MERRSR_EL1 registers to logs the 
> + * non-fatal errors. Whereas the single bit and double bit ECC erros are 
> + * handled by firmware.
> + *
> + * This program is free software; you can redistribute  it and/or modify it
> + * under  the terms of  the GNU General  Public License as published by the
> + * Free Software Foundation;  either version 2 of the  License, or (at your
> + * option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/of_device.h>
> +#include <linux/platform_device.h>
> +
> +#include "edac_core.h"
> +
> +#define EDAC_MOD_STR             "seattle_edac"
> +
> +#define CPUMERRSR_EL1_INDEX(x)   ((x) & 0x1ffff)
> +#define CPUMERRSR_EL1_BANK(x)    (((x) >> 18) & 0x1f)
> +#define CPUMERRSR_EL1_RAMID(x)   (((x) >> 24) & 0x7f)
> +#define CPUMERRSR_EL1_VALID(x)   ((x) & (1 << 31))
> +#define CPUMERRSR_EL1_REPEAT(x)  (((x) >> 32) & 0x7f)
> +#define CPUMERRSR_EL1_OTHER(x)   (((x) >> 40) & 0xff)
> +#define CPUMERRSR_EL1_FATAL(x)   ((x) & (1UL << 63))
> +
> +#define L2MERRSR_EL1_INDEX(x)    ((x) & 0x1ffff)
> +#define L2MERRSR_EL1_CPUID(x)    (((x) >> 18) & 0xf)
> +#define L2MERRSR_EL1_RAMID(x)    (((x) >> 24) & 0x7f)
> +#define L2MERRSR_EL1_VALID(x)    ((x) & (1 << 31))
> +#define L2MERRSR_EL1_REPEAT(x)   (((x) >> 32) & 0xff)
> +#define L2MERRSR_EL1_OTHER(x)    (((x) >> 40) & 0xff)
> +#define L2MERRSR_EL1_FATAL(x)    ((x) & (1UL << 63))
> +
> +struct seattle_edac {
> +	struct edac_device_ctl_info *edac_ctl;
> +};
> +
> +static inline u64 read_cpumerrsr_el1(void)
> +{
> +	u64 val;
> +
> +	asm volatile("mrs %0, s3_1_c15_c2_2" : "=r" (val));
> +	return val;
> +}
> +
> +static inline void write_cpumerrsr_el1(u64 val)
> +{
> +	asm volatile("msr s3_1_c15_c2_2, %0" :: "r" (val));
> +}
> +
> +static inline u64 read_l2merrsr_el1(void)
> +{
> +	u64 val;
> +
> +	asm volatile("mrs %0, s3_1_c15_c2_3" : "=r" (val));
> +	return val;
> +}
> +
> +static inline void write_l2merrsr_el1(u64 val)
> +{
> +	asm volatile("msr s3_1_c15_c2_3, %0" :: "r" (val));
> +}
> +
> +static void check_l2merrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
> +{
> +	int fatal;
> +	int cpuid;
> +	u64 val = read_l2merrsr_el1();
> +
> +	if (!L2MERRSR_EL1_VALID(val))
> +		return;
> +
> +	fatal = L2MERRSR_EL1_FATAL(val);
> +	cpuid = L2MERRSR_EL1_CPUID(val);
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +		    "CPU%d detected %s error on L2 (L2MERRSR=%#llx)!\n",
> +		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
> +
> +	switch (L2MERRSR_EL1_RAMID(val)) {
> +	case 0x10:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Tag RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x11:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Data RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x12:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Snoop tag RAM cpu %d way %d\n",
> +			    cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x14:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 Dirty RAM cpu %d way %d\n",
> +			    cpuid / 2, cpuid % 2);
> +		break;
> +	case 0x18:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 inclusion RAM cpu %d way %d\n",
> +			    cpuid / 2, cpuid % 2);
> +		break;
> +	default:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "unknown RAMID cpuid %d\n", cpuid);
> +		break;
> +	}
> +
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
> +		    (int)L2MERRSR_EL1_REPEAT(val));
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
> +		    (int)L2MERRSR_EL1_OTHER(val));
> +	if (fatal)
> +		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	else
> +		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	write_l2merrsr_el1(0);
> +}
> +
> +static void check_cpumerrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
> +{
> +	int fatal;
> +	int bank;
> +	u64 val = read_cpumerrsr_el1();
> +
> +	if (!CPUMERRSR_EL1_VALID(val))
> +		return;
> +
> +	bank = CPUMERRSR_EL1_BANK(val);
> +	fatal = CPUMERRSR_EL1_FATAL(val);
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +		    "CPU%d detected %s error on L1 (CPUMERRSR=%#llx)!\n",
> +		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
> +
> +	switch (CPUMERRSR_EL1_RAMID(val)) {
> +	case 0x0:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-I Tag RAM bank %d\n", bank);
> +		break;
> +	case 0x1:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-I Data RAM bank %d\n", bank);
> +		break;
> +	case 0x8:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-D Tag RAM bank %d\n", bank);
> +		break;
> +	case 0x9:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L1-D Data RAM bank %d\n", bank);
> +		break;
> +	case 0x18:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "L2 TLB RAM bank %d\n", bank);
> +		break;
> +	default:
> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
> +			    "unknown ramid %d bank %d\n",
> +			    (int)CPUMERRSR_EL1_RAMID(val), bank);
> +		break;
> +	}
> +
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
> +		    (int)CPUMERRSR_EL1_REPEAT(val));
> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
> +		    (int)CPUMERRSR_EL1_OTHER(val));
> +	if (fatal)
> +		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	else
> +		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
> +				      edac_ctl->name);
> +	write_cpumerrsr_el1(0);
> +}

The codes above are common for all A57 architectures, other A57 SoCs will use the same
code for L1/L2 caches error report, can we put those codes in common place and reused
for all A57 architectures?

> +
> +static void cpu_check_errors(void *args)
> +{
> +	struct edac_device_ctl_info *edev_ctl = args;
> +
> +	check_cpumerrsr_el1_error(edev_ctl);
> +	check_l2merrsr_el1_error(edev_ctl);
> +}
> +
> +static void edac_check_errors(struct edac_device_ctl_info *edev_ctl)
> +{
> +	int cpu;
> +
> +	/* read L1 and L2 memory error syndrome register on possible CPU's */
> +	for_each_possible_cpu(cpu)
> +		smp_call_function_single(cpu, cpu_check_errors, edev_ctl, 0);

Seems that error syndrome registers for L2 cache are cluster lever (each cluster share the
L2 cache, you can refer to ARM doc: DDI0488D, Cortex-A57 Technical Reference Manual),
so for L2 cache, we need to check the error at cluster lever not the cpu core lever.

Thanks
Hanjun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-19 20:52 ` Mark Rutland
@ 2015-10-20 16:44   ` Brijesh Singh
  2015-10-20 16:57     ` Borislav Petkov
  2015-10-20 17:25     ` Mark Rutland
  0 siblings, 2 replies; 19+ messages in thread
From: Brijesh Singh @ 2015-10-20 16:44 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Mark,

Thanks for review. 

-Brijesh

On 10/19/2015 03:52 PM, Mark Rutland wrote:
> Hi,
> 
> Please Cc the devicetree list (devicetree at vger.kernel.org) when sending
> binding patches. I see you've added the people from the MAINTAINERS
> entry; the list should also be Cc'd.
> 
Noted.
> On Mon, Oct 19, 2015 at 02:23:17PM -0500, Brijesh Singh wrote:
>> Add support for the AMD Seattle SoC EDAC driver.
>>
>> Signed-off-by: Brijesh Singh <brijeshkumar.singh@amd.com>
>> ---
>>  .../devicetree/bindings/edac/amd-seattle-edac.txt  |  15 +
>>  drivers/edac/Kconfig                               |   6 +
>>  drivers/edac/Makefile                              |   1 +
>>  drivers/edac/seattle_edac.c                        | 306 +++++++++++++++++++++
>>  4 files changed, 328 insertions(+)
>>  create mode 100644 Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
>>  create mode 100644 drivers/edac/seattle_edac.c
>>
>> diff --git a/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
>> new file mode 100644
>> index 0000000..a0354e8
>> --- /dev/null
>> +++ b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
>> @@ -0,0 +1,15 @@
>> +* AMD Seattle SoC EDAC node
>> +
>> +EDAC node is defined to describe on-chip error detection and reporting.
>> +
>> +Required properties:
>> +- compatible: Should be "amd,arm-seattle-edac"
>> +- poll-delay-msec: Indicates how often the edac check callback should be called.
>> +  Time in msec.
> 
> This second property doesn't describe the hardware in any way. It should
> be runtime-configurable and dpesn't belong in the DT.
> 
> Regardless, the binding is wrong. This is in no way specific to AMD
> Seattle, and per the code is actually used to imply the presence of a
> Cortex-A57 feature. No reference to AMD Seattle belongs in the DT
> binding (with the exception of the example, perhaps), nor in the driver.
> 
> NAK while this pretends to be something that it isn't. At minimum, you
> need to correctly describe the feature you are trying to add support
> for.
> 
I will remove AMD specific string in compatibility field and make the poll-delay-msec optional. Will also expose this as module parameter as you suggested below.

>> +
>> +Example:
>> +	edac {
>> +		compatible = "amd,arm-seattle-edac";
>> +		poll-delay-msec = <100>;
>> +	};
>> +
>> diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
>> index ef25000..d342335 100644
>> --- a/drivers/edac/Kconfig
>> +++ b/drivers/edac/Kconfig
>> @@ -390,4 +390,10 @@ config EDAC_XGENE
>>  	  Support for error detection and correction on the
>>  	  APM X-Gene family of SOCs.
>>  
>> +config EDAC_SEATTLE
>> +	tristate "AMD Seattle EDAC"
>> +	depends on EDAC_MM_EDAC && ARCH_SEATTLE
>> +	help
>> +	  Support for error detection and correction on the
>> +	  AMD Seattle SOC.
>>  endif # EDAC
>> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
>> index ae3c5f3..9e4f3ef 100644
>> --- a/drivers/edac/Makefile
>> +++ b/drivers/edac/Makefile
>> @@ -68,3 +68,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI)		+= octeon_edac-pci.o
>>  obj-$(CONFIG_EDAC_ALTERA_MC)		+= altera_edac.o
>>  obj-$(CONFIG_EDAC_SYNOPSYS)		+= synopsys_edac.o
>>  obj-$(CONFIG_EDAC_XGENE)		+= xgene_edac.o
>> +obj-$(CONFIG_EDAC_SEATTLE)		+= seattle_edac.o
>> diff --git a/drivers/edac/seattle_edac.c b/drivers/edac/seattle_edac.c
>> new file mode 100644
>> index 0000000..78101aa
>> --- /dev/null
>> +++ b/drivers/edac/seattle_edac.c
>> @@ -0,0 +1,306 @@
>> +/*
>> + * AMD Seattle EDAC
>> + *
>> + * Copyright (c) 2015, Advanced Micro Devices
>> + * Author: Brijesh Singh <brijeshkumar.singh@amd.com>
>> + *
>> + * The driver polls CPUMERRSR_EL1 and L2MERRSR_EL1 registers to logs the 
> 
> These are IMPLEMENTATION DEFINED registers which are specific to
> Cortex-A57, and I note that L2MERRSR_EL1 changed in r1p0.
> 
> Which revisions of Cortex-A57 does this work with?
> 
I have tested my code on r1p2.

> Generally we avoid touching IMPLEMENTATION DEFINED registers as they may
> not exist in some configurations or revisions, and could trap or undef.
> Is it always safe to access these registers (in current revisions of
> Cortex-A57)?
> 
So far I have not ran into any trap error, was able to read/write registers from EL1 all the times. I looked at TRM but could not find reference that it would fail. As per doc the register should be available at all EL's except EL0.

>> + * non-fatal errors. Whereas the single bit and double bit ECC erros are 
>> + * handled by firmware.
> 
> I had expected this would be all be left for firmware, given that this
> feature may change in any revision of the CPU.
> 
> What precisely does the AMD Seattle firmware do for double-bit ECC
> errors, and how is it triggered?
> 
The error handling firmware is work in progress. I believe the approach is: Configure the platform to trigger a firmware handler when the error occurs, trusted firmware will receive the fatal error interrupt and take the action and will generate APEI objects; if error requires a SoC warm reset then it will communicate with SCP to warm reset the SoC. The SCP firmware will then need to provide the ACPI BERT error logging information back when the A57 restarts. 
 
>> + * This program is free software; you can redistribute  it and/or modify it
>> + * under  the terms of  the GNU General  Public License as published by the
>> + * Free Software Foundation;  either version 2 of the  License, or (at your
>> + * option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/of_device.h>
>> +#include <linux/platform_device.h>
>> +
>> +#include "edac_core.h"
>> +
>> +#define EDAC_MOD_STR             "seattle_edac"
>> +
>> +#define CPUMERRSR_EL1_INDEX(x)   ((x) & 0x1ffff)
>> +#define CPUMERRSR_EL1_BANK(x)    (((x) >> 18) & 0x1f)
>> +#define CPUMERRSR_EL1_RAMID(x)   (((x) >> 24) & 0x7f)
>> +#define CPUMERRSR_EL1_VALID(x)   ((x) & (1 << 31))
>> +#define CPUMERRSR_EL1_REPEAT(x)  (((x) >> 32) & 0x7f)
>> +#define CPUMERRSR_EL1_OTHER(x)   (((x) >> 40) & 0xff)
>> +#define CPUMERRSR_EL1_FATAL(x)   ((x) & (1UL << 63))
>> +
>> +#define L2MERRSR_EL1_INDEX(x)    ((x) & 0x1ffff)
>> +#define L2MERRSR_EL1_CPUID(x)    (((x) >> 18) & 0xf)
>> +#define L2MERRSR_EL1_RAMID(x)    (((x) >> 24) & 0x7f)
>> +#define L2MERRSR_EL1_VALID(x)    ((x) & (1 << 31))
>> +#define L2MERRSR_EL1_REPEAT(x)   (((x) >> 32) & 0xff)
>> +#define L2MERRSR_EL1_OTHER(x)    (((x) >> 40) & 0xff)
>> +#define L2MERRSR_EL1_FATAL(x)    ((x) & (1UL << 63))
>> +
>> +struct seattle_edac {
>> +	struct edac_device_ctl_info *edac_ctl;
>> +};
>> +
>> +static inline u64 read_cpumerrsr_el1(void)
>> +{
>> +	u64 val;
>> +
>> +	asm volatile("mrs %0, s3_1_c15_c2_2" : "=r" (val));
>> +	return val;
>> +}
>> +
>> +static inline void write_cpumerrsr_el1(u64 val)
>> +{
>> +	asm volatile("msr s3_1_c15_c2_2, %0" :: "r" (val));
>> +}
>> +
>> +static inline u64 read_l2merrsr_el1(void)
>> +{
>> +	u64 val;
>> +
>> +	asm volatile("mrs %0, s3_1_c15_c2_3" : "=r" (val));
>> +	return val;
>> +}
>> +
>> +static inline void write_l2merrsr_el1(u64 val)
>> +{
>> +	asm volatile("msr s3_1_c15_c2_3, %0" :: "r" (val));
>> +}
>> +
>> +static void check_l2merrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
>> +{
>> +	int fatal;
>> +	int cpuid;
>> +	u64 val = read_l2merrsr_el1();
>> +
>> +	if (!L2MERRSR_EL1_VALID(val))
>> +		return;
>> +
>> +	fatal = L2MERRSR_EL1_FATAL(val);
>> +	cpuid = L2MERRSR_EL1_CPUID(val);
> 
> Per the spec this appears to be the physical index of the CPU within a
> cluster. So the logged messages aren't all that useful in a
> multi-cluster system.
> 
> Additionally this isn't just the CPUID, which is why all the prints
> below look weird. Please split the field here rather than propagating it
> with a misleading name.
> 
Noted.
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +		    "CPU%d detected %s error on L2 (L2MERRSR=%#llx)!\n",
>> +		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
>> +
>> +	switch (L2MERRSR_EL1_RAMID(val)) {
>> +	case 0x10:
> 
> Define some macro mnemonics for these.
> 
Noted.
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 Tag RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
>> +		break;
>> +	case 0x11:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 Data RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
>> +		break;
>> +	case 0x12:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 Snoop tag RAM cpu %d way %d\n",
>> +			    cpuid / 2, cpuid % 2);
>> +		break;
>> +	case 0x14:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 Dirty RAM cpu %d way %d\n",
>> +			    cpuid / 2, cpuid % 2);
>> +		break;
>> +	case 0x18:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 inclusion RAM cpu %d way %d\n",
>> +			    cpuid / 2, cpuid % 2);
>> +		break;
>> +	default:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "unknown RAMID cpuid %d\n", cpuid);
>> +		break;
>> +	}
>> +
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
>> +		    (int)L2MERRSR_EL1_REPEAT(val));
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
>> +		    (int)L2MERRSR_EL1_OTHER(val));
>> +	if (fatal)
>> +		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
>> +				      edac_ctl->name);
>> +	else
>> +		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
>> +				      edac_ctl->name);
>> +	write_l2merrsr_el1(0);
>> +}
>> +
>> +static void check_cpumerrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
>> +{
>> +	int fatal;
>> +	int bank;
>> +	u64 val = read_cpumerrsr_el1();
>> +
>> +	if (!CPUMERRSR_EL1_VALID(val))
>> +		return;
>> +
>> +	bank = CPUMERRSR_EL1_BANK(val);
>> +	fatal = CPUMERRSR_EL1_FATAL(val);
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +		    "CPU%d detected %s error on L1 (CPUMERRSR=%#llx)!\n",
>> +		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
>> +
>> +	switch (CPUMERRSR_EL1_RAMID(val)) {
>> +	case 0x0:
> 
> Mnemonics please.
> 
Noted.
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L1-I Tag RAM bank %d\n", bank);
>> +		break;
>> +	case 0x1:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L1-I Data RAM bank %d\n", bank);
>> +		break;
>> +	case 0x8:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L1-D Tag RAM bank %d\n", bank);
>> +		break;
>> +	case 0x9:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L1-D Data RAM bank %d\n", bank);
>> +		break;
>> +	case 0x18:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 TLB RAM bank %d\n", bank);
>> +		break;
>> +	default:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "unknown ramid %d bank %d\n",
>> +			    (int)CPUMERRSR_EL1_RAMID(val), bank);
>> +		break;
>> +	}
>> +
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
>> +		    (int)CPUMERRSR_EL1_REPEAT(val));
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
>> +		    (int)CPUMERRSR_EL1_OTHER(val));
>> +	if (fatal)
>> +		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
>> +				      edac_ctl->name);
>> +	else
>> +		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
>> +				      edac_ctl->name);
>> +	write_cpumerrsr_el1(0);
>> +}
>> +
>> +static void cpu_check_errors(void *args)
>> +{
>> +	struct edac_device_ctl_info *edev_ctl = args;
>> +
>> +	check_cpumerrsr_el1_error(edev_ctl);
>> +	check_l2merrsr_el1_error(edev_ctl);
>> +}
>> +
>> +static void edac_check_errors(struct edac_device_ctl_info *edev_ctl)
>> +{
>> +	int cpu;
>> +
>> +	/* read L1 and L2 memory error syndrome register on possible CPU's */
> 
> Nit: get rid of the apostrophe
> 
Noted.
>> +	for_each_possible_cpu(cpu)
>> +		smp_call_function_single(cpu, cpu_check_errors, edev_ctl, 0);
> 
> This doesn't look right. Why are we cross-calling to other CPUs?
> 
> As far as I can see, this is a per-cluster thing, so surely we should
> have a device per-cluster?
> 
>> +}
>> +
>> +static int seattle_edac_probe(struct platform_device *pdev)
>> +{
>> +	int rc;
>> +	u32 poll_msec;
>> +	struct seattle_edac *drv;
>> +	struct device *dev = &pdev->dev;
>> +
>> +	rc = of_property_read_u32(pdev->dev.of_node, "poll-delay-msec",
>> +				  &poll_msec);
> 
> As stated above, this property does not belong in the DT.
> 
> I see that this is a module; this can easily be a module parameter.
> 
noted.

> Thanks,
> Mark.
> 
>> +	if (rc < 0) {
>> +		edac_printk(KERN_ERR, EDAC_MOD_STR,
>> +			    "failed to get poll interval\n");
>> +		return rc;
>> +	}
>> +
>> +	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
>> +	if (!drv)
>> +		return -ENOMEM;
>> +
>> +	drv->edac_ctl = edac_device_alloc_ctl_info(0, "cpu",
>> +						   num_possible_cpus(), "L", 2,
>> +						   1, NULL, 0,
>> +						   edac_device_alloc_index());
>> +
>> +	drv->edac_ctl->poll_msec = poll_msec;
>> +	drv->edac_ctl->edac_check = edac_check_errors;
>> +	drv->edac_ctl->dev = dev;
>> +	drv->edac_ctl->mod_name = dev_name(dev);
>> +	drv->edac_ctl->dev_name = dev_name(dev);
>> +	drv->edac_ctl->ctl_name = "cpu_err";
>> +	drv->edac_ctl->panic_on_ue = 1;
>> +	platform_set_drvdata(pdev, drv);
>> +
>> +	rc = edac_device_add_device(drv->edac_ctl);
>> +	if (rc)
>> +		goto edac_alloc_failed;
>> +
>> +	return 0;
>> +
>> +edac_alloc_failed:
>> +	edac_device_free_ctl_info(drv->edac_ctl);
>> +	return rc;
>> +}
>> +
>> +static int seattle_edac_remove(struct platform_device *pdev)
>> +{
>> +	struct seattle_edac *drv = dev_get_drvdata(&pdev->dev);
>> +	struct edac_device_ctl_info *edac_ctl = drv->edac_ctl;
>> +
>> +	edac_device_del_device(edac_ctl->dev);
>> +	edac_device_free_ctl_info(edac_ctl);
>> +
>> +	return 0;
>> +}
>> +
>> +static const struct of_device_id seattle_edac_of_match[] = {
>> +	{ .compatible = "amd,arm-seattle-edac" },
>> +	{},
>> +};
>> +MODULE_DEVICE_TABLE(of, seattle_edac_of_match);
>> +
>> +static struct platform_driver seattle_edac_driver = {
>> +	.probe = seattle_edac_probe,
>> +	.remove = seattle_edac_remove,
>> +	.driver = {
>> +		.name = "seattle-edac",
>> +		.of_match_table = seattle_edac_of_match,
>> +	},
>> +};
>> +
>> +static int __init seattle_edac_init(void)
>> +{
>> +	int rc;
>> +
>> +	/* we support poll method */
>> +	edac_op_state = EDAC_OPSTATE_POLL;
>> +
>> +	rc = platform_driver_register(&seattle_edac_driver);
>> +	if (rc) {
>> +		edac_printk(KERN_ERR, EDAC_MOD_STR,
>> +			    "EDAC fails to register\n");
>> +		return rc;
>> +	}
>> +
>> +	return 0;
>> +}
>> +module_init(seattle_edac_init);
>> +
>> +static void __exit seattle_edac_exit(void)
>> +{
>> +	platform_driver_unregister(&seattle_edac_driver);
>> +}
>> +module_exit(seattle_edac_exit);
>> +
>> +MODULE_LICENSE("GPL");
>> +MODULE_AUTHOR("Brijesh Singh <brijeshkumar.singh@amd.com>");
>> +MODULE_DESCRIPTION("AMD Seattle EDAC driver");
>> -- 
>> 1.9.1
>>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-20 16:44   ` Brijesh Singh
@ 2015-10-20 16:57     ` Borislav Petkov
  2015-10-20 17:26       ` Mark Rutland
  2015-10-20 17:25     ` Mark Rutland
  1 sibling, 1 reply; 19+ messages in thread
From: Borislav Petkov @ 2015-10-20 16:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Oct 20, 2015 at 11:44:46AM -0500, Brijesh Singh wrote:
> > This second property doesn't describe the hardware in any way. It should
> > be runtime-configurable and dpesn't belong in the DT.
> > 
> > Regardless, the binding is wrong. This is in no way specific to AMD
> > Seattle, and per the code is actually used to imply the presence of a
> > Cortex-A57 feature. No reference to AMD Seattle belongs in the DT
> > binding (with the exception of the example, perhaps), nor in the driver.
> > 
> > NAK while this pretends to be something that it isn't. At minimum, you
> > need to correctly describe the feature you are trying to add support
> > for.
> > 
> I will remove AMD specific string in compatibility field and make the poll-delay-msec optional. Will also expose this as module parameter as you suggested below.

Btw, how much of this is implementing generic A57 functionality?

If a lot, can we make this a generic a57_edac driver so that multiple
vendors can use it?

How fast and how ugly can something like that become?

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-20 16:44   ` Brijesh Singh
  2015-10-20 16:57     ` Borislav Petkov
@ 2015-10-20 17:25     ` Mark Rutland
  2015-10-21  1:45       ` Hanjun Guo
  1 sibling, 1 reply; 19+ messages in thread
From: Mark Rutland @ 2015-10-20 17:25 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Oct 20, 2015 at 11:44:46AM -0500, Brijesh Singh wrote:
> Hi Mark,
> 
> Thanks for review. 
> 
> -Brijesh
> 
> On 10/19/2015 03:52 PM, Mark Rutland wrote:
> > Hi,
> > 
> > Please Cc the devicetree list (devicetree at vger.kernel.org) when sending
> > binding patches. I see you've added the people from the MAINTAINERS
> > entry; the list should also be Cc'd.
> > 
> Noted.
> > On Mon, Oct 19, 2015 at 02:23:17PM -0500, Brijesh Singh wrote:
> >> Add support for the AMD Seattle SoC EDAC driver.
> >>
> >> Signed-off-by: Brijesh Singh <brijeshkumar.singh@amd.com>
> >> ---
> >>  .../devicetree/bindings/edac/amd-seattle-edac.txt  |  15 +
> >>  drivers/edac/Kconfig                               |   6 +
> >>  drivers/edac/Makefile                              |   1 +
> >>  drivers/edac/seattle_edac.c                        | 306 +++++++++++++++++++++
> >>  4 files changed, 328 insertions(+)
> >>  create mode 100644 Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
> >>  create mode 100644 drivers/edac/seattle_edac.c
> >>
> >> diff --git a/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
> >> new file mode 100644
> >> index 0000000..a0354e8
> >> --- /dev/null
> >> +++ b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
> >> @@ -0,0 +1,15 @@
> >> +* AMD Seattle SoC EDAC node
> >> +
> >> +EDAC node is defined to describe on-chip error detection and reporting.
> >> +
> >> +Required properties:
> >> +- compatible: Should be "amd,arm-seattle-edac"
> >> +- poll-delay-msec: Indicates how often the edac check callback should be called.
> >> +  Time in msec.
> > 
> > This second property doesn't describe the hardware in any way. It should
> > be runtime-configurable and dpesn't belong in the DT.
> > 
> > Regardless, the binding is wrong. This is in no way specific to AMD
> > Seattle, and per the code is actually used to imply the presence of a
> > Cortex-A57 feature. No reference to AMD Seattle belongs in the DT
> > binding (with the exception of the example, perhaps), nor in the driver.
> > 
> > NAK while this pretends to be something that it isn't. At minimum, you
> > need to correctly describe the feature you are trying to add support
> > for.
> > 
> I will remove AMD specific string in compatibility field and make the
> poll-delay-msec optional. Will also expose this as module parameter as
> you suggested below.

I don't think it should be optional; I don't think it should be there at
all.

I'm not sure if we even need a DT binding if we can derive the presence
of the feature from the MIDR.

> >> + * The driver polls CPUMERRSR_EL1 and L2MERRSR_EL1 registers to logs the 
> > 
> > These are IMPLEMENTATION DEFINED registers which are specific to
> > Cortex-A57, and I note that L2MERRSR_EL1 changed in r1p0.
> > 
> > Which revisions of Cortex-A57 does this work with?
> > 
> I have tested my code on r1p2.
> 
> > Generally we avoid touching IMPLEMENTATION DEFINED registers as they may
> > not exist in some configurations or revisions, and could trap or undef.
> > Is it always safe to access these registers (in current revisions of
> > Cortex-A57)?
> > 
> So far I have not ran into any trap error, was able to read/write
> registers from EL1 all the times. I looked at TRM but could not find
> reference that it would fail. As per doc the register should be
> available at all EL's except EL0.

Ok.

> >> + * non-fatal errors. Whereas the single bit and double bit ECC erros are 
> >> + * handled by firmware.
> > 
> > I had expected this would be all be left for firmware, given that this
> > feature may change in any revision of the CPU.
> > 
> > What precisely does the AMD Seattle firmware do for double-bit ECC
> > errors, and how is it triggered?
> > 
> The error handling firmware is work in progress. I believe the
> approach is: Configure the platform to trigger a firmware handler when
> the error occurs, trusted firmware will receive the fatal error
> interrupt and take the action and will generate APEI objects; if error
> requires a SoC warm reset then it will communicate with SCP to warm
> reset the SoC. The SCP firmware will then need to provide the ACPI
> BERT error logging information back when the A57 restarts. 

Can signalling of single-bit errors not happen in the same way via APEI?
Or is APEI handled fatally?

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-20 16:57     ` Borislav Petkov
@ 2015-10-20 17:26       ` Mark Rutland
  2015-10-20 17:36         ` Borislav Petkov
  0 siblings, 1 reply; 19+ messages in thread
From: Mark Rutland @ 2015-10-20 17:26 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Oct 20, 2015 at 06:57:44PM +0200, Borislav Petkov wrote:
> On Tue, Oct 20, 2015 at 11:44:46AM -0500, Brijesh Singh wrote:
> > > This second property doesn't describe the hardware in any way. It should
> > > be runtime-configurable and dpesn't belong in the DT.
> > > 
> > > Regardless, the binding is wrong. This is in no way specific to AMD
> > > Seattle, and per the code is actually used to imply the presence of a
> > > Cortex-A57 feature. No reference to AMD Seattle belongs in the DT
> > > binding (with the exception of the example, perhaps), nor in the driver.
> > > 
> > > NAK while this pretends to be something that it isn't. At minimum, you
> > > need to correctly describe the feature you are trying to add support
> > > for.
> > > 
> > I will remove AMD specific string in compatibility field and make
> > the poll-delay-msec optional. Will also expose this as module
> > parameter as you suggested below.
> 
> Btw, how much of this is implementing generic A57 functionality?

The driver is entirely A57 generic.

> If a lot, can we make this a generic a57_edac driver so that multiple
> vendors can use it?

Yes.

> How fast and how ugly can something like that become?

Not sure I follow.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-20 17:26       ` Mark Rutland
@ 2015-10-20 17:36         ` Borislav Petkov
  2015-10-20 17:41           ` Mark Rutland
  2015-10-21  1:55           ` Hanjun Guo
  0 siblings, 2 replies; 19+ messages in thread
From: Borislav Petkov @ 2015-10-20 17:36 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Oct 20, 2015 at 06:26:55PM +0100, Mark Rutland wrote:
> > Btw, how much of this is implementing generic A57 functionality?
> 
> The driver is entirely A57 generic.
> 
> > If a lot, can we make this a generic a57_edac driver so that multiple
> > vendors can use it?
> 
> Yes.

Ok, cool.

> > How fast and how ugly can something like that become?
> 
> Not sure I follow.

In the sense that some vendor might require just a little bit different
handling or maybe wants to read some vendor-specific registers in
addition to the architectural ones.

Then we'll start adding vendor-specific hacks to that generic driver.
And therefore the question how fast and how ugly such hacks would
become.

I guess we'll worry about that when we get there...

So Brijesh, if you only need generic, architectural functionality,
please call it arm64_edac or so and let's add it so that other arm64
vendors can use it too.

Thanks.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-20 17:36         ` Borislav Petkov
@ 2015-10-20 17:41           ` Mark Rutland
  2015-10-20 19:16             ` Brijesh Singh
  2015-10-21  1:55           ` Hanjun Guo
  1 sibling, 1 reply; 19+ messages in thread
From: Mark Rutland @ 2015-10-20 17:41 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Oct 20, 2015 at 07:36:39PM +0200, Borislav Petkov wrote:
> On Tue, Oct 20, 2015 at 06:26:55PM +0100, Mark Rutland wrote:
> > > Btw, how much of this is implementing generic A57 functionality?
> > 
> > The driver is entirely A57 generic.
> > 
> > > If a lot, can we make this a generic a57_edac driver so that multiple
> > > vendors can use it?
> > 
> > Yes.
> 
> Ok, cool.
> 
> > > How fast and how ugly can something like that become?
> > 
> > Not sure I follow.
> 
> In the sense that some vendor might require just a little bit different
> handling or maybe wants to read some vendor-specific registers in
> addition to the architectural ones.
> 
> Then we'll start adding vendor-specific hacks to that generic driver.
> And therefore the question how fast and how ugly such hacks would
> become.
> 
> I guess we'll worry about that when we get there...
> 
> So Brijesh, if you only need generic, architectural functionality,
> please call it arm64_edac or so and let's add it so that other arm64
> vendors can use it too.

Please note that this is specific to Cortex-A57, not ARMv8 or aarch64.

It is an IMPLEMENTATION DEFINED feature as implemented by Cortex-A57,
which by definition is not implemented by other CPUs. It is not provided
by the ARM architecture.

So this cannot be arm64_edac, but could potentially be cortex_a57_edac.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-20 17:41           ` Mark Rutland
@ 2015-10-20 19:16             ` Brijesh Singh
  0 siblings, 0 replies; 19+ messages in thread
From: Brijesh Singh @ 2015-10-20 19:16 UTC (permalink / raw)
  To: linux-arm-kernel



On 10/20/2015 12:41 PM, Mark Rutland wrote:
> On Tue, Oct 20, 2015 at 07:36:39PM +0200, Borislav Petkov wrote:
>> On Tue, Oct 20, 2015 at 06:26:55PM +0100, Mark Rutland wrote:
>>>> Btw, how much of this is implementing generic A57 functionality?
>>>
>>> The driver is entirely A57 generic.
>>>
>>>> If a lot, can we make this a generic a57_edac driver so that multiple
>>>> vendors can use it?
>>>
>>> Yes.
>>
>> Ok, cool.
>>
>>>> How fast and how ugly can something like that become?
>>>
>>> Not sure I follow.
>>
>> In the sense that some vendor might require just a little bit different
>> handling or maybe wants to read some vendor-specific registers in
>> addition to the architectural ones.
>>
>> Then we'll start adding vendor-specific hacks to that generic driver.
>> And therefore the question how fast and how ugly such hacks would
>> become.
>>
>> I guess we'll worry about that when we get there...
>>
>> So Brijesh, if you only need generic, architectural functionality,
>> please call it arm64_edac or so and let's add it so that other arm64
>> vendors can use it too.
> 
> Please note that this is specific to Cortex-A57, not ARMv8 or aarch64.
> 
> It is an IMPLEMENTATION DEFINED feature as implemented by Cortex-A57,
> which by definition is not implemented by other CPUs. It is not provided
> by the ARM architecture.
> 
> So this cannot be arm64_edac, but could potentially be cortex_a57_edac.
> 

Yes code is generic to Cortex A57 and naming it cortex_a57_edac sounds good.

Also I will follow your suggestion and remove DT binding and use MIDR. 

> Thanks,
> Mark.
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-20  2:21 ` Hanjun Guo
@ 2015-10-20 21:26   ` Brijesh Singh
  2015-10-21  1:35     ` Hanjun Guo
  0 siblings, 1 reply; 19+ messages in thread
From: Brijesh Singh @ 2015-10-20 21:26 UTC (permalink / raw)
  To: linux-arm-kernel


Hi Hanjun,

Thanks for review.

-Brijesh 
On 10/19/2015 09:21 PM, Hanjun Guo wrote:
> Hi Brijesh,
> 
> On 2015/10/20 3:23, Brijesh Singh wrote:
>> Add support for the AMD Seattle SoC EDAC driver.
>>
>> Signed-off-by: Brijesh Singh <brijeshkumar.singh@amd.com>
>> ---
>>  .../devicetree/bindings/edac/amd-seattle-edac.txt  |  15 +
>>  drivers/edac/Kconfig                               |   6 +
>>  drivers/edac/Makefile                              |   1 +
>>  drivers/edac/seattle_edac.c                        | 306 +++++++++++++++++++++
>>  4 files changed, 328 insertions(+)
>>  create mode 100644 Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
>>  create mode 100644 drivers/edac/seattle_edac.c
>>
>>
> [...]
>> +config EDAC_SEATTLE
>> +	tristate "AMD Seattle EDAC"
>> +	depends on EDAC_MM_EDAC && ARCH_SEATTLE
>> +	help
>> +	  Support for error detection and correction on the
>> +	  AMD Seattle SOC.
>>  endif # EDAC
>> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
>> index ae3c5f3..9e4f3ef 100644
>> --- a/drivers/edac/Makefile
>> +++ b/drivers/edac/Makefile
>> @@ -68,3 +68,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI)		+= octeon_edac-pci.o
>>  obj-$(CONFIG_EDAC_ALTERA_MC)		+= altera_edac.o
>>  obj-$(CONFIG_EDAC_SYNOPSYS)		+= synopsys_edac.o
>>  obj-$(CONFIG_EDAC_XGENE)		+= xgene_edac.o
>> +obj-$(CONFIG_EDAC_SEATTLE)		+= seattle_edac.o
>> diff --git a/drivers/edac/seattle_edac.c b/drivers/edac/seattle_edac.c
>> new file mode 100644
>> index 0000000..78101aa
>> --- /dev/null
>> +++ b/drivers/edac/seattle_edac.c
>> @@ -0,0 +1,306 @@
>> +/*
>> + * AMD Seattle EDAC
>> + *
>> + * Copyright (c) 2015, Advanced Micro Devices
>> + * Author: Brijesh Singh <brijeshkumar.singh@amd.com>
>> + *
>> + * The driver polls CPUMERRSR_EL1 and L2MERRSR_EL1 registers to logs the 
>> + * non-fatal errors. Whereas the single bit and double bit ECC erros are 
>> + * handled by firmware.
>> + *
>> + * This program is free software; you can redistribute  it and/or modify it
>> + * under  the terms of  the GNU General  Public License as published by the
>> + * Free Software Foundation;  either version 2 of the  License, or (at your
>> + * option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/of_device.h>
>> +#include <linux/platform_device.h>
>> +
>> +#include "edac_core.h"
>> +
>> +#define EDAC_MOD_STR             "seattle_edac"
>> +
>> +#define CPUMERRSR_EL1_INDEX(x)   ((x) & 0x1ffff)
>> +#define CPUMERRSR_EL1_BANK(x)    (((x) >> 18) & 0x1f)
>> +#define CPUMERRSR_EL1_RAMID(x)   (((x) >> 24) & 0x7f)
>> +#define CPUMERRSR_EL1_VALID(x)   ((x) & (1 << 31))
>> +#define CPUMERRSR_EL1_REPEAT(x)  (((x) >> 32) & 0x7f)
>> +#define CPUMERRSR_EL1_OTHER(x)   (((x) >> 40) & 0xff)
>> +#define CPUMERRSR_EL1_FATAL(x)   ((x) & (1UL << 63))
>> +
>> +#define L2MERRSR_EL1_INDEX(x)    ((x) & 0x1ffff)
>> +#define L2MERRSR_EL1_CPUID(x)    (((x) >> 18) & 0xf)
>> +#define L2MERRSR_EL1_RAMID(x)    (((x) >> 24) & 0x7f)
>> +#define L2MERRSR_EL1_VALID(x)    ((x) & (1 << 31))
>> +#define L2MERRSR_EL1_REPEAT(x)   (((x) >> 32) & 0xff)
>> +#define L2MERRSR_EL1_OTHER(x)    (((x) >> 40) & 0xff)
>> +#define L2MERRSR_EL1_FATAL(x)    ((x) & (1UL << 63))
>> +
>> +struct seattle_edac {
>> +	struct edac_device_ctl_info *edac_ctl;
>> +};
>> +
>> +static inline u64 read_cpumerrsr_el1(void)
>> +{
>> +	u64 val;
>> +
>> +	asm volatile("mrs %0, s3_1_c15_c2_2" : "=r" (val));
>> +	return val;
>> +}
>> +
>> +static inline void write_cpumerrsr_el1(u64 val)
>> +{
>> +	asm volatile("msr s3_1_c15_c2_2, %0" :: "r" (val));
>> +}
>> +
>> +static inline u64 read_l2merrsr_el1(void)
>> +{
>> +	u64 val;
>> +
>> +	asm volatile("mrs %0, s3_1_c15_c2_3" : "=r" (val));
>> +	return val;
>> +}
>> +
>> +static inline void write_l2merrsr_el1(u64 val)
>> +{
>> +	asm volatile("msr s3_1_c15_c2_3, %0" :: "r" (val));
>> +}
>> +
>> +static void check_l2merrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
>> +{
>> +	int fatal;
>> +	int cpuid;
>> +	u64 val = read_l2merrsr_el1();
>> +
>> +	if (!L2MERRSR_EL1_VALID(val))
>> +		return;
>> +
>> +	fatal = L2MERRSR_EL1_FATAL(val);
>> +	cpuid = L2MERRSR_EL1_CPUID(val);
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +		    "CPU%d detected %s error on L2 (L2MERRSR=%#llx)!\n",
>> +		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
>> +
>> +	switch (L2MERRSR_EL1_RAMID(val)) {
>> +	case 0x10:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 Tag RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
>> +		break;
>> +	case 0x11:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 Data RAM cpu %d way %d\n", cpuid / 2, cpuid % 2);
>> +		break;
>> +	case 0x12:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 Snoop tag RAM cpu %d way %d\n",
>> +			    cpuid / 2, cpuid % 2);
>> +		break;
>> +	case 0x14:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 Dirty RAM cpu %d way %d\n",
>> +			    cpuid / 2, cpuid % 2);
>> +		break;
>> +	case 0x18:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 inclusion RAM cpu %d way %d\n",
>> +			    cpuid / 2, cpuid % 2);
>> +		break;
>> +	default:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "unknown RAMID cpuid %d\n", cpuid);
>> +		break;
>> +	}
>> +
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
>> +		    (int)L2MERRSR_EL1_REPEAT(val));
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
>> +		    (int)L2MERRSR_EL1_OTHER(val));
>> +	if (fatal)
>> +		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
>> +				      edac_ctl->name);
>> +	else
>> +		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
>> +				      edac_ctl->name);
>> +	write_l2merrsr_el1(0);
>> +}
>> +
>> +static void check_cpumerrsr_el1_error(struct edac_device_ctl_info *edac_ctl)
>> +{
>> +	int fatal;
>> +	int bank;
>> +	u64 val = read_cpumerrsr_el1();
>> +
>> +	if (!CPUMERRSR_EL1_VALID(val))
>> +		return;
>> +
>> +	bank = CPUMERRSR_EL1_BANK(val);
>> +	fatal = CPUMERRSR_EL1_FATAL(val);
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +		    "CPU%d detected %s error on L1 (CPUMERRSR=%#llx)!\n",
>> +		    smp_processor_id(), fatal ? "fatal" : "non-fatal", val);
>> +
>> +	switch (CPUMERRSR_EL1_RAMID(val)) {
>> +	case 0x0:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L1-I Tag RAM bank %d\n", bank);
>> +		break;
>> +	case 0x1:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L1-I Data RAM bank %d\n", bank);
>> +		break;
>> +	case 0x8:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L1-D Tag RAM bank %d\n", bank);
>> +		break;
>> +	case 0x9:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L1-D Data RAM bank %d\n", bank);
>> +		break;
>> +	case 0x18:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "L2 TLB RAM bank %d\n", bank);
>> +		break;
>> +	default:
>> +		edac_printk(KERN_CRIT, EDAC_MOD_STR,
>> +			    "unknown ramid %d bank %d\n",
>> +			    (int)CPUMERRSR_EL1_RAMID(val), bank);
>> +		break;
>> +	}
>> +
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n",
>> +		    (int)CPUMERRSR_EL1_REPEAT(val));
>> +	edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n",
>> +		    (int)CPUMERRSR_EL1_OTHER(val));
>> +	if (fatal)
>> +		edac_device_handle_ue(edac_ctl, smp_processor_id(), 1,
>> +				      edac_ctl->name);
>> +	else
>> +		edac_device_handle_ce(edac_ctl, smp_processor_id(), 1,
>> +				      edac_ctl->name);
>> +	write_cpumerrsr_el1(0);
>> +}
> 
> The codes above are common for all A57 architectures, other A57 SoCs will use the same
> code for L1/L2 caches error report, can we put those codes in common place and reused
> for all A57 architectures?
> 
Code is generic to A57 and I will follow Mark Rutland suggestion to make it cortex_a57_edac. If you have something else in mind then please let me know.

>> +
>> +static void cpu_check_errors(void *args)
>> +{
>> +	struct edac_device_ctl_info *edev_ctl = args;
>> +
>> +	check_cpumerrsr_el1_error(edev_ctl);
>> +	check_l2merrsr_el1_error(edev_ctl);
>> +}
>> +
>> +static void edac_check_errors(struct edac_device_ctl_info *edev_ctl)
>> +{
>> +	int cpu;
>> +
>> +	/* read L1 and L2 memory error syndrome register on possible CPU's */
>> +	for_each_possible_cpu(cpu)
>> +		smp_call_function_single(cpu, cpu_check_errors, edev_ctl, 0);
> 
> Seems that error syndrome registers for L2 cache are cluster lever (each cluster share the
> L2 cache, you can refer to ARM doc: DDI0488D, Cortex-A57 Technical Reference Manual),
> so for L2 cache, we need to check the error at cluster lever not the cpu core lever.
> 
Yes L1 seems to be CPU specific and L2 is shared in a cluster. So I am thinking of making the following changes in this function.

static void edac_check_errors(struct edac_device_ctl_info *edev_ctl)
{
        int cpu;
        struct cpumask cluster_mask, old_mask;

        cpumask_clear(&cluster_mask);
        cpumask_clear(&old_mask);

        for_each_possible_cpu(cpu) {
                smp_call_function_single(cpu, check_cpumerrsr_el1_error, 
                                         edev_ctl, 0); 
                cpumask_copy(&cluster_mask, topology_core_cpumask(cpu));
                if (cpumask_equal(&cluster_mask, &old_mask))
                        continue;
                cpumask_copy(&old_mask, &cluster_mask);
                smp_call_function_any(&cluster_mask, check_l2merrsr_el1_error,
                                      edev_ctl, 0); 
        }   
}

Read L1 on each CPU and L2 once in a cluster. Does this address your feedback ?

> Thanks
> Hanjun
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-20 21:26   ` Brijesh Singh
@ 2015-10-21  1:35     ` Hanjun Guo
  0 siblings, 0 replies; 19+ messages in thread
From: Hanjun Guo @ 2015-10-21  1:35 UTC (permalink / raw)
  To: linux-arm-kernel

On 2015/10/21 5:26, Brijesh Singh wrote:
> Hi Hanjun,
>
> Thanks for review.
>
> -Brijesh 
> On 10/19/2015 09:21 PM, Hanjun Guo wrote:
>> Hi Brijesh,
>>
>> On 2015/10/20 3:23, Brijesh Singh wrote:
[...]
>> The codes above are common for all A57 architectures, other A57 SoCs will use the same
>> code for L1/L2 caches error report, can we put those codes in common place and reused
>> for all A57 architectures?
>>
> Code is generic to A57 and I will follow Mark Rutland suggestion to make it cortex_a57_edac. If you have something else in mind then please let me know.

Sorry, I missed Mark's comments before I sent my email, I'm fine with
the file name suggested.

>
>>> +
>>> +static void cpu_check_errors(void *args)
>>> +{
>>> +	struct edac_device_ctl_info *edev_ctl = args;
>>> +
>>> +	check_cpumerrsr_el1_error(edev_ctl);
>>> +	check_l2merrsr_el1_error(edev_ctl);
>>> +}
>>> +
>>> +static void edac_check_errors(struct edac_device_ctl_info *edev_ctl)
>>> +{
>>> +	int cpu;
>>> +
>>> +	/* read L1 and L2 memory error syndrome register on possible CPU's */
>>> +	for_each_possible_cpu(cpu)
>>> +		smp_call_function_single(cpu, cpu_check_errors, edev_ctl, 0);
>> Seems that error syndrome registers for L2 cache are cluster lever (each cluster share the
>> L2 cache, you can refer to ARM doc: DDI0488D, Cortex-A57 Technical Reference Manual),
>> so for L2 cache, we need to check the error at cluster lever not the cpu core lever.
>>
> Yes L1 seems to be CPU specific and L2 is shared in a cluster. So I am thinking of making the following changes in this function.
>
> static void edac_check_errors(struct edac_device_ctl_info *edev_ctl)
> {
>         int cpu;
>         struct cpumask cluster_mask, old_mask;
>
>         cpumask_clear(&cluster_mask);
>         cpumask_clear(&old_mask);
>
>         for_each_possible_cpu(cpu) {
>                 smp_call_function_single(cpu, check_cpumerrsr_el1_error, 
>                                          edev_ctl, 0); 
>                 cpumask_copy(&cluster_mask, topology_core_cpumask(cpu));
>                 if (cpumask_equal(&cluster_mask, &old_mask))
>                         continue;
>                 cpumask_copy(&old_mask, &cluster_mask);
>                 smp_call_function_any(&cluster_mask, check_l2merrsr_el1_error,
>                                       edev_ctl, 0); 
>         }   
> }
>
> Read L1 on each CPU and L2 once in a cluster. Does this address your feedback ?

Yes, at least it will work as expected :)

Thanks
Hanjun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-20 17:25     ` Mark Rutland
@ 2015-10-21  1:45       ` Hanjun Guo
  0 siblings, 0 replies; 19+ messages in thread
From: Hanjun Guo @ 2015-10-21  1:45 UTC (permalink / raw)
  To: linux-arm-kernel

On 2015/10/21 1:25, Mark Rutland wrote:
> On Tue, Oct 20, 2015 at 11:44:46AM -0500, Brijesh Singh wrote:
>> Hi Mark,
>>
>> Thanks for review. 
>>
>> -Brijesh
>>
>> On 10/19/2015 03:52 PM, Mark Rutland wrote:
>>> Hi,
>>>
>>> Please Cc the devicetree list (devicetree at vger.kernel.org) when sending
>>> binding patches. I see you've added the people from the MAINTAINERS
>>> entry; the list should also be Cc'd.
>>>
>> Noted.
>>> On Mon, Oct 19, 2015 at 02:23:17PM -0500, Brijesh Singh wrote:
>>>> Add support for the AMD Seattle SoC EDAC driver.
>>>>
>>>> Signed-off-by: Brijesh Singh <brijeshkumar.singh@amd.com>
>>>> ---
>>>>  .../devicetree/bindings/edac/amd-seattle-edac.txt  |  15 +
>>>>  drivers/edac/Kconfig                               |   6 +
>>>>  drivers/edac/Makefile                              |   1 +
>>>>  drivers/edac/seattle_edac.c                        | 306 +++++++++++++++++++++
>>>>  4 files changed, 328 insertions(+)
>>>>  create mode 100644 Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
>>>>  create mode 100644 drivers/edac/seattle_edac.c
>>>>
>>>> diff --git a/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
>>>> new file mode 100644
>>>> index 0000000..a0354e8
>>>> --- /dev/null
>>>> +++ b/Documentation/devicetree/bindings/edac/amd-seattle-edac.txt
>>>> @@ -0,0 +1,15 @@
>>>> +* AMD Seattle SoC EDAC node
>>>> +
>>>> +EDAC node is defined to describe on-chip error detection and reporting.
>>>> +
>>>> +Required properties:
>>>> +- compatible: Should be "amd,arm-seattle-edac"
>>>> +- poll-delay-msec: Indicates how often the edac check callback should be called.
>>>> +  Time in msec.
>>> This second property doesn't describe the hardware in any way. It should
>>> be runtime-configurable and dpesn't belong in the DT.
>>>
>>> Regardless, the binding is wrong. This is in no way specific to AMD
>>> Seattle, and per the code is actually used to imply the presence of a
>>> Cortex-A57 feature. No reference to AMD Seattle belongs in the DT
>>> binding (with the exception of the example, perhaps), nor in the driver.
>>>
>>> NAK while this pretends to be something that it isn't. At minimum, you
>>> need to correctly describe the feature you are trying to add support
>>> for.
>>>
>> I will remove AMD specific string in compatibility field and make the
>> poll-delay-msec optional. Will also expose this as module parameter as
>> you suggested below.
> I don't think it should be optional; I don't think it should be there at
> all.
>
> I'm not sure if we even need a DT binding if we can derive the presence
> of the feature from the MIDR.
>
>>>> + * The driver polls CPUMERRSR_EL1 and L2MERRSR_EL1 registers to logs the 
>>> These are IMPLEMENTATION DEFINED registers which are specific to
>>> Cortex-A57, and I note that L2MERRSR_EL1 changed in r1p0.
>>>
>>> Which revisions of Cortex-A57 does this work with?
>>>
>> I have tested my code on r1p2.
>>
>>> Generally we avoid touching IMPLEMENTATION DEFINED registers as they may
>>> not exist in some configurations or revisions, and could trap or undef.
>>> Is it always safe to access these registers (in current revisions of
>>> Cortex-A57)?
>>>
>> So far I have not ran into any trap error, was able to read/write
>> registers from EL1 all the times. I looked at TRM but could not find
>> reference that it would fail. As per doc the register should be
>> available at all EL's except EL0.
> Ok.

This also works on Hisilicon D02 board. but the mechanism to handle the error
is a little bit different, on D02, it use poll mechanism to scan the single bit ECC error
just as Seattle, but D02 will trigger SEI when double bit ECC error happened.

Thanks
Hanjun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-20 17:36         ` Borislav Petkov
  2015-10-20 17:41           ` Mark Rutland
@ 2015-10-21  1:55           ` Hanjun Guo
  2015-10-21  9:35             ` Borislav Petkov
  1 sibling, 1 reply; 19+ messages in thread
From: Hanjun Guo @ 2015-10-21  1:55 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Boris, Mark,

On 2015/10/21 1:36, Borislav Petkov wrote:
> On Tue, Oct 20, 2015 at 06:26:55PM +0100, Mark Rutland wrote:
>>> Btw, how much of this is implementing generic A57 functionality?
>> The driver is entirely A57 generic.
>>
>>> If a lot, can we make this a generic a57_edac driver so that multiple
>>> vendors can use it?
>> Yes.
> Ok, cool.
>
>>> How fast and how ugly can something like that become?
>> Not sure I follow.
> In the sense that some vendor might require just a little bit different
> handling or maybe wants to read some vendor-specific registers in
> addition to the architectural ones.

Yes, you are right and foresight :)

>
> Then we'll start adding vendor-specific hacks to that generic driver.
> And therefore the question how fast and how ugly such hacks would
> become.
>
> I guess we'll worry about that when we get there...

So I think the meaning of those error register is the same, but the way
of handle it may different from SoCs, for single bit error:

 - SoC may trigger a interrupt;
 - SoC may just keep silent so we need to scan the registers using poll
   mechanism.

For Double bit error:
  - SoC may also keep silent
  - Trigger a interrupt
  - Trigger a SEI (system error)

Any suggestion to cover those cases?

Thanks
Hanjun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-21  1:55           ` Hanjun Guo
@ 2015-10-21  9:35             ` Borislav Petkov
  2015-10-21 10:01               ` Andre Przywara
  2015-10-23  1:38               ` Hanjun Guo
  0 siblings, 2 replies; 19+ messages in thread
From: Borislav Petkov @ 2015-10-21  9:35 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Oct 21, 2015 at 09:55:43AM +0800, Hanjun Guo wrote:
> So I think the meaning of those error register is the same, but the way
> of handle it may different from SoCs, for single bit error:
> 
>  - SoC may trigger a interrupt;
>  - SoC may just keep silent so we need to scan the registers using poll
>    mechanism.
> 
> For Double bit error:
>   - SoC may also keep silent
>   - Trigger a interrupt
>   - Trigger a SEI (system error)
> 
> Any suggestion to cover those cases?

Well, I guess we can implement all those and have them configurable
in the sense that a single driver loads, it has all functionality and
dependent on the vendor detection, it does only what the vendor wants
like trigger an interrupt or remain silent or ...

Btw, in talking about this with Andre last night, he had the suggestion
that this functionality is also in other implementations besides A57 so
maybe the driver should be called arm_cortex_edac...

Just putting it out there and adding Andre to CC.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-21  9:35             ` Borislav Petkov
@ 2015-10-21 10:01               ` Andre Przywara
  2015-10-21 16:22                 ` Brijesh Singh
  2015-10-23  1:38               ` Hanjun Guo
  1 sibling, 1 reply; 19+ messages in thread
From: Andre Przywara @ 2015-10-21 10:01 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

On 21/10/15 10:35, Borislav Petkov wrote:
> On Wed, Oct 21, 2015 at 09:55:43AM +0800, Hanjun Guo wrote:
>> So I think the meaning of those error register is the same, but the way
>> of handle it may different from SoCs, for single bit error:
>>
>>  - SoC may trigger a interrupt;
>>  - SoC may just keep silent so we need to scan the registers using poll
>>    mechanism.
>>
>> For Double bit error:
>>   - SoC may also keep silent
>>   - Trigger a interrupt
>>   - Trigger a SEI (system error)
>>
>> Any suggestion to cover those cases?
> 
> Well, I guess we can implement all those and have them configurable
> in the sense that a single driver loads, it has all functionality and
> dependent on the vendor detection, it does only what the vendor wants
> like trigger an interrupt or remain silent or ...

I guess the firmware (running in EL3) will take precedence over this
driver anyway, so we could just optimistically implement all errors, as
the driver will just never see errors that are handled in firmware (?)
In case of a critical error for instance I expect the firmware to never
return to EL1.

> 
> Btw, in talking about this with Andre last night, he had the suggestion
> that this functionality is also in other implementations besides A57 so
> maybe the driver should be called arm_cortex_edac...

Yeah, so looking at the A-72 and the A-53 TRM I see those registers to
be there as well. The A-72 and the A-57 versions look identical to me,
the A-53 version is only slightly different, but apparently still
compatible.
So I'd suggest to let this driver load on detecting all three MIDRs.
Should later revisions of any of those parts change the register
meaning, we could add a blacklist or specific MIDR detection.

But let's just not assume the worst in the first place ;-)

Cheers,
Andre.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-21 10:01               ` Andre Przywara
@ 2015-10-21 16:22                 ` Brijesh Singh
  0 siblings, 0 replies; 19+ messages in thread
From: Brijesh Singh @ 2015-10-21 16:22 UTC (permalink / raw)
  To: linux-arm-kernel



On 10/21/2015 05:01 AM, Andre Przywara wrote:
> Hi,
> 
> On 21/10/15 10:35, Borislav Petkov wrote:
>> On Wed, Oct 21, 2015 at 09:55:43AM +0800, Hanjun Guo wrote:
>>> So I think the meaning of those error register is the same, but the way
>>> of handle it may different from SoCs, for single bit error:
>>>
>>>  - SoC may trigger a interrupt;
>>>  - SoC may just keep silent so we need to scan the registers using poll
>>>    mechanism.
>>>
>>> For Double bit error:
>>>   - SoC may also keep silent
>>>   - Trigger a interrupt
>>>   - Trigger a SEI (system error)
>>>
>>> Any suggestion to cover those cases?
>>
>> Well, I guess we can implement all those and have them configurable
>> in the sense that a single driver loads, it has all functionality and
>> dependent on the vendor detection, it does only what the vendor wants
>> like trigger an interrupt or remain silent or ...
> 
> I guess the firmware (running in EL3) will take precedence over this
> driver anyway, so we could just optimistically implement all errors, as
> the driver will just never see errors that are handled in firmware (?)
> In case of a critical error for instance I expect the firmware to never
> return to EL1.
> 
>>
>> Btw, in talking about this with Andre last night, he had the suggestion
>> that this functionality is also in other implementations besides A57 so
>> maybe the driver should be called arm_cortex_edac...
> 
> Yeah, so looking at the A-72 and the A-53 TRM I see those registers to
> be there as well. The A-72 and the A-57 versions look identical to me,
> the A-53 version is only slightly different, but apparently still
> compatible.
> So I'd suggest to let this driver load on detecting all three MIDRs.
> Should later revisions of any of those parts change the register
> meaning, we could add a blacklist or specific MIDR detection.
> 
> But let's just not assume the worst in the first place ;-)
> 
Ok. Will make it generic cortex_arm64_edac. Will check MIDR and call appropriate CPUMERRSR_EL1 and L2MERRSR_EL1. Since I don't have A53 and A72 hence my testing will be limited to Cortex A57.
> Cheers,
> Andre.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] EDAC: Add AMD Seattle SoC EDAC
  2015-10-21  9:35             ` Borislav Petkov
  2015-10-21 10:01               ` Andre Przywara
@ 2015-10-23  1:38               ` Hanjun Guo
  1 sibling, 0 replies; 19+ messages in thread
From: Hanjun Guo @ 2015-10-23  1:38 UTC (permalink / raw)
  To: linux-arm-kernel

On 2015/10/21 17:35, Borislav Petkov wrote:
> On Wed, Oct 21, 2015 at 09:55:43AM +0800, Hanjun Guo wrote:
>> So I think the meaning of those error register is the same, but the way
>> of handle it may different from SoCs, for single bit error:
>>
>>  - SoC may trigger a interrupt;
>>  - SoC may just keep silent so we need to scan the registers using poll
>>    mechanism.
>>
>> For Double bit error:
>>   - SoC may also keep silent
>>   - Trigger a interrupt
>>   - Trigger a SEI (system error)
>>
>> Any suggestion to cover those cases?
> Well, I guess we can implement all those and have them configurable
> in the sense that a single driver loads, it has all functionality and
> dependent on the vendor detection, it does only what the vendor wants
> like trigger an interrupt or remain silent or ...

Hmm, so we need to keep the DT bindings for different SoCs which
have different ways of handling the errors.

Thanks
Hanjun

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2015-10-23  1:38 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-10-19 19:23 [PATCH] EDAC: Add AMD Seattle SoC EDAC Brijesh Singh
2015-10-19 20:14 ` Borislav Petkov
2015-10-19 20:52 ` Mark Rutland
2015-10-20 16:44   ` Brijesh Singh
2015-10-20 16:57     ` Borislav Petkov
2015-10-20 17:26       ` Mark Rutland
2015-10-20 17:36         ` Borislav Petkov
2015-10-20 17:41           ` Mark Rutland
2015-10-20 19:16             ` Brijesh Singh
2015-10-21  1:55           ` Hanjun Guo
2015-10-21  9:35             ` Borislav Petkov
2015-10-21 10:01               ` Andre Przywara
2015-10-21 16:22                 ` Brijesh Singh
2015-10-23  1:38               ` Hanjun Guo
2015-10-20 17:25     ` Mark Rutland
2015-10-21  1:45       ` Hanjun Guo
2015-10-20  2:21 ` Hanjun Guo
2015-10-20 21:26   ` Brijesh Singh
2015-10-21  1:35     ` Hanjun Guo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).