From mboxrd@z Thu Jan 1 00:00:00 1970 From: brijeshkumar.singh@amd.com (Brijesh Singh) Date: Tue, 20 Oct 2015 16:26:49 -0500 Subject: [PATCH] EDAC: Add AMD Seattle SoC EDAC In-Reply-To: <5625A528.1040803@huawei.com> References: <1445282597-18999-1-git-send-email-brijeshkumar.singh@amd.com> <5625A528.1040803@huawei.com> Message-ID: <5626B199.4050209@amd.com> To: linux-arm-kernel@lists.infradead.org List-Id: linux-arm-kernel.lists.infradead.org Hi Hanjun, Thanks for review. -Brijesh On 10/19/2015 09:21 PM, Hanjun Guo wrote: > Hi Brijesh, > > On 2015/10/20 3:23, Brijesh Singh wrote: >> Add support for the AMD Seattle SoC EDAC driver. >> >> Signed-off-by: Brijesh Singh >> --- >> .../devicetree/bindings/edac/amd-seattle-edac.txt | 15 + >> drivers/edac/Kconfig | 6 + >> drivers/edac/Makefile | 1 + >> drivers/edac/seattle_edac.c | 306 +++++++++++++++++++++ >> 4 files changed, 328 insertions(+) >> create mode 100644 Documentation/devicetree/bindings/edac/amd-seattle-edac.txt >> create mode 100644 drivers/edac/seattle_edac.c >> >> > [...] >> +config EDAC_SEATTLE >> + tristate "AMD Seattle EDAC" >> + depends on EDAC_MM_EDAC && ARCH_SEATTLE >> + help >> + Support for error detection and correction on the >> + AMD Seattle SOC. >> endif # EDAC >> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile >> index ae3c5f3..9e4f3ef 100644 >> --- a/drivers/edac/Makefile >> +++ b/drivers/edac/Makefile >> @@ -68,3 +68,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI) += octeon_edac-pci.o >> obj-$(CONFIG_EDAC_ALTERA_MC) += altera_edac.o >> obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o >> obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o >> +obj-$(CONFIG_EDAC_SEATTLE) += seattle_edac.o >> diff --git a/drivers/edac/seattle_edac.c b/drivers/edac/seattle_edac.c >> new file mode 100644 >> index 0000000..78101aa >> --- /dev/null >> +++ b/drivers/edac/seattle_edac.c >> @@ -0,0 +1,306 @@ >> +/* >> + * AMD Seattle EDAC >> + * >> + * Copyright (c) 2015, Advanced Micro Devices >> + * Author: Brijesh Singh >> + * >> + * The driver polls CPUMERRSR_EL1 and L2MERRSR_EL1 registers to logs the >> + * non-fatal errors. Whereas the single bit and double bit ECC erros are >> + * handled by firmware. >> + * >> + * This program is free software; you can redistribute it and/or modify it >> + * under the terms of the GNU General Public License as published by the >> + * Free Software Foundation; either version 2 of the License, or (at your >> + * option) any later version. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License >> + * along with this program. If not, see . >> + */ >> + >> +#include >> +#include >> +#include >> + >> +#include "edac_core.h" >> + >> +#define EDAC_MOD_STR "seattle_edac" >> + >> +#define CPUMERRSR_EL1_INDEX(x) ((x) & 0x1ffff) >> +#define CPUMERRSR_EL1_BANK(x) (((x) >> 18) & 0x1f) >> +#define CPUMERRSR_EL1_RAMID(x) (((x) >> 24) & 0x7f) >> +#define CPUMERRSR_EL1_VALID(x) ((x) & (1 << 31)) >> +#define CPUMERRSR_EL1_REPEAT(x) (((x) >> 32) & 0x7f) >> +#define CPUMERRSR_EL1_OTHER(x) (((x) >> 40) & 0xff) >> +#define CPUMERRSR_EL1_FATAL(x) ((x) & (1UL << 63)) >> + >> +#define L2MERRSR_EL1_INDEX(x) ((x) & 0x1ffff) >> +#define L2MERRSR_EL1_CPUID(x) (((x) >> 18) & 0xf) >> +#define L2MERRSR_EL1_RAMID(x) (((x) >> 24) & 0x7f) >> +#define L2MERRSR_EL1_VALID(x) ((x) & (1 << 31)) >> +#define L2MERRSR_EL1_REPEAT(x) (((x) >> 32) & 0xff) >> +#define L2MERRSR_EL1_OTHER(x) (((x) >> 40) & 0xff) >> +#define L2MERRSR_EL1_FATAL(x) ((x) & (1UL << 63)) >> + >> +struct seattle_edac { >> + struct edac_device_ctl_info *edac_ctl; >> +}; >> + >> +static inline u64 read_cpumerrsr_el1(void) >> +{ >> + u64 val; >> + >> + asm volatile("mrs %0, s3_1_c15_c2_2" : "=r" (val)); >> + return val; >> +} >> + >> +static inline void write_cpumerrsr_el1(u64 val) >> +{ >> + asm volatile("msr s3_1_c15_c2_2, %0" :: "r" (val)); >> +} >> + >> +static inline u64 read_l2merrsr_el1(void) >> +{ >> + u64 val; >> + >> + asm volatile("mrs %0, s3_1_c15_c2_3" : "=r" (val)); >> + return val; >> +} >> + >> +static inline void write_l2merrsr_el1(u64 val) >> +{ >> + asm volatile("msr s3_1_c15_c2_3, %0" :: "r" (val)); >> +} >> + >> +static void check_l2merrsr_el1_error(struct edac_device_ctl_info *edac_ctl) >> +{ >> + int fatal; >> + int cpuid; >> + u64 val = read_l2merrsr_el1(); >> + >> + if (!L2MERRSR_EL1_VALID(val)) >> + return; >> + >> + fatal = L2MERRSR_EL1_FATAL(val); >> + cpuid = L2MERRSR_EL1_CPUID(val); >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "CPU%d detected %s error on L2 (L2MERRSR=%#llx)!\n", >> + smp_processor_id(), fatal ? "fatal" : "non-fatal", val); >> + >> + switch (L2MERRSR_EL1_RAMID(val)) { >> + case 0x10: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 Tag RAM cpu %d way %d\n", cpuid / 2, cpuid % 2); >> + break; >> + case 0x11: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 Data RAM cpu %d way %d\n", cpuid / 2, cpuid % 2); >> + break; >> + case 0x12: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 Snoop tag RAM cpu %d way %d\n", >> + cpuid / 2, cpuid % 2); >> + break; >> + case 0x14: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 Dirty RAM cpu %d way %d\n", >> + cpuid / 2, cpuid % 2); >> + break; >> + case 0x18: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 inclusion RAM cpu %d way %d\n", >> + cpuid / 2, cpuid % 2); >> + break; >> + default: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "unknown RAMID cpuid %d\n", cpuid); >> + break; >> + } >> + >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n", >> + (int)L2MERRSR_EL1_REPEAT(val)); >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n", >> + (int)L2MERRSR_EL1_OTHER(val)); >> + if (fatal) >> + edac_device_handle_ue(edac_ctl, smp_processor_id(), 1, >> + edac_ctl->name); >> + else >> + edac_device_handle_ce(edac_ctl, smp_processor_id(), 1, >> + edac_ctl->name); >> + write_l2merrsr_el1(0); >> +} >> + >> +static void check_cpumerrsr_el1_error(struct edac_device_ctl_info *edac_ctl) >> +{ >> + int fatal; >> + int bank; >> + u64 val = read_cpumerrsr_el1(); >> + >> + if (!CPUMERRSR_EL1_VALID(val)) >> + return; >> + >> + bank = CPUMERRSR_EL1_BANK(val); >> + fatal = CPUMERRSR_EL1_FATAL(val); >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "CPU%d detected %s error on L1 (CPUMERRSR=%#llx)!\n", >> + smp_processor_id(), fatal ? "fatal" : "non-fatal", val); >> + >> + switch (CPUMERRSR_EL1_RAMID(val)) { >> + case 0x0: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L1-I Tag RAM bank %d\n", bank); >> + break; >> + case 0x1: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L1-I Data RAM bank %d\n", bank); >> + break; >> + case 0x8: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L1-D Tag RAM bank %d\n", bank); >> + break; >> + case 0x9: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L1-D Data RAM bank %d\n", bank); >> + break; >> + case 0x18: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 TLB RAM bank %d\n", bank); >> + break; >> + default: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "unknown ramid %d bank %d\n", >> + (int)CPUMERRSR_EL1_RAMID(val), bank); >> + break; >> + } >> + >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n", >> + (int)CPUMERRSR_EL1_REPEAT(val)); >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n", >> + (int)CPUMERRSR_EL1_OTHER(val)); >> + if (fatal) >> + edac_device_handle_ue(edac_ctl, smp_processor_id(), 1, >> + edac_ctl->name); >> + else >> + edac_device_handle_ce(edac_ctl, smp_processor_id(), 1, >> + edac_ctl->name); >> + write_cpumerrsr_el1(0); >> +} > > The codes above are common for all A57 architectures, other A57 SoCs will use the same > code for L1/L2 caches error report, can we put those codes in common place and reused > for all A57 architectures? > Code is generic to A57 and I will follow Mark Rutland suggestion to make it cortex_a57_edac. If you have something else in mind then please let me know. >> + >> +static void cpu_check_errors(void *args) >> +{ >> + struct edac_device_ctl_info *edev_ctl = args; >> + >> + check_cpumerrsr_el1_error(edev_ctl); >> + check_l2merrsr_el1_error(edev_ctl); >> +} >> + >> +static void edac_check_errors(struct edac_device_ctl_info *edev_ctl) >> +{ >> + int cpu; >> + >> + /* read L1 and L2 memory error syndrome register on possible CPU's */ >> + for_each_possible_cpu(cpu) >> + smp_call_function_single(cpu, cpu_check_errors, edev_ctl, 0); > > Seems that error syndrome registers for L2 cache are cluster lever (each cluster share the > L2 cache, you can refer to ARM doc: DDI0488D, Cortex-A57 Technical Reference Manual), > so for L2 cache, we need to check the error at cluster lever not the cpu core lever. > Yes L1 seems to be CPU specific and L2 is shared in a cluster. So I am thinking of making the following changes in this function. static void edac_check_errors(struct edac_device_ctl_info *edev_ctl) { int cpu; struct cpumask cluster_mask, old_mask; cpumask_clear(&cluster_mask); cpumask_clear(&old_mask); for_each_possible_cpu(cpu) { smp_call_function_single(cpu, check_cpumerrsr_el1_error, edev_ctl, 0); cpumask_copy(&cluster_mask, topology_core_cpumask(cpu)); if (cpumask_equal(&cluster_mask, &old_mask)) continue; cpumask_copy(&old_mask, &cluster_mask); smp_call_function_any(&cluster_mask, check_l2merrsr_el1_error, edev_ctl, 0); } } Read L1 on each CPU and L2 once in a cluster. Does this address your feedback ? > Thanks > Hanjun > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo at vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752970AbbJTV14 (ORCPT ); Tue, 20 Oct 2015 17:27:56 -0400 Received: from mail-bl2on0088.outbound.protection.outlook.com ([65.55.169.88]:51328 "EHLO na01-bl2-obe.outbound.protection.outlook.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1751309AbbJTV1w (ORCPT ); Tue, 20 Oct 2015 17:27:52 -0400 X-Greylist: delayed 7843 seconds by postgrey-1.27 at vger.kernel.org; Tue, 20 Oct 2015 17:27:52 EDT Authentication-Results: spf=none (sender IP is 165.204.84.221) smtp.mailfrom=amd.com; alien8.de; dkim=none (message not signed) header.d=none;alien8.de; dmarc=permerror action=none header.from=amd.com; X-WSS-ID: 0NWJFM9-07-0LZ-02 X-M-MSG: Subject: Re: [PATCH] EDAC: Add AMD Seattle SoC EDAC To: Hanjun Guo , , References: <1445282597-18999-1-git-send-email-brijeshkumar.singh@amd.com> <5625A528.1040803@huawei.com> CC: , , , , , , , , , , dingtinahong , Hanjun Guo From: Brijesh Singh Message-ID: <5626B199.4050209@amd.com> Date: Tue, 20 Oct 2015 16:26:49 -0500 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.2.0 MIME-Version: 1.0 In-Reply-To: <5625A528.1040803@huawei.com> Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit X-Originating-IP: [10.180.168.240] X-EOPAttributedMessage: 0 X-Forefront-Antispam-Report: CIP:165.204.84.221;CTRY:US;IPV:NLI;EFV:NLI;SFV:NSPM;SFS:(10009020)(6009001)(2980300002)(428002)(377454003)(24454002)(479174004)(189002)(199003)(50986999)(19580395003)(46102003)(5001770100001)(11100500001)(5008740100001)(83506001)(23676002)(189998001)(97736004)(80316001)(19580405001)(4001350100001)(87936001)(65956001)(2201001)(64126003)(105586002)(65816999)(47776003)(101416001)(5007970100001)(64706001)(92566002)(106466001)(65806001)(33656002)(86362001)(87266999)(77096005)(2950100001)(15975445007)(5004730100002)(36756003)(54356999)(59896002)(50466002)(76176999)(2004002);DIR:OUT;SFP:1101;SCL:1;SRVR:BY2PR12MB0710;H:atltwp01.amd.com;FPR:;SPF:None;PTR:InfoDomainNonexistent;MX:1;A:1;LANG:en; X-Microsoft-Exchange-Diagnostics: 1;BY2PR12MB0710;2:DbwyPJAPXQ7ZEKhruXxNusf/O0PE5ELH+MeEg6AGQc4UIwuYmcdd9nBNgp8xfWfXIzkQfy9lAVkVKc1MfuIKrtarlrZDIOsp7BPYi7Tm9soe9aUyoGEeQ6+mcqq8nPzlZRX+ftPZzcmOEHISuhtaPJCJ47Vwp476aB5cc6yIkpU=;3:HTYJ5Y/9b8xJWLJM1X3kloYQ4YitFkOU6/PSIjiK60yY3f3jGYSzHl9P+g17UvSxbRBJRCIBGMrHe+QUv3pGldIwN+20ylBfeY40Dt9SjidoIGl6Abv4rSTWT7HXdzx6loMeDVJwB8KuEJ/hlY3jfjQEmD6L/0qRXjIB/3U2b1VAwdSXSxVPNrsxVZLZIVZ//kaEUsq4dJom6jacsZz9odB154xewhGmFuO06oj9T7SSJPdnwOyxjKcMJ48WUG421IcMAPjgHBxFFgJxb3KFOA==;25:bsyNYpcr6oko61wY007GhvF/WZBgoyZT5gq5LwUgE/CZKah8Xj8vR2PmUiiNoBa0tDcyDF6i6r92SbP6D+IQUbESBZiyRZR5H0WiCGp+pAVkL1lE20N14P7jE7P/JxZXPP520tyefu1gTLMuBsrDnC+8wWIHiodL876703g9K4bb7vSNd99g0Rxi0MWNBWrCxRSti/PGbIAYAjVWFNFX3Y+NHYHlu8bAIBmlMRD5d4zWNI2S9sfJ9uTfNrp7+4lYYFxk5b5aXRdfoYVzchUx3A== X-Microsoft-Antispam: UriScan:;BCL:0;PCL:0;RULEID:(42134001)(42139001);SRVR:BY2PR12MB0710; X-Microsoft-Exchange-Diagnostics: 1;BY2PR12MB0710;20:IpOAsTptVj8k1TWAH+vZHb9BfgbM4yeIGTO6nx6/xgOZA8galUVex7fua1BP3NGQqxDrI3EmQEswubAdeT6lnz11khc1f4RwnOeY0o2ka0MTBWslWPLDzarJdY1VNp8dn/7badDQURaAoaSC8jy2dE7YIGj3XILAdUY4xCvj7QEACRu84jm5i58HrSppYkGxf9G57OQ5eMsx7vK+N9Dtae/h1/8JzHAH1US6pm+ZVykNCwJPSLqisIfP0fDn6+2E0JQuda/qHO1q13pfyi2/SUq2pwDzfBZTHrY/oQsUBi0zJCx2MXu9YK5xrgqZL5vzMFx+RtPfI3JxjhTN4wX9FXZ+i4ulYXKS4fzBxDFg6u8G7SKiGhAFH2enPu0sqmyopoH1f4T4Tsmg7gsUTd+VvpBkcm7JJaqDy5mcc7vi9iERGPWZ0ZQ2ZpNlrclUfjB0NvtlMYtIoSllNsQtfCNF/f0JCfP5PoPh07j5JSuszUWi5KywmPyoauPUoWQqEjw7;4:WUMppBRBg+K1Lo4KYWg9iNFaTatlGarR4++GUsE7uOLYEEU+IfnRobwJXd8/UMgedRwUqtzzj2G7MMg0LHqhZ3lMOzTNoa9hKsvsXdO/0mOCj79JKp/2MtcQmBq4bpZ6/ZQaoOjbgNuLD3FbyKnYkjcQ0E0ALnYT3Boe5dbhTseD+lLEwl5imob3uMonzEYLPCHKCOnX0zWYKo6fEQx9zKsudQeq4+B8A2XQ+nLK0mHzjCbuwMqGXYgqAUR8rRDfO9d1Oo2SM4HwKrOXrHtsRZb7tf2BruurRnj9b0yQAZbQV9waR1pGRU3E6uG35g5s5wW1dZP77w6XQCVVk2LnFNyuOxyjA2xxoBWkqPvGviQ= X-Microsoft-Antispam-PRVS: X-Exchange-Antispam-Report-Test: UriScan:(767451399110); X-Exchange-Antispam-Report-CFA-Test: BCL:0;PCL:0;RULEID:(601004)(2401047)(520078)(5005006)(8121501046)(3002001);SRVR:BY2PR12MB0710;BCL:0;PCL:0;RULEID:;SRVR:BY2PR12MB0710; X-Forefront-PRVS: 073515755F X-Microsoft-Exchange-Diagnostics: =?utf-8?B?MTtCWTJQUjEyTUIwNzEwOzIzOjdnd3VmcmIrQS93bzFSNzQrNThFUkJjN3ZE?= =?utf-8?B?OEx6ZzJYT0xSd1ROc3RDeFdIOVBFakFlMFhoZUNucFBoU05reTlrbE4rNFBm?= =?utf-8?B?VkovdEZYWEVWUVNwTmU0TTBmQzc0YzAzQWpqNnZpQlhxUVVhcC9kOXBqVlZV?= =?utf-8?B?VmZzUkxDcHVhcWI0ckloN3JNcjZCQXdPV2lQeVU4dGFGR0lyeHhXN2x2Zk1s?= =?utf-8?B?azRaTURYbGNUaFplT0wySkg4MldVZ2lndDAzUzQxTmE2Y1ZDWlAvZmFaZDli?= =?utf-8?B?R2dybzN3d2NYWGhENVNMV05xdkZEYjUyN1JtRWRFZk5CVVRmMU1YbVFzMG9j?= =?utf-8?B?SWRiSFY4bzBPNGpxa3VQdzNLVy9UTy9JaTR6TEpsRzhkQ0N2cmlQcjZETE44?= =?utf-8?B?TFN4cEVNVXl2ZWNxV04zdi8rVVI5d0RTTE5EVklzZlZjRWp6SmdxWHgvUVdk?= =?utf-8?B?dWhGbWtvMmkzM0xCRGl1Q3VaM2NjYzVXNEdzMUFHcE9SR3lwMVYrUzFCSDdT?= =?utf-8?B?bHFTOHo1ODlBU0djUGZxVkdMaitNVHhQTXNybjMyOXNRVEJ0UDBwMVk2SCtY?= =?utf-8?B?RjBiUUxBSGJRdmF2K3NtNG9OL0FPM3ZvV21wWHlCVkFXM2tlbmRMYWtwU0w0?= =?utf-8?B?b0x6Y1V1bi90ZExlN2dIQ0V6djhpUzJvcjAzLzBITS9MdTdBNUl6UWEyM3NM?= =?utf-8?B?NEtrMGoydnVSWmVoY29LczM2bVo1M3RlcFpBNlBnRU0wMG40ZkxYWFUzd0d4?= =?utf-8?B?MHJiWXQ3L2NTWVc1MVg4Qk02RUJyRVBsMGticlV6QkVvRUVDZEc3dlBhTmtR?= =?utf-8?B?U0ZVdXV2Vi9PcnFwSlBWWUE0ZGM4ODdjVTArcDZHOEk2ZndMOXdUSGJSMVlG?= =?utf-8?B?b0d3b0FLay9BbEdJRjZmK1RlSytoWHV0ZWpqb3ZtbUdXTm5NOE95TlFQUS9N?= =?utf-8?B?dFozaWZtMHNwOVpIZmxsNjZhQ0s1SzJqeVFvQ1hmME1ONHhSN0wzNXFmeFFr?= =?utf-8?B?V3U3RVdQRGJ5WTBWR1BaODIvbjlJdE1GTTFoQkN4VWl4VmFjbXVTQituTWk3?= =?utf-8?B?OEsvTU5aQVJIczNIeHdTVW11amdGTGRLNlduSE1QQm5hRVdYMm5hWVBSUXpH?= =?utf-8?B?KzZRZHN1ZFlnMVdhTDVBUXUxTW1IOW9URXlRQmpXZVFPM2t0eUlaVmR0c1h5?= =?utf-8?B?R2l2UnVjRXpVbFFqditTeS84cnh4SDhsWXVuMVQ2aGJPbHpoV054OGV2Z21i?= =?utf-8?B?Rzc2QVdQdFFMd2t6enJPYlhIeHpRUEtzZFAwYlc4THc3U09zKy82SjVWTGtv?= =?utf-8?B?QTU2b2JhR3lkeHNRSWlhTlpiQ0xVc05rUDR6VTNHTURtT0s2SEsyZExwQ0th?= =?utf-8?B?VnFSakNKNUlaRXRHVUFHMkhEN0JuWGVaNFpVVk1rSW9pVFZTamZ1eWRjQ3Iv?= =?utf-8?B?N3JHdkRlYys0L3YrcFMrMVBNanF0VEsxUGdCZVJDclZvaVFCK01KV2VhTjRD?= =?utf-8?B?eUtkd0ZqQTdiRG1HaGc5bkZtSkhFZVdORGY3Tm1PQkQzM0dIQkJDaXV3TVgz?= =?utf-8?B?RmtpNHJhSlpsTGovWDRSOHFLV0VoTXQ3UkFuK2IxWjJEZjl4ZGJISmdUZUFR?= =?utf-8?B?SDE0bTFQdmVzVitNTWorb2I0bEpaOWNMK3hhWkRwRjlJOGx2dnBNZWRkUmRC?= =?utf-8?Q?+ysDdaJFNONPzofm1U=3D?= X-Microsoft-Exchange-Diagnostics: 1;BY2PR12MB0710;5:T3TfCLhapmutV8X3rv/yWdhjY9aOwrNDoTpng4/B1rGNRvDmYpi2c72TWJJ5dOolka00amV37ZadLA1z8ayWQjgf4EIxjzW6wZPSwB0xHGccZ1AFH2nqlCJCcHjmCo8l7fHzfGknC43s/84kI9WQ8g==;24:1JNvqqcYKq0TpYFYML3WLUQpsR+VxqVh34NJ6xbx6bMLQSxt4hPa0qfSxS6n/nxqdtB+jLmii5dvvQpU9HifZbDUZnn/eLHhwQYdjLTC9Yo=;20:x+UxcN+DlwUCJpy4tBkTb7Vq/RkEPDBIb/GZFE1qW+d4ArVG+3LEl6RWPoWGGFTdai9fLQLnBBTGtrZx/1ZpeCNX5WZ9ZnZonmyvyUO5MsxUX41rhIDWIGwOwGycgAI9LzZ+OYSl6OXsghamZvJoUTlMAxhQKTVRYh01pKUD/LEKAptaLMVfULMKhCb3eln/+/md93QjUdcscex7NoAm27a6t+h3r747tRHrHlfOQRUvWBpHjq7Zt4nNX953XO5Y SpamDiagnosticOutput: 1:23 SpamDiagnosticMetadata: NSPM X-OriginatorOrg: amd.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 20 Oct 2015 21:27:46.8968 (UTC) X-MS-Exchange-CrossTenant-Id: 3dd8961f-e488-4e60-8e11-a82d994e183d X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp: TenantId=3dd8961f-e488-4e60-8e11-a82d994e183d;Ip=[165.204.84.221];Helo=[atltwp01.amd.com] X-MS-Exchange-CrossTenant-FromEntityHeader: HybridOnPrem X-MS-Exchange-Transport-CrossTenantHeadersStamped: BY2PR12MB0710 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Hi Hanjun, Thanks for review. -Brijesh On 10/19/2015 09:21 PM, Hanjun Guo wrote: > Hi Brijesh, > > On 2015/10/20 3:23, Brijesh Singh wrote: >> Add support for the AMD Seattle SoC EDAC driver. >> >> Signed-off-by: Brijesh Singh >> --- >> .../devicetree/bindings/edac/amd-seattle-edac.txt | 15 + >> drivers/edac/Kconfig | 6 + >> drivers/edac/Makefile | 1 + >> drivers/edac/seattle_edac.c | 306 +++++++++++++++++++++ >> 4 files changed, 328 insertions(+) >> create mode 100644 Documentation/devicetree/bindings/edac/amd-seattle-edac.txt >> create mode 100644 drivers/edac/seattle_edac.c >> >> > [...] >> +config EDAC_SEATTLE >> + tristate "AMD Seattle EDAC" >> + depends on EDAC_MM_EDAC && ARCH_SEATTLE >> + help >> + Support for error detection and correction on the >> + AMD Seattle SOC. >> endif # EDAC >> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile >> index ae3c5f3..9e4f3ef 100644 >> --- a/drivers/edac/Makefile >> +++ b/drivers/edac/Makefile >> @@ -68,3 +68,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI) += octeon_edac-pci.o >> obj-$(CONFIG_EDAC_ALTERA_MC) += altera_edac.o >> obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o >> obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o >> +obj-$(CONFIG_EDAC_SEATTLE) += seattle_edac.o >> diff --git a/drivers/edac/seattle_edac.c b/drivers/edac/seattle_edac.c >> new file mode 100644 >> index 0000000..78101aa >> --- /dev/null >> +++ b/drivers/edac/seattle_edac.c >> @@ -0,0 +1,306 @@ >> +/* >> + * AMD Seattle EDAC >> + * >> + * Copyright (c) 2015, Advanced Micro Devices >> + * Author: Brijesh Singh >> + * >> + * The driver polls CPUMERRSR_EL1 and L2MERRSR_EL1 registers to logs the >> + * non-fatal errors. Whereas the single bit and double bit ECC erros are >> + * handled by firmware. >> + * >> + * This program is free software; you can redistribute it and/or modify it >> + * under the terms of the GNU General Public License as published by the >> + * Free Software Foundation; either version 2 of the License, or (at your >> + * option) any later version. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License >> + * along with this program. If not, see . >> + */ >> + >> +#include >> +#include >> +#include >> + >> +#include "edac_core.h" >> + >> +#define EDAC_MOD_STR "seattle_edac" >> + >> +#define CPUMERRSR_EL1_INDEX(x) ((x) & 0x1ffff) >> +#define CPUMERRSR_EL1_BANK(x) (((x) >> 18) & 0x1f) >> +#define CPUMERRSR_EL1_RAMID(x) (((x) >> 24) & 0x7f) >> +#define CPUMERRSR_EL1_VALID(x) ((x) & (1 << 31)) >> +#define CPUMERRSR_EL1_REPEAT(x) (((x) >> 32) & 0x7f) >> +#define CPUMERRSR_EL1_OTHER(x) (((x) >> 40) & 0xff) >> +#define CPUMERRSR_EL1_FATAL(x) ((x) & (1UL << 63)) >> + >> +#define L2MERRSR_EL1_INDEX(x) ((x) & 0x1ffff) >> +#define L2MERRSR_EL1_CPUID(x) (((x) >> 18) & 0xf) >> +#define L2MERRSR_EL1_RAMID(x) (((x) >> 24) & 0x7f) >> +#define L2MERRSR_EL1_VALID(x) ((x) & (1 << 31)) >> +#define L2MERRSR_EL1_REPEAT(x) (((x) >> 32) & 0xff) >> +#define L2MERRSR_EL1_OTHER(x) (((x) >> 40) & 0xff) >> +#define L2MERRSR_EL1_FATAL(x) ((x) & (1UL << 63)) >> + >> +struct seattle_edac { >> + struct edac_device_ctl_info *edac_ctl; >> +}; >> + >> +static inline u64 read_cpumerrsr_el1(void) >> +{ >> + u64 val; >> + >> + asm volatile("mrs %0, s3_1_c15_c2_2" : "=r" (val)); >> + return val; >> +} >> + >> +static inline void write_cpumerrsr_el1(u64 val) >> +{ >> + asm volatile("msr s3_1_c15_c2_2, %0" :: "r" (val)); >> +} >> + >> +static inline u64 read_l2merrsr_el1(void) >> +{ >> + u64 val; >> + >> + asm volatile("mrs %0, s3_1_c15_c2_3" : "=r" (val)); >> + return val; >> +} >> + >> +static inline void write_l2merrsr_el1(u64 val) >> +{ >> + asm volatile("msr s3_1_c15_c2_3, %0" :: "r" (val)); >> +} >> + >> +static void check_l2merrsr_el1_error(struct edac_device_ctl_info *edac_ctl) >> +{ >> + int fatal; >> + int cpuid; >> + u64 val = read_l2merrsr_el1(); >> + >> + if (!L2MERRSR_EL1_VALID(val)) >> + return; >> + >> + fatal = L2MERRSR_EL1_FATAL(val); >> + cpuid = L2MERRSR_EL1_CPUID(val); >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "CPU%d detected %s error on L2 (L2MERRSR=%#llx)!\n", >> + smp_processor_id(), fatal ? "fatal" : "non-fatal", val); >> + >> + switch (L2MERRSR_EL1_RAMID(val)) { >> + case 0x10: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 Tag RAM cpu %d way %d\n", cpuid / 2, cpuid % 2); >> + break; >> + case 0x11: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 Data RAM cpu %d way %d\n", cpuid / 2, cpuid % 2); >> + break; >> + case 0x12: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 Snoop tag RAM cpu %d way %d\n", >> + cpuid / 2, cpuid % 2); >> + break; >> + case 0x14: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 Dirty RAM cpu %d way %d\n", >> + cpuid / 2, cpuid % 2); >> + break; >> + case 0x18: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 inclusion RAM cpu %d way %d\n", >> + cpuid / 2, cpuid % 2); >> + break; >> + default: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "unknown RAMID cpuid %d\n", cpuid); >> + break; >> + } >> + >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n", >> + (int)L2MERRSR_EL1_REPEAT(val)); >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n", >> + (int)L2MERRSR_EL1_OTHER(val)); >> + if (fatal) >> + edac_device_handle_ue(edac_ctl, smp_processor_id(), 1, >> + edac_ctl->name); >> + else >> + edac_device_handle_ce(edac_ctl, smp_processor_id(), 1, >> + edac_ctl->name); >> + write_l2merrsr_el1(0); >> +} >> + >> +static void check_cpumerrsr_el1_error(struct edac_device_ctl_info *edac_ctl) >> +{ >> + int fatal; >> + int bank; >> + u64 val = read_cpumerrsr_el1(); >> + >> + if (!CPUMERRSR_EL1_VALID(val)) >> + return; >> + >> + bank = CPUMERRSR_EL1_BANK(val); >> + fatal = CPUMERRSR_EL1_FATAL(val); >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "CPU%d detected %s error on L1 (CPUMERRSR=%#llx)!\n", >> + smp_processor_id(), fatal ? "fatal" : "non-fatal", val); >> + >> + switch (CPUMERRSR_EL1_RAMID(val)) { >> + case 0x0: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L1-I Tag RAM bank %d\n", bank); >> + break; >> + case 0x1: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L1-I Data RAM bank %d\n", bank); >> + break; >> + case 0x8: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L1-D Tag RAM bank %d\n", bank); >> + break; >> + case 0x9: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L1-D Data RAM bank %d\n", bank); >> + break; >> + case 0x18: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "L2 TLB RAM bank %d\n", bank); >> + break; >> + default: >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, >> + "unknown ramid %d bank %d\n", >> + (int)CPUMERRSR_EL1_RAMID(val), bank); >> + break; >> + } >> + >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, "Repeated error count: %d\n", >> + (int)CPUMERRSR_EL1_REPEAT(val)); >> + edac_printk(KERN_CRIT, EDAC_MOD_STR, "Other error count: %d\n", >> + (int)CPUMERRSR_EL1_OTHER(val)); >> + if (fatal) >> + edac_device_handle_ue(edac_ctl, smp_processor_id(), 1, >> + edac_ctl->name); >> + else >> + edac_device_handle_ce(edac_ctl, smp_processor_id(), 1, >> + edac_ctl->name); >> + write_cpumerrsr_el1(0); >> +} > > The codes above are common for all A57 architectures, other A57 SoCs will use the same > code for L1/L2 caches error report, can we put those codes in common place and reused > for all A57 architectures? > Code is generic to A57 and I will follow Mark Rutland suggestion to make it cortex_a57_edac. If you have something else in mind then please let me know. >> + >> +static void cpu_check_errors(void *args) >> +{ >> + struct edac_device_ctl_info *edev_ctl = args; >> + >> + check_cpumerrsr_el1_error(edev_ctl); >> + check_l2merrsr_el1_error(edev_ctl); >> +} >> + >> +static void edac_check_errors(struct edac_device_ctl_info *edev_ctl) >> +{ >> + int cpu; >> + >> + /* read L1 and L2 memory error syndrome register on possible CPU's */ >> + for_each_possible_cpu(cpu) >> + smp_call_function_single(cpu, cpu_check_errors, edev_ctl, 0); > > Seems that error syndrome registers for L2 cache are cluster lever (each cluster share the > L2 cache, you can refer to ARM doc: DDI0488D, Cortex-A57 Technical Reference Manual), > so for L2 cache, we need to check the error at cluster lever not the cpu core lever. > Yes L1 seems to be CPU specific and L2 is shared in a cluster. So I am thinking of making the following changes in this function. static void edac_check_errors(struct edac_device_ctl_info *edev_ctl) { int cpu; struct cpumask cluster_mask, old_mask; cpumask_clear(&cluster_mask); cpumask_clear(&old_mask); for_each_possible_cpu(cpu) { smp_call_function_single(cpu, check_cpumerrsr_el1_error, edev_ctl, 0); cpumask_copy(&cluster_mask, topology_core_cpumask(cpu)); if (cpumask_equal(&cluster_mask, &old_mask)) continue; cpumask_copy(&old_mask, &cluster_mask); smp_call_function_any(&cluster_mask, check_l2merrsr_el1_error, edev_ctl, 0); } } Read L1 on each CPU and L2 once in a cluster. Does this address your feedback ? > Thanks > Hanjun > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ >