From: "Luck, Tony" <tony.luck@intel.com>
To: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>,
Tony Luck <tony.luck@intel.com>, Borislav Petkov <bp@alien8.de>,
Aristeu Rozanski <aris@redhat.com>,
Mauro Carvalho Chehab <mchehab@s-opensource.com>,
linux-edac@vger.kernel.org, linux-acpi@vger.kernel.org
Subject: [v2,3/3] EDAC, skx_edac: Add address translation for non-volatile DIMMs
Date: Thu, 11 Oct 2018 14:12:36 -0700 [thread overview]
Message-ID: <20181011211236.23525-4-tony.luck@intel.com> (raw)
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Current skx_edac driver doesn't support address translation for
non-volatile DIMMs.
The ACPI ADXL DSM method support address translation for both
volatile DIMMs and non-volatile DIMMs. So switch skx_edac to use
the wrapped ACPI DSM methods, if they are supported and there are
non-volatile DIMMs populated on the system.
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
drivers/edac/Kconfig | 1 +
drivers/edac/skx_edac.c | 191 +++++++++++++++++++++++++++++++++++++---
2 files changed, 179 insertions(+), 13 deletions(-)
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 57304b2e989f..ffd349c12479 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -234,6 +234,7 @@ config EDAC_SKX
depends on PCI && X86_64 && X86_MCE_INTEL && PCI_MMCONFIG
depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_SKX can't be y
select DMI
+ select ACPI_ADXL
help
Support for error detection and correction the Intel
Skylake server Integrated Memory Controllers. If your
diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c
index a710169abdbc..598564f9c4b2 100644
--- a/drivers/edac/skx_edac.c
+++ b/drivers/edac/skx_edac.c
@@ -26,6 +26,7 @@
#include <linux/bitmap.h>
#include <linux/math64.h>
#include <linux/mod_devicetable.h>
+#include <linux/adxl.h>
#include <acpi/nfit.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
@@ -35,6 +36,7 @@
#include "edac_module.h"
#define EDAC_MOD_STR "skx_edac"
+#define MSG_SIZE 1024
/*
* Debug macros
@@ -54,6 +56,26 @@
static LIST_HEAD(skx_edac_list);
static u64 skx_tolm, skx_tohm;
+static int nvdimm_count;
+static u64 *adxl_values;
+
+enum {
+ INDEX_SOCKET,
+ INDEX_MEMCTRL,
+ INDEX_CHANNEL,
+ INDEX_DIMM,
+ INDEX_MAX
+};
+static char component_names[][32] = {
+ [INDEX_SOCKET] = "ProcessorSocketId",
+ [INDEX_MEMCTRL] = "MemoryControllerId",
+ [INDEX_CHANNEL] = "ChannelId",
+ [INDEX_DIMM] = "DimmSlotId",
+};
+
+static int component_indices[ARRAY_SIZE(component_names)];
+static int adxl_component_count;
+static const char * const *adxl_component_names;
#define NUM_IMC 2 /* memory controllers per socket */
#define NUM_CHANNELS 3 /* channels per memory controller */
@@ -102,11 +124,11 @@ struct decoded_addr {
u64 addr;
int socket;
int imc;
- int channel;
+ u64 channel;
u64 chan_addr;
int sktways;
int chanways;
- int dimm;
+ u64 dimm;
int rank;
int channel_rank;
u64 rank_address;
@@ -393,6 +415,8 @@ static int get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc,
u16 flags;
u64 size = 0;
+ nvdimm_count++;
+
dev_handle = ACPI_NFIT_BUILD_DEVICE_HANDLE(dimmno, chan, imc->lmc,
imc->src_id, 0);
@@ -682,7 +706,7 @@ static bool skx_sad_decode(struct decoded_addr *res)
res->imc = GET_BITFIELD(d->mcroute, lchan * 3, lchan * 3 + 2);
res->channel = GET_BITFIELD(d->mcroute, lchan * 2 + 18, lchan * 2 + 19);
- edac_dbg(2, "%llx: socket=%d imc=%d channel=%d\n",
+ edac_dbg(2, "%llx: socket=%d imc=%d channel=%llu\n",
res->addr, res->socket, res->imc, res->channel);
return true;
}
@@ -818,7 +842,7 @@ static bool skx_rir_decode(struct decoded_addr *res)
res->dimm = chan_rank / 4;
res->rank = chan_rank % 4;
- edac_dbg(2, "%llx: dimm=%d rank=%d chan_rank=%d rank_addr=%llx\n",
+ edac_dbg(2, "%llx: dimm=%llu rank=%d chan_rank=%d rank_addr=%llx\n",
res->addr, res->dimm, res->rank,
res->channel_rank, res->rank_address);
return true;
@@ -894,12 +918,48 @@ static bool skx_decode(struct decoded_addr *res)
skx_rir_decode(res) && skx_mad_decode(res);
}
+static bool skx_dsm_decode(u64 addr, char *msg, int msglen,
+ u64 *sock, u64 *imc,
+ u64 *chan, u64 *dimm)
+
+{
+ int i, len = 0;
+
+ if (addr >= skx_tohm || (addr >= skx_tolm && addr < BIT_ULL(32))) {
+ edac_dbg(0, "Address %llx out of range\n", addr);
+ return false;
+ }
+
+ if (adxl_decode(addr, adxl_values)) {
+ edac_dbg(0, "Failed to decode 0x%llx\n", addr);
+ return false;
+ }
+
+ *sock = adxl_values[component_indices[INDEX_SOCKET]];
+ *imc = adxl_values[component_indices[INDEX_MEMCTRL]];
+ *chan = adxl_values[component_indices[INDEX_CHANNEL]];
+ *dimm = adxl_values[component_indices[INDEX_DIMM]];
+
+ for (i = 0; i < adxl_component_count; i++) {
+ if (adxl_values[i] == ~0x0ull)
+ continue;
+ len += snprintf(msg + len, msglen, " %s:0x%llx",
+ adxl_component_names[i], adxl_values[i]);
+
+ if (msglen - len <= 0)
+ break;
+ }
+
+ return true;
+}
+
static void skx_mce_output_error(struct mem_ctl_info *mci,
const struct mce *m,
- struct decoded_addr *res)
+ struct decoded_addr *res,
+ char *msg)
{
enum hw_event_mc_err_type tp_event;
- char *type, *optype, msg[256];
+ char *type, *optype;
bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
bool overflow = GET_BITFIELD(m->status, 62, 62);
bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
@@ -960,13 +1020,11 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
}
}
- snprintf(msg, sizeof(msg),
- "%s%s err_code:%04x:%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:%x col:%x",
+ snprintf(msg, MSG_SIZE,
+ "%s%s err_code:%04x:%04x %s",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
- mscod, errcode,
- res->socket, res->imc, res->rank,
- res->bank_group, res->bank_address, res->row, res->column);
+ mscod, errcode, msg + MSG_SIZE);
edac_dbg(0, "%s\n", msg);
@@ -977,13 +1035,33 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
optype, msg);
}
+static struct mem_ctl_info *get_mci(u64 src_id, u64 lmc)
+{
+ struct skx_dev *d;
+
+ if (lmc > NUM_IMC - 1) {
+ skx_printk(KERN_ERR, "Bad mc# 0x%llx\n", lmc);
+ return NULL;
+ }
+
+ list_for_each_entry(d, &skx_edac_list, list) {
+ if (d->imc[0].src_id == src_id)
+ return d->imc[lmc].mci;
+ }
+
+ skx_printk(KERN_ERR, "No mci for src_id %llx lmc %llx\n", src_id, lmc);
+
+ return NULL;
+}
+
static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *mce = (struct mce *)data;
struct decoded_addr res;
struct mem_ctl_info *mci;
- char *type;
+ u64 socket, memctrl;
+ char *type, *msg;
if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
return NOTIFY_DONE;
@@ -992,11 +1070,35 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
return NOTIFY_DONE;
+ msg = kzalloc(MSG_SIZE * 2, GFP_KERNEL);
+ if (!msg)
+ return NOTIFY_DONE;
+
+ if (adxl_component_count)
+ goto dsm_decoding;
+
res.addr = mce->addr;
if (!skx_decode(&res))
return NOTIFY_DONE;
mci = res.dev->imc[res.imc].mci;
+ snprintf(msg + MSG_SIZE, MSG_SIZE,
+ "socket:%d imc:%d chan:%llu dimm:%llu rank:%d bg:%d ba:%d row:%x col:%x",
+ res.socket, res.imc, res.channel, res.dimm, res.rank,
+ res.bank_group, res.bank_address, res.row, res.column);
+
+ goto decoded;
+
+dsm_decoding:
+ if (!skx_dsm_decode(mce->addr, msg + MSG_SIZE, MSG_SIZE,
+ &socket, &memctrl, &res.channel, &res.dimm))
+ goto out;
+
+ mci = get_mci(socket, memctrl);
+ if (!mci)
+ goto out;
+
+decoded:
if (mce->mcgstatus & MCG_STATUS_MCIP)
type = "Exception";
else
@@ -1015,8 +1117,10 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
"%u APIC %x\n", mce->cpuvendor, mce->cpuid,
mce->time, mce->socketid, mce->apicid);
- skx_mce_output_error(mci, mce, &res);
+ skx_mce_output_error(mci, mce, &res, msg);
+out:
+ kfree(msg);
return NOTIFY_DONE;
}
@@ -1090,6 +1194,54 @@ static void skx_remove(void)
}
}
+static void __init skx_dsm_get(void)
+{
+ const char * const *names;
+ int i, j;
+
+ names = adxl_get_component_names();
+ if (!names) {
+ skx_printk(KERN_NOTICE, "No firmware support for address translation.");
+ skx_printk(KERN_CONT, " Only decoding DDR4 address!\n");
+ return;
+ }
+
+ for (i = 0; i < INDEX_MAX; i++) {
+ for (j = 0; names[j]; j++) {
+ if (!strcmp(component_names[i], names[j])) {
+ component_indices[i] = j;
+ break;
+ }
+ }
+
+ if (!names[j])
+ goto err;
+ }
+
+ adxl_component_names = names;
+ while (*names++)
+ adxl_component_count++;
+
+ adxl_values = kcalloc(adxl_component_count, sizeof(*adxl_values), GFP_KERNEL);
+ if (!adxl_values) {
+ adxl_component_count = 0;
+ edac_dbg(0, "No memory for adxl_decode()\n");
+ }
+
+ return;
+err:
+ skx_printk(KERN_ERR, "'%s' is not matched from DSM parameters: ",
+ component_names[i]);
+ for (j = 0; names[j]; j++)
+ skx_printk(KERN_CONT, "%s ", names[j]);
+ skx_printk(KERN_CONT, "\n");
+}
+
+static void __exit skx_dsm_put(void)
+{
+ kfree(adxl_values);
+}
+
/*
* skx_init:
* make sure we are running on the correct cpu model
@@ -1154,6 +1306,9 @@ static int __init skx_init(void)
}
}
+ if (nvdimm_count)
+ skx_dsm_get();
+
/* Ensure that the OPSTATE is set correctly for POLL or NMI */
opstate_init();
@@ -1170,6 +1325,8 @@ static int __init skx_init(void)
static void __exit skx_exit(void)
{
edac_dbg(2, "\n");
+ if (nvdimm_count)
+ skx_dsm_put();
mce_unregister_decode_chain(&skx_mce_dec);
skx_remove();
teardown_skx_debug();
@@ -1180,6 +1337,14 @@ module_exit(skx_exit);
module_param(edac_op_state, int, 0444);
MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
+module_param_string(sock, component_names[0], sizeof(component_names[0]), 0644);
+MODULE_PARM_DESC(sock, "String to get socket ID");
+module_param_string(imc, component_names[1], sizeof(component_names[1]), 0644);
+MODULE_PARM_DESC(imc, "String to get IMC ID");
+module_param_string(chan, component_names[2], sizeof(component_names[2]), 0644);
+MODULE_PARM_DESC(chan, "String to get channel ID");
+module_param_string(dimm, component_names[3], sizeof(component_names[3]), 0644);
+MODULE_PARM_DESC(dimm, "String to get DIMM ID");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Tony Luck");
next reply other threads:[~2018-10-11 21:12 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-10-11 21:12 Luck, Tony [this message]
-- strict thread matches above, loose matches on Subject: below --
2018-10-16 8:58 [v2,3/3] EDAC, skx_edac: Add address translation for non-volatile DIMMs Borislav Petkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20181011211236.23525-4-tony.luck@intel.com \
--to=tony.luck@intel.com \
--cc=aris@redhat.com \
--cc=bp@alien8.de \
--cc=linux-acpi@vger.kernel.org \
--cc=linux-edac@vger.kernel.org \
--cc=mchehab@s-opensource.com \
--cc=qiuxu.zhuo@intel.com \
--cc=rafael.j.wysocki@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox