From: "Luck, Tony" <tony.luck@intel.com>
To: Borislav Petkov <bp@suse.de>
Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>,
Tony Luck <tony.luck@intel.com>, Borislav Petkov <bp@alien8.de>,
Mauro Carvalho Chehab <mchehab@kernel.org>,
Aristeu Rozanski <aris@redhat.com>,
linux-edac@vger.kernel.org
Subject: [3/3] EDAC, sb_edac: Fix reporting wrong DIMM when patrol scrubber finds error
Date: Tue, 4 Sep 2018 14:07:59 -0700 [thread overview]
Message-ID: <20180904210759.3814-4-tony.luck@intel.com> (raw)
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
EDAC driver sometimes reports the wrong DIMM for a memory
error found by the patrol scrubber. It's rooted in h/w that
only provides a 4KB page aligned address for the error case.
This means that the EDAC driver will point at the DIMM matching
offset 0x0 in the 4KB page, but because of interleaving across
channels and ranks the actual DIMM involved may be different
if the error is on some other cache line within the page.
For this error case, we could pass the socket/iMC/channel
information from the "mce" structure passed the EDAC driver
and "dimm=-1" to the EDAC core. So it will report all the
DIMMs on that channel may be affected.
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
drivers/edac/sb_edac.c | 95 +++++++++++++++++++++++++++++++++++++++---
1 file changed, 89 insertions(+), 6 deletions(-)
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index f3678cdada83..f6009c7d452b 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -326,6 +326,7 @@ struct sbridge_info {
const struct interleave_pkg *interleave_pkg;
u8 max_sad;
u8 (*get_node_id)(struct sbridge_pvt *pvt);
+ u8 (*get_ha)(u8 bank);
enum mem_type (*get_memory_type)(struct sbridge_pvt *pvt);
enum dev_type (*get_width)(struct sbridge_pvt *pvt, u32 mtr);
struct pci_dev *pci_vtd;
@@ -1002,6 +1003,22 @@ static u8 knl_get_node_id(struct sbridge_pvt *pvt)
return GET_BITFIELD(reg, 0, 2);
}
+static u8 sbridge_get_ha(u8 bank)
+{
+ return 0;
+}
+
+static u8 ibridge_get_ha(u8 bank)
+{
+ switch (bank) {
+ case 7 ... 8:
+ return bank - 7;
+ case 9 ... 16:
+ return (bank - 9) / 4;
+ default:
+ return -EINVAL;
+ }
+}
static u64 haswell_get_tolm(struct sbridge_pvt *pvt)
{
@@ -2207,6 +2224,56 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
return 0;
}
+static int get_memory_error_data_from_mce(struct mem_ctl_info *mci,
+ const struct mce *m, u8 *socket,
+ u8 *ha, long *channel_mask,
+ char *msg)
+{
+ u32 reg, channel = GET_BITFIELD(m->status, 0, 3);
+ struct mem_ctl_info *new_mci;
+ struct sbridge_pvt *pvt;
+ struct pci_dev *pci_ha;
+ bool tad0;
+
+ if (channel >= NUM_CHANNELS) {
+ sprintf(msg, "Invalid channel 0x%x", channel);
+ return -EINVAL;
+ }
+
+ pvt = mci->pvt_info;
+ *ha = pvt->info.get_ha(m->bank);
+ if (*ha != 0 && *ha != 1) {
+ sprintf(msg, "Invalid ha 0x%x", *ha);
+ return -EINVAL;
+ }
+
+ *socket = m->socketid;
+ new_mci = get_mci_for_node_id(*socket, *ha);
+ if (!new_mci) {
+ strcpy(msg, "mci socket got corrupted!");
+ return -EINVAL;
+ }
+
+ pvt = new_mci->pvt_info;
+ pci_ha = pvt->pci_ha;
+ pci_read_config_dword(pci_ha, tad_dram_rule[0], ®);
+ tad0 = m->addr <= TAD_LIMIT(reg);
+
+ *channel_mask = 1 << channel;
+ if (pvt->mirror_mode == FULL_MIRRORING ||
+ (pvt->mirror_mode == ADDR_RANGE_MIRRORING && tad0)) {
+ *channel_mask |= 1 << ((channel + 2) % 4);
+ pvt->is_cur_addr_mirrored = true;
+ } else {
+ pvt->is_cur_addr_mirrored = false;
+ }
+
+ if (pvt->is_lockstep)
+ *channel_mask |= 1 << ((channel + 1) % 4);
+
+ return 0;
+}
+
/****************************************************************************
Device initialization routines: put/get, init/exit
****************************************************************************/
@@ -2877,10 +2944,16 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
u32 errcode = GET_BITFIELD(m->status, 0, 15);
u32 channel = GET_BITFIELD(m->status, 0, 3);
u32 optypenum = GET_BITFIELD(m->status, 4, 6);
+ /*
+ * Bits 5-0 of MCi_MISC give the least significant bit that is valid.
+ * A value 6 is for cache line aligned address, a value 12 is for page
+ * aligned address reported by patrol scrubber.
+ */
+ u32 lsb = GET_BITFIELD(m->misc, 0, 5);
long channel_mask, first_channel;
- u8 rank, socket, ha;
+ u8 rank = 0xff, socket, ha;
int rc, dimm;
- char *area_type = NULL;
+ char *area_type = "DRAM";
if (pvt->info.type != SANDY_BRIDGE)
recoverable = true;
@@ -2964,9 +3037,13 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
optype, msg);
}
return;
- } else {
+ } else if (lsb < 12) {
rc = get_memory_error_data(mci, m->addr, &socket, &ha,
- &channel_mask, &rank, &area_type, msg);
+ &channel_mask, &rank,
+ &area_type, msg);
+ } else {
+ rc = get_memory_error_data_from_mce(mci, m, &socket, &ha,
+ &channel_mask, msg);
}
if (rc < 0)
@@ -2981,14 +3058,15 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
first_channel = find_first_bit(&channel_mask, NUM_CHANNELS);
- if (rank < 4)
+ if (rank == 0xff)
+ dimm = -1;
+ else if (rank < 4)
dimm = 0;
else if (rank < 8)
dimm = 1;
else
dimm = 2;
-
/*
* FIXME: On some memory configurations (mirror, lockstep), the
* Memory Controller can't point the error to a single DIMM. The
@@ -3175,6 +3253,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = ibridge_dram_rule;
pvt->info.get_memory_type = get_memory_type;
pvt->info.get_node_id = get_node_id;
+ pvt->info.get_ha = ibridge_get_ha;
pvt->info.rir_limit = rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@@ -3199,6 +3278,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = sbridge_dram_rule;
pvt->info.get_memory_type = get_memory_type;
pvt->info.get_node_id = get_node_id;
+ pvt->info.get_ha = sbridge_get_ha;
pvt->info.rir_limit = rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@@ -3223,6 +3303,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = ibridge_dram_rule;
pvt->info.get_memory_type = haswell_get_memory_type;
pvt->info.get_node_id = haswell_get_node_id;
+ pvt->info.get_ha = ibridge_get_ha;
pvt->info.rir_limit = haswell_rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@@ -3247,6 +3328,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = ibridge_dram_rule;
pvt->info.get_memory_type = haswell_get_memory_type;
pvt->info.get_node_id = haswell_get_node_id;
+ pvt->info.get_ha = ibridge_get_ha;
pvt->info.rir_limit = haswell_rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@@ -3271,6 +3353,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = knl_dram_rule;
pvt->info.get_memory_type = knl_get_memory_type;
pvt->info.get_node_id = knl_get_node_id;
+ pvt->info.get_ha = NULL;
pvt->info.rir_limit = NULL;
pvt->info.sad_limit = knl_sad_limit;
pvt->info.interleave_mode = knl_interleave_mode;
next reply other threads:[~2018-09-04 21:07 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-09-04 21:07 Luck, Tony [this message]
-- strict thread matches above, loose matches on Subject: below --
2018-09-06 10:53 [3/3] EDAC, sb_edac: Fix reporting wrong DIMM when patrol scrubber finds error Borislav Petkov
2018-09-06 11:48 Qiuxu Zhuo
2018-09-06 12:34 Borislav Petkov
2018-09-06 13:12 Qiuxu Zhuo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180904210759.3814-4-tony.luck@intel.com \
--to=tony.luck@intel.com \
--cc=aris@redhat.com \
--cc=bp@alien8.de \
--cc=bp@suse.de \
--cc=linux-edac@vger.kernel.org \
--cc=mchehab@kernel.org \
--cc=qiuxu.zhuo@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox