public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Muralidhara M K <muralimk@amd.com>
To: <linux-edac@vger.kernel.org>, <x86@kernel.org>
Cc: <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
	<mingo@redhat.com>, <mchehab@kernel.org>, <nchatrad@amd.com>,
	<yazen.ghannam@amd.com>, Muralidhara M K <muralidhara.mk@amd.com>,
	Naveen Krishna Chatradhi <naveenkrishna.chatradhi@amd.com>
Subject: [PATCH 6/7] EDAC/amd64: Add error instance get_err_info() to pvt->ops
Date: Thu, 20 Jul 2023 12:54:24 +0000	[thread overview]
Message-ID: <20230720125425.3735538-7-muralimk@amd.com> (raw)
In-Reply-To: <20230720125425.3735538-1-muralimk@amd.com>

From: Muralidhara M K <muralidhara.mk@amd.com>

On CPUs the data fabric ID of an instance on a CPU is equal to the
UMC number. since the UMC number and channel are equal in CPU nodes,
the channel can be used as the data fabric ID of the instance.

GPU node has 'X' number of PHYs and 'Y' number of channels.
This results in 'X*Y' number of instances in the data fabric.
Therefore the data fabric ID of an instance in GPU as below:
  df_inst_id = 'X' * number of channels per PHY + 'Y'

Co-developed-by: Naveen Krishna Chatradhi <naveenkrishna.chatradhi@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <naveenkrishna.chatradhi@amd.com>
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
---
 drivers/edac/amd64_edac.c | 36 +++++++++++++++++++++++++++++++++++-
 drivers/edac/amd64_edac.h |  2 ++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 45d8093c117a..74b2b47cc22a 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -3047,6 +3047,17 @@ static inline void decode_bus_error(int node_id, struct mce *m)
 	__log_ecc_error(mci, &err, ecc_type);
 }
 
+/*
+ * On CPUs, The data fabric ID of an instance is equal to the UMC number.
+ * and since the UMC number and channel are equal in CPU nodes, the channel can be
+ * used as the data fabric ID of the instance.
+ */
+static int umc_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+		       struct err_info *err)
+{
+	return err->channel;
+}
+
 /*
  * To find the UMC channel represented by this bank we need to match on its
  * instance_id. The instance_id of a bank is held in the lower 32 bits of its
@@ -3071,6 +3082,7 @@ static void decode_umc_error(int node_id, struct mce *m)
 	struct mem_ctl_info *mci;
 	struct amd64_pvt *pvt;
 	struct err_info err;
+	u8 df_inst_id;
 	u64 sys_addr;
 
 	node_id = fixup_node_id(node_id, m);
@@ -3101,8 +3113,9 @@ static void decode_umc_error(int node_id, struct mce *m)
 	}
 
 	pvt->ops->get_err_info(m, &err);
+	df_inst_id = pvt->ops->get_inst_id(mci, pvt, &err);
 
-	if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) {
+	if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, df_inst_id, &sys_addr)) {
 		err.err_code = ERR_NORM_ADDR;
 		goto log_error;
 	}
@@ -3758,6 +3771,25 @@ static int umc_hw_info_get(struct amd64_pvt *pvt)
 	return 0;
 }
 
+/*
+ * A GPU node has 'X' number of PHYs and 'Y' number of channels.
+ * This results in 'X*Y' number of instances in the data fabric.
+ * Therefore the data fabric ID of an instance can be found with the following formula:
+ * df_inst_id = 'X' * number of channels per PHY + 'Y'
+ *
+ */
+static int gpu_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+		       struct err_info *err)
+{
+	int i, channels = 0;
+
+	/* The memory channels in case of GPUs are fully populated */
+	for_each_umc(i)
+		channels += pvt->csels[i].b_cnt;
+
+	return (err->csrow * channels / mci->nr_csrows) + err->channel;
+}
+
 /*
  * The CPUs have one channel per UMC, so UMC number is equivalent to a
  * channel number. The GPUs have 8 channels per UMC, so the UMC number no
@@ -4015,6 +4047,7 @@ static struct low_ops umc_ops = {
 	.setup_mci_misc_attrs		= umc_setup_mci_misc_attrs,
 	.dump_misc_regs			= umc_dump_misc_regs,
 	.get_err_info			= umc_get_err_info,
+	.get_inst_id			= umc_inst_id,
 };
 
 static struct low_ops gpu_ops = {
@@ -4023,6 +4056,7 @@ static struct low_ops gpu_ops = {
 	.setup_mci_misc_attrs		= gpu_setup_mci_misc_attrs,
 	.dump_misc_regs			= gpu_dump_misc_regs,
 	.get_err_info			= gpu_get_err_info,
+	.get_inst_id			= gpu_inst_id,
 };
 
 /* Use Family 16h versions for defaults and adjust as needed below. */
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 5a4e4a59682b..d9e9e62dd4b1 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -471,6 +471,8 @@ struct low_ops {
 	void (*setup_mci_misc_attrs)(struct mem_ctl_info *mci);
 	void (*dump_misc_regs)(struct amd64_pvt *pvt);
 	void (*get_err_info)(struct mce *m, struct err_info *err);
+	int  (*get_inst_id)(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+			    struct err_info *err);
 };
 
 int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,
-- 
2.25.1


  parent reply	other threads:[~2023-07-20 12:55 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-20 12:54 [PATCH 0/7] AMD Family 19h Models 90h-9fh EDAC Support Muralidhara M K
2023-07-20 12:54 ` [PATCH 1/7] x86/amd_nb: Add AMD Family 19h Models(80h-80fh) and (90h-9fh) PCI IDs Muralidhara M K
2023-07-21 14:44   ` Yazen Ghannam
2023-07-20 12:54 ` [PATCH 2/7] EDAC/mce_amd: Remove SMCA Extended Error code descriptions Muralidhara M K
2023-07-20 13:59   ` Borislav Petkov
2023-07-20 15:25     ` M K, Muralidhara
2023-07-20 15:55       ` Borislav Petkov
2023-07-21 14:45         ` Yazen Ghannam
2023-10-24  6:18           ` M K, Muralidhara
2023-07-20 12:54 ` [PATCH 3/7] x86/MCE/AMD: Add new MA_LLC, USR_DP, and USR_CP bank types Muralidhara M K
2023-07-22  8:20   ` Borislav Petkov
2023-07-20 12:54 ` [PATCH 4/7] EDAC/mc: Add new HBM3 memory type Muralidhara M K
2023-08-03 10:27   ` Borislav Petkov
2023-07-20 12:54 ` [PATCH 5/7] EDAC/amd64: Add Fam19h Model 90h ~ 9fh enumeration support Muralidhara M K
2023-08-05 10:10   ` Borislav Petkov
2023-07-20 12:54 ` Muralidhara M K [this message]
2023-07-21 14:47   ` [PATCH 6/7] EDAC/amd64: Add error instance get_err_info() to pvt->ops Yazen Ghannam
2023-07-20 12:54 ` [PATCH 7/7] EDAC/amd64: Add Error address conversion for UMC Muralidhara M K
2023-07-21 14:49   ` Yazen Ghannam

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230720125425.3735538-7-muralimk@amd.com \
    --to=muralimk@amd.com \
    --cc=bp@alien8.de \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mchehab@kernel.org \
    --cc=mingo@redhat.com \
    --cc=muralidhara.mk@amd.com \
    --cc=naveenkrishna.chatradhi@amd.com \
    --cc=nchatrad@amd.com \
    --cc=x86@kernel.org \
    --cc=yazen.ghannam@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox