From: Oded Gabbay <ogabbay@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: Ofir Bitton <obitton@habana.ai>
Subject: [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event
Date: Mon, 4 Jul 2022 12:29:30 +0300 [thread overview]
Message-ID: <20220704092941.2237683-1-ogabbay@kernel.org> (raw)
From: Ofir Bitton <obitton@habana.ai>
Correctable ECC events are not fatal, but as they accumulate, the f/w
can decide that a hard-rest is required. This indication is
propagated to the host using the existing ECC event interface.
Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/gaudi2/gaudi2.c | 25 +++++++++++--------
.../misc/habanalabs/include/common/cpucp_if.h | 2 +-
2 files changed, 16 insertions(+), 11 deletions(-)
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index edcf23b314a7..dbbd08600a56 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -6637,7 +6637,7 @@ static void gaudi2_print_irq_info(struct hl_device *hdev, u16 event_type)
event_type, desc);
}
-static void gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
+static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
struct hl_eq_ecc_data *ecc_data)
{
u64 ecc_address = 0, ecc_syndrom = 0;
@@ -6647,8 +6647,11 @@ static void gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom);
memory_wrapper_idx = ecc_data->memory_wrapper_idx;
- dev_err(hdev->dev, "ECC error detected. address: %#llx. Syndrom: %#llx. block id %u\n",
- ecc_address, ecc_syndrom, memory_wrapper_idx);
+ dev_err(hdev->dev,
+ "ECC error detected. address: %#llx. Syndrom: %#llx. block id %u. critical %u.\n",
+ ecc_address, ecc_syndrom, memory_wrapper_idx, ecc_data->is_critical);
+
+ return !!ecc_data->is_critical;
}
/*
@@ -7991,9 +7994,9 @@ static bool gaudi2_handle_hbm_mc_sei_err(struct hl_device *hdev, u16 event_type,
}
dev_err_ratelimited(hdev->dev,
- "System Error Interrupt - HBM(%u) MC(%u) MC_CH(%u) MC_PC(%u). Error cause: %s\n",
- hbm_id, mc_id, sei_data->hdr.mc_channel, sei_data->hdr.mc_pseudo_channel,
- hbm_mc_sei_cause[cause_idx]);
+ "System Error Interrupt - HBM(%u) MC(%u) MC_CH(%u) MC_PC(%u). Critical(%u). Error cause: %s\n",
+ hbm_id, mc_id, sei_data->hdr.mc_channel, sei_data->hdr.mc_pseudo_channel,
+ sei_data->hdr.is_critical, hbm_mc_sei_cause[cause_idx]);
/* Print error-specific info */
switch (cause_idx) {
@@ -8032,6 +8035,8 @@ static bool gaudi2_handle_hbm_mc_sei_err(struct hl_device *hdev, u16 event_type,
break;
};
+ require_hard_reset |= !!sei_data->hdr.is_critical;
+
return require_hard_reset;
}
@@ -8199,7 +8204,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
{
u32 ctl, reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY;
struct gaudi2_device *gaudi2 = hdev->asic_specific;
- bool hbm_require_reset = false, skip_reset = false;
+ bool reset_required = false, skip_reset = false;
int index, sbte_index;
u16 event_type;
@@ -8222,7 +8227,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
fallthrough;
case GAUDI2_EVENT_ROTATOR0_SERR ... GAUDI2_EVENT_ROTATOR1_DERR:
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
- gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
+ reset_required = gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
break;
case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_PDMA1_QM:
@@ -8387,7 +8392,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
case GAUDI2_EVENT_HBM0_MC0_SEI_SEVERE ... GAUDI2_EVENT_HBM5_MC1_SEI_NON_SEVERE:
if (gaudi2_handle_hbm_mc_sei_err(hdev, event_type, &eq_entry->sei_data)) {
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
- hbm_require_reset = true;
+ reset_required = true;
}
break;
@@ -8539,7 +8544,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
event_type);
}
- if ((gaudi2_irq_map_table[event_type].reset || hbm_require_reset) && !skip_reset)
+ if ((gaudi2_irq_map_table[event_type].reset || reset_required) && !skip_reset)
goto reset_device;
/* Send unmask irq only for interrupts not classified as MSG */
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index 719b2ff80985..abf40e1c4965 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -192,7 +192,7 @@ struct hl_hbm_sei_header {
__u8 sei_cause; /* enum hl_hbm_sei_cause */
__u8 mc_channel; /* range: 0-3 */
__u8 mc_pseudo_channel; /* range: 0-7 */
- __u8 pad[1];
+ __u8 is_critical;
};
#define HBM_RD_ADDR_SID_SHIFT 0
--
2.25.1
next reply other threads:[~2022-07-04 9:29 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-07-04 9:29 Oded Gabbay [this message]
2022-07-04 9:29 ` [PATCH 02/12] habanalabs: wait for preboot ready after hard reset Oded Gabbay
2022-07-04 9:29 ` [PATCH 03/12] habanalabs: naming refactor of user interrupt flow Oded Gabbay
2022-07-04 9:29 ` [PATCH 04/12] habanalabs: add support for common decoder interrupts Oded Gabbay
2022-07-04 9:29 ` [PATCH 05/12] habanalabs: save f/w preboot minor version Oded Gabbay
2022-07-04 9:29 ` [PATCH 06/12] habanalabs: allow detection of unsupported f/w packets Oded Gabbay
2022-07-04 9:29 ` [PATCH 07/12] habanalabs/gaudi2: remove unused variable Oded Gabbay
2022-07-04 9:29 ` [PATCH 08/12] habanalabs/gaudi2: SM mask can only be 8-bit Oded Gabbay
2022-07-04 9:29 ` [PATCH 09/12] habanalabs: do not set max power on a secured device Oded Gabbay
2022-07-04 9:29 ` [PATCH 10/12] habanalabs: don't declare tmp twice in same function Oded Gabbay
2022-07-04 9:29 ` [PATCH 11/12] habanalabs: make sure variable is set before used Oded Gabbay
2022-07-04 9:29 ` [PATCH 12/12] habanalabs/gaudi2: remove unused defines Oded Gabbay
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220704092941.2237683-1-ogabbay@kernel.org \
--to=ogabbay@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=obitton@habana.ai \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox