* [PATCH 2/6] habanalabs: add device id to all threads names
2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
@ 2023-02-27 11:13 ` Oded Gabbay
2023-03-01 13:21 ` Stanislaw Gruszka
2023-02-27 11:13 ` [PATCH 3/6] habanalabs/gaudi2: break is_idle function into per-engine sub-routines Oded Gabbay
` (4 subsequent siblings)
5 siblings, 1 reply; 10+ messages in thread
From: Oded Gabbay @ 2023-02-27 11:13 UTC (permalink / raw)
To: dri-devel; +Cc: Sagiv Ozeri
From: Sagiv Ozeri <sozeri@habana.ai>
Compute driver threads names will start with hlX-*, when X is the
device id.
This will help distinguish them from the NIC thread names.
Signed-off-by: Sagiv Ozeri <sozeri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/accel/habanalabs/common/device.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index e544d00fe376..7ade32487138 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -840,7 +840,7 @@ static int device_early_init(struct hl_device *hdev)
}
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
- snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i);
+ snprintf(workq_name, 32, "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i);
hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
if (hdev->cq_wq[i] == NULL) {
dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
@@ -849,14 +849,16 @@ static int device_early_init(struct hl_device *hdev)
}
}
- hdev->eq_wq = create_singlethread_workqueue("hl-events");
+ snprintf(workq_name, 32, "hl%u-events", hdev->cdev_idx);
+ hdev->eq_wq = create_singlethread_workqueue(workq_name);
if (hdev->eq_wq == NULL) {
dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
rc = -ENOMEM;
goto free_cq_wq;
}
- hdev->cs_cmplt_wq = alloc_workqueue("hl-cs-completions", WQ_UNBOUND, 0);
+ snprintf(workq_name, 32, "hl%u-cs-completions", hdev->cdev_idx);
+ hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
if (!hdev->cs_cmplt_wq) {
dev_err(hdev->dev,
"Failed to allocate CS completions workqueue\n");
@@ -864,7 +866,8 @@ static int device_early_init(struct hl_device *hdev)
goto free_eq_wq;
}
- hdev->ts_free_obj_wq = alloc_workqueue("hl-ts-free-obj", WQ_UNBOUND, 0);
+ snprintf(workq_name, 32, "hl%u-ts-free-obj", hdev->cdev_idx);
+ hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
if (!hdev->ts_free_obj_wq) {
dev_err(hdev->dev,
"Failed to allocate Timestamp registration free workqueue\n");
@@ -872,15 +875,15 @@ static int device_early_init(struct hl_device *hdev)
goto free_cs_cmplt_wq;
}
- hdev->prefetch_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0);
+ snprintf(workq_name, 32, "hl%u-prefetch", hdev->cdev_idx);
+ hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
if (!hdev->prefetch_wq) {
dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
rc = -ENOMEM;
goto free_ts_free_wq;
}
- hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
- GFP_KERNEL);
+ hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL);
if (!hdev->hl_chip_info) {
rc = -ENOMEM;
goto free_prefetch_wq;
@@ -892,7 +895,8 @@ static int device_early_init(struct hl_device *hdev)
hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr);
- hdev->reset_wq = create_singlethread_workqueue("hl_device_reset");
+ snprintf(workq_name, 32, "hl%u_device_reset", hdev->cdev_idx);
+ hdev->reset_wq = create_singlethread_workqueue(workq_name);
if (!hdev->reset_wq) {
rc = -ENOMEM;
dev_err(hdev->dev, "Failed to create device reset WQ\n");
--
2.39.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 3/6] habanalabs/gaudi2: break is_idle function into per-engine sub-routines
2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
2023-02-27 11:13 ` [PATCH 2/6] habanalabs: add device id to all threads names Oded Gabbay
@ 2023-02-27 11:13 ` Oded Gabbay
2023-02-27 11:13 ` [PATCH 4/6] habanalabs: assert return value of hw_fini Oded Gabbay
` (3 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2023-02-27 11:13 UTC (permalink / raw)
To: dri-devel; +Cc: Koby Elbaz
From: Koby Elbaz <kelbaz@habana.ai>
is_idle() was too long, so break it up for readability.
In addition, we can now use the new sub-routines from other places.
Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/accel/habanalabs/gaudi2/gaudi2.c | 212 ++++++++++++++++-------
1 file changed, 146 insertions(+), 66 deletions(-)
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 2021ef9d4702..82448edfdfa0 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -6651,70 +6651,17 @@ static int gaudi2_compute_reset_late_init(struct hl_device *hdev)
return hl_fw_unmask_irq_arr(hdev, gaudi2->hw_events, irq_arr_size);
}
-static void gaudi2_is_tpc_engine_idle(struct hl_device *hdev, int dcore, int inst, u32 offset,
- struct iterate_module_ctx *ctx)
-{
- struct gaudi2_tpc_idle_data *idle_data = ctx->data;
- u32 tpc_cfg_sts, qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts;
- bool is_eng_idle;
- int engine_idx;
-
- if ((dcore == 0) && (inst == (NUM_DCORE0_TPC - 1)))
- engine_idx = GAUDI2_DCORE0_ENGINE_ID_TPC_6;
- else
- engine_idx = GAUDI2_DCORE0_ENGINE_ID_TPC_0 +
- dcore * GAUDI2_ENGINE_ID_DCORE_OFFSET + inst;
-
- tpc_cfg_sts = RREG32(mmDCORE0_TPC0_CFG_STATUS + offset);
- qm_glbl_sts0 = RREG32(mmDCORE0_TPC0_QM_GLBL_STS0 + offset);
- qm_glbl_sts1 = RREG32(mmDCORE0_TPC0_QM_GLBL_STS1 + offset);
- qm_cgm_sts = RREG32(mmDCORE0_TPC0_QM_CGM_STS + offset);
-
- is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts) &&
- IS_TPC_IDLE(tpc_cfg_sts);
- *(idle_data->is_idle) &= is_eng_idle;
-
- if (idle_data->mask && !is_eng_idle)
- set_bit(engine_idx, idle_data->mask);
-
- if (idle_data->e)
- hl_engine_data_sprintf(idle_data->e,
- idle_data->tpc_fmt, dcore, inst,
- is_eng_idle ? "Y" : "N",
- qm_glbl_sts0, qm_cgm_sts, tpc_cfg_sts);
-}
-
-static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
- struct engines_data *e)
+static bool gaudi2_get_edma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+ struct engines_data *e)
{
- u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts, dma_core_idle_ind_mask,
- mme_arch_sts, dec_swreg15, dec_enabled_bit;
+ u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts, dma_core_idle_ind_mask;
struct asic_fixed_properties *prop = &hdev->asic_prop;
- const char *rot_fmt = "%-6d%-5d%-9s%#-14x%#-12x%s\n";
unsigned long *mask = (unsigned long *) mask_arr;
const char *edma_fmt = "%-6d%-6d%-9s%#-14x%#x\n";
- const char *mme_fmt = "%-5d%-6s%-9s%#-14x%#x\n";
- const char *nic_fmt = "%-5d%-9s%#-14x%#-12x\n";
- const char *pdma_fmt = "%-6d%-9s%#-14x%#x\n";
- const char *pcie_dec_fmt = "%-10d%-9s%#x\n";
- const char *dec_fmt = "%-6d%-5d%-9s%#x\n";
bool is_idle = true, is_eng_idle;
- u64 offset;
-
- struct gaudi2_tpc_idle_data tpc_idle_data = {
- .tpc_fmt = "%-6d%-5d%-9s%#-14x%#-12x%#x\n",
- .e = e,
- .mask = mask,
- .is_idle = &is_idle,
- };
- struct iterate_module_ctx tpc_iter = {
- .fn = &gaudi2_is_tpc_engine_idle,
- .data = &tpc_idle_data,
- };
-
int engine_idx, i, j;
+ u64 offset;
- /* EDMA, Two engines per Dcore */
if (e)
hl_engine_data_sprintf(e,
"\nCORE EDMA is_idle QM_GLBL_STS0 DMA_CORE_IDLE_IND_MASK\n"
@@ -6753,7 +6700,19 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
}
}
- /* PDMA, Two engines in Full chip */
+ return is_idle;
+}
+
+static bool gaudi2_get_pdma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+ struct engines_data *e)
+{
+ u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts, dma_core_idle_ind_mask;
+ unsigned long *mask = (unsigned long *) mask_arr;
+ const char *pdma_fmt = "%-6d%-9s%#-14x%#x\n";
+ bool is_idle = true, is_eng_idle;
+ int engine_idx, i;
+ u64 offset;
+
if (e)
hl_engine_data_sprintf(e,
"\nPDMA is_idle QM_GLBL_STS0 DMA_CORE_IDLE_IND_MASK\n"
@@ -6780,6 +6739,19 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
qm_glbl_sts0, dma_core_idle_ind_mask);
}
+ return is_idle;
+}
+
+static bool gaudi2_get_nic_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+ struct engines_data *e)
+{
+ unsigned long *mask = (unsigned long *) mask_arr;
+ const char *nic_fmt = "%-5d%-9s%#-14x%#-12x\n";
+ u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts;
+ bool is_idle = true, is_eng_idle;
+ int engine_idx, i;
+ u64 offset;
+
/* NIC, twelve macros in Full chip */
if (e && hdev->nic_ports_mask)
hl_engine_data_sprintf(e,
@@ -6813,6 +6785,19 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
qm_glbl_sts0, qm_cgm_sts);
}
+ return is_idle;
+}
+
+static bool gaudi2_get_mme_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+ struct engines_data *e)
+{
+ u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts, mme_arch_sts;
+ unsigned long *mask = (unsigned long *) mask_arr;
+ const char *mme_fmt = "%-5d%-6s%-9s%#-14x%#x\n";
+ bool is_idle = true, is_eng_idle;
+ int engine_idx, i;
+ u64 offset;
+
if (e)
hl_engine_data_sprintf(e,
"\nMME Stub is_idle QM_GLBL_STS0 MME_ARCH_STATUS\n"
@@ -6843,16 +6828,82 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
set_bit(engine_idx, mask);
}
- /*
- * TPC
- */
+ return is_idle;
+}
+
+static void gaudi2_is_tpc_engine_idle(struct hl_device *hdev, int dcore, int inst, u32 offset,
+ struct iterate_module_ctx *ctx)
+{
+ struct gaudi2_tpc_idle_data *idle_data = ctx->data;
+ u32 tpc_cfg_sts, qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts;
+ bool is_eng_idle;
+ int engine_idx;
+
+ if ((dcore == 0) && (inst == (NUM_DCORE0_TPC - 1)))
+ engine_idx = GAUDI2_DCORE0_ENGINE_ID_TPC_6;
+ else
+ engine_idx = GAUDI2_DCORE0_ENGINE_ID_TPC_0 +
+ dcore * GAUDI2_ENGINE_ID_DCORE_OFFSET + inst;
+
+ tpc_cfg_sts = RREG32(mmDCORE0_TPC0_CFG_STATUS + offset);
+ qm_glbl_sts0 = RREG32(mmDCORE0_TPC0_QM_GLBL_STS0 + offset);
+ qm_glbl_sts1 = RREG32(mmDCORE0_TPC0_QM_GLBL_STS1 + offset);
+ qm_cgm_sts = RREG32(mmDCORE0_TPC0_QM_CGM_STS + offset);
+
+ is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts) &&
+ IS_TPC_IDLE(tpc_cfg_sts);
+ *(idle_data->is_idle) &= is_eng_idle;
+
+ if (idle_data->mask && !is_eng_idle)
+ set_bit(engine_idx, idle_data->mask);
+
+ if (idle_data->e)
+ hl_engine_data_sprintf(idle_data->e,
+ idle_data->tpc_fmt, dcore, inst,
+ is_eng_idle ? "Y" : "N",
+ qm_glbl_sts0, qm_cgm_sts, tpc_cfg_sts);
+}
+
+static bool gaudi2_get_tpc_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+ struct engines_data *e)
+{
+ struct asic_fixed_properties *prop = &hdev->asic_prop;
+ unsigned long *mask = (unsigned long *) mask_arr;
+ bool is_idle = true;
+
+ struct gaudi2_tpc_idle_data tpc_idle_data = {
+ .tpc_fmt = "%-6d%-5d%-9s%#-14x%#-12x%#x\n",
+ .e = e,
+ .mask = mask,
+ .is_idle = &is_idle,
+ };
+ struct iterate_module_ctx tpc_iter = {
+ .fn = &gaudi2_is_tpc_engine_idle,
+ .data = &tpc_idle_data,
+ };
+
if (e && prop->tpc_enabled_mask)
hl_engine_data_sprintf(e,
- "\nCORE TPC is_idle QM_GLBL_STS0 QM_CGM_STS DMA_CORE_IDLE_IND_MASK\n"
- "---- --- -------- ------------ ---------- ----------------------\n");
+ "\nCORE TPC is_idle QM_GLBL_STS0 QM_CGM_STS STATUS\n"
+ "---- --- ------- ------------ ---------- ------\n");
gaudi2_iterate_tpcs(hdev, &tpc_iter);
+ return tpc_idle_data.is_idle;
+}
+
+static bool gaudi2_get_decoder_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+ struct engines_data *e)
+{
+ struct asic_fixed_properties *prop = &hdev->asic_prop;
+ unsigned long *mask = (unsigned long *) mask_arr;
+ const char *pcie_dec_fmt = "%-10d%-9s%#x\n";
+ const char *dec_fmt = "%-6d%-5d%-9s%#x\n";
+ bool is_idle = true, is_eng_idle;
+ u32 dec_swreg15, dec_enabled_bit;
+ int engine_idx, i, j;
+ u64 offset;
+
/* Decoders, two each Dcore and two shared PCIe decoders */
if (e && (prop->decoder_enabled_mask & (~PCIE_DEC_EN_MASK)))
hl_engine_data_sprintf(e,
@@ -6907,10 +6958,23 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
is_eng_idle ? "Y" : "N", dec_swreg15);
}
+ return is_idle;
+}
+
+static bool gaudi2_get_rotator_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+ struct engines_data *e)
+{
+ const char *rot_fmt = "%-6d%-5d%-9s%#-14x%#-14x%#x\n";
+ unsigned long *mask = (unsigned long *) mask_arr;
+ u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts;
+ bool is_idle = true, is_eng_idle;
+ int engine_idx, i;
+ u64 offset;
+
if (e)
hl_engine_data_sprintf(e,
- "\nCORE ROT is_idle QM_GLBL_STS0 QM_CGM_STS DMA_CORE_STS0\n"
- "---- ---- ------- ------------ ---------- -------------\n");
+ "\nCORE ROT is_idle QM_GLBL_STS0 QM_GLBL_STS1 QM_CGM_STS\n"
+ "---- --- ------- ------------ ------------ ----------\n");
for (i = 0 ; i < NUM_OF_ROT ; i++) {
engine_idx = GAUDI2_ENGINE_ID_ROT_0 + i;
@@ -6929,12 +6993,28 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
if (e)
hl_engine_data_sprintf(e, rot_fmt, i, 0, is_eng_idle ? "Y" : "N",
- qm_glbl_sts0, qm_cgm_sts, "-");
+ qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts);
}
return is_idle;
}
+bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+ struct engines_data *e)
+{
+ bool is_idle = true;
+
+ is_idle &= gaudi2_get_edma_idle_status(hdev, mask_arr, mask_len, e);
+ is_idle &= gaudi2_get_pdma_idle_status(hdev, mask_arr, mask_len, e);
+ is_idle &= gaudi2_get_nic_idle_status(hdev, mask_arr, mask_len, e);
+ is_idle &= gaudi2_get_mme_idle_status(hdev, mask_arr, mask_len, e);
+ is_idle &= gaudi2_get_tpc_idle_status(hdev, mask_arr, mask_len, e);
+ is_idle &= gaudi2_get_decoder_idle_status(hdev, mask_arr, mask_len, e);
+ is_idle &= gaudi2_get_rotator_idle_status(hdev, mask_arr, mask_len, e);
+
+ return is_idle;
+}
+
static void gaudi2_hw_queues_lock(struct hl_device *hdev)
__acquires(&gaudi2->hw_queues_lock)
{
--
2.39.2
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 4/6] habanalabs: assert return value of hw_fini
2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
2023-02-27 11:13 ` [PATCH 2/6] habanalabs: add device id to all threads names Oded Gabbay
2023-02-27 11:13 ` [PATCH 3/6] habanalabs/gaudi2: break is_idle function into per-engine sub-routines Oded Gabbay
@ 2023-02-27 11:13 ` Oded Gabbay
2023-03-01 13:31 ` Stanislaw Gruszka
2023-02-27 11:13 ` [PATCH 5/6] habanalabs: use notifications and graceful reset for decoder Oded Gabbay
` (2 subsequent siblings)
5 siblings, 1 reply; 10+ messages in thread
From: Oded Gabbay @ 2023-02-27 11:13 UTC (permalink / raw)
To: dri-devel; +Cc: Dafna Hirschfeld
From: Dafna Hirschfeld <dhirschfeld@habana.ai>
Since hw_fini return error code for failure indication, we should
check its return value. Currently it might only fail upon soft-reset
from hl_device_reset. Later patch will add hw_fini failure in case of
polling timeout in hard-reset.
Signed-off-by: Dafna Hirschfeld <dhirschfeld@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/accel/habanalabs/common/device.c | 12 +++++++++---
drivers/accel/habanalabs/gaudi/gaudi.c | 7 ++++++-
drivers/accel/habanalabs/gaudi2/gaudi2.c | 7 ++++++-
drivers/accel/habanalabs/goya/goya.c | 7 ++++++-
4 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 7ade32487138..99e793dfb126 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1472,7 +1472,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
schedule_hard_reset = false, delay_reset, from_dev_release, from_watchdog_thread;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
struct hl_ctx *ctx;
- int i, rc;
+ int i, rc, hw_fini_rc;
if (!hdev->init_done) {
dev_err(hdev->dev, "Can't reset before initialization is done\n");
@@ -1634,7 +1634,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
}
/* Reset the H/W. It will be in idle state after this returns */
- hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
+ hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
if (hard_reset) {
hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
@@ -1661,6 +1661,10 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hl_ctx_put(ctx);
}
+ if (hw_fini_rc) {
+ rc = hw_fini_rc;
+ goto out_err;
+ }
/* Finished tear-down, starting to re-initialize */
if (hard_reset) {
@@ -2416,7 +2420,9 @@ void hl_device_fini(struct hl_device *hdev)
hl_cb_pool_fini(hdev);
/* Reset the H/W. It will be in idle state after this returns */
- hdev->asic_funcs->hw_fini(hdev, true, false);
+ rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+ if (rc)
+ dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 26287084a9e0..60146fd4de6b 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -868,13 +868,18 @@ static int gaudi_early_init(struct hl_device *hdev)
rc = hl_fw_read_preboot_status(hdev);
if (rc) {
if (hdev->reset_on_preboot_fail)
+ /* we are already on failure flow, so don't check if hw_fini fails. */
hdev->asic_funcs->hw_fini(hdev, true, false);
goto pci_fini;
}
if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
- hdev->asic_funcs->hw_fini(hdev, true, false);
+ rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+ if (rc) {
+ dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
+ goto pci_fini;
+ }
}
return 0;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 82448edfdfa0..f01fa4bca381 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2886,13 +2886,18 @@ static int gaudi2_early_init(struct hl_device *hdev)
rc = hl_fw_read_preboot_status(hdev);
if (rc) {
if (hdev->reset_on_preboot_fail)
+ /* we are already on failure flow, so don't check if hw_fini fails. */
hdev->asic_funcs->hw_fini(hdev, true, false);
goto pci_fini;
}
if (gaudi2_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
- hdev->asic_funcs->hw_fini(hdev, true, false);
+ rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+ if (rc) {
+ dev_err(hdev->dev, "failed to reset HW during early init (%d)\n", rc);
+ goto pci_fini;
+ }
}
return 0;
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index 7a45ab3ca43a..39f9e5de1f4c 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -669,13 +669,18 @@ static int goya_early_init(struct hl_device *hdev)
rc = hl_fw_read_preboot_status(hdev);
if (rc) {
if (hdev->reset_on_preboot_fail)
+ /* we are already on failure flow, so don't check if hw_fini fails. */
hdev->asic_funcs->hw_fini(hdev, true, false);
goto pci_fini;
}
if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
- hdev->asic_funcs->hw_fini(hdev, true, false);
+ rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+ if (rc) {
+ dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
+ goto pci_fini;
+ }
}
if (!hdev->pldm) {
--
2.39.2
^ permalink raw reply related [flat|nested] 10+ messages in thread