All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oded Gabbay <ogabbay@kernel.org>
To: dri-devel@lists.freedesktop.org
Cc: Dafna Hirschfeld <dhirschfeld@habana.ai>
Subject: [PATCH 4/6] habanalabs: assert return value of hw_fini
Date: Mon, 27 Feb 2023 13:13:04 +0200	[thread overview]
Message-ID: <20230227111306.3985896-4-ogabbay@kernel.org> (raw)
In-Reply-To: <20230227111306.3985896-1-ogabbay@kernel.org>

From: Dafna Hirschfeld <dhirschfeld@habana.ai>

Since hw_fini return error code for failure indication, we should
check its return value. Currently it might only fail upon soft-reset
from hl_device_reset. Later patch will add hw_fini failure in case of
polling timeout in hard-reset.

Signed-off-by: Dafna Hirschfeld <dhirschfeld@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 12 +++++++++---
 drivers/accel/habanalabs/gaudi/gaudi.c   |  7 ++++++-
 drivers/accel/habanalabs/gaudi2/gaudi2.c |  7 ++++++-
 drivers/accel/habanalabs/goya/goya.c     |  7 ++++++-
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 7ade32487138..99e793dfb126 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1472,7 +1472,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		schedule_hard_reset = false, delay_reset, from_dev_release, from_watchdog_thread;
 	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
 	struct hl_ctx *ctx;
-	int i, rc;
+	int i, rc, hw_fini_rc;
 
 	if (!hdev->init_done) {
 		dev_err(hdev->dev, "Can't reset before initialization is done\n");
@@ -1634,7 +1634,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	}
 
 	/* Reset the H/W. It will be in idle state after this returns */
-	hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
+	hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
 
 	if (hard_reset) {
 		hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
@@ -1661,6 +1661,10 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		hl_ctx_put(ctx);
 	}
 
+	if (hw_fini_rc) {
+		rc = hw_fini_rc;
+		goto out_err;
+	}
 	/* Finished tear-down, starting to re-initialize */
 
 	if (hard_reset) {
@@ -2416,7 +2420,9 @@ void hl_device_fini(struct hl_device *hdev)
 	hl_cb_pool_fini(hdev);
 
 	/* Reset the H/W. It will be in idle state after this returns */
-	hdev->asic_funcs->hw_fini(hdev, true, false);
+	rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+	if (rc)
+		dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
 
 	hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
 
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 26287084a9e0..60146fd4de6b 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -868,13 +868,18 @@ static int gaudi_early_init(struct hl_device *hdev)
 	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
+			/* we are already on failure flow, so don't check if hw_fini fails. */
 			hdev->asic_funcs->hw_fini(hdev, true, false);
 		goto pci_fini;
 	}
 
 	if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
 		dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->hw_fini(hdev, true, false);
+		rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+		if (rc) {
+			dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
+			goto pci_fini;
+		}
 	}
 
 	return 0;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 82448edfdfa0..f01fa4bca381 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2886,13 +2886,18 @@ static int gaudi2_early_init(struct hl_device *hdev)
 	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
+			/* we are already on failure flow, so don't check if hw_fini fails. */
 			hdev->asic_funcs->hw_fini(hdev, true, false);
 		goto pci_fini;
 	}
 
 	if (gaudi2_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
 		dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->hw_fini(hdev, true, false);
+		rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+		if (rc) {
+			dev_err(hdev->dev, "failed to reset HW during early init (%d)\n", rc);
+			goto pci_fini;
+		}
 	}
 
 	return 0;
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index 7a45ab3ca43a..39f9e5de1f4c 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -669,13 +669,18 @@ static int goya_early_init(struct hl_device *hdev)
 	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
+			/* we are already on failure flow, so don't check if hw_fini fails. */
 			hdev->asic_funcs->hw_fini(hdev, true, false);
 		goto pci_fini;
 	}
 
 	if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
 		dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->hw_fini(hdev, true, false);
+		rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+		if (rc) {
+			dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
+			goto pci_fini;
+		}
 	}
 
 	if (!hdev->pldm) {
-- 
2.39.2


  parent reply	other threads:[~2023-02-27 11:13 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
2023-02-27 11:13 ` [PATCH 2/6] habanalabs: add device id to all threads names Oded Gabbay
2023-03-01 13:21   ` Stanislaw Gruszka
2023-02-27 11:13 ` [PATCH 3/6] habanalabs/gaudi2: break is_idle function into per-engine sub-routines Oded Gabbay
2023-02-27 11:13 ` Oded Gabbay [this message]
2023-03-01 13:31   ` [PATCH 4/6] habanalabs: assert return value of hw_fini Stanislaw Gruszka
2023-02-27 11:13 ` [PATCH 5/6] habanalabs: use notifications and graceful reset for decoder Oded Gabbay
2023-03-01 13:34   ` Stanislaw Gruszka
2023-02-27 11:13 ` [PATCH 6/6] habanalabs/gaudi2: verify return code after scrubbing ARCs DCCMs Oded Gabbay
2023-03-01 13:20 ` [PATCH 1/6] habanalabs: add helper function to get vm hash node Stanislaw Gruszka

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230227111306.3985896-4-ogabbay@kernel.org \
    --to=ogabbay@kernel.org \
    --cc=dhirschfeld@habana.ai \
    --cc=dri-devel@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.