From: Oded Gabbay <ogabbay@kernel.org>
To: linux-kernel@vger.kernel.org
Subject: [PATCH 01/12] habanalabs: fix soft reset accounting
Date: Sun, 28 Nov 2021 21:34:24 +0200 [thread overview]
Message-ID: <20211128193435.266534-1-ogabbay@kernel.org> (raw)
Reset upon device release is not a soft-reset from user/system point
of view. As such, we shouldn't count that reset in the statistics we
gather and expose to the monitoring applications.
We also shouldn't print soft-reset when doing the reset upon device
release.
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/common/device.c | 50 ++++++++++++-------------
1 file changed, 25 insertions(+), 25 deletions(-)
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 484e0446381e..2b208007c26f 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -962,13 +962,13 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
*/
int hl_device_reset(struct hl_device *hdev, u32 flags)
{
- bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
+ bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
+ reset_upon_device_release = false;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
int i, rc;
if (!hdev->init_done) {
- dev_err(hdev->dev,
- "Can't reset before initialization is done\n");
+ dev_err(hdev->dev, "Can't reset before initialization is done\n");
return 0;
}
@@ -988,6 +988,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
return -EINVAL;
}
+ reset_upon_device_release = true;
+
goto do_reset;
}
@@ -1024,12 +1026,10 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
if (hard_reset)
dev_info(hdev->dev, "Going to reset device\n");
- else if (flags & HL_DRV_RESET_DEV_RELEASE)
- dev_info(hdev->dev,
- "Going to reset device after it was released by user\n");
+ else if (reset_upon_device_release)
+ dev_info(hdev->dev, "Going to reset device after release by user\n");
else
- dev_info(hdev->dev,
- "Going to reset compute engines of inference device\n");
+ dev_info(hdev->dev, "Going to reset engines of inference device\n");
}
again:
@@ -1174,16 +1174,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
rc = hdev->asic_funcs->hw_init(hdev);
if (rc) {
- dev_err(hdev->dev,
- "failed to initialize the H/W after reset\n");
+ dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
goto out_err;
}
/* If device is not idle fail the reset process */
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
- dev_err(hdev->dev,
- "device is not idle (mask 0x%llx_%llx) after reset\n",
+ dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
idle_mask[1], idle_mask[0]);
rc = -EIO;
goto out_err;
@@ -1192,23 +1190,20 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
/* Check that the communication with the device is working */
rc = hdev->asic_funcs->test_queues(hdev);
if (rc) {
- dev_err(hdev->dev,
- "Failed to detect if device is alive after reset\n");
+ dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
goto out_err;
}
if (hard_reset) {
rc = device_late_init(hdev);
if (rc) {
- dev_err(hdev->dev,
- "Failed late init after hard reset\n");
+ dev_err(hdev->dev, "Failed late init after hard reset\n");
goto out_err;
}
rc = hl_vm_init(hdev);
if (rc) {
- dev_err(hdev->dev,
- "Failed to init memory module after hard reset\n");
+ dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
goto out_err;
}
@@ -1216,8 +1211,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
} else {
rc = hdev->asic_funcs->soft_reset_late_init(hdev);
if (rc) {
- dev_err(hdev->dev,
- "Failed late init after soft reset\n");
+ if (reset_upon_device_release)
+ dev_err(hdev->dev,
+ "Failed late init in reset after device release\n");
+ else
+ dev_err(hdev->dev, "Failed late init after soft reset\n");
goto out_err;
}
}
@@ -1236,7 +1234,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
* the device will be operational although it shouldn't be
*/
hdev->asic_funcs->enable_events_from_fw(hdev);
- } else {
+ } else if (!reset_upon_device_release) {
hdev->soft_reset_cnt++;
}
@@ -1246,12 +1244,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hdev->disabled = true;
if (hard_reset) {
- dev_err(hdev->dev,
- "Failed to reset! Device is NOT usable\n");
+ dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
hdev->hard_reset_cnt++;
+ } else if (reset_upon_device_release) {
+ dev_err(hdev->dev, "Failed to reset device after user release\n");
+ hard_reset = true;
+ goto again;
} else {
- dev_err(hdev->dev,
- "Failed to do soft-reset, trying hard reset\n");
+ dev_err(hdev->dev, "Failed to do soft-reset\n");
hdev->soft_reset_cnt++;
hard_reset = true;
goto again;
--
2.25.1
next reply other threads:[~2021-11-28 19:36 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-11-28 19:34 Oded Gabbay [this message]
2021-11-28 19:34 ` [PATCH 02/12] habanalabs: rename late init after reset function Oded Gabbay
2021-11-28 19:34 ` [PATCH 03/12] habanalabs/gaudi: return EPERM on non hard-reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 04/12] habanalabs: move device boot warnings to the correct location Oded Gabbay
2021-11-28 19:34 ` [PATCH 05/12] habanalabs: fix race condition in multi CS completion Oded Gabbay
2021-11-28 19:34 ` [PATCH 06/12] habanalabs: add more info ioctls support during reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 07/12] habanalabs: add power information type to POWER_GET packet Oded Gabbay
2021-11-28 19:34 ` [PATCH 08/12] habanalabs: change misleading IRQ warning during reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 09/12] habanalabs: handle events during soft-reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 10/12] habanalabs: skip read fw errors if dynamic descriptor invalid Oded Gabbay
2021-11-28 19:34 ` [PATCH 11/12] habanalabs: add SOB information to signal submission uAPI Oded Gabbay
2021-11-28 19:34 ` [PATCH 12/12] habanalabs: enable access to info ioctl during hard reset Oded Gabbay
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211128193435.266534-1-ogabbay@kernel.org \
--to=ogabbay@kernel.org \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.