All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/12] habanalabs: fix soft reset accounting
@ 2021-11-28 19:34 Oded Gabbay
  2021-11-28 19:34 ` [PATCH 02/12] habanalabs: rename late init after reset function Oded Gabbay
                   ` (10 more replies)
  0 siblings, 11 replies; 12+ messages in thread
From: Oded Gabbay @ 2021-11-28 19:34 UTC (permalink / raw)
  To: linux-kernel

Reset upon device release is not a soft-reset from user/system point
of view. As such, we shouldn't count that reset in the statistics we
gather and expose to the monitoring applications.

We also shouldn't print soft-reset when doing the reset upon device
release.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c | 50 ++++++++++++-------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 484e0446381e..2b208007c26f 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -962,13 +962,13 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
  */
 int hl_device_reset(struct hl_device *hdev, u32 flags)
 {
-	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
+	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
+								reset_upon_device_release = false;
 	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
 	int i, rc;
 
 	if (!hdev->init_done) {
-		dev_err(hdev->dev,
-			"Can't reset before initialization is done\n");
+		dev_err(hdev->dev, "Can't reset before initialization is done\n");
 		return 0;
 	}
 
@@ -988,6 +988,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 			return -EINVAL;
 		}
 
+		reset_upon_device_release = true;
+
 		goto do_reset;
 	}
 
@@ -1024,12 +1026,10 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 		if (hard_reset)
 			dev_info(hdev->dev, "Going to reset device\n");
-		else if (flags & HL_DRV_RESET_DEV_RELEASE)
-			dev_info(hdev->dev,
-				"Going to reset device after it was released by user\n");
+		else if (reset_upon_device_release)
+			dev_info(hdev->dev, "Going to reset device after release by user\n");
 		else
-			dev_info(hdev->dev,
-				"Going to reset compute engines of inference device\n");
+			dev_info(hdev->dev, "Going to reset engines of inference device\n");
 	}
 
 again:
@@ -1174,16 +1174,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 	rc = hdev->asic_funcs->hw_init(hdev);
 	if (rc) {
-		dev_err(hdev->dev,
-			"failed to initialize the H/W after reset\n");
+		dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
 		goto out_err;
 	}
 
 	/* If device is not idle fail the reset process */
 	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
 			HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
-		dev_err(hdev->dev,
-			"device is not idle (mask 0x%llx_%llx) after reset\n",
+		dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
 			idle_mask[1], idle_mask[0]);
 		rc = -EIO;
 		goto out_err;
@@ -1192,23 +1190,20 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	/* Check that the communication with the device is working */
 	rc = hdev->asic_funcs->test_queues(hdev);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to detect if device is alive after reset\n");
+		dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
 		goto out_err;
 	}
 
 	if (hard_reset) {
 		rc = device_late_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed late init after hard reset\n");
+			dev_err(hdev->dev, "Failed late init after hard reset\n");
 			goto out_err;
 		}
 
 		rc = hl_vm_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed to init memory module after hard reset\n");
+			dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
 			goto out_err;
 		}
 
@@ -1216,8 +1211,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	} else {
 		rc = hdev->asic_funcs->soft_reset_late_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed late init after soft reset\n");
+			if (reset_upon_device_release)
+				dev_err(hdev->dev,
+					"Failed late init in reset after device release\n");
+			else
+				dev_err(hdev->dev, "Failed late init after soft reset\n");
 			goto out_err;
 		}
 	}
@@ -1236,7 +1234,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		 * the device will be operational although it shouldn't be
 		 */
 		hdev->asic_funcs->enable_events_from_fw(hdev);
-	} else {
+	} else if (!reset_upon_device_release) {
 		hdev->soft_reset_cnt++;
 	}
 
@@ -1246,12 +1244,14 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	hdev->disabled = true;
 
 	if (hard_reset) {
-		dev_err(hdev->dev,
-			"Failed to reset! Device is NOT usable\n");
+		dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
 		hdev->hard_reset_cnt++;
+	} else if (reset_upon_device_release) {
+		dev_err(hdev->dev, "Failed to reset device after user release\n");
+		hard_reset = true;
+		goto again;
 	} else {
-		dev_err(hdev->dev,
-			"Failed to do soft-reset, trying hard reset\n");
+		dev_err(hdev->dev, "Failed to do soft-reset\n");
 		hdev->soft_reset_cnt++;
 		hard_reset = true;
 		goto again;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-11-28 19:37 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-11-28 19:34 [PATCH 01/12] habanalabs: fix soft reset accounting Oded Gabbay
2021-11-28 19:34 ` [PATCH 02/12] habanalabs: rename late init after reset function Oded Gabbay
2021-11-28 19:34 ` [PATCH 03/12] habanalabs/gaudi: return EPERM on non hard-reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 04/12] habanalabs: move device boot warnings to the correct location Oded Gabbay
2021-11-28 19:34 ` [PATCH 05/12] habanalabs: fix race condition in multi CS completion Oded Gabbay
2021-11-28 19:34 ` [PATCH 06/12] habanalabs: add more info ioctls support during reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 07/12] habanalabs: add power information type to POWER_GET packet Oded Gabbay
2021-11-28 19:34 ` [PATCH 08/12] habanalabs: change misleading IRQ warning during reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 09/12] habanalabs: handle events during soft-reset Oded Gabbay
2021-11-28 19:34 ` [PATCH 10/12] habanalabs: skip read fw errors if dynamic descriptor invalid Oded Gabbay
2021-11-28 19:34 ` [PATCH 11/12] habanalabs: add SOB information to signal submission uAPI Oded Gabbay
2021-11-28 19:34 ` [PATCH 12/12] habanalabs: enable access to info ioctl during hard reset Oded Gabbay

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.