public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Oded Gabbay <ogabbay@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: Ofir Bitton <obitton@habana.ai>
Subject: [PATCH 4/6] habanalabs: reset device upon FD close if not idle
Date: Sun,  6 Jun 2021 11:23:18 +0300	[thread overview]
Message-ID: <20210606082320.4319-4-ogabbay@kernel.org> (raw)
In-Reply-To: <20210606082320.4319-1-ogabbay@kernel.org>

From: Ofir Bitton <obitton@habana.ai>

If device is not idle after user closes the FD we must reset device
as next user that will try to open FD will encounter a non-functional
device.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/context.c      |  9 ---------
 drivers/misc/habanalabs/common/device.c       | 20 ++++++++++++++++---
 drivers/misc/habanalabs/common/habanalabs.h   |  1 +
 .../misc/habanalabs/common/habanalabs_drv.c   |  1 +
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 62d705889ca8..19b6b045219e 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -12,7 +12,6 @@
 static void hl_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
-	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
 	int i;
 
 	/* Release all allocated pending cb's, those cb's were never
@@ -57,14 +56,6 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 
 		/* Scrub both SRAM and DRAM */
 		hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
-
-		if ((!hdev->pldm) && (hdev->pdev) &&
-				(!hdev->asic_funcs->is_device_idle(hdev,
-					idle_mask,
-					HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)))
-			dev_notice(hdev->dev,
-					"device not idle after user context is closed (0x%llx, 0x%llx)\n",
-						idle_mask[0], idle_mask[1]);
 	} else {
 		dev_dbg(hdev->dev, "closing kernel context\n");
 		hdev->asic_funcs->ctx_fini(ctx);
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index bc58a91bf50a..0056282cec94 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -51,6 +51,8 @@ bool hl_device_operational(struct hl_device *hdev,
 
 static void hpriv_release(struct kref *ref)
 {
+	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
+	bool device_is_idle = true;
 	struct hl_fpriv *hpriv;
 	struct hl_device *hdev;
 
@@ -71,7 +73,19 @@ static void hpriv_release(struct kref *ref)
 
 	kfree(hpriv);
 
-	if (hdev->reset_upon_device_release)
+	if ((!hdev->pldm) && (hdev->pdev) &&
+			(!hdev->asic_funcs->is_device_idle(hdev,
+				idle_mask,
+				HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) {
+		dev_err(hdev->dev,
+			"device not idle after user context is closed (0x%llx_%llx)\n",
+			idle_mask[1], idle_mask[0]);
+
+		device_is_idle = false;
+	}
+
+	if ((hdev->reset_if_device_not_idle && !device_is_idle)
+			|| hdev->reset_upon_device_release)
 		hl_device_reset(hdev, 0);
 }
 
@@ -1108,8 +1122,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
 			HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
 		dev_err(hdev->dev,
-			"device is not idle (mask %#llx %#llx) after reset\n",
-			idle_mask[0], idle_mask[1]);
+			"device is not idle (mask 0x%llx_%llx) after reset\n",
+			idle_mask[1], idle_mask[0]);
 		rc = -EIO;
 		goto out_err;
 	}
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 56d2f41f8893..bcb5bfdd7f20 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2311,6 +2311,7 @@ struct hl_device {
 	u8				rl_enable;
 	u8				reset_on_preboot_fail;
 	u8				reset_upon_device_release;
+	u8				reset_if_device_not_idle;
 };
 
 
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 137e7dc63d3b..b55dd1c55166 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -264,6 +264,7 @@ static void set_driver_behavior_per_device(struct hl_device *hdev)
 	hdev->bmc_enable = 1;
 	hdev->hard_reset_on_fw_events = 1;
 	hdev->reset_on_preboot_fail = 1;
+	hdev->reset_if_device_not_idle = 1;
 
 	hdev->reset_pcilink = 0;
 	hdev->axi_drain = 0;
-- 
2.25.1


  parent reply	other threads:[~2021-06-06  8:23 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-06  8:23 [PATCH 1/6] habanalabs/gaudi: don't use disabled ports in collective wait Oded Gabbay
2021-06-06  8:23 ` [PATCH 2/6] habanalabs/gaudi: add FW alive event support Oded Gabbay
2021-06-06  8:23 ` [PATCH 3/6] habanalabs: add debug flag to prevent failure on timeout Oded Gabbay
2021-06-06  8:23 ` Oded Gabbay [this message]
2021-06-06  8:23 ` [PATCH 5/6] habanalabs: skip valid test for boot_dev_sts regs Oded Gabbay
2021-06-06  8:23 ` [PATCH 6/6] habanalabs: fix mask to obtain page offset Oded Gabbay

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210606082320.4319-4-ogabbay@kernel.org \
    --to=ogabbay@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=obitton@habana.ai \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox