From: Oded Gabbay <ogabbay@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: Ofir Bitton <obitton@habana.ai>
Subject: [PATCH 4/6] habanalabs: reset device upon FD close if not idle
Date: Sun, 6 Jun 2021 11:23:18 +0300 [thread overview]
Message-ID: <20210606082320.4319-4-ogabbay@kernel.org> (raw)
In-Reply-To: <20210606082320.4319-1-ogabbay@kernel.org>
From: Ofir Bitton <obitton@habana.ai>
If device is not idle after user closes the FD we must reset device
as next user that will try to open FD will encounter a non-functional
device.
Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
drivers/misc/habanalabs/common/context.c | 9 ---------
drivers/misc/habanalabs/common/device.c | 20 ++++++++++++++++---
drivers/misc/habanalabs/common/habanalabs.h | 1 +
.../misc/habanalabs/common/habanalabs_drv.c | 1 +
4 files changed, 19 insertions(+), 12 deletions(-)
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 62d705889ca8..19b6b045219e 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -12,7 +12,6 @@
static void hl_ctx_fini(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
- u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
int i;
/* Release all allocated pending cb's, those cb's were never
@@ -57,14 +56,6 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
/* Scrub both SRAM and DRAM */
hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
-
- if ((!hdev->pldm) && (hdev->pdev) &&
- (!hdev->asic_funcs->is_device_idle(hdev,
- idle_mask,
- HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)))
- dev_notice(hdev->dev,
- "device not idle after user context is closed (0x%llx, 0x%llx)\n",
- idle_mask[0], idle_mask[1]);
} else {
dev_dbg(hdev->dev, "closing kernel context\n");
hdev->asic_funcs->ctx_fini(ctx);
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index bc58a91bf50a..0056282cec94 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -51,6 +51,8 @@ bool hl_device_operational(struct hl_device *hdev,
static void hpriv_release(struct kref *ref)
{
+ u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
+ bool device_is_idle = true;
struct hl_fpriv *hpriv;
struct hl_device *hdev;
@@ -71,7 +73,19 @@ static void hpriv_release(struct kref *ref)
kfree(hpriv);
- if (hdev->reset_upon_device_release)
+ if ((!hdev->pldm) && (hdev->pdev) &&
+ (!hdev->asic_funcs->is_device_idle(hdev,
+ idle_mask,
+ HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) {
+ dev_err(hdev->dev,
+ "device not idle after user context is closed (0x%llx_%llx)\n",
+ idle_mask[1], idle_mask[0]);
+
+ device_is_idle = false;
+ }
+
+ if ((hdev->reset_if_device_not_idle && !device_is_idle)
+ || hdev->reset_upon_device_release)
hl_device_reset(hdev, 0);
}
@@ -1108,8 +1122,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
dev_err(hdev->dev,
- "device is not idle (mask %#llx %#llx) after reset\n",
- idle_mask[0], idle_mask[1]);
+ "device is not idle (mask 0x%llx_%llx) after reset\n",
+ idle_mask[1], idle_mask[0]);
rc = -EIO;
goto out_err;
}
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 56d2f41f8893..bcb5bfdd7f20 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2311,6 +2311,7 @@ struct hl_device {
u8 rl_enable;
u8 reset_on_preboot_fail;
u8 reset_upon_device_release;
+ u8 reset_if_device_not_idle;
};
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 137e7dc63d3b..b55dd1c55166 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -264,6 +264,7 @@ static void set_driver_behavior_per_device(struct hl_device *hdev)
hdev->bmc_enable = 1;
hdev->hard_reset_on_fw_events = 1;
hdev->reset_on_preboot_fail = 1;
+ hdev->reset_if_device_not_idle = 1;
hdev->reset_pcilink = 0;
hdev->axi_drain = 0;
--
2.25.1
next prev parent reply other threads:[~2021-06-06 8:23 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-06-06 8:23 [PATCH 1/6] habanalabs/gaudi: don't use disabled ports in collective wait Oded Gabbay
2021-06-06 8:23 ` [PATCH 2/6] habanalabs/gaudi: add FW alive event support Oded Gabbay
2021-06-06 8:23 ` [PATCH 3/6] habanalabs: add debug flag to prevent failure on timeout Oded Gabbay
2021-06-06 8:23 ` Oded Gabbay [this message]
2021-06-06 8:23 ` [PATCH 5/6] habanalabs: skip valid test for boot_dev_sts regs Oded Gabbay
2021-06-06 8:23 ` [PATCH 6/6] habanalabs: fix mask to obtain page offset Oded Gabbay
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210606082320.4319-4-ogabbay@kernel.org \
--to=ogabbay@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=obitton@habana.ai \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox