dri-devel Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] habanalabs: add helper function to get vm hash node
@ 2023-02-27 11:13 Oded Gabbay
  2023-02-27 11:13 ` [PATCH 2/6] habanalabs: add device id to all threads names Oded Gabbay
                   ` (5 more replies)
  0 siblings, 6 replies; 10+ messages in thread
From: Oded Gabbay @ 2023-02-27 11:13 UTC (permalink / raw)
  To: dri-devel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Add a helper function to search the vm hash for a node with a given
virtual address.
As opposed to the current code, this function explicitly returns NULL
when no node is found, instead of basing on the loop cursor object's
value.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/memory.c | 28 ++++++++++++++----------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
index be0cba3b61ab..88f5178d2df7 100644
--- a/drivers/accel/habanalabs/common/memory.c
+++ b/drivers/accel/habanalabs/common/memory.c
@@ -1266,6 +1266,18 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device
 	return rc;
 }
 
+/* Should be called while the context's mem_hash_lock is taken */
+static struct hl_vm_hash_node *get_vm_hash_node_locked(struct hl_ctx *ctx, u64 vaddr)
+{
+	struct hl_vm_hash_node *hnode;
+
+	hash_for_each_possible(ctx->mem_hash, hnode, node, vaddr)
+		if (vaddr == hnode->vaddr)
+			return hnode;
+
+	return NULL;
+}
+
 /**
  * unmap_device_va() - unmap the given device virtual address.
  * @ctx: pointer to the context structure.
@@ -1281,10 +1293,10 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 {
 	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
 	u64 vaddr = args->unmap.device_virt_addr;
-	struct hl_vm_hash_node *hnode = NULL;
 	struct asic_fixed_properties *prop;
 	struct hl_device *hdev = ctx->hdev;
 	struct hl_userptr *userptr = NULL;
+	struct hl_vm_hash_node *hnode;
 	struct hl_va_range *va_range;
 	enum vm_type *vm_type;
 	bool is_userptr;
@@ -1294,15 +1306,10 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 
 	/* protect from double entrance */
 	mutex_lock(&ctx->mem_hash_lock);
-	hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)vaddr)
-		if (vaddr == hnode->vaddr)
-			break;
-
+	hnode = get_vm_hash_node_locked(ctx, vaddr);
 	if (!hnode) {
 		mutex_unlock(&ctx->mem_hash_lock);
-		dev_err(hdev->dev,
-			"unmap failed, no mem hnode for vaddr 0x%llx\n",
-			vaddr);
+		dev_err(hdev->dev, "unmap failed, no mem hnode for vaddr 0x%llx\n", vaddr);
 		return -EINVAL;
 	}
 
@@ -1782,10 +1789,7 @@ static struct hl_vm_hash_node *memhash_node_export_get(struct hl_ctx *ctx, u64 a
 
 	/* get the memory handle */
 	mutex_lock(&ctx->mem_hash_lock);
-	hash_for_each_possible(ctx->mem_hash, hnode, node, (unsigned long)addr)
-		if (addr == hnode->vaddr)
-			break;
-
+	hnode = get_vm_hash_node_locked(ctx, addr);
 	if (!hnode) {
 		mutex_unlock(&ctx->mem_hash_lock);
 		dev_dbg(hdev->dev, "map address %#llx not found\n", addr);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/6] habanalabs: add device id to all threads names
  2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
@ 2023-02-27 11:13 ` Oded Gabbay
  2023-03-01 13:21   ` Stanislaw Gruszka
  2023-02-27 11:13 ` [PATCH 3/6] habanalabs/gaudi2: break is_idle function into per-engine sub-routines Oded Gabbay
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: Oded Gabbay @ 2023-02-27 11:13 UTC (permalink / raw)
  To: dri-devel; +Cc: Sagiv Ozeri

From: Sagiv Ozeri <sozeri@habana.ai>

Compute driver threads names will start with hlX-*, when X is the
device id.
This will help distinguish them from the NIC thread names.

Signed-off-by: Sagiv Ozeri <sozeri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index e544d00fe376..7ade32487138 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -840,7 +840,7 @@ static int device_early_init(struct hl_device *hdev)
 	}
 
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
-		snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i);
+		snprintf(workq_name, 32, "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i);
 		hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
 		if (hdev->cq_wq[i] == NULL) {
 			dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
@@ -849,14 +849,16 @@ static int device_early_init(struct hl_device *hdev)
 		}
 	}
 
-	hdev->eq_wq = create_singlethread_workqueue("hl-events");
+	snprintf(workq_name, 32, "hl%u-events", hdev->cdev_idx);
+	hdev->eq_wq = create_singlethread_workqueue(workq_name);
 	if (hdev->eq_wq == NULL) {
 		dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
 		rc = -ENOMEM;
 		goto free_cq_wq;
 	}
 
-	hdev->cs_cmplt_wq = alloc_workqueue("hl-cs-completions", WQ_UNBOUND, 0);
+	snprintf(workq_name, 32, "hl%u-cs-completions", hdev->cdev_idx);
+	hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
 	if (!hdev->cs_cmplt_wq) {
 		dev_err(hdev->dev,
 			"Failed to allocate CS completions workqueue\n");
@@ -864,7 +866,8 @@ static int device_early_init(struct hl_device *hdev)
 		goto free_eq_wq;
 	}
 
-	hdev->ts_free_obj_wq = alloc_workqueue("hl-ts-free-obj", WQ_UNBOUND, 0);
+	snprintf(workq_name, 32, "hl%u-ts-free-obj", hdev->cdev_idx);
+	hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
 	if (!hdev->ts_free_obj_wq) {
 		dev_err(hdev->dev,
 			"Failed to allocate Timestamp registration free workqueue\n");
@@ -872,15 +875,15 @@ static int device_early_init(struct hl_device *hdev)
 		goto free_cs_cmplt_wq;
 	}
 
-	hdev->prefetch_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0);
+	snprintf(workq_name, 32, "hl%u-prefetch", hdev->cdev_idx);
+	hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0);
 	if (!hdev->prefetch_wq) {
 		dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
 		rc = -ENOMEM;
 		goto free_ts_free_wq;
 	}
 
-	hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
-					GFP_KERNEL);
+	hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL);
 	if (!hdev->hl_chip_info) {
 		rc = -ENOMEM;
 		goto free_prefetch_wq;
@@ -892,7 +895,8 @@ static int device_early_init(struct hl_device *hdev)
 
 	hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr);
 
-	hdev->reset_wq = create_singlethread_workqueue("hl_device_reset");
+	snprintf(workq_name, 32, "hl%u_device_reset", hdev->cdev_idx);
+	hdev->reset_wq = create_singlethread_workqueue(workq_name);
 	if (!hdev->reset_wq) {
 		rc = -ENOMEM;
 		dev_err(hdev->dev, "Failed to create device reset WQ\n");
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/6] habanalabs/gaudi2: break is_idle function into per-engine sub-routines
  2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
  2023-02-27 11:13 ` [PATCH 2/6] habanalabs: add device id to all threads names Oded Gabbay
@ 2023-02-27 11:13 ` Oded Gabbay
  2023-02-27 11:13 ` [PATCH 4/6] habanalabs: assert return value of hw_fini Oded Gabbay
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2023-02-27 11:13 UTC (permalink / raw)
  To: dri-devel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

is_idle() was too long, so break it up for readability.
In addition, we can now use the new sub-routines from other places.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 212 ++++++++++++++++-------
 1 file changed, 146 insertions(+), 66 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 2021ef9d4702..82448edfdfa0 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -6651,70 +6651,17 @@ static int gaudi2_compute_reset_late_init(struct hl_device *hdev)
 	return hl_fw_unmask_irq_arr(hdev, gaudi2->hw_events, irq_arr_size);
 }
 
-static void gaudi2_is_tpc_engine_idle(struct hl_device *hdev, int dcore, int inst, u32 offset,
-					struct iterate_module_ctx *ctx)
-{
-	struct gaudi2_tpc_idle_data *idle_data = ctx->data;
-	u32 tpc_cfg_sts, qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts;
-	bool is_eng_idle;
-	int engine_idx;
-
-	if ((dcore == 0) && (inst == (NUM_DCORE0_TPC - 1)))
-		engine_idx = GAUDI2_DCORE0_ENGINE_ID_TPC_6;
-	else
-		engine_idx = GAUDI2_DCORE0_ENGINE_ID_TPC_0 +
-				dcore * GAUDI2_ENGINE_ID_DCORE_OFFSET + inst;
-
-	tpc_cfg_sts = RREG32(mmDCORE0_TPC0_CFG_STATUS + offset);
-	qm_glbl_sts0 = RREG32(mmDCORE0_TPC0_QM_GLBL_STS0 + offset);
-	qm_glbl_sts1 = RREG32(mmDCORE0_TPC0_QM_GLBL_STS1 + offset);
-	qm_cgm_sts = RREG32(mmDCORE0_TPC0_QM_CGM_STS + offset);
-
-	is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts) &&
-						IS_TPC_IDLE(tpc_cfg_sts);
-	*(idle_data->is_idle) &= is_eng_idle;
-
-	if (idle_data->mask && !is_eng_idle)
-		set_bit(engine_idx, idle_data->mask);
-
-	if (idle_data->e)
-		hl_engine_data_sprintf(idle_data->e,
-					idle_data->tpc_fmt, dcore, inst,
-					is_eng_idle ? "Y" : "N",
-					qm_glbl_sts0, qm_cgm_sts, tpc_cfg_sts);
-}
-
-static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
-					struct engines_data *e)
+static bool gaudi2_get_edma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+		struct engines_data *e)
 {
-	u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts, dma_core_idle_ind_mask,
-		mme_arch_sts, dec_swreg15, dec_enabled_bit;
+	u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts, dma_core_idle_ind_mask;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	const char *rot_fmt = "%-6d%-5d%-9s%#-14x%#-12x%s\n";
 	unsigned long *mask = (unsigned long *) mask_arr;
 	const char *edma_fmt = "%-6d%-6d%-9s%#-14x%#x\n";
-	const char *mme_fmt = "%-5d%-6s%-9s%#-14x%#x\n";
-	const char *nic_fmt = "%-5d%-9s%#-14x%#-12x\n";
-	const char *pdma_fmt = "%-6d%-9s%#-14x%#x\n";
-	const char *pcie_dec_fmt = "%-10d%-9s%#x\n";
-	const char *dec_fmt = "%-6d%-5d%-9s%#x\n";
 	bool is_idle = true, is_eng_idle;
-	u64 offset;
-
-	struct gaudi2_tpc_idle_data tpc_idle_data = {
-		.tpc_fmt = "%-6d%-5d%-9s%#-14x%#-12x%#x\n",
-		.e = e,
-		.mask = mask,
-		.is_idle = &is_idle,
-	};
-	struct iterate_module_ctx tpc_iter = {
-		.fn = &gaudi2_is_tpc_engine_idle,
-		.data = &tpc_idle_data,
-	};
-
 	int engine_idx, i, j;
+	u64 offset;
 
-	/* EDMA, Two engines per Dcore */
 	if (e)
 		hl_engine_data_sprintf(e,
 			"\nCORE  EDMA  is_idle  QM_GLBL_STS0  DMA_CORE_IDLE_IND_MASK\n"
@@ -6753,7 +6700,19 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
 		}
 	}
 
-	/* PDMA, Two engines in Full chip */
+	return is_idle;
+}
+
+static bool gaudi2_get_pdma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+		struct engines_data *e)
+{
+	u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts, dma_core_idle_ind_mask;
+	unsigned long *mask = (unsigned long *) mask_arr;
+	const char *pdma_fmt = "%-6d%-9s%#-14x%#x\n";
+	bool is_idle = true, is_eng_idle;
+	int engine_idx, i;
+	u64 offset;
+
 	if (e)
 		hl_engine_data_sprintf(e,
 					"\nPDMA  is_idle  QM_GLBL_STS0  DMA_CORE_IDLE_IND_MASK\n"
@@ -6780,6 +6739,19 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
 						qm_glbl_sts0, dma_core_idle_ind_mask);
 	}
 
+	return is_idle;
+}
+
+static bool gaudi2_get_nic_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+		struct engines_data *e)
+{
+	unsigned long *mask = (unsigned long *) mask_arr;
+	const char *nic_fmt = "%-5d%-9s%#-14x%#-12x\n";
+	u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts;
+	bool is_idle = true, is_eng_idle;
+	int engine_idx, i;
+	u64 offset;
+
 	/* NIC, twelve macros in Full chip */
 	if (e && hdev->nic_ports_mask)
 		hl_engine_data_sprintf(e,
@@ -6813,6 +6785,19 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
 						qm_glbl_sts0, qm_cgm_sts);
 	}
 
+	return is_idle;
+}
+
+static bool gaudi2_get_mme_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+		struct engines_data *e)
+{
+	u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts, mme_arch_sts;
+	unsigned long *mask = (unsigned long *) mask_arr;
+	const char *mme_fmt = "%-5d%-6s%-9s%#-14x%#x\n";
+	bool is_idle = true, is_eng_idle;
+	int engine_idx, i;
+	u64 offset;
+
 	if (e)
 		hl_engine_data_sprintf(e,
 					"\nMME  Stub  is_idle  QM_GLBL_STS0  MME_ARCH_STATUS\n"
@@ -6843,16 +6828,82 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
 			set_bit(engine_idx, mask);
 	}
 
-	/*
-	 * TPC
-	 */
+	return is_idle;
+}
+
+static void gaudi2_is_tpc_engine_idle(struct hl_device *hdev, int dcore, int inst, u32 offset,
+					struct iterate_module_ctx *ctx)
+{
+	struct gaudi2_tpc_idle_data *idle_data = ctx->data;
+	u32 tpc_cfg_sts, qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts;
+	bool is_eng_idle;
+	int engine_idx;
+
+	if ((dcore == 0) && (inst == (NUM_DCORE0_TPC - 1)))
+		engine_idx = GAUDI2_DCORE0_ENGINE_ID_TPC_6;
+	else
+		engine_idx = GAUDI2_DCORE0_ENGINE_ID_TPC_0 +
+				dcore * GAUDI2_ENGINE_ID_DCORE_OFFSET + inst;
+
+	tpc_cfg_sts = RREG32(mmDCORE0_TPC0_CFG_STATUS + offset);
+	qm_glbl_sts0 = RREG32(mmDCORE0_TPC0_QM_GLBL_STS0 + offset);
+	qm_glbl_sts1 = RREG32(mmDCORE0_TPC0_QM_GLBL_STS1 + offset);
+	qm_cgm_sts = RREG32(mmDCORE0_TPC0_QM_CGM_STS + offset);
+
+	is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts) &&
+						IS_TPC_IDLE(tpc_cfg_sts);
+	*(idle_data->is_idle) &= is_eng_idle;
+
+	if (idle_data->mask && !is_eng_idle)
+		set_bit(engine_idx, idle_data->mask);
+
+	if (idle_data->e)
+		hl_engine_data_sprintf(idle_data->e,
+					idle_data->tpc_fmt, dcore, inst,
+					is_eng_idle ? "Y" : "N",
+					qm_glbl_sts0, qm_cgm_sts, tpc_cfg_sts);
+}
+
+static bool gaudi2_get_tpc_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+		struct engines_data *e)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	unsigned long *mask = (unsigned long *) mask_arr;
+	bool is_idle = true;
+
+	struct gaudi2_tpc_idle_data tpc_idle_data = {
+		.tpc_fmt = "%-6d%-5d%-9s%#-14x%#-12x%#x\n",
+		.e = e,
+		.mask = mask,
+		.is_idle = &is_idle,
+	};
+	struct iterate_module_ctx tpc_iter = {
+		.fn = &gaudi2_is_tpc_engine_idle,
+		.data = &tpc_idle_data,
+	};
+
 	if (e && prop->tpc_enabled_mask)
 		hl_engine_data_sprintf(e,
-			"\nCORE  TPC   is_idle  QM_GLBL_STS0  QM_CGM_STS  DMA_CORE_IDLE_IND_MASK\n"
-			"----  ---  --------  ------------  ----------  ----------------------\n");
+			"\nCORE  TPC  is_idle  QM_GLBL_STS0  QM_CGM_STS  STATUS\n"
+			"----  ---  -------  ------------  ----------  ------\n");
 
 	gaudi2_iterate_tpcs(hdev, &tpc_iter);
 
+	return tpc_idle_data.is_idle;
+}
+
+static bool gaudi2_get_decoder_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+		struct engines_data *e)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	unsigned long *mask = (unsigned long *) mask_arr;
+	const char *pcie_dec_fmt = "%-10d%-9s%#x\n";
+	const char *dec_fmt = "%-6d%-5d%-9s%#x\n";
+	bool is_idle = true, is_eng_idle;
+	u32 dec_swreg15, dec_enabled_bit;
+	int engine_idx, i, j;
+	u64 offset;
+
 	/* Decoders, two each Dcore and two shared PCIe decoders */
 	if (e && (prop->decoder_enabled_mask & (~PCIE_DEC_EN_MASK)))
 		hl_engine_data_sprintf(e,
@@ -6907,10 +6958,23 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
 						is_eng_idle ? "Y" : "N", dec_swreg15);
 	}
 
+	return is_idle;
+}
+
+static bool gaudi2_get_rotator_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+		struct engines_data *e)
+{
+	const char *rot_fmt = "%-6d%-5d%-9s%#-14x%#-14x%#x\n";
+	unsigned long *mask = (unsigned long *) mask_arr;
+	u32 qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts;
+	bool is_idle = true, is_eng_idle;
+	int engine_idx, i;
+	u64 offset;
+
 	if (e)
 		hl_engine_data_sprintf(e,
-			"\nCORE  ROT  is_idle  QM_GLBL_STS0  QM_CGM_STS  DMA_CORE_STS0\n"
-			"----  ----  -------  ------------  ----------  -------------\n");
+			"\nCORE  ROT  is_idle  QM_GLBL_STS0  QM_GLBL_STS1  QM_CGM_STS\n"
+			"----  ---  -------  ------------  ------------  ----------\n");
 
 	for (i = 0 ; i < NUM_OF_ROT ; i++) {
 		engine_idx = GAUDI2_ENGINE_ID_ROT_0 + i;
@@ -6929,12 +6993,28 @@ static bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask
 
 		if (e)
 			hl_engine_data_sprintf(e, rot_fmt, i, 0, is_eng_idle ? "Y" : "N",
-					qm_glbl_sts0, qm_cgm_sts, "-");
+						qm_glbl_sts0, qm_glbl_sts1, qm_cgm_sts);
 	}
 
 	return is_idle;
 }
 
+bool gaudi2_is_device_idle(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+					struct engines_data *e)
+{
+	bool is_idle = true;
+
+	is_idle &= gaudi2_get_edma_idle_status(hdev, mask_arr, mask_len, e);
+	is_idle &= gaudi2_get_pdma_idle_status(hdev, mask_arr, mask_len, e);
+	is_idle &= gaudi2_get_nic_idle_status(hdev, mask_arr, mask_len, e);
+	is_idle &= gaudi2_get_mme_idle_status(hdev, mask_arr, mask_len, e);
+	is_idle &= gaudi2_get_tpc_idle_status(hdev, mask_arr, mask_len, e);
+	is_idle &= gaudi2_get_decoder_idle_status(hdev, mask_arr, mask_len, e);
+	is_idle &= gaudi2_get_rotator_idle_status(hdev, mask_arr, mask_len, e);
+
+	return is_idle;
+}
+
 static void gaudi2_hw_queues_lock(struct hl_device *hdev)
 	__acquires(&gaudi2->hw_queues_lock)
 {
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 4/6] habanalabs: assert return value of hw_fini
  2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
  2023-02-27 11:13 ` [PATCH 2/6] habanalabs: add device id to all threads names Oded Gabbay
  2023-02-27 11:13 ` [PATCH 3/6] habanalabs/gaudi2: break is_idle function into per-engine sub-routines Oded Gabbay
@ 2023-02-27 11:13 ` Oded Gabbay
  2023-03-01 13:31   ` Stanislaw Gruszka
  2023-02-27 11:13 ` [PATCH 5/6] habanalabs: use notifications and graceful reset for decoder Oded Gabbay
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: Oded Gabbay @ 2023-02-27 11:13 UTC (permalink / raw)
  To: dri-devel; +Cc: Dafna Hirschfeld

From: Dafna Hirschfeld <dhirschfeld@habana.ai>

Since hw_fini return error code for failure indication, we should
check its return value. Currently it might only fail upon soft-reset
from hl_device_reset. Later patch will add hw_fini failure in case of
polling timeout in hard-reset.

Signed-off-by: Dafna Hirschfeld <dhirschfeld@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 12 +++++++++---
 drivers/accel/habanalabs/gaudi/gaudi.c   |  7 ++++++-
 drivers/accel/habanalabs/gaudi2/gaudi2.c |  7 ++++++-
 drivers/accel/habanalabs/goya/goya.c     |  7 ++++++-
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 7ade32487138..99e793dfb126 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1472,7 +1472,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		schedule_hard_reset = false, delay_reset, from_dev_release, from_watchdog_thread;
 	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
 	struct hl_ctx *ctx;
-	int i, rc;
+	int i, rc, hw_fini_rc;
 
 	if (!hdev->init_done) {
 		dev_err(hdev->dev, "Can't reset before initialization is done\n");
@@ -1634,7 +1634,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	}
 
 	/* Reset the H/W. It will be in idle state after this returns */
-	hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
+	hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
 
 	if (hard_reset) {
 		hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
@@ -1661,6 +1661,10 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		hl_ctx_put(ctx);
 	}
 
+	if (hw_fini_rc) {
+		rc = hw_fini_rc;
+		goto out_err;
+	}
 	/* Finished tear-down, starting to re-initialize */
 
 	if (hard_reset) {
@@ -2416,7 +2420,9 @@ void hl_device_fini(struct hl_device *hdev)
 	hl_cb_pool_fini(hdev);
 
 	/* Reset the H/W. It will be in idle state after this returns */
-	hdev->asic_funcs->hw_fini(hdev, true, false);
+	rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+	if (rc)
+		dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
 
 	hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
 
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 26287084a9e0..60146fd4de6b 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -868,13 +868,18 @@ static int gaudi_early_init(struct hl_device *hdev)
 	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
+			/* we are already on failure flow, so don't check if hw_fini fails. */
 			hdev->asic_funcs->hw_fini(hdev, true, false);
 		goto pci_fini;
 	}
 
 	if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
 		dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->hw_fini(hdev, true, false);
+		rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+		if (rc) {
+			dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
+			goto pci_fini;
+		}
 	}
 
 	return 0;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 82448edfdfa0..f01fa4bca381 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2886,13 +2886,18 @@ static int gaudi2_early_init(struct hl_device *hdev)
 	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
+			/* we are already on failure flow, so don't check if hw_fini fails. */
 			hdev->asic_funcs->hw_fini(hdev, true, false);
 		goto pci_fini;
 	}
 
 	if (gaudi2_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
 		dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->hw_fini(hdev, true, false);
+		rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+		if (rc) {
+			dev_err(hdev->dev, "failed to reset HW during early init (%d)\n", rc);
+			goto pci_fini;
+		}
 	}
 
 	return 0;
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index 7a45ab3ca43a..39f9e5de1f4c 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -669,13 +669,18 @@ static int goya_early_init(struct hl_device *hdev)
 	rc = hl_fw_read_preboot_status(hdev);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
+			/* we are already on failure flow, so don't check if hw_fini fails. */
 			hdev->asic_funcs->hw_fini(hdev, true, false);
 		goto pci_fini;
 	}
 
 	if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
 		dev_dbg(hdev->dev, "H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->hw_fini(hdev, true, false);
+		rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+		if (rc) {
+			dev_err(hdev->dev, "failed to reset HW in dirty state (%d)\n", rc);
+			goto pci_fini;
+		}
 	}
 
 	if (!hdev->pldm) {
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 5/6] habanalabs: use notifications and graceful reset for decoder
  2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
                   ` (2 preceding siblings ...)
  2023-02-27 11:13 ` [PATCH 4/6] habanalabs: assert return value of hw_fini Oded Gabbay
@ 2023-02-27 11:13 ` Oded Gabbay
  2023-03-01 13:34   ` Stanislaw Gruszka
  2023-02-27 11:13 ` [PATCH 6/6] habanalabs/gaudi2: verify return code after scrubbing ARCs DCCMs Oded Gabbay
  2023-03-01 13:20 ` [PATCH 1/6] habanalabs: add helper function to get vm hash node Stanislaw Gruszka
  5 siblings, 1 reply; 10+ messages in thread
From: Oded Gabbay @ 2023-02-27 11:13 UTC (permalink / raw)
  To: dri-devel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Add notifications to user in case of decoder abnormal interrupts, and
use the graceful reset mechanism if reset is required.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/decoder.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/accel/habanalabs/common/decoder.c b/drivers/accel/habanalabs/common/decoder.c
index 2aab14d74b53..69c78c1784b4 100644
--- a/drivers/accel/habanalabs/common/decoder.c
+++ b/drivers/accel/habanalabs/common/decoder.c
@@ -46,7 +46,7 @@ static void dec_print_abnrm_intr_source(struct hl_device *hdev, u32 irq_status)
 static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_id)
 {
 	bool reset_required = false;
-	u32 irq_status;
+	u32 irq_status, event_mask;
 
 	irq_status = RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
 
@@ -54,17 +54,27 @@ static void dec_error_intr_work(struct hl_device *hdev, u32 base_addr, u32 core_
 
 	dec_print_abnrm_intr_source(hdev, irq_status);
 
-	if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK)
-		reset_required = true;
-
 	/* Clear the interrupt */
 	WREG32(base_addr + VCMD_IRQ_STATUS_OFFSET, irq_status);
 
 	/* Flush the interrupt clear */
 	RREG32(base_addr + VCMD_IRQ_STATUS_OFFSET);
 
-	if (reset_required)
-		hl_device_reset(hdev, HL_DRV_RESET_HARD);
+	if (irq_status & VCMD_IRQ_STATUS_TIMEOUT_MASK) {
+		reset_required = true;
+		event_mask = HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+	} else if (irq_status & VCMD_IRQ_STATUS_CMDERR_MASK) {
+		event_mask = HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
+	} else {
+		event_mask = HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
+	}
+
+	if (reset_required) {
+		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
+		hl_device_cond_reset(hdev, 0, event_mask);
+	} else {
+		hl_notifier_event_send_all(hdev, event_mask);
+	}
 }
 
 static void dec_completion_abnrm(struct work_struct *work)
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 6/6] habanalabs/gaudi2: verify return code after scrubbing ARCs DCCMs
  2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
                   ` (3 preceding siblings ...)
  2023-02-27 11:13 ` [PATCH 5/6] habanalabs: use notifications and graceful reset for decoder Oded Gabbay
@ 2023-02-27 11:13 ` Oded Gabbay
  2023-03-01 13:20 ` [PATCH 1/6] habanalabs: add helper function to get vm hash node Stanislaw Gruszka
  5 siblings, 0 replies; 10+ messages in thread
From: Oded Gabbay @ 2023-02-27 11:13 UTC (permalink / raw)
  To: dri-devel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

In case the KDMA fails scrubbing the DCCMs (following a soft-reset
upon device release), the driver will only print failure until reset
flow ends, rather than escalating it into a hard-reset.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 26 ++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index f01fa4bca381..2186f8bd547e 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -3024,16 +3024,21 @@ static int gaudi2_scrub_arc_dccm(struct hl_device *hdev, u32 cpu_id)
 	return 0;
 }
 
-static void gaudi2_scrub_arcs_dccm(struct hl_device *hdev)
+static int gaudi2_scrub_arcs_dccm(struct hl_device *hdev)
 {
 	u16 arc_id;
+	int rc;
 
 	for (arc_id = CPU_ID_SCHED_ARC0 ; arc_id < CPU_ID_MAX ; arc_id++) {
 		if (!gaudi2_is_arc_enabled(hdev, arc_id))
 			continue;
 
-		gaudi2_scrub_arc_dccm(hdev, arc_id);
+		rc = gaudi2_scrub_arc_dccm(hdev, arc_id);
+		if (rc)
+			return rc;
 	}
+
+	return 0;
 }
 
 static int gaudi2_late_init(struct hl_device *hdev)
@@ -3057,7 +3062,13 @@ static int gaudi2_late_init(struct hl_device *hdev)
 	}
 
 	gaudi2_init_arcs(hdev);
-	gaudi2_scrub_arcs_dccm(hdev);
+
+	rc = gaudi2_scrub_arcs_dccm(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to scrub arcs DCCM\n");
+		goto disable_pci_access;
+	}
+
 	gaudi2_init_security(hdev);
 
 	return 0;
@@ -6643,12 +6654,19 @@ static int gaudi2_compute_reset_late_init(struct hl_device *hdev)
 {
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
 	size_t irq_arr_size;
+	int rc;
 
 	/* TODO: missing gaudi2_nic_resume.
 	 * Until implemented nic_hw_cap_initialized will remain zeroed
 	 */
 	gaudi2_init_arcs(hdev);
-	gaudi2_scrub_arcs_dccm(hdev);
+
+	rc = gaudi2_scrub_arcs_dccm(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to scrub arcs DCCM\n");
+		return rc;
+	}
+
 	gaudi2_init_security(hdev);
 
 	/* Unmask all IRQs since some could have been received during the soft reset */
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/6] habanalabs: add helper function to get vm hash node
  2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
                   ` (4 preceding siblings ...)
  2023-02-27 11:13 ` [PATCH 6/6] habanalabs/gaudi2: verify return code after scrubbing ARCs DCCMs Oded Gabbay
@ 2023-03-01 13:20 ` Stanislaw Gruszka
  5 siblings, 0 replies; 10+ messages in thread
From: Stanislaw Gruszka @ 2023-03-01 13:20 UTC (permalink / raw)
  To: Oded Gabbay; +Cc: Tomer Tayar, dri-devel

On Mon, Feb 27, 2023 at 01:13:01PM +0200, Oded Gabbay wrote:
> From: Tomer Tayar <ttayar@habana.ai>
> 
> Add a helper function to search the vm hash for a node with a given
> virtual address.
> As opposed to the current code, this function explicitly returns NULL
> when no node is found, instead of basing on the loop cursor object's
> value.
> 
> Signed-off-by: Tomer Tayar <ttayar@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/6] habanalabs: add device id to all threads names
  2023-02-27 11:13 ` [PATCH 2/6] habanalabs: add device id to all threads names Oded Gabbay
@ 2023-03-01 13:21   ` Stanislaw Gruszka
  0 siblings, 0 replies; 10+ messages in thread
From: Stanislaw Gruszka @ 2023-03-01 13:21 UTC (permalink / raw)
  To: Oded Gabbay; +Cc: Sagiv Ozeri, dri-devel

On Mon, Feb 27, 2023 at 01:13:02PM +0200, Oded Gabbay wrote:
> From: Sagiv Ozeri <sozeri@habana.ai>
> 
> Compute driver threads names will start with hlX-*, when X is the
> device id.
> This will help distinguish them from the NIC thread names.
> 
> Signed-off-by: Sagiv Ozeri <sozeri@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/6] habanalabs: assert return value of hw_fini
  2023-02-27 11:13 ` [PATCH 4/6] habanalabs: assert return value of hw_fini Oded Gabbay
@ 2023-03-01 13:31   ` Stanislaw Gruszka
  0 siblings, 0 replies; 10+ messages in thread
From: Stanislaw Gruszka @ 2023-03-01 13:31 UTC (permalink / raw)
  To: Oded Gabbay; +Cc: Dafna Hirschfeld, dri-devel

On Mon, Feb 27, 2023 at 01:13:04PM +0200, Oded Gabbay wrote:
> From: Dafna Hirschfeld <dhirschfeld@habana.ai>
> 
> Since hw_fini return error code for failure indication, we should
> check its return value. Currently it might only fail upon soft-reset
> from hl_device_reset. Later patch will add hw_fini failure in case of
> polling timeout in hard-reset.
> 
> Signed-off-by: Dafna Hirschfeld <dhirschfeld@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/6] habanalabs: use notifications and graceful reset for decoder
  2023-02-27 11:13 ` [PATCH 5/6] habanalabs: use notifications and graceful reset for decoder Oded Gabbay
@ 2023-03-01 13:34   ` Stanislaw Gruszka
  0 siblings, 0 replies; 10+ messages in thread
From: Stanislaw Gruszka @ 2023-03-01 13:34 UTC (permalink / raw)
  To: Oded Gabbay; +Cc: Tomer Tayar, dri-devel

On Mon, Feb 27, 2023 at 01:13:05PM +0200, Oded Gabbay wrote:
> From: Tomer Tayar <ttayar@habana.ai>
> 
> Add notifications to user in case of decoder abnormal interrupts, and
> use the graceful reset mechanism if reset is required.
> 
> Signed-off-by: Tomer Tayar <ttayar@habana.ai>
> Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2023-03-01 13:34 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-02-27 11:13 [PATCH 1/6] habanalabs: add helper function to get vm hash node Oded Gabbay
2023-02-27 11:13 ` [PATCH 2/6] habanalabs: add device id to all threads names Oded Gabbay
2023-03-01 13:21   ` Stanislaw Gruszka
2023-02-27 11:13 ` [PATCH 3/6] habanalabs/gaudi2: break is_idle function into per-engine sub-routines Oded Gabbay
2023-02-27 11:13 ` [PATCH 4/6] habanalabs: assert return value of hw_fini Oded Gabbay
2023-03-01 13:31   ` Stanislaw Gruszka
2023-02-27 11:13 ` [PATCH 5/6] habanalabs: use notifications and graceful reset for decoder Oded Gabbay
2023-03-01 13:34   ` Stanislaw Gruszka
2023-02-27 11:13 ` [PATCH 6/6] habanalabs/gaudi2: verify return code after scrubbing ARCs DCCMs Oded Gabbay
2023-03-01 13:20 ` [PATCH 1/6] habanalabs: add helper function to get vm hash node Stanislaw Gruszka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox