public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/7] habanalabs: protect access to dynamic mem 'user_mappings'
@ 2023-01-08 17:20 Oded Gabbay
  2023-01-08 17:20 ` [PATCH 2/7] habanalabs: add set engines masks ASIC function Oded Gabbay
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-01-08 17:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: Koby Elbaz

From: Koby Elbaz <kelbaz@habana.ai>

When HL_INFO_USER_MAPPINGS IOCTL is called, we copy_to_user from
a dynamically allocated memory - 'user_mappings'.
Since freeing/allocating it happens in runtime (upon a page fault),
it not unlikely to access it even before being initially allocated
(i.e., accessing a NULL pointer).

The solution is to simply mark the spot when the err info has been
collected, and that way to know whether err info (either page fault
or RAZWI) is available to be read.

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c      |  4 +++
 drivers/accel/habanalabs/common/habanalabs.h  |  4 +++
 .../accel/habanalabs/common/habanalabs_drv.c  |  2 ++
 .../habanalabs/common/habanalabs_ioctl.c      | 36 ++++++++++++-------
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index e1b5a2c34986..6a05ab3fda23 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2441,6 +2441,8 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_
 	memcpy(&razwi_info->razwi.engine_id[0], &engine_id[0],
 			num_of_engines * sizeof(u16));
 	razwi_info->razwi.flags = flags;
+
+	razwi_info->razwi_info_available = true;
 }
 
 void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
@@ -2526,6 +2528,8 @@ void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is
 	pgf_info->page_fault.addr = addr;
 	pgf_info->page_fault.engine_id = eng_id;
 	hl_capture_user_mappings(hdev, is_pmmu);
+
+	pgf_info->page_fault_info_available = true;
 }
 
 void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index e578645acba9..cd474422163d 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -2984,12 +2984,14 @@ struct undefined_opcode_info {
  *                       Since we're looking for the page-fault's root cause,
  *                       we don't care of the others that might follow it-
  *                       so once changed to 1, it will remain that way.
+ * @page_fault_info_available: indicates that a page fault info is now available.
  */
 struct page_fault_info {
 	struct hl_page_fault_info	page_fault;
 	struct hl_user_mapping		*user_mappings;
 	u64				num_of_user_mappings;
 	atomic_t			page_fault_detected;
+	bool				page_fault_info_available;
 };
 
 /**
@@ -3000,10 +3002,12 @@ struct page_fault_info {
  *                  Since we're looking for the RAZWI's root cause,
  *                  we don't care of the others that might follow it-
  *                  so once changed to 1, it will remain that way.
+ * @razwi_info_available: indicates that a RAZWI info is now available.
  */
 struct razwi_info {
 	struct hl_info_razwi_event	razwi;
 	atomic_t			razwi_detected;
+	bool				razwi_info_available;
 };
 
 /**
diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c
index d7fe0af33bca..03dae57dc838 100644
--- a/drivers/accel/habanalabs/common/habanalabs_drv.c
+++ b/drivers/accel/habanalabs/common/habanalabs_drv.c
@@ -225,6 +225,8 @@ int hl_device_open(struct inode *inode, struct file *filp)
 	atomic_set(&hdev->captured_err_info.razwi_info.razwi_detected, 0);
 	atomic_set(&hdev->captured_err_info.page_fault_info.page_fault_detected, 0);
 	hdev->captured_err_info.undef_opcode.write_enable = true;
+	hdev->captured_err_info.razwi_info.razwi_info_available = false;
+	hdev->captured_err_info.page_fault_info.page_fault_info_available = false;
 
 	hdev->open_counter++;
 	hdev->last_successful_open_jif = jiffies;
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 949d38527160..72493bf94ba3 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -607,16 +607,20 @@ static int cs_timeout_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 
 static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
 	struct hl_device *hdev = hpriv->hdev;
 	u32 max_size = args->return_size;
-	struct hl_info_razwi_event *info = &hdev->captured_err_info.razwi_info.razwi;
-	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	struct razwi_info *razwi_info;
 
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_info_razwi_event)))
-				? -EFAULT : 0;
+	razwi_info = &hdev->captured_err_info.razwi_info;
+	if (!razwi_info->razwi_info_available)
+		return 0;
+
+	return copy_to_user(out, &razwi_info->razwi,
+			min_t(size_t, max_size, sizeof(struct hl_info_razwi_event))) ? -EFAULT : 0;
 }
 
 static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
@@ -786,16 +790,20 @@ static int engine_status_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 
 static int page_fault_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
 	struct hl_device *hdev = hpriv->hdev;
 	u32 max_size = args->return_size;
-	struct hl_page_fault_info *info = &hdev->captured_err_info.page_fault_info.page_fault;
-	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	struct page_fault_info *pgf_info;
 
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_page_fault_info)))
-				? -EFAULT : 0;
+	pgf_info = &hdev->captured_err_info.page_fault_info;
+	if (!pgf_info->page_fault_info_available)
+		return 0;
+
+	return copy_to_user(out, &pgf_info->page_fault,
+			min_t(size_t, max_size, sizeof(struct hl_page_fault_info))) ? -EFAULT : 0;
 }
 
 static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
@@ -806,18 +814,20 @@ static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	struct page_fault_info *pgf_info;
 	u64 actual_size;
 
-	pgf_info = &hdev->captured_err_info.page_fault_info;
-	args->array_size = pgf_info->num_of_user_mappings;
-
 	if (!out)
 		return -EINVAL;
 
+	pgf_info = &hdev->captured_err_info.page_fault_info;
+	if (!pgf_info->page_fault_info_available)
+		return 0;
+
+	args->array_size = pgf_info->num_of_user_mappings;
+
 	actual_size = pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping);
 	if (user_buf_size < actual_size)
 		return -ENOMEM;
 
-	return copy_to_user(out, pgf_info->user_mappings, min_t(size_t, user_buf_size, actual_size))
-				? -EFAULT : 0;
+	return copy_to_user(out, pgf_info->user_mappings, actual_size) ? -EFAULT : 0;
 }
 
 static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/7] habanalabs: add set engines masks ASIC function
  2023-01-08 17:20 [PATCH 1/7] habanalabs: protect access to dynamic mem 'user_mappings' Oded Gabbay
@ 2023-01-08 17:20 ` Oded Gabbay
  2023-01-08 17:20 ` [PATCH 3/7] habanalabs/gaudi2: fix log for sob value overflow/underflow Oded Gabbay
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-01-08 17:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ohad Sharabi

From: Ohad Sharabi <osharabi@habana.ai>

This function shall be used whenever components enable/binning masks
should be updated.

Usage is in one of the below cases:
- update user (or default) component masks
- update when getting the masks from FW (either CPUCP or COMMS)

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/firmware_if.c | 12 +++++---
 drivers/accel/habanalabs/common/habanalabs.h  |  2 ++
 drivers/accel/habanalabs/gaudi/gaudi.c        |  6 ++++
 drivers/accel/habanalabs/gaudi2/gaudi2.c      | 30 +++++++++++++------
 drivers/accel/habanalabs/goya/goya.c          |  6 ++++
 5 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index eb000e035026..ef228087ef55 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -2647,7 +2647,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 				fw_loader->dynamic_loader.comm_desc.cur_fw_ver);
 
 		if (rc)
-			goto out;
+			return rc;
 
 		/* read binning info from preboot */
 		if (hdev->support_preboot_binning) {
@@ -2660,15 +2660,19 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 
 			rc = hdev->asic_funcs->set_dram_properties(hdev);
 			if (rc)
-				goto out;
+				return rc;
+
+			rc = hdev->asic_funcs->set_binning_masks(hdev);
+			if (rc)
+				return rc;
 
 			dev_dbg(hdev->dev,
 				"Read binning masks: tpc: 0x%llx, dram: 0x%llx, edma: 0x%x, dec: 0x%x, rot:0x%x\n",
 				hdev->tpc_binning, hdev->dram_binning, hdev->edma_binning,
 				hdev->decoder_binning, hdev->rotator_binning);
 		}
-out:
-		return rc;
+
+		return 0;
 	}
 
 	/* load boot fit to FW */
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index cd474422163d..0b7fe4afd92d 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -1549,6 +1549,7 @@ struct engines_data {
  * @set_engine_cores: set a config command to engine cores
  * @send_device_activity: indication to FW about device availability
  * @set_dram_properties: set DRAM related properties.
+ * @set_binning_masks: set binning/enable masks for all relevant components.
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -1687,6 +1688,7 @@ struct hl_asic_funcs {
 					u32 num_cores, u32 core_command);
 	int (*send_device_activity)(struct hl_device *hdev, bool open);
 	int (*set_dram_properties)(struct hl_device *hdev);
+	int (*set_binning_masks)(struct hl_device *hdev);
 };
 
 
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 733916f38752..71debe862c86 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -9135,6 +9135,11 @@ static int gaudi_set_dram_properties(struct hl_device *hdev)
 	return 0;
 }
 
+static int gaudi_set_binning_masks(struct hl_device *hdev)
+{
+	return 0;
+}
+
 static void gaudi_check_if_razwi_happened(struct hl_device *hdev)
 {
 }
@@ -9262,6 +9267,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.set_dram_bar_base = gaudi_set_hbm_bar_base,
 	.send_device_activity = gaudi_send_device_activity,
 	.set_dram_properties = gaudi_set_dram_properties,
+	.set_binning_masks = gaudi_set_binning_masks,
 };
 
 /**
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 4529a64d49b6..0f3e690041af 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2437,6 +2437,25 @@ static int gaudi2_set_cluster_binning_masks(struct hl_device *hdev)
 	return 0;
 }
 
+static int gaudi2_set_binning_masks(struct hl_device *hdev)
+{
+	int rc;
+
+	rc = gaudi2_set_cluster_binning_masks(hdev);
+	if (rc)
+		return rc;
+
+	rc = gaudi2_set_tpc_binning_masks(hdev);
+	if (rc)
+		return rc;
+
+	rc = gaudi2_set_dec_binning_masks(hdev);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
 static int gaudi2_cpucp_info_get(struct hl_device *hdev)
 {
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
@@ -2492,15 +2511,7 @@ static int gaudi2_cpucp_info_get(struct hl_device *hdev)
 	if (rc)
 		return rc;
 
-	rc = gaudi2_set_cluster_binning_masks(hdev);
-	if (rc)
-		return rc;
-
-	rc = gaudi2_set_tpc_binning_masks(hdev);
-	if (rc)
-		return rc;
-
-	rc = gaudi2_set_dec_binning_masks(hdev);
+	rc = hdev->asic_funcs->set_binning_masks(hdev);
 	if (rc)
 		return rc;
 
@@ -10597,6 +10608,7 @@ static const struct hl_asic_funcs gaudi2_funcs = {
 	.set_engine_cores = gaudi2_set_engine_cores,
 	.send_device_activity = gaudi2_send_device_activity,
 	.set_dram_properties = gaudi2_set_dram_properties,
+	.set_binning_masks = gaudi2_set_binning_masks,
 };
 
 void gaudi2_set_asic_funcs(struct hl_device *hdev)
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index ee0c7db16270..2b135e856607 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -5425,6 +5425,11 @@ static int goya_set_dram_properties(struct hl_device *hdev)
 	return 0;
 }
 
+static int goya_set_binning_masks(struct hl_device *hdev)
+{
+	return 0;
+}
+
 static int goya_send_device_activity(struct hl_device *hdev, bool open)
 {
 	return 0;
@@ -5524,6 +5529,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.set_dram_bar_base = goya_set_ddr_bar_base,
 	.send_device_activity = goya_send_device_activity,
 	.set_dram_properties = goya_set_dram_properties,
+	.set_binning_masks = goya_set_binning_masks,
 };
 
 /*
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/7] habanalabs/gaudi2: fix log for sob value overflow/underflow
  2023-01-08 17:20 [PATCH 1/7] habanalabs: protect access to dynamic mem 'user_mappings' Oded Gabbay
  2023-01-08 17:20 ` [PATCH 2/7] habanalabs: add set engines masks ASIC function Oded Gabbay
@ 2023-01-08 17:20 ` Oded Gabbay
  2023-01-08 17:20 ` [PATCH 4/7] habanalabs: define events to trace PCI LBW access Oded Gabbay
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-01-08 17:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: Carmit Carmel

From: Carmit Carmel <ccarmel@habana.ai>

The value in SM_SEI_CAUSE includes the SOB index and not the SOB group
index.
Remove usage of log_mask in sm_sei_cause structure as it was never
used.

Signed-off-by: Carmit Carmel <ccarmel@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 0f3e690041af..503a52db203f 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -676,14 +676,13 @@ static const char * const gaudi2_kdma_core_interrupts_cause[GAUDI2_NUM_OF_DMA_CO
 struct gaudi2_sm_sei_cause_data {
 	const char *cause_name;
 	const char *log_name;
-	u32 log_mask;
 };
 
 static const struct gaudi2_sm_sei_cause_data
 gaudi2_sm_sei_cause[GAUDI2_NUM_OF_SM_SEI_ERR_CAUSE] = {
-	{"calculated SO value overflow/underflow", "SOB group ID", 0x7FF},
-	{"payload address of monitor is not aligned to 4B", "monitor addr", 0xFFFF},
-	{"armed monitor write got BRESP (SLVERR or DECERR)", "AXI id", 0xFFFF},
+	{"calculated SO value overflow/underflow", "SOB ID"},
+	{"payload address of monitor is not aligned to 4B", "monitor addr"},
+	{"armed monitor write got BRESP (SLVERR or DECERR)", "AXI id"},
 };
 
 static const char * const
@@ -8418,7 +8417,7 @@ static int gaudi2_handle_sm_err(struct hl_device *hdev, u16 event_type, u8 sm_in
 				"err cause: %s. %s: 0x%X\n",
 				gaudi2_sm_sei_cause[i].cause_name,
 				gaudi2_sm_sei_cause[i].log_name,
-				sei_cause_log & gaudi2_sm_sei_cause[i].log_mask);
+				sei_cause_log);
 			error_count++;
 			break;
 		}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/7] habanalabs: define events to trace PCI LBW access
  2023-01-08 17:20 [PATCH 1/7] habanalabs: protect access to dynamic mem 'user_mappings' Oded Gabbay
  2023-01-08 17:20 ` [PATCH 2/7] habanalabs: add set engines masks ASIC function Oded Gabbay
  2023-01-08 17:20 ` [PATCH 3/7] habanalabs/gaudi2: fix log for sob value overflow/underflow Oded Gabbay
@ 2023-01-08 17:20 ` Oded Gabbay
  2023-01-08 17:20 ` [PATCH 5/7] habanalabs: trace LBW reads/writes Oded Gabbay
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-01-08 17:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ohad Sharabi

From: Ohad Sharabi <osharabi@habana.ai>

There are cases where it may be useful to dump the whole LBW configs.
Yet, doing so while spamming the kernel log will probably shade other
important messages since the LBW access is done in sheer volume.
To answer this we add trace events for those too.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 include/trace/events/habanalabs.h | 39 +++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/include/trace/events/habanalabs.h b/include/trace/events/habanalabs.h
index 10233e13cee4..951643e6a7a9 100644
--- a/include/trace/events/habanalabs.h
+++ b/include/trace/events/habanalabs.h
@@ -123,6 +123,45 @@ DEFINE_EVENT(habanalabs_comms_template, habanalabs_comms_wait_status_done,
 	TP_PROTO(struct device *dev, char *op_str),
 	TP_ARGS(dev, op_str));
 
+DECLARE_EVENT_CLASS(habanalabs_reg_access_template,
+	TP_PROTO(struct device *dev, u32 addr, u32 val),
+
+	TP_ARGS(dev, addr, val),
+
+	TP_STRUCT__entry(
+		__string(dname, dev_name(dev))
+		__field(u32, addr)
+		__field(u32, val)
+	),
+
+	TP_fast_assign(
+		__assign_str(dname, dev_name(dev));
+		__entry->addr = addr;
+		__entry->val = val;
+	),
+
+	TP_printk("%s: addr: %#x, val: %#x",
+		__get_str(dname),
+		__entry->addr,
+		__entry->val)
+);
+
+DEFINE_EVENT(habanalabs_reg_access_template, habanalabs_rreg32,
+	TP_PROTO(struct device *dev, u32 addr, u32 val),
+	TP_ARGS(dev, addr, val));
+
+DEFINE_EVENT(habanalabs_reg_access_template, habanalabs_wreg32,
+	TP_PROTO(struct device *dev, u32 addr, u32 val),
+	TP_ARGS(dev, addr, val));
+
+DEFINE_EVENT(habanalabs_reg_access_template, habanalabs_elbi_read,
+	TP_PROTO(struct device *dev, u32 addr, u32 val),
+	TP_ARGS(dev, addr, val));
+
+DEFINE_EVENT(habanalabs_reg_access_template, habanalabs_elbi_write,
+	TP_PROTO(struct device *dev, u32 addr, u32 val),
+	TP_ARGS(dev, addr, val));
+
 #endif /* if !defined(_TRACE_HABANALABS_H) || defined(TRACE_HEADER_MULTI_READ) */
 
 /* This part must be outside protection */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 5/7] habanalabs: trace LBW reads/writes
  2023-01-08 17:20 [PATCH 1/7] habanalabs: protect access to dynamic mem 'user_mappings' Oded Gabbay
                   ` (2 preceding siblings ...)
  2023-01-08 17:20 ` [PATCH 4/7] habanalabs: define events to trace PCI LBW access Oded Gabbay
@ 2023-01-08 17:20 ` Oded Gabbay
  2023-01-08 17:20 ` [PATCH 6/7] habanalabs/gaudi2: remove use of razwi info received from f/w Oded Gabbay
  2023-01-08 17:20 ` [PATCH 7/7] habanalabs: extend fatal messages to contain PCI info Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-01-08 17:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: Ohad Sharabi

From: Ohad Sharabi <osharabi@habana.ai>

Add traces to LBW reads/writes.
This may be handy when debugging configuration failure or events when
tracking configuration flow.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c  | 10 +++++++++-
 drivers/accel/habanalabs/common/pci/pci.c | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 6a05ab3fda23..722a5beb0974 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -2401,7 +2401,12 @@ void hl_device_fini(struct hl_device *hdev)
  */
 inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
 {
-	return readl(hdev->rmmio + reg);
+	u32 val = readl(hdev->rmmio + reg);
+
+	if (unlikely(trace_habanalabs_rreg32_enabled()))
+		trace_habanalabs_rreg32(hdev->dev, reg, val);
+
+	return val;
 }
 
 /*
@@ -2416,6 +2421,9 @@ inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
  */
 inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
 {
+	if (unlikely(trace_habanalabs_wreg32_enabled()))
+		trace_habanalabs_wreg32(hdev->dev, reg, val);
+
 	writel(val, hdev->rmmio + reg);
 }
 
diff --git a/drivers/accel/habanalabs/common/pci/pci.c b/drivers/accel/habanalabs/common/pci/pci.c
index 5fe3da5fba30..d1f4c695baf2 100644
--- a/drivers/accel/habanalabs/common/pci/pci.c
+++ b/drivers/accel/habanalabs/common/pci/pci.c
@@ -10,6 +10,8 @@
 
 #include <linux/pci.h>
 
+#include <trace/events/habanalabs.h>
+
 #define HL_PLDM_PCI_ELBI_TIMEOUT_MSEC	(HL_PCI_ELBI_TIMEOUT_MSEC * 100)
 
 #define IATU_REGION_CTRL_REGION_EN_MASK		BIT(31)
@@ -120,6 +122,9 @@ int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data)
 	if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE) {
 		pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_DATA, data);
 
+		if (unlikely(trace_habanalabs_elbi_read_enabled()))
+			trace_habanalabs_elbi_read(hdev->dev, (u32) addr, val);
+
 		return 0;
 	}
 
@@ -179,8 +184,11 @@ static int hl_pci_elbi_write(struct hl_device *hdev, u64 addr, u32 data)
 		usleep_range(300, 500);
 	}
 
-	if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE)
+	if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE) {
+		if (unlikely(trace_habanalabs_elbi_write_enabled()))
+			trace_habanalabs_elbi_write(hdev->dev, (u32) addr, val);
 		return 0;
+	}
 
 	if (val & PCI_CONFIG_ELBI_STS_ERR)
 		return -EIO;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 6/7] habanalabs/gaudi2: remove use of razwi info received from f/w
  2023-01-08 17:20 [PATCH 1/7] habanalabs: protect access to dynamic mem 'user_mappings' Oded Gabbay
                   ` (3 preceding siblings ...)
  2023-01-08 17:20 ` [PATCH 5/7] habanalabs: trace LBW reads/writes Oded Gabbay
@ 2023-01-08 17:20 ` Oded Gabbay
  2023-01-08 17:20 ` [PATCH 7/7] habanalabs: extend fatal messages to contain PCI info Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-01-08 17:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: Dani Liberman

From: Dani Liberman <dliberman@habana.ai>

Because f/w does not update razwi info when sending events, remove the
use of it.
The driver is responsible to check if razwi happened and to
collect razwi data.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 193 +++++++----------------
 1 file changed, 57 insertions(+), 136 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 503a52db203f..2b5cd058f5ad 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -7081,7 +7081,6 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
 
 static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev,
 			u64 rtr_mstr_if_base_addr, bool is_write, char *name,
-			bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info,
 			enum gaudi2_engine_id id, u64 *event_mask)
 {
 	u32 razwi_hi, razwi_lo, razwi_xy;
@@ -7089,26 +7088,14 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev,
 	u8 rd_wr_flag;
 
 	if (is_write) {
-		if (read_razwi_regs) {
-			razwi_hi = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HI);
-			razwi_lo = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_LO);
-			razwi_xy = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_XY);
-		} else {
-			razwi_hi = le32_to_cpu(razwi_info->hbw.rr_aw_razwi_hi_reg);
-			razwi_lo = le32_to_cpu(razwi_info->hbw.rr_aw_razwi_lo_reg);
-			razwi_xy = le32_to_cpu(razwi_info->hbw.rr_aw_razwi_id_reg);
-		}
+		razwi_hi = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HI);
+		razwi_lo = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_LO);
+		razwi_xy = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_XY);
 		rd_wr_flag = HL_RAZWI_WRITE;
 	} else {
-		if (read_razwi_regs) {
-			razwi_hi = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HI);
-			razwi_lo = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_LO);
-			razwi_xy = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_XY);
-		} else {
-			razwi_hi = le32_to_cpu(razwi_info->hbw.rr_ar_razwi_hi_reg);
-			razwi_lo = le32_to_cpu(razwi_info->hbw.rr_ar_razwi_lo_reg);
-			razwi_xy = le32_to_cpu(razwi_info->hbw.rr_ar_razwi_id_reg);
-		}
+		razwi_hi = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HI);
+		razwi_lo = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_LO);
+		razwi_xy = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_XY);
 		rd_wr_flag = HL_RAZWI_READ;
 	}
 
@@ -7122,7 +7109,6 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev,
 
 static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev,
 			u64 rtr_mstr_if_base_addr, bool is_write, char *name,
-			bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info,
 			enum gaudi2_engine_id id, u64 *event_mask)
 {
 	u32 razwi_addr, razwi_xy;
@@ -7130,24 +7116,12 @@ static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev,
 	u8 rd_wr_flag;
 
 	if (is_write) {
-		if (read_razwi_regs) {
-			razwi_addr = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI);
-			razwi_xy = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_XY);
-		} else {
-			razwi_addr = le32_to_cpu(razwi_info->lbw.rr_aw_razwi_reg);
-			razwi_xy = le32_to_cpu(razwi_info->lbw.rr_aw_razwi_id_reg);
-		}
-
+		razwi_addr = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI);
+		razwi_xy = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_XY);
 		rd_wr_flag = HL_RAZWI_WRITE;
 	} else {
-		if (read_razwi_regs) {
-			razwi_addr = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI);
-			razwi_xy = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_XY);
-		} else {
-			razwi_addr = le32_to_cpu(razwi_info->lbw.rr_ar_razwi_reg);
-			razwi_xy = le32_to_cpu(razwi_info->lbw.rr_ar_razwi_id_reg);
-		}
-
+		razwi_addr = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI);
+		razwi_xy = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_XY);
 		rd_wr_flag = HL_RAZWI_READ;
 	}
 
@@ -7208,19 +7182,15 @@ static enum gaudi2_engine_id gaudi2_razwi_calc_engine_id(struct hl_device *hdev,
  */
 static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 				enum razwi_event_sources module, u8 module_idx,
-				u8 module_sub_idx, struct hl_eq_razwi_info *razwi_info,
-				u64 *event_mask)
+				u8 module_sub_idx, u64 *event_mask)
 {
-	bool via_sft = false, read_razwi_regs = false;
+	bool via_sft = false;
 	u32 rtr_id, dcore_id, dcore_rtr_id, sft_id, eng_id;
 	u64 rtr_mstr_if_base_addr;
 	u32 hbw_shrd_aw = 0, hbw_shrd_ar = 0;
 	u32 lbw_shrd_aw = 0, lbw_shrd_ar = 0;
 	char initiator_name[64];
 
-	if (hdev->pldm || !(hdev->fw_components & FW_TYPE_LINUX) || !razwi_info)
-		read_razwi_regs = true;
-
 	switch (module) {
 	case RAZWI_TPC:
 		rtr_id = gaudi2_tpc_initiator_rtr_id[module_idx];
@@ -7286,23 +7256,6 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 		return;
 	}
 
-	if (!read_razwi_regs) {
-		if (le32_to_cpu(razwi_info->razwi_happened_mask) & RAZWI_HAPPENED_HBW) {
-			hbw_shrd_aw = le32_to_cpu(razwi_info->razwi_happened_mask) &
-								RAZWI_HAPPENED_AW;
-			hbw_shrd_ar = le32_to_cpu(razwi_info->razwi_happened_mask) &
-								RAZWI_HAPPENED_AR;
-		} else if (le32_to_cpu(razwi_info->razwi_happened_mask) & RAZWI_HAPPENED_LBW) {
-			lbw_shrd_aw = le32_to_cpu(razwi_info->razwi_happened_mask) &
-								RAZWI_HAPPENED_AW;
-			lbw_shrd_ar = le32_to_cpu(razwi_info->razwi_happened_mask) &
-								RAZWI_HAPPENED_AR;
-		}
-		rtr_mstr_if_base_addr = 0;
-
-		goto dump_info;
-	}
-
 	/* Find router mstr_if register base */
 	if (via_sft) {
 		rtr_mstr_if_base_addr = mmSFT0_HBW_RTR_IF0_RTR_CTRL_BASE +
@@ -7320,7 +7273,6 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 
 	/* Find out event cause by reading "RAZWI_HAPPENED" registers */
 	hbw_shrd_aw = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HAPPENED);
-
 	hbw_shrd_ar = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HAPPENED);
 
 	if (via_sft) {
@@ -7333,58 +7285,43 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 				RTR_LBW_MSTR_IF_OFFSET;
 
 		lbw_shrd_aw = RREG32(base + RR_SHRD_LBW_AW_RAZWI_HAPPENED);
-
 		lbw_shrd_ar = RREG32(base + RR_SHRD_LBW_AR_RAZWI_HAPPENED);
 	} else {
 		lbw_shrd_aw = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_HAPPENED);
-
 		lbw_shrd_ar = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_HAPPENED);
 	}
 
-dump_info:
-	/* check if there is no RR razwi indication at all */
-	if (!hbw_shrd_aw && !hbw_shrd_ar && !lbw_shrd_aw && !lbw_shrd_ar)
-		return;
-
 	eng_id = gaudi2_razwi_calc_engine_id(hdev, module, module_idx);
 	if (hbw_shrd_aw) {
 		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true,
-						initiator_name, read_razwi_regs, razwi_info,
-						eng_id, event_mask);
+						initiator_name, eng_id, event_mask);
 
 		/* Clear event indication */
-		if (read_razwi_regs)
-			WREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HAPPENED, hbw_shrd_aw);
+		WREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HAPPENED, hbw_shrd_aw);
 	}
 
 	if (hbw_shrd_ar) {
 		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false,
-						initiator_name, read_razwi_regs, razwi_info,
-						eng_id, event_mask);
+						initiator_name, eng_id, event_mask);
 
 		/* Clear event indication */
-		if (read_razwi_regs)
-			WREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HAPPENED, hbw_shrd_ar);
+		WREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HAPPENED, hbw_shrd_ar);
 	}
 
 	if (lbw_shrd_aw) {
 		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true,
-						initiator_name, read_razwi_regs, razwi_info,
-						eng_id, event_mask);
+						initiator_name, eng_id, event_mask);
 
 		/* Clear event indication */
-		if (read_razwi_regs)
-			WREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_HAPPENED, lbw_shrd_aw);
+		WREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_HAPPENED, lbw_shrd_aw);
 	}
 
 	if (lbw_shrd_ar) {
 		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false,
-						initiator_name, read_razwi_regs, razwi_info,
-						eng_id, event_mask);
+						initiator_name, eng_id, event_mask);
 
 		/* Clear event indication */
-		if (read_razwi_regs)
-			WREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_HAPPENED, lbw_shrd_ar);
+		WREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_HAPPENED, lbw_shrd_ar);
 	}
 }
 
@@ -7396,42 +7333,38 @@ static void gaudi2_check_if_razwi_happened(struct hl_device *hdev)
 	/* check all TPCs */
 	for (mod_idx = 0 ; mod_idx < (NUM_OF_TPC_PER_DCORE * NUM_OF_DCORES + 1) ; mod_idx++) {
 		if (prop->tpc_enabled_mask & BIT(mod_idx))
-			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, mod_idx, 0, NULL,
-								NULL);
+			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, mod_idx, 0, NULL);
 	}
 
 	/* check all MMEs */
 	for (mod_idx = 0 ; mod_idx < (NUM_OF_MME_PER_DCORE * NUM_OF_DCORES) ; mod_idx++)
 		for (sub_mod = MME_WAP0 ; sub_mod < MME_INITIATORS_MAX ; sub_mod++)
 			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mod_idx,
-									sub_mod, NULL, NULL);
+									sub_mod, NULL);
 
 	/* check all EDMAs */
 	for (mod_idx = 0 ; mod_idx < (NUM_OF_EDMA_PER_DCORE * NUM_OF_DCORES) ; mod_idx++)
 		if (prop->edma_enabled_mask & BIT(mod_idx))
-			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_EDMA, mod_idx, 0, NULL,
-								NULL);
+			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_EDMA, mod_idx, 0, NULL);
 
 	/* check all PDMAs */
 	for (mod_idx = 0 ; mod_idx < NUM_OF_PDMA ; mod_idx++)
-		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, mod_idx, 0, NULL,
-							NULL);
+		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, mod_idx, 0, NULL);
 
 	/* check all NICs */
 	for (mod_idx = 0 ; mod_idx < NIC_NUMBER_OF_PORTS ; mod_idx++)
 		if (hdev->nic_ports_mask & BIT(mod_idx))
 			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_NIC, mod_idx >> 1, 0,
-								NULL, NULL);
+								NULL);
 
 	/* check all DECs */
 	for (mod_idx = 0 ; mod_idx < NUMBER_OF_DEC ; mod_idx++)
 		if (prop->decoder_enabled_mask & BIT(mod_idx))
-			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, mod_idx, 0, NULL,
-								NULL);
+			gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, mod_idx, 0, NULL);
 
 	/* check all ROTs */
 	for (mod_idx = 0 ; mod_idx < NUM_OF_ROT ; mod_idx++)
-		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, mod_idx, 0, NULL, NULL);
+		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, mod_idx, 0, NULL);
 }
 
 static const char *gaudi2_get_initiators_name(u32 rtr_id)
@@ -7818,7 +7751,7 @@ static int _gaudi2_handle_qm_sei_err(struct hl_device *hdev, u64 qman_base, u16
 }
 
 static int gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type,
-					struct hl_eq_razwi_info *razwi_info, u64 *event_mask)
+					bool extended_err_check, u64 *event_mask)
 {
 	enum razwi_event_sources module;
 	u32 error_count = 0;
@@ -7871,9 +7804,9 @@ static int gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type,
 		error_count += _gaudi2_handle_qm_sei_err(hdev,
 					qman_base + NIC_QM_OFFSET, event_type);
 
-	/* check if RAZWI happened */
-	if (razwi_info)
-		gaudi2_ack_module_razwi_event_handler(hdev, module, 0, 0, razwi_info, event_mask);
+	if (extended_err_check)
+		/* check if RAZWI happened */
+		gaudi2_ack_module_razwi_event_handler(hdev, module, 0, 0, event_mask);
 
 	return error_count;
 }
@@ -8042,8 +7975,7 @@ static int gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index, u16 event
 		}
 
 	/* check if RAZWI happened */
-	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, rot_index, 0,
-						&razwi_with_intr_cause->razwi_info, event_mask);
+	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, rot_index, 0, event_mask);
 
 	return error_count;
 }
@@ -8064,14 +7996,13 @@ static int gaudi2_tpc_ack_interrupts(struct hl_device *hdev,  u8 tpc_index, u16
 		}
 
 	/* check if RAZWI happened */
-	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, tpc_index, 0,
-						&razwi_with_intr_cause->razwi_info, event_mask);
+	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, tpc_index, 0, event_mask);
 
 	return error_count;
 }
 
 static int gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, u16 event_type,
-				struct hl_eq_razwi_info *razwi_info, u64 *event_mask)
+					u64 *event_mask)
 {
 	u32 sts_addr, sts_val, sts_clr_val = 0, error_count = 0;
 	int i;
@@ -8098,8 +8029,7 @@ static int gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, u16 event
 	}
 
 	/* check if RAZWI happened */
-	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, dec_index, 0, razwi_info,
-						event_mask);
+	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, dec_index, 0, event_mask);
 
 	/* Write 1 clear errors */
 	WREG32(sts_addr, sts_clr_val);
@@ -8108,7 +8038,7 @@ static int gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, u16 event
 }
 
 static int gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, u16 event_type,
-				struct hl_eq_razwi_info *razwi_info, u64 *event_mask)
+					u64 *event_mask)
 {
 	u32 sts_addr, sts_val, sts_clr_addr, sts_clr_val = 0, error_count = 0;
 	int i;
@@ -8129,8 +8059,7 @@ static int gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, u16 event
 
 	/* check if RAZWI happened */
 	for (i = MME_WRITE ; i < MME_INITIATORS_MAX ; i++)
-		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, i, razwi_info,
-							event_mask);
+		gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, i, event_mask);
 
 	WREG32(sts_clr_addr, sts_clr_val);
 
@@ -8153,7 +8082,7 @@ static int gaudi2_handle_mme_sbte_err(struct hl_device *hdev, u16 event_type,
 }
 
 static int gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index, u16 event_type,
-					struct hl_eq_razwi_info *razwi_info, u64 *event_mask)
+					u64 *event_mask)
 {
 	u32 sts_addr, sts_val, sts_clr_addr, sts_clr_val = 0, error_count = 0;
 	int i;
@@ -8173,10 +8102,8 @@ static int gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index, u16 e
 	}
 
 	/* check if RAZWI happened on WAP0/1 */
-	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP0, razwi_info,
-						event_mask);
-	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP1, razwi_info,
-						event_mask);
+	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP0, event_mask);
+	gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP1, event_mask);
 
 	WREG32(sts_clr_addr, sts_clr_val);
 
@@ -8226,29 +8153,29 @@ static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev,
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
-		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true,
-							NULL, GAUDI2_ENGINE_ID_PCIE, event_mask);
+		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE",
+							GAUDI2_ENGINE_ID_PCIE, event_mask);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
-		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true,
-							NULL, GAUDI2_ENGINE_ID_PCIE, event_mask);
+		gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE",
+							GAUDI2_ENGINE_ID_PCIE, event_mask);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
-		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true,
-							NULL, GAUDI2_ENGINE_ID_PCIE, event_mask);
+		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE",
+							GAUDI2_ENGINE_ID_PCIE, event_mask);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 
 	razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_HAPPENED;
 	if (RREG32(razwi_happened_addr)) {
-		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true,
-							NULL, GAUDI2_ENGINE_ID_PCIE, event_mask);
+		gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE",
+							GAUDI2_ENGINE_ID_PCIE, event_mask);
 		WREG32(razwi_happened_addr, 0x1);
 	}
 }
@@ -8912,8 +8839,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
 	case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
-		error_count = gaudi2_handle_qm_sei_err(hdev, event_type,
-					&eq_entry->razwi_info, &event_mask);
+		error_count = gaudi2_handle_qm_sei_err(hdev, event_type, true, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -8922,7 +8848,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		index = event_type - GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE;
 		error_count = gaudi2_handle_rot_err(hdev, index, event_type,
 					&eq_entry->razwi_with_intr_cause, &event_mask);
-		error_count += gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask);
+		error_count += gaudi2_handle_qm_sei_err(hdev, event_type, false, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -8930,14 +8856,13 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP;
 		error_count = gaudi2_tpc_ack_interrupts(hdev, index, event_type,
 						&eq_entry->razwi_with_intr_cause, &event_mask);
-		error_count += gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask);
+		error_count += gaudi2_handle_qm_sei_err(hdev, event_type, false, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
 	case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE:
 		index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE;
-		error_count = gaudi2_handle_dec_err(hdev, index, event_type,
-						&eq_entry->razwi_info, &event_mask);
+		error_count = gaudi2_handle_dec_err(hdev, index, event_type, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -8985,8 +8910,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_DEC9_SPI:
 		index = (event_type - GAUDI2_EVENT_DEC0_SPI) /
 				(GAUDI2_EVENT_DEC1_SPI - GAUDI2_EVENT_DEC0_SPI);
-		error_count = gaudi2_handle_dec_err(hdev, index, event_type,
-					&eq_entry->razwi_info, &event_mask);
+		error_count = gaudi2_handle_dec_err(hdev, index, event_type, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -8997,9 +8921,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		index = (event_type - GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE) /
 				(GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE -
 						GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE);
-		error_count = gaudi2_handle_mme_err(hdev, index, event_type,
-				&eq_entry->razwi_info, &event_mask);
-		error_count += gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask);
+		error_count = gaudi2_handle_mme_err(hdev, index, event_type, &event_mask);
+		error_count += gaudi2_handle_qm_sei_err(hdev, event_type, false, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -9010,8 +8933,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		index = (event_type - GAUDI2_EVENT_MME0_QMAN_SW_ERROR) /
 				(GAUDI2_EVENT_MME1_QMAN_SW_ERROR -
 					GAUDI2_EVENT_MME0_QMAN_SW_ERROR);
-		error_count = gaudi2_handle_mme_err(hdev, index, event_type,
-					&eq_entry->razwi_info, &event_mask);
+		error_count = gaudi2_handle_mme_err(hdev, index, event_type, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
@@ -9022,8 +8944,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		index = (event_type - GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID) /
 				(GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID -
 					GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID);
-		error_count = gaudi2_handle_mme_wap_err(hdev, index, event_type,
-					&eq_entry->razwi_info, &event_mask);
+		error_count = gaudi2_handle_mme_wap_err(hdev, index, event_type, &event_mask);
 		event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
 		break;
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 7/7] habanalabs: extend fatal messages to contain PCI info
  2023-01-08 17:20 [PATCH 1/7] habanalabs: protect access to dynamic mem 'user_mappings' Oded Gabbay
                   ` (4 preceding siblings ...)
  2023-01-08 17:20 ` [PATCH 6/7] habanalabs/gaudi2: remove use of razwi info received from f/w Oded Gabbay
@ 2023-01-08 17:20 ` Oded Gabbay
  5 siblings, 0 replies; 7+ messages in thread
From: Oded Gabbay @ 2023-01-08 17:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: Moti Haimovski

From: Moti Haimovski <mhaimovski@habana.ai>

This commit attaches the PCI device address to driver fatal messages
in order to ease debugging in multi-device setups.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/device.c | 38 ++++++++++++++++--------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 722a5beb0974..2b6971463f12 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1563,7 +1563,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 		if (rc == -EBUSY) {
 			if (hdev->device_fini_pending) {
 				dev_crit(hdev->dev,
-					"Failed to kill all open processes, stopping hard reset\n");
+					"%s Failed to kill all open processes, stopping hard reset\n",
+					dev_name(&(hdev)->pdev->dev));
 				goto out_err;
 			}
 
@@ -1573,7 +1574,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 
 		if (rc) {
 			dev_crit(hdev->dev,
-				"Failed to kill all open processes, stopping hard reset\n");
+				"%s Failed to kill all open processes, stopping hard reset\n",
+				dev_name(&(hdev)->pdev->dev));
 			goto out_err;
 		}
 
@@ -1624,14 +1626,16 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 			 * ensure driver puts the driver in a unusable state
 			 */
 			dev_crit(hdev->dev,
-				"Consecutive FW fatal errors received, stopping hard reset\n");
+				"%s Consecutive FW fatal errors received, stopping hard reset\n",
+				dev_name(&(hdev)->pdev->dev));
 			rc = -EIO;
 			goto out_err;
 		}
 
 		if (hdev->kernel_ctx) {
 			dev_crit(hdev->dev,
-				"kernel ctx was alive during hard reset, something is terribly wrong\n");
+				"%s kernel ctx was alive during hard reset, something is terribly wrong\n",
+				dev_name(&(hdev)->pdev->dev));
 			rc = -EBUSY;
 			goto out_err;
 		}
@@ -1749,9 +1753,13 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	hdev->reset_info.needs_reset = false;
 
 	if (hard_reset)
-		dev_info(hdev->dev, "Successfully finished resetting the device\n");
+		dev_info(hdev->dev,
+			 "Successfully finished resetting the %s device\n",
+			 dev_name(&(hdev)->pdev->dev));
 	else
-		dev_dbg(hdev->dev, "Successfully finished resetting the device\n");
+		dev_dbg(hdev->dev,
+			"Successfully finished resetting the %s device\n",
+			dev_name(&(hdev)->pdev->dev));
 
 	if (hard_reset) {
 		hdev->reset_info.hard_reset_cnt++;
@@ -1786,7 +1794,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	hdev->reset_info.in_compute_reset = 0;
 
 	if (hard_reset) {
-		dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
+		dev_err(hdev->dev,
+			"%s Failed to reset! Device is NOT usable\n",
+			dev_name(&(hdev)->pdev->dev));
 		hdev->reset_info.hard_reset_cnt++;
 	} else if (reset_upon_device_release) {
 		spin_unlock(&hdev->reset_info.lock);
@@ -2185,7 +2195,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	}
 
 	dev_notice(hdev->dev,
-		"Successfully added device to habanalabs driver\n");
+		"Successfully added device %s to habanalabs driver\n",
+		dev_name(&(hdev)->pdev->dev));
 
 	hdev->init_done = true;
 
@@ -2234,11 +2245,11 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		device_cdev_sysfs_add(hdev);
 	if (hdev->pdev)
 		dev_err(&hdev->pdev->dev,
-			"Failed to initialize hl%d. Device is NOT usable !\n",
-			hdev->cdev_idx);
+			"Failed to initialize hl%d. Device %s is NOT usable !\n",
+			hdev->cdev_idx, dev_name(&(hdev)->pdev->dev));
 	else
-		pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
-			hdev->cdev_idx);
+		pr_err("Failed to initialize hl%d. Device %s is NOT usable !\n",
+			hdev->cdev_idx, dev_name(&(hdev)->pdev->dev));
 
 	return rc;
 }
@@ -2294,7 +2305,8 @@ void hl_device_fini(struct hl_device *hdev)
 
 		if (ktime_compare(ktime_get(), timeout) > 0) {
 			dev_crit(hdev->dev,
-				"Failed to remove device because reset function did not finish\n");
+				"%s Failed to remove device because reset function did not finish\n",
+				dev_name(&(hdev)->pdev->dev));
 			return;
 		}
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-01-08 17:21 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-01-08 17:20 [PATCH 1/7] habanalabs: protect access to dynamic mem 'user_mappings' Oded Gabbay
2023-01-08 17:20 ` [PATCH 2/7] habanalabs: add set engines masks ASIC function Oded Gabbay
2023-01-08 17:20 ` [PATCH 3/7] habanalabs/gaudi2: fix log for sob value overflow/underflow Oded Gabbay
2023-01-08 17:20 ` [PATCH 4/7] habanalabs: define events to trace PCI LBW access Oded Gabbay
2023-01-08 17:20 ` [PATCH 5/7] habanalabs: trace LBW reads/writes Oded Gabbay
2023-01-08 17:20 ` [PATCH 6/7] habanalabs/gaudi2: remove use of razwi info received from f/w Oded Gabbay
2023-01-08 17:20 ` [PATCH 7/7] habanalabs: extend fatal messages to contain PCI info Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox