public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] accel/habanalabs: report 3 instances of Infineon second stage
@ 2023-12-07 12:24 Oded Gabbay
  2023-12-07 12:24 ` [PATCH 2/5] accel/habanalabs/gaudi2: add zero padding when printing QM CP instruction Oded Gabbay
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Oded Gabbay @ 2023-12-07 12:24 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Ariel Suller

From: Ariel Suller <asuller@habana.ai>

Infineon controller second stage has 3 instances that their version
need to be reported by driver.

Signed-off-by: Ariel Suller <asuller@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/sysfs.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c
index 8d2164691d81..c940c5f1d109 100644
--- a/drivers/accel/habanalabs/common/sysfs.c
+++ b/drivers/accel/habanalabs/common/sysfs.c
@@ -8,6 +8,7 @@
 #include "habanalabs.h"
 
 #include <linux/pci.h>
+#include <linux/types.h>
 
 static ssize_t clk_max_freq_mhz_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
@@ -80,12 +81,27 @@ static ssize_t vrm_ver_show(struct device *dev, struct device_attribute *attr, c
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	struct cpucp_info *cpucp_info;
+	u32 infineon_second_stage_version;
+	u32 infineon_second_stage_first_instance;
+	u32 infineon_second_stage_second_instance;
+	u32 infineon_second_stage_third_instance;
+	u32 mask = 0xff;
 
 	cpucp_info = &hdev->asic_prop.cpucp_info;
 
+	infineon_second_stage_version = le32_to_cpu(cpucp_info->infineon_second_stage_version);
+	infineon_second_stage_first_instance = infineon_second_stage_version & mask;
+	infineon_second_stage_second_instance =
+					(infineon_second_stage_version >> 8) & mask;
+	infineon_second_stage_third_instance =
+					(infineon_second_stage_version >> 16) & mask;
+
 	if (cpucp_info->infineon_second_stage_version)
-		return sprintf(buf, "%#04x %#04x\n", le32_to_cpu(cpucp_info->infineon_version),
-				le32_to_cpu(cpucp_info->infineon_second_stage_version));
+		return sprintf(buf, "%#04x %#04x:%#04x:%#04x\n",
+				le32_to_cpu(cpucp_info->infineon_version),
+				infineon_second_stage_first_instance,
+				infineon_second_stage_second_instance,
+				infineon_second_stage_third_instance);
 	else
 		return sprintf(buf, "%#04x\n", le32_to_cpu(cpucp_info->infineon_version));
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/5] accel/habanalabs/gaudi2: add zero padding when printing QM CP instruction
  2023-12-07 12:24 [PATCH 1/5] accel/habanalabs: report 3 instances of Infineon second stage Oded Gabbay
@ 2023-12-07 12:24 ` Oded Gabbay
  2023-12-07 12:24 ` [PATCH 3/5] accel/habanalabs: update debugfs-driver-habanalabs with the device-name directory Oded Gabbay
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2023-12-07 12:24 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

QM instructions are in multiples of 64 bits and the command type is in
the upper bits of first QWORD.
To make it clearer that an undefined command is due to a type of 0x0,
always print all 64 bits and add a zero padding if needed.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index bf537c2082cd..f81b57649b00 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -7884,7 +7884,7 @@ static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base,
 	cp_current_inst = ((u64) hi) << 32 | lo;
 
 	dev_info(hdev->dev,
-		"LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
+		"LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#018llx}\n",
 		is_arc_cq ? "ARC_" : "", cq_ptr, cq_ptr_size, cp_current_inst);
 
 	if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/5] accel/habanalabs: update debugfs-driver-habanalabs with the device-name directory
  2023-12-07 12:24 [PATCH 1/5] accel/habanalabs: report 3 instances of Infineon second stage Oded Gabbay
  2023-12-07 12:24 ` [PATCH 2/5] accel/habanalabs/gaudi2: add zero padding when printing QM CP instruction Oded Gabbay
@ 2023-12-07 12:24 ` Oded Gabbay
  2023-12-07 12:24 ` [PATCH 4/5] accel/habanalabs: add parent_device sysfs attribute Oded Gabbay
  2023-12-07 12:24 ` [PATCH 5/5] accel/habanalabs/gaudi2: avoid overriding existing undefined opcode data Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2023-12-07 12:24 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

The device debugfs directory was modified to be named as the
parent device name.
Update the paths accordingly.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../ABI/testing/debugfs-driver-habanalabs     | 72 +++++++++----------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index 042fd125fbc9..a7a432dc4015 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -1,4 +1,4 @@
-What:           /sys/kernel/debug/accel/<n>/addr
+What:           /sys/kernel/debug/accel/<parent_device>/addr
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
@@ -8,34 +8,34 @@ Description:    Sets the device address to be used for read or write through
                 only when the IOMMU is disabled.
                 The acceptable value is a string that starts with "0x"
 
-What:           /sys/kernel/debug/accel/<n>/clk_gate
+What:           /sys/kernel/debug/accel/<parent_device>/clk_gate
 Date:           May 2020
 KernelVersion:  5.8
 Contact:        ogabbay@kernel.org
 Description:    This setting is now deprecated as clock gating is handled solely by the f/w
 
-What:           /sys/kernel/debug/accel/<n>/command_buffers
+What:           /sys/kernel/debug/accel/<parent_device>/command_buffers
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
 Description:    Displays a list with information about the currently allocated
                 command buffers
 
-What:           /sys/kernel/debug/accel/<n>/command_submission
+What:           /sys/kernel/debug/accel/<parent_device>/command_submission
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
 Description:    Displays a list with information about the currently active
                 command submissions
 
-What:           /sys/kernel/debug/accel/<n>/command_submission_jobs
+What:           /sys/kernel/debug/accel/<parent_device>/command_submission_jobs
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
 Description:    Displays a list with detailed information about each JOB (CB) of
                 each active command submission
 
-What:           /sys/kernel/debug/accel/<n>/data32
+What:           /sys/kernel/debug/accel/<parent_device>/data32
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
@@ -50,7 +50,7 @@ Description:    Allows the root user to read or write directly through the
                 If the IOMMU is disabled, it also allows the root user to read
                 or write from the host a device VA of a host mapped memory
 
-What:           /sys/kernel/debug/accel/<n>/data64
+What:           /sys/kernel/debug/accel/<parent_device>/data64
 Date:           Jan 2020
 KernelVersion:  5.6
 Contact:        ogabbay@kernel.org
@@ -65,7 +65,7 @@ Description:    Allows the root user to read or write 64 bit data directly
                 If the IOMMU is disabled, it also allows the root user to read
                 or write from the host a device VA of a host mapped memory
 
-What:           /sys/kernel/debug/accel/<n>/data_dma
+What:           /sys/kernel/debug/accel/<parent_device>/data_dma
 Date:           Apr 2021
 KernelVersion:  5.13
 Contact:        ogabbay@kernel.org
@@ -83,7 +83,7 @@ Description:    Allows the root user to read from the device's internal
                 workloads.
                 Only supported on GAUDI at this stage.
 
-What:           /sys/kernel/debug/accel/<n>/device
+What:           /sys/kernel/debug/accel/<parent_device>/device
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
@@ -91,14 +91,14 @@ Description:    Enables the root user to set the device to specific state.
                 Valid values are "disable", "enable", "suspend", "resume".
                 User can read this property to see the valid values
 
-What:           /sys/kernel/debug/accel/<n>/device_release_watchdog_timeout
+What:           /sys/kernel/debug/accel/<parent_device>/device_release_watchdog_timeout
 Date:           Oct 2022
 KernelVersion:  6.2
 Contact:        ttayar@habana.ai
 Description:    The watchdog timeout value in seconds for a device release upon
                 certain error cases, after which the device is reset.
 
-What:           /sys/kernel/debug/accel/<n>/dma_size
+What:           /sys/kernel/debug/accel/<parent_device>/dma_size
 Date:           Apr 2021
 KernelVersion:  5.13
 Contact:        ogabbay@kernel.org
@@ -108,7 +108,7 @@ Description:    Specify the size of the DMA transaction when using DMA to read
                 When the write is finished, the user can read the "data_dma"
                 blob
 
-What:           /sys/kernel/debug/accel/<n>/dump_razwi_events
+What:           /sys/kernel/debug/accel/<parent_device>/dump_razwi_events
 Date:           Aug 2022
 KernelVersion:  5.20
 Contact:        fkassabri@habana.ai
@@ -117,7 +117,7 @@ Description:    Dumps all razwi events to dmesg if exist.
                 the routine will clear the status register.
                 Usage: cat dump_razwi_events
 
-What:           /sys/kernel/debug/accel/<n>/dump_security_violations
+What:           /sys/kernel/debug/accel/<parent_device>/dump_security_violations
 Date:           Jan 2021
 KernelVersion:  5.12
 Contact:        ogabbay@kernel.org
@@ -125,14 +125,14 @@ Description:    Dumps all security violations to dmesg. This will also ack
                 all security violations meanings those violations will not be
                 dumped next time user calls this API
 
-What:           /sys/kernel/debug/accel/<n>/engines
+What:           /sys/kernel/debug/accel/<parent_device>/engines
 Date:           Jul 2019
 KernelVersion:  5.3
 Contact:        ogabbay@kernel.org
 Description:    Displays the status registers values of the device engines and
                 their derived idle status
 
-What:           /sys/kernel/debug/accel/<n>/i2c_addr
+What:           /sys/kernel/debug/accel/<parent_device>/i2c_addr
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
@@ -140,7 +140,7 @@ Description:    Sets I2C device address for I2C transaction that is generated
                 by the device's CPU, Not available when device is loaded with secured
                 firmware
 
-What:           /sys/kernel/debug/accel/<n>/i2c_bus
+What:           /sys/kernel/debug/accel/<parent_device>/i2c_bus
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
@@ -148,7 +148,7 @@ Description:    Sets I2C bus address for I2C transaction that is generated by
                 the device's CPU, Not available when device is loaded with secured
                 firmware
 
-What:           /sys/kernel/debug/accel/<n>/i2c_data
+What:           /sys/kernel/debug/accel/<parent_device>/i2c_data
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
@@ -157,7 +157,7 @@ Description:    Triggers an I2C transaction that is generated by the device's
                 reading from the file generates a read transaction, Not available
                 when device is loaded with secured firmware
 
-What:           /sys/kernel/debug/accel/<n>/i2c_len
+What:           /sys/kernel/debug/accel/<parent_device>/i2c_len
 Date:           Dec 2021
 KernelVersion:  5.17
 Contact:        obitton@habana.ai
@@ -165,7 +165,7 @@ Description:    Sets I2C length in bytes for I2C transaction that is generated b
                 the device's CPU, Not available when device is loaded with secured
                 firmware
 
-What:           /sys/kernel/debug/accel/<n>/i2c_reg
+What:           /sys/kernel/debug/accel/<parent_device>/i2c_reg
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
@@ -173,35 +173,35 @@ Description:    Sets I2C register id for I2C transaction that is generated by
                 the device's CPU, Not available when device is loaded with secured
                 firmware
 
-What:           /sys/kernel/debug/accel/<n>/led0
+What:           /sys/kernel/debug/accel/<parent_device>/led0
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
 Description:    Sets the state of the first S/W led on the device, Not available
                 when device is loaded with secured firmware
 
-What:           /sys/kernel/debug/accel/<n>/led1
+What:           /sys/kernel/debug/accel/<parent_device>/led1
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
 Description:    Sets the state of the second S/W led on the device, Not available
                 when device is loaded with secured firmware
 
-What:           /sys/kernel/debug/accel/<n>/led2
+What:           /sys/kernel/debug/accel/<parent_device>/led2
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
 Description:    Sets the state of the third S/W led on the device, Not available
                 when device is loaded with secured firmware
 
-What:           /sys/kernel/debug/accel/<n>/memory_scrub
+What:           /sys/kernel/debug/accel/<parent_device>/memory_scrub
 Date:           May 2022
 KernelVersion:  5.19
 Contact:        dhirschfeld@habana.ai
 Description:    Allows the root user to scrub the dram memory. The scrubbing
                 value can be set using the debugfs file memory_scrub_val.
 
-What:           /sys/kernel/debug/accel/<n>/memory_scrub_val
+What:           /sys/kernel/debug/accel/<parent_device>/memory_scrub_val
 Date:           May 2022
 KernelVersion:  5.19
 Contact:        dhirschfeld@habana.ai
@@ -209,7 +209,7 @@ Description:    The value to which the dram will be set to when the user
                 scrubs the dram using 'memory_scrub' debugfs file and
                 the scrubbing value when using module param 'memory_scrub'
 
-What:           /sys/kernel/debug/accel/<n>/mmu
+What:           /sys/kernel/debug/accel/<parent_device>/mmu
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
@@ -219,7 +219,7 @@ Description:    Displays the hop values and physical address for a given ASID
                 e.g. to display info about VA 0x1000 for ASID 1 you need to do:
                 echo "1 0x1000" > /sys/kernel/debug/accel/0/mmu
 
-What:           /sys/kernel/debug/accel/<n>/mmu_error
+What:           /sys/kernel/debug/accel/<parent_device>/mmu_error
 Date:           Mar 2021
 KernelVersion:  5.12
 Contact:        fkassabri@habana.ai
@@ -229,7 +229,7 @@ Description:    Check and display page fault or access violation mmu errors for
                 echo "0x200" > /sys/kernel/debug/accel/0/mmu_error
                 cat /sys/kernel/debug/accel/0/mmu_error
 
-What:           /sys/kernel/debug/accel/<n>/monitor_dump
+What:           /sys/kernel/debug/accel/<parent_device>/monitor_dump
 Date:           Mar 2022
 KernelVersion:  5.19
 Contact:        osharabi@habana.ai
@@ -243,7 +243,7 @@ Description:    Allows the root user to dump monitors status from the device's
                 This interface doesn't support concurrency in the same device.
                 Only supported on GAUDI.
 
-What:           /sys/kernel/debug/accel/<n>/monitor_dump_trig
+What:           /sys/kernel/debug/accel/<parent_device>/monitor_dump_trig
 Date:           Mar 2022
 KernelVersion:  5.19
 Contact:        osharabi@habana.ai
@@ -253,14 +253,14 @@ Description:    Triggers dump of monitor data. The value to trigger the operatio
                 When the write is finished, the user can read the "monitor_dump"
                 blob
 
-What:           /sys/kernel/debug/accel/<n>/set_power_state
+What:           /sys/kernel/debug/accel/<parent_device>/set_power_state
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
 Description:    Sets the PCI power state. Valid values are "1" for D0 and "2"
                 for D3Hot
 
-What:           /sys/kernel/debug/accel/<n>/skip_reset_on_timeout
+What:           /sys/kernel/debug/accel/<parent_device>/skip_reset_on_timeout
 Date:           Jun 2021
 KernelVersion:  5.13
 Contact:        ynudelman@habana.ai
@@ -268,7 +268,7 @@ Description:    Sets the skip reset on timeout option for the device. Value of
                 "0" means device will be reset in case some CS has timed out,
                 otherwise it will not be reset.
 
-What:           /sys/kernel/debug/accel/<n>/state_dump
+What:           /sys/kernel/debug/accel/<parent_device>/state_dump
 Date:           Oct 2021
 KernelVersion:  5.15
 Contact:        ynudelman@habana.ai
@@ -279,7 +279,7 @@ Description:    Gets the state dump occurring on a CS timeout or failure.
                 Writing an integer X discards X state dumps, so that the
                 next read would return X+1-st newest state dump.
 
-What:           /sys/kernel/debug/accel/<n>/stop_on_err
+What:           /sys/kernel/debug/accel/<parent_device>/stop_on_err
 Date:           Mar 2020
 KernelVersion:  5.6
 Contact:        ogabbay@kernel.org
@@ -287,13 +287,13 @@ Description:    Sets the stop-on_error option for the device engines. Value of
                 "0" is for disable, otherwise enable.
                 Relevant only for GOYA and GAUDI.
 
-What:           /sys/kernel/debug/accel/<n>/timeout_locked
+What:           /sys/kernel/debug/accel/<parent_device>/timeout_locked
 Date:           Sep 2021
 KernelVersion:  5.16
 Contact:        obitton@habana.ai
 Description:    Sets the command submission timeout value in seconds.
 
-What:           /sys/kernel/debug/accel/<n>/userptr
+What:           /sys/kernel/debug/accel/<parent_device>/userptr
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
@@ -301,7 +301,7 @@ Description:    Displays a list with information about the current user
                 pointers (user virtual addresses) that are pinned and mapped
                 to DMA addresses
 
-What:           /sys/kernel/debug/accel/<n>/userptr_lookup
+What:           /sys/kernel/debug/accel/<parent_device>/userptr_lookup
 Date:           Oct 2021
 KernelVersion:  5.15
 Contact:        ogabbay@kernel.org
@@ -309,7 +309,7 @@ Description:    Allows to search for specific user pointers (user virtual
                 addresses) that are pinned and mapped to DMA addresses, and see
                 their resolution to the specific dma address.
 
-What:           /sys/kernel/debug/accel/<n>/vm
+What:           /sys/kernel/debug/accel/<parent_device>/vm
 Date:           Jan 2019
 KernelVersion:  5.1
 Contact:        ogabbay@kernel.org
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/5] accel/habanalabs: add parent_device sysfs attribute
  2023-12-07 12:24 [PATCH 1/5] accel/habanalabs: report 3 instances of Infineon second stage Oded Gabbay
  2023-12-07 12:24 ` [PATCH 2/5] accel/habanalabs/gaudi2: add zero padding when printing QM CP instruction Oded Gabbay
  2023-12-07 12:24 ` [PATCH 3/5] accel/habanalabs: update debugfs-driver-habanalabs with the device-name directory Oded Gabbay
@ 2023-12-07 12:24 ` Oded Gabbay
  2023-12-07 12:24 ` [PATCH 5/5] accel/habanalabs/gaudi2: avoid overriding existing undefined opcode data Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2023-12-07 12:24 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

The device debugfs directory was modified to be named as the
device-name.
This name is the parent device name, i.e. either the PCI address in case
of an ASIC, or the simulator device name in case of a simulator.

This change makes it more difficult for a user to access the debugfs
directory for a specific accel device, because he can't just use the
accel minor id, but he needs to do more device-dependent operations to
get the device name.

To make it easier to get this name, add a 'parent_device' sysfs
attribute that the user can read using the minor id before accessing
debugfs.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 Documentation/ABI/testing/sysfs-driver-habanalabs | 6 ++++++
 drivers/accel/habanalabs/common/habanalabs.h      | 3 +++
 drivers/accel/habanalabs/common/sysfs.c           | 9 +++++++++
 3 files changed, 18 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-driver-habanalabs b/Documentation/ABI/testing/sysfs-driver-habanalabs
index 89fe3b09d4ad..4244f5af4b54 100644
--- a/Documentation/ABI/testing/sysfs-driver-habanalabs
+++ b/Documentation/ABI/testing/sysfs-driver-habanalabs
@@ -155,6 +155,12 @@ KernelVersion:  not yet upstreamed
 Contact:        ogabbay@kernel.org
 Description:    Displays the device's module id
 
+What:           /sys/class/accel/accel<n>/device/parent_device
+Date:           Nov 2023
+KernelVersion:  6.8
+Contact:        ttayar@habana.ai
+Description:    Displays the name of the parent device of the accel device
+
 What:           /sys/class/accel/accel<n>/device/pci_addr
 Date:           Jan 2019
 KernelVersion:  5.1
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index dd3fe3ddc00a..2a900c9941fe 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3521,6 +3521,9 @@ struct hl_device {
 	u8				heartbeat;
 };
 
+/* Retrieve PCI device name in case of a PCI device or dev name in simulator */
+#define HL_DEV_NAME(hdev)	\
+		((hdev)->pdev ? dev_name(&(hdev)->pdev->dev) : "NA-DEVICE")
 
 /**
  * struct hl_cs_encaps_sig_handle - encapsulated signals handle structure
diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c
index c940c5f1d109..8a9f98832157 100644
--- a/drivers/accel/habanalabs/common/sysfs.c
+++ b/drivers/accel/habanalabs/common/sysfs.c
@@ -410,6 +410,13 @@ static ssize_t module_id_show(struct device *dev,
 	return sprintf(buf, "%u\n", le32_to_cpu(hdev->asic_prop.cpucp_info.card_location));
 }
 
+static ssize_t parent_device_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", HL_DEV_NAME(hdev));
+}
+
 static DEVICE_ATTR_RO(armcp_kernel_ver);
 static DEVICE_ATTR_RO(armcp_ver);
 static DEVICE_ATTR_RO(cpld_ver);
@@ -430,6 +437,7 @@ static DEVICE_ATTR_RO(uboot_ver);
 static DEVICE_ATTR_RO(fw_os_ver);
 static DEVICE_ATTR_RO(security_enabled);
 static DEVICE_ATTR_RO(module_id);
+static DEVICE_ATTR_RO(parent_device);
 
 static struct bin_attribute bin_attr_eeprom = {
 	.attr = {.name = "eeprom", .mode = (0444)},
@@ -456,6 +464,7 @@ static struct attribute *hl_dev_attrs[] = {
 	&dev_attr_fw_os_ver.attr,
 	&dev_attr_security_enabled.attr,
 	&dev_attr_module_id.attr,
+	&dev_attr_parent_device.attr,
 	NULL,
 };
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 5/5] accel/habanalabs/gaudi2: avoid overriding existing undefined opcode data
  2023-12-07 12:24 [PATCH 1/5] accel/habanalabs: report 3 instances of Infineon second stage Oded Gabbay
                   ` (2 preceding siblings ...)
  2023-12-07 12:24 ` [PATCH 4/5] accel/habanalabs: add parent_device sysfs attribute Oded Gabbay
@ 2023-12-07 12:24 ` Oded Gabbay
  3 siblings, 0 replies; 5+ messages in thread
From: Oded Gabbay @ 2023-12-07 12:24 UTC (permalink / raw)
  To: dri-devel, linux-kernel; +Cc: Tomer Tayar

From: Tomer Tayar <ttayar@habana.ai>

Part of the undefined opcode data is updated in
gaudi2_handle_qman_err_generic() and some in
handle_lower_qman_data_on_err().
However, the 'write_enable' flag is checked only in
gaudi2_handle_qman_err_generic(), and information of more than a single
error can be mixed there.

Moreover, handle_lower_qman_data_on_err() is called only for the lower
QMAN, so for an error in the upper QMAN there is only a partial info.

Move all the data update to be done in a single place, protected by the
'write_enable' flag.
As mainly the lower QMAN's info is interesting, avoid saving the partial
info for the upper QMAN.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/gaudi2/gaudi2.c | 40 +++++++++++-------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index f81b57649b00..e0e5615ef9b0 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -7858,10 +7858,11 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 	return !!ecc_data->is_critical;
 }
 
-static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask)
+static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u32 engine_id)
 {
-	u32 lo, hi, cq_ptr_size, cp_sts;
+	struct undefined_opcode_info *undef_opcode = &hdev->captured_err_info.undef_opcode;
 	u64 cq_ptr, cp_current_inst;
+	u32 lo, hi, cq_size, cp_sts;
 	bool is_arc_cq;
 
 	cp_sts = RREG32(qman_base + QM_CP_STS_4_OFFSET);
@@ -7871,12 +7872,12 @@ static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base,
 		lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_STS_OFFSET);
 		hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_STS_OFFSET);
 		cq_ptr = ((u64) hi) << 32 | lo;
-		cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_STS_OFFSET);
+		cq_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_STS_OFFSET);
 	} else {
 		lo = RREG32(qman_base + QM_CQ_PTR_LO_STS_4_OFFSET);
 		hi = RREG32(qman_base + QM_CQ_PTR_HI_STS_4_OFFSET);
 		cq_ptr = ((u64) hi) << 32 | lo;
-		cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_STS_4_OFFSET);
+		cq_size = RREG32(qman_base + QM_CQ_TSIZE_STS_4_OFFSET);
 	}
 
 	lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET);
@@ -7885,12 +7886,16 @@ static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base,
 
 	dev_info(hdev->dev,
 		"LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#018llx}\n",
-		is_arc_cq ? "ARC_" : "", cq_ptr, cq_ptr_size, cp_current_inst);
+		is_arc_cq ? "ARC_" : "", cq_ptr, cq_size, cp_current_inst);
 
-	if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
-		hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
-		hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
-		hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS;
+	if (undef_opcode->write_enable) {
+		memset(undef_opcode, 0, sizeof(*undef_opcode));
+		undef_opcode->timestamp = ktime_get();
+		undef_opcode->cq_addr = cq_ptr;
+		undef_opcode->cq_size = cq_size;
+		undef_opcode->engine_id = engine_id;
+		undef_opcode->stream_id = QMAN_STREAMS;
+		undef_opcode->write_enable = 0;
 	}
 }
 
@@ -7929,19 +7934,12 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
 				error_count++;
 			}
 
-		/* check for undefined opcode */
-		if (glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK) {
+		/* Check for undefined opcode error in lower QM */
+		if ((i == QMAN_STREAMS) &&
+				(glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK)) {
+			handle_lower_qman_data_on_err(hdev, qman_base,
+							gaudi2_queue_id_to_engine_id[qid_base]);
 			*event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
-			if (hdev->captured_err_info.undef_opcode.write_enable) {
-				memset(&hdev->captured_err_info.undef_opcode, 0,
-						sizeof(hdev->captured_err_info.undef_opcode));
-				hdev->captured_err_info.undef_opcode.timestamp = ktime_get();
-				hdev->captured_err_info.undef_opcode.engine_id =
-							gaudi2_queue_id_to_engine_id[qid_base];
-			}
-
-			if (i == QMAN_STREAMS)
-				handle_lower_qman_data_on_err(hdev, qman_base, *event_mask);
 		}
 	}
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-12-07 12:25 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-12-07 12:24 [PATCH 1/5] accel/habanalabs: report 3 instances of Infineon second stage Oded Gabbay
2023-12-07 12:24 ` [PATCH 2/5] accel/habanalabs/gaudi2: add zero padding when printing QM CP instruction Oded Gabbay
2023-12-07 12:24 ` [PATCH 3/5] accel/habanalabs: update debugfs-driver-habanalabs with the device-name directory Oded Gabbay
2023-12-07 12:24 ` [PATCH 4/5] accel/habanalabs: add parent_device sysfs attribute Oded Gabbay
2023-12-07 12:24 ` [PATCH 5/5] accel/habanalabs/gaudi2: avoid overriding existing undefined opcode data Oded Gabbay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox