* [RFC v2 01/15] drm/amdgpu: add helper functions to track status for ras manager
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-17 1:13 ` Wang, Yang(Kevin)
2025-01-17 5:03 ` Lazar, Lijo
2025-01-13 1:42 ` [RFC v2 02/15] drm/amdgpu: add a flag to track ras debugfs creation status Jiang Liu
` (14 subsequent siblings)
15 siblings, 2 replies; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Add helper functions to track status for ras manager and ip blocks.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 38 +++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 +++++++
3 files changed, 85 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5e55a44f9eef..f0f773659faf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -377,12 +377,28 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block);
#define AMDGPU_MAX_IP_NUM 16
+enum amdgpu_marker {
+ // Markers for IRQs, used for both ip blocks and ras blocks.
+ AMDGPU_MARKER_IRQ0 = 32,
+ AMDGPU_MARKER_IRQ1,
+ AMDGPU_MARKER_IRQ2,
+ AMDGPU_MARKER_IRQ3,
+ AMDGPU_MARKER_IRQ4,
+ AMDGPU_MARKER_IRQ5,
+ AMDGPU_MARKER_IRQ6,
+ AMDGPU_MARKER_IRQ7,
+ AMDGPU_MARKER_IRQ_MAX = 63,
+};
+
+#define AMDGPU_MARKER_IRQ(idx) (AMDGPU_MARKER_IRQ0 + (idx))
+
struct amdgpu_ip_block_status {
bool valid;
bool sw;
bool hw;
bool late_initialized;
bool hang;
+ uint64_t markers;
};
struct amdgpu_ip_block_version {
@@ -410,6 +426,28 @@ amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
const struct amdgpu_ip_block_version *ip_block_version);
+static inline void amdgpu_ip_block_set_marker(struct amdgpu_ip_block *ip_block,
+ enum amdgpu_marker marker)
+{
+ WARN_ON(marker > 63);
+ WARN_ON(ip_block->status.markers & (0x1ull << marker));
+ ip_block->status.markers |= 0x1ull << (int)marker;
+}
+
+static inline bool amdgpu_ip_block_test_and_clear_marker(struct amdgpu_ip_block *ip_block,
+ enum amdgpu_marker marker)
+{
+ bool set = false;
+ uint64_t value = 0x1ull << (int)marker;
+
+ if ((ip_block->status.markers & value) != 0) {
+ ip_block->status.markers &= ~value;
+ set = true;
+ }
+
+ return set;
+}
+
/*
* BIOS.
*/
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f0924aa3f4e4..5e19d820ab34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -5207,3 +5207,40 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
return con->is_rma;
}
+
+bool amdgpu_ras_test_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head, int marker)
+{
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+
+ if (obj && obj->markers & (0x1ull << marker))
+ return true;
+
+ return false;
+}
+
+void amdgpu_ras_set_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head, int marker)
+{
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+
+ WARN_ON(marker > 63);
+ WARN_ON(obj->markers & (0x1ull << marker));
+ if (obj)
+ obj->markers |= 0x1ull << marker;
+}
+
+bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head, int marker)
+{
+ bool set = false;
+ uint64_t value = 0x1ull << marker;
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+
+ if (obj && (obj->markers & value) != 0) {
+ obj->markers &= ~value;
+ set = true;
+ }
+
+ return set;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 82db986c36a0..35881087b17b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -634,6 +634,8 @@ struct ras_manager {
struct ras_common_if head;
/* reference count */
int use;
+ /* Flags for status tracking */
+ uint64_t markers;
/* ras block link */
struct list_head node;
/* the device */
@@ -977,4 +979,12 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
const char *fmt, ...);
bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
+
+bool amdgpu_ras_test_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head, int marker);
+void amdgpu_ras_set_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head, int marker);
+bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head,
+ int marker);
#endif
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* RE: [RFC v2 01/15] drm/amdgpu: add helper functions to track status for ras manager
2025-01-13 1:42 ` [RFC v2 01/15] drm/amdgpu: add helper functions to track status for ras manager Jiang Liu
@ 2025-01-17 1:13 ` Wang, Yang(Kevin)
2025-01-17 5:03 ` Lazar, Lijo
1 sibling, 0 replies; 39+ messages in thread
From: Wang, Yang(Kevin) @ 2025-01-17 1:13 UTC (permalink / raw)
To: Jiang Liu, Deucher, Alexander, Koenig, Christian, Pan, Xinhui,
airlied@gmail.com, simona@ffwll.ch, Khatri, Sunil, Lazar, Lijo,
Zhang, Hawking, Limonciello, Mario, Chen, Xiaogang, Russell, Kent,
shuox.liu@linux.alibaba.com, amd-gfx@lists.freedesktop.org
[AMD Official Use Only - AMD Internal Distribution Only]
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Jiang Liu
Sent: Monday, January 13, 2025 09:42
To: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Pan, Xinhui <Xinhui.Pan@amd.com>; airlied@gmail.com; simona@ffwll.ch; Khatri, Sunil <Sunil.Khatri@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Limonciello, Mario <Mario.Limonciello@amd.com>; Chen, Xiaogang <Xiaogang.Chen@amd.com>; Russell, Kent <Kent.Russell@amd.com>; shuox.liu@linux.alibaba.com; amd-gfx@lists.freedesktop.org
Cc: Jiang Liu <gerry@linux.alibaba.com>
Subject: [RFC v2 01/15] drm/amdgpu: add helper functions to track status for ras manager
Add helper functions to track status for ras manager and ip blocks.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 38 +++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 +++++++
3 files changed, 85 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5e55a44f9eef..f0f773659faf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -377,12 +377,28 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block);
#define AMDGPU_MAX_IP_NUM 16
+enum amdgpu_marker {
+ // Markers for IRQs, used for both ip blocks and ras blocks.
+ AMDGPU_MARKER_IRQ0 = 32,
+ AMDGPU_MARKER_IRQ1,
+ AMDGPU_MARKER_IRQ2,
+ AMDGPU_MARKER_IRQ3,
+ AMDGPU_MARKER_IRQ4,
+ AMDGPU_MARKER_IRQ5,
+ AMDGPU_MARKER_IRQ6,
+ AMDGPU_MARKER_IRQ7,
+ AMDGPU_MARKER_IRQ_MAX = 63,
+};
+
+#define AMDGPU_MARKER_IRQ(idx) (AMDGPU_MARKER_IRQ0 + (idx))
+
struct amdgpu_ip_block_status {
bool valid;
bool sw;
bool hw;
bool late_initialized;
bool hang;
+ uint64_t markers;
};
struct amdgpu_ip_block_version {
@@ -410,6 +426,28 @@ amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
const struct amdgpu_ip_block_version *ip_block_version);
+static inline void amdgpu_ip_block_set_marker(struct amdgpu_ip_block *ip_block,
+ enum amdgpu_marker marker)
+{
+ WARN_ON(marker > 63);
+ WARN_ON(ip_block->status.markers & (0x1ull << marker));
+ ip_block->status.markers |= 0x1ull << (int)marker; }
+
+static inline bool amdgpu_ip_block_test_and_clear_marker(struct amdgpu_ip_block *ip_block,
+ enum amdgpu_marker marker)
+{
+ bool set = false;
+ uint64_t value = 0x1ull << (int)marker;
+
+ if ((ip_block->status.markers & value) != 0) {
+ ip_block->status.markers &= ~value;
+ set = true;
+ }
+
+ return set;
+}
+
/*
* BIOS.
*/
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f0924aa3f4e4..5e19d820ab34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -5207,3 +5207,40 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
return con->is_rma;
}
+
+bool amdgpu_ras_test_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head, int marker) {
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+
+ if (obj && obj->markers & (0x1ull << marker))
+ return true;
+
+ return false;
+}
+
+void amdgpu_ras_set_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head, int marker) {
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+
+ WARN_ON(marker > 63);
+ WARN_ON(obj->markers & (0x1ull << marker));
[kevin]:
It's best to check for null pointers before accessing member variables
Best Regards,
Kevin
+ if (obj)
+ obj->markers |= 0x1ull << marker;
+}
+
+bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head, int marker) {
+ bool set = false;
+ uint64_t value = 0x1ull << marker;
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+
+ if (obj && (obj->markers & value) != 0) {
+ obj->markers &= ~value;
+ set = true;
+ }
+
+ return set;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 82db986c36a0..35881087b17b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -634,6 +634,8 @@ struct ras_manager {
struct ras_common_if head;
/* reference count */
int use;
+ /* Flags for status tracking */
+ uint64_t markers;
/* ras block link */
struct list_head node;
/* the device */
@@ -977,4 +979,12 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
const char *fmt, ...);
bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
+
+bool amdgpu_ras_test_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head, int marker); void
+amdgpu_ras_set_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head, int marker); bool
+amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev,
+ struct ras_common_if *head,
+ int marker);
#endif
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 01/15] drm/amdgpu: add helper functions to track status for ras manager
2025-01-13 1:42 ` [RFC v2 01/15] drm/amdgpu: add helper functions to track status for ras manager Jiang Liu
2025-01-17 1:13 ` Wang, Yang(Kevin)
@ 2025-01-17 5:03 ` Lazar, Lijo
1 sibling, 0 replies; 39+ messages in thread
From: Lazar, Lijo @ 2025-01-17 5:03 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Add helper functions to track status for ras manager and ip blocks.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 38 +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 +++++++
> 3 files changed, 85 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 5e55a44f9eef..f0f773659faf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -377,12 +377,28 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block);
>
> #define AMDGPU_MAX_IP_NUM 16
>
> +enum amdgpu_marker {
> + // Markers for IRQs, used for both ip blocks and ras blocks.
> + AMDGPU_MARKER_IRQ0 = 32,
> + AMDGPU_MARKER_IRQ1,
> + AMDGPU_MARKER_IRQ2,
> + AMDGPU_MARKER_IRQ3,
> + AMDGPU_MARKER_IRQ4,
> + AMDGPU_MARKER_IRQ5,
> + AMDGPU_MARKER_IRQ6,
> + AMDGPU_MARKER_IRQ7,
> + AMDGPU_MARKER_IRQ_MAX = 63,
> +};
> +
> +#define AMDGPU_MARKER_IRQ(idx) (AMDGPU_MARKER_IRQ0 + (idx))
> +
> struct amdgpu_ip_block_status {
> bool valid;
> bool sw;
> bool hw;
> bool late_initialized;
> bool hang;
> + uint64_t markers;
> };
>
This fine grained levels maintained at IP layer doesn't look like a
proper solution. It's either IP or RAS block has the required IRQs
enabled or disabled. Unwinding them needs to be tracked at IRQ object
layer and not here.
Thanks,
Lijo
> struct amdgpu_ip_block_version {
> @@ -410,6 +426,28 @@ amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
> int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
> const struct amdgpu_ip_block_version *ip_block_version);
>
> +static inline void amdgpu_ip_block_set_marker(struct amdgpu_ip_block *ip_block,
> + enum amdgpu_marker marker)
> +{
> + WARN_ON(marker > 63);
> + WARN_ON(ip_block->status.markers & (0x1ull << marker));
> + ip_block->status.markers |= 0x1ull << (int)marker;
> +}
> +
> +static inline bool amdgpu_ip_block_test_and_clear_marker(struct amdgpu_ip_block *ip_block,
> + enum amdgpu_marker marker)
> +{
> + bool set = false;
> + uint64_t value = 0x1ull << (int)marker;
> +
> + if ((ip_block->status.markers & value) != 0) {
> + ip_block->status.markers &= ~value;
> + set = true;
> + }
> +
> + return set;
> +}
> +
> /*
> * BIOS.
> */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index f0924aa3f4e4..5e19d820ab34 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -5207,3 +5207,40 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
>
> return con->is_rma;
> }
> +
> +bool amdgpu_ras_test_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head, int marker)
> +{
> + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
> +
> + if (obj && obj->markers & (0x1ull << marker))
> + return true;
> +
> + return false;
> +}
> +
> +void amdgpu_ras_set_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head, int marker)
> +{
> + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
> +
> + WARN_ON(marker > 63);
> + WARN_ON(obj->markers & (0x1ull << marker));
> + if (obj)
> + obj->markers |= 0x1ull << marker;
> +}
> +
> +bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head, int marker)
> +{
> + bool set = false;
> + uint64_t value = 0x1ull << marker;
> + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
> +
> + if (obj && (obj->markers & value) != 0) {
> + obj->markers &= ~value;
> + set = true;
> + }
> +
> + return set;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 82db986c36a0..35881087b17b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -634,6 +634,8 @@ struct ras_manager {
> struct ras_common_if head;
> /* reference count */
> int use;
> + /* Flags for status tracking */
> + uint64_t markers;
> /* ras block link */
> struct list_head node;
> /* the device */
> @@ -977,4 +979,12 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
> const char *fmt, ...);
>
> bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
> +
> +bool amdgpu_ras_test_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head, int marker);
> +void amdgpu_ras_set_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head, int marker);
> +bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev,
> + struct ras_common_if *head,
> + int marker);
> #endif
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 02/15] drm/amdgpu: add a flag to track ras debugfs creation status
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
2025-01-13 1:42 ` [RFC v2 01/15] drm/amdgpu: add helper functions to track status for ras manager Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-17 5:24 ` Lazar, Lijo
2025-01-13 1:42 ` [RFC v2 03/15] drm/amdgpu: free all resources on error recovery path of amdgpu_ras_init() Jiang Liu
` (13 subsequent siblings)
15 siblings, 1 reply; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Add a flag to track ras debugfs creation status, to avoid possible
incorrect reference count management for ras block object in function
amdgpu_ras_aca_is_supported().
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++--
2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f0f773659faf..09b63a622728 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -378,6 +378,8 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block);
#define AMDGPU_MAX_IP_NUM 16
enum amdgpu_marker {
+ // Markers for ras blocks.
+ AMDGPU_MARKER_RAS_DEBUGFS,
// Markers for IRQs, used for both ip blocks and ras blocks.
AMDGPU_MARKER_IRQ0 = 32,
AMDGPU_MARKER_IRQ1,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5e19d820ab34..c10ea3fd3e16 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1997,7 +1997,8 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
- if (!obj || !dir)
+ if (!obj || !dir ||
+ amdgpu_ras_test_marker(adev, &head->head, AMDGPU_MARKER_RAS_DEBUGFS))
return;
get_obj(obj);
@@ -2008,6 +2009,8 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
obj, &amdgpu_ras_debugfs_ops);
+
+ amdgpu_ras_set_marker(adev, &head->head, AMDGPU_MARKER_RAS_DEBUGFS);
}
static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev)
@@ -2136,7 +2139,9 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
if (IS_ENABLED(CONFIG_DEBUG_FS)) {
list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
- if (ip_obj)
+ if (ip_obj &&
+ amdgpu_ras_test_and_clear_marker(adev, &ip_obj->head,
+ AMDGPU_MARKER_RAS_DEBUGFS))
put_obj(ip_obj);
}
}
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 02/15] drm/amdgpu: add a flag to track ras debugfs creation status
2025-01-13 1:42 ` [RFC v2 02/15] drm/amdgpu: add a flag to track ras debugfs creation status Jiang Liu
@ 2025-01-17 5:24 ` Lazar, Lijo
0 siblings, 0 replies; 39+ messages in thread
From: Lazar, Lijo @ 2025-01-17 5:24 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Add a flag to track ras debugfs creation status, to avoid possible
> incorrect reference count management for ras block object in function
> amdgpu_ras_aca_is_supported().
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++--
> 2 files changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f0f773659faf..09b63a622728 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -378,6 +378,8 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block);
> #define AMDGPU_MAX_IP_NUM 16
>
> enum amdgpu_marker {
> + // Markers for ras blocks.
> + AMDGPU_MARKER_RAS_DEBUGFS,
As mentioned in patch 1, keeping a global tracker at this layer is not
the right solution. The expectation is each object/IP block to keep the
state info.
Thanks,
Lijo
> // Markers for IRQs, used for both ip blocks and ras blocks.
> AMDGPU_MARKER_IRQ0 = 32,
> AMDGPU_MARKER_IRQ1,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 5e19d820ab34..c10ea3fd3e16 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1997,7 +1997,8 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
> {
> struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
>
> - if (!obj || !dir)
> + if (!obj || !dir ||
> + amdgpu_ras_test_marker(adev, &head->head, AMDGPU_MARKER_RAS_DEBUGFS))
> return;
>
> get_obj(obj);
> @@ -2008,6 +2009,8 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
>
> debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
> obj, &amdgpu_ras_debugfs_ops);
> +
> + amdgpu_ras_set_marker(adev, &head->head, AMDGPU_MARKER_RAS_DEBUGFS);
> }
>
> static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev)
> @@ -2136,7 +2139,9 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
> if (IS_ENABLED(CONFIG_DEBUG_FS)) {
> list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
> ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
> - if (ip_obj)
> + if (ip_obj &&
> + amdgpu_ras_test_and_clear_marker(adev, &ip_obj->head,
> + AMDGPU_MARKER_RAS_DEBUGFS))
> put_obj(ip_obj);
> }
> }
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 03/15] drm/amdgpu: free all resources on error recovery path of amdgpu_ras_init()
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
2025-01-13 1:42 ` [RFC v2 01/15] drm/amdgpu: add helper functions to track status for ras manager Jiang Liu
2025-01-13 1:42 ` [RFC v2 02/15] drm/amdgpu: add a flag to track ras debugfs creation status Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-16 21:02 ` Mario Limonciello
2025-01-17 5:39 ` Lazar, Lijo
2025-01-13 1:42 ` [RFC v2 04/15] drm/amdgpu: introduce a flag to track refcount held for features Jiang Liu
` (12 subsequent siblings)
15 siblings, 2 replies; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Free all allocated resources on error recovery path in function
amdgpu_ras_init().
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c10ea3fd3e16..6b508a9b1abe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3864,6 +3864,7 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_ras_block_list *ras_node, *tmp;
int r;
if (con)
@@ -3953,20 +3954,20 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
* to handle fatal error */
r = amdgpu_nbio_ras_sw_init(adev);
if (r)
- return r;
+ goto release_con;
if (adev->nbio.ras &&
adev->nbio.ras->init_ras_controller_interrupt) {
r = adev->nbio.ras->init_ras_controller_interrupt(adev);
if (r)
- goto release_con;
+ goto free_blocks;
}
if (adev->nbio.ras &&
adev->nbio.ras->init_ras_err_event_athub_interrupt) {
r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
if (r)
- goto release_con;
+ goto free_blocks;
}
/* Packed socket_id to ras feature mask bits[31:29] */
@@ -3982,7 +3983,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
- goto release_con;
+ goto free_blocks;
}
if (amdgpu_ras_aca_is_supported(adev)) {
@@ -3991,7 +3992,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
else
r = amdgpu_mca_init(adev);
if (r)
- goto release_con;
+ goto clear_ras_fs;
}
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
@@ -3999,6 +4000,14 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
adev->ras_hw_enabled, adev->ras_enabled);
return 0;
+
+clear_ras_fs:
+ amdgpu_ras_fs_fini(adev);
+free_blocks:
+ list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
+ list_del(&ras_node->node);
+ kfree(ras_node);
+ }
release_con:
amdgpu_ras_set_context(adev, NULL);
kfree(con);
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 03/15] drm/amdgpu: free all resources on error recovery path of amdgpu_ras_init()
2025-01-13 1:42 ` [RFC v2 03/15] drm/amdgpu: free all resources on error recovery path of amdgpu_ras_init() Jiang Liu
@ 2025-01-16 21:02 ` Mario Limonciello
2025-01-17 5:39 ` Lazar, Lijo
1 sibling, 0 replies; 39+ messages in thread
From: Mario Limonciello @ 2025-01-16 21:02 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, lijo.lazar, Hawking.Zhang,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/12/2025 19:42, Jiang Liu wrote:
> Free all allocated resources on error recovery path in function
> amdgpu_ras_init().
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 ++++++++++++++-----
> 1 file changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index c10ea3fd3e16..6b508a9b1abe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3864,6 +3864,7 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
> int amdgpu_ras_init(struct amdgpu_device *adev)
> {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct amdgpu_ras_block_list *ras_node, *tmp;
> int r;
>
> if (con)
> @@ -3953,20 +3954,20 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
> * to handle fatal error */
> r = amdgpu_nbio_ras_sw_init(adev);
> if (r)
> - return r;
> + goto release_con;
>
> if (adev->nbio.ras &&
> adev->nbio.ras->init_ras_controller_interrupt) {
> r = adev->nbio.ras->init_ras_controller_interrupt(adev);
> if (r)
> - goto release_con;
> + goto free_blocks;
> }
>
> if (adev->nbio.ras &&
> adev->nbio.ras->init_ras_err_event_athub_interrupt) {
> r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
> if (r)
> - goto release_con;
> + goto free_blocks;
> }
>
> /* Packed socket_id to ras feature mask bits[31:29] */
> @@ -3982,7 +3983,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>
> if (amdgpu_ras_fs_init(adev)) {
> r = -EINVAL;
> - goto release_con;
> + goto free_blocks;
Reviewing this shows there is a mistake in amdgpu_ras_fs_init(). If
sysfs fails to init there it still returns 0.
Please modify amdgpu_ras_fs_init() to 'return r' and then use that
value. IE:
r = amdgpu_ras_fs_init(adev);
if (r)
goto free_blocks;
> }
>
> if (amdgpu_ras_aca_is_supported(adev)) {
> @@ -3991,7 +3992,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
> else
> r = amdgpu_mca_init(adev);
> if (r)
> - goto release_con;
> + goto clear_ras_fs;
> }
>
> dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
> @@ -3999,6 +4000,14 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
> adev->ras_hw_enabled, adev->ras_enabled);
>
> return 0;
> +
> +clear_ras_fs:
> + amdgpu_ras_fs_fini(adev);
> +free_blocks:
> + list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
> + list_del(&ras_node->node);
> + kfree(ras_node);
> + }
> release_con:
> amdgpu_ras_set_context(adev, NULL);
> kfree(con);
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [RFC v2 03/15] drm/amdgpu: free all resources on error recovery path of amdgpu_ras_init()
2025-01-13 1:42 ` [RFC v2 03/15] drm/amdgpu: free all resources on error recovery path of amdgpu_ras_init() Jiang Liu
2025-01-16 21:02 ` Mario Limonciello
@ 2025-01-17 5:39 ` Lazar, Lijo
1 sibling, 0 replies; 39+ messages in thread
From: Lazar, Lijo @ 2025-01-17 5:39 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Free all allocated resources on error recovery path in function
> amdgpu_ras_init().
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 ++++++++++++++-----
> 1 file changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index c10ea3fd3e16..6b508a9b1abe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3864,6 +3864,7 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
> int amdgpu_ras_init(struct amdgpu_device *adev)
> {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct amdgpu_ras_block_list *ras_node, *tmp;
> int r;
>
> if (con)
> @@ -3953,20 +3954,20 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
> * to handle fatal error */
> r = amdgpu_nbio_ras_sw_init(adev);
> if (r)
> - return r;
> + goto release_con;
>
> if (adev->nbio.ras &&
> adev->nbio.ras->init_ras_controller_interrupt) {
> r = adev->nbio.ras->init_ras_controller_interrupt(adev);
> if (r)
> - goto release_con;
> + goto free_blocks;
> }
>
> if (adev->nbio.ras &&
> adev->nbio.ras->init_ras_err_event_athub_interrupt) {
> r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
> if (r)
> - goto release_con;
> + goto free_blocks;
> }
>
> /* Packed socket_id to ras feature mask bits[31:29] */
> @@ -3982,7 +3983,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>
> if (amdgpu_ras_fs_init(adev)) {
> r = -EINVAL;
> - goto release_con;
> + goto free_blocks;
> }
>
> if (amdgpu_ras_aca_is_supported(adev)) {
> @@ -3991,7 +3992,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
> else
> r = amdgpu_mca_init(adev);
> if (r)
> - goto release_con;
> + goto clear_ras_fs;
> }
>
> dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
> @@ -3999,6 +4000,14 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
> adev->ras_hw_enabled, adev->ras_enabled);
>
> return 0;
> +
> +clear_ras_fs:
> + amdgpu_ras_fs_fini(adev);
> +free_blocks:
> + list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
> + list_del(&ras_node->node);
> + kfree(ras_node);
Suggest to add amdgpu_nbio_ras_sw_fini which calls something like
amdgpu_ras_unregister_ras_block instead of this.
Thanks,
Lijo
> + }
> release_con:
> amdgpu_ras_set_context(adev, NULL);
> kfree(con);
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 04/15] drm/amdgpu: introduce a flag to track refcount held for features
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (2 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 03/15] drm/amdgpu: free all resources on error recovery path of amdgpu_ras_init() Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-17 5:46 ` Lazar, Lijo
2025-01-13 1:42 ` [RFC v2 05/15] drm/amdgpu: enhance amdgpu_ras_block_late_fini() Jiang Liu
` (11 subsequent siblings)
15 siblings, 1 reply; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Currently we track the refcount on ras block object for features by
checking `if (obj && amdgpu_ras_is_feature_enabled(adev, head))`,
which is a little unreliable. So introduce a dedicated flag to track
the reference count.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++--
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 09b63a622728..24ef39b706e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -380,6 +380,7 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block);
enum amdgpu_marker {
// Markers for ras blocks.
AMDGPU_MARKER_RAS_DEBUGFS,
+ AMDGPU_MARKER_RAS_FEATURE,
// Markers for IRQs, used for both ip blocks and ras blocks.
AMDGPU_MARKER_IRQ0 = 32,
AMDGPU_MARKER_IRQ1,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6b508a9b1abe..f0cd14ff78a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -775,15 +775,20 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
obj = amdgpu_ras_create_obj(adev, head);
if (!obj)
return -EINVAL;
- } else {
+ amdgpu_ras_set_marker(adev, head, AMDGPU_MARKER_RAS_FEATURE);
+ } else if (!amdgpu_ras_test_marker(adev, head,
+ AMDGPU_MARKER_RAS_FEATURE)) {
/* In case we create obj somewhere else */
get_obj(obj);
+ amdgpu_ras_set_marker(adev, head, AMDGPU_MARKER_RAS_FEATURE);
}
con->features |= BIT(head->block);
} else {
if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
con->features &= ~BIT(head->block);
- put_obj(obj);
+ if (amdgpu_ras_test_and_clear_marker(adev, head,
+ AMDGPU_MARKER_RAS_FEATURE))
+ put_obj(obj);
}
}
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 04/15] drm/amdgpu: introduce a flag to track refcount held for features
2025-01-13 1:42 ` [RFC v2 04/15] drm/amdgpu: introduce a flag to track refcount held for features Jiang Liu
@ 2025-01-17 5:46 ` Lazar, Lijo
0 siblings, 0 replies; 39+ messages in thread
From: Lazar, Lijo @ 2025-01-17 5:46 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Currently we track the refcount on ras block object for features by
> checking `if (obj && amdgpu_ras_is_feature_enabled(adev, head))`,
> which is a little unreliable. So introduce a dedicated flag to track
> the reference count.
>
Please clarify more on this. I see con->features available to track the
status.
Thanks,
Lijo
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++--
> 2 files changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 09b63a622728..24ef39b706e3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -380,6 +380,7 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block);
> enum amdgpu_marker {
> // Markers for ras blocks.
> AMDGPU_MARKER_RAS_DEBUGFS,
> + AMDGPU_MARKER_RAS_FEATURE,
> // Markers for IRQs, used for both ip blocks and ras blocks.
> AMDGPU_MARKER_IRQ0 = 32,
> AMDGPU_MARKER_IRQ1,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6b508a9b1abe..f0cd14ff78a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -775,15 +775,20 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
> obj = amdgpu_ras_create_obj(adev, head);
> if (!obj)
> return -EINVAL;
> - } else {
> + amdgpu_ras_set_marker(adev, head, AMDGPU_MARKER_RAS_FEATURE);
> + } else if (!amdgpu_ras_test_marker(adev, head,
> + AMDGPU_MARKER_RAS_FEATURE)) {
> /* In case we create obj somewhere else */
> get_obj(obj);
> + amdgpu_ras_set_marker(adev, head, AMDGPU_MARKER_RAS_FEATURE);
> }
> con->features |= BIT(head->block);
> } else {
> if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
> con->features &= ~BIT(head->block);
> - put_obj(obj);
> + if (amdgpu_ras_test_and_clear_marker(adev, head,
> + AMDGPU_MARKER_RAS_FEATURE))
> + put_obj(obj);
> }
> }
>
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 05/15] drm/amdgpu: enhance amdgpu_ras_block_late_fini()
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (3 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 04/15] drm/amdgpu: introduce a flag to track refcount held for features Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-16 21:10 ` Mario Limonciello
2025-01-17 5:54 ` Lazar, Lijo
2025-01-13 1:42 ` [RFC v2 06/15] drm/amdgpu: enhance amdgpu_ras_pre_fini() to better support SR Jiang Liu
` (10 subsequent siblings)
15 siblings, 2 replies; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Enhance amdgpu_ras_block_late_fini() to revert what has been done
by amdgpu_ras_block_late_init(), and fix a possible resource leakage
in function amdgpu_ras_block_late_init().
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f0cd14ff78a7..7bbab7297c97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4104,13 +4104,13 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
ras_obj->hw_ops->query_ras_error_status)) {
r = amdgpu_ras_sysfs_create(adev, ras_block);
if (r)
- goto interrupt;
+ goto cleanup;
/* Those are the cached values at init.
*/
query_info = kzalloc(sizeof(*query_info), GFP_KERNEL);
if (!query_info)
- return -ENOMEM;
+ goto cleanup;
memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) {
@@ -4123,11 +4123,8 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
return 0;
-interrupt:
- if (ras_obj->ras_cb)
- amdgpu_ras_interrupt_remove_handler(adev, ras_block);
cleanup:
- amdgpu_ras_feature_enable(adev, ras_block, 0);
+ amdgpu_ras_block_late_fini(adev, ras_block);
return r;
}
@@ -4142,9 +4139,16 @@ void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
struct ras_common_if *ras_block)
{
struct amdgpu_ras_block_object *ras_obj;
+
if (!ras_block)
return;
+ amdgpu_ras_feature_enable(adev, ras_block, 0);
+
+ /* in resume/reset phase, no need to delete ras fs node */
+ if (adev->in_suspend || amdgpu_in_reset(adev))
+ return;
+
amdgpu_ras_sysfs_remove(adev, ras_block);
ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 05/15] drm/amdgpu: enhance amdgpu_ras_block_late_fini()
2025-01-13 1:42 ` [RFC v2 05/15] drm/amdgpu: enhance amdgpu_ras_block_late_fini() Jiang Liu
@ 2025-01-16 21:10 ` Mario Limonciello
2025-01-17 5:54 ` Lazar, Lijo
1 sibling, 0 replies; 39+ messages in thread
From: Mario Limonciello @ 2025-01-16 21:10 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, lijo.lazar, Hawking.Zhang,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/12/2025 19:42, Jiang Liu wrote:
> Enhance amdgpu_ras_block_late_fini() to revert what has been done
> by amdgpu_ras_block_late_init(), and fix a possible resource leakage
> in function amdgpu_ras_block_late_init().
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++++------
> 1 file changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index f0cd14ff78a7..7bbab7297c97 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4104,13 +4104,13 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
> ras_obj->hw_ops->query_ras_error_status)) {
> r = amdgpu_ras_sysfs_create(adev, ras_block);
> if (r)
> - goto interrupt;
> + goto cleanup;
>
> /* Those are the cached values at init.
> */
> query_info = kzalloc(sizeof(*query_info), GFP_KERNEL);
> if (!query_info)
> - return -ENOMEM;
> + goto cleanup;
AFAICT you still need to set "r = -ENOMEM" here for this error flow.
> memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
>
> if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) {
> @@ -4123,11 +4123,8 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
>
> return 0;
>
> -interrupt:
> - if (ras_obj->ras_cb)
> - amdgpu_ras_interrupt_remove_handler(adev, ras_block);
> cleanup:
> - amdgpu_ras_feature_enable(adev, ras_block, 0);
> + amdgpu_ras_block_late_fini(adev, ras_block);
> return r;
> }
>
> @@ -4142,9 +4139,16 @@ void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
> struct ras_common_if *ras_block)
> {
> struct amdgpu_ras_block_object *ras_obj;
> +
> if (!ras_block)
> return;
>
> + amdgpu_ras_feature_enable(adev, ras_block, 0);
> +
> + /* in resume/reset phase, no need to delete ras fs node */
> + if (adev->in_suspend || amdgpu_in_reset(adev))
> + return;
> +
> amdgpu_ras_sysfs_remove(adev, ras_block);
>
> ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [RFC v2 05/15] drm/amdgpu: enhance amdgpu_ras_block_late_fini()
2025-01-13 1:42 ` [RFC v2 05/15] drm/amdgpu: enhance amdgpu_ras_block_late_fini() Jiang Liu
2025-01-16 21:10 ` Mario Limonciello
@ 2025-01-17 5:54 ` Lazar, Lijo
1 sibling, 0 replies; 39+ messages in thread
From: Lazar, Lijo @ 2025-01-17 5:54 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Enhance amdgpu_ras_block_late_fini() to revert what has been done
> by amdgpu_ras_block_late_init(), and fix a possible resource leakage
> in function amdgpu_ras_block_late_init().
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++++------
> 1 file changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index f0cd14ff78a7..7bbab7297c97 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4104,13 +4104,13 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
> ras_obj->hw_ops->query_ras_error_status)) {
> r = amdgpu_ras_sysfs_create(adev, ras_block);
> if (r)
> - goto interrupt;
> + goto cleanup;
>
> /* Those are the cached values at init.
> */
> query_info = kzalloc(sizeof(*query_info), GFP_KERNEL);
> if (!query_info)
> - return -ENOMEM;
> + goto cleanup;
> memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
>
> if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) {
> @@ -4123,11 +4123,8 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
>
> return 0;
>
> -interrupt:
> - if (ras_obj->ras_cb)
> - amdgpu_ras_interrupt_remove_handler(adev, ras_block);
> cleanup:
> - amdgpu_ras_feature_enable(adev, ras_block, 0);
> + amdgpu_ras_block_late_fini(adev, ras_block);
> return r;
> }
>
> @@ -4142,9 +4139,16 @@ void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
> struct ras_common_if *ras_block)
> {
> struct amdgpu_ras_block_object *ras_obj;
> +
> if (!ras_block)
> return;
>
> + amdgpu_ras_feature_enable(adev, ras_block, 0);
> +
> + /* in resume/reset phase, no need to delete ras fs node */
> + if (adev->in_suspend || amdgpu_in_reset(adev))
> + return;
> +
I guess late_init will get called during resume or after reset. Won't it
go to sysfs creation logic that time?
Thanks,
Lijo
> amdgpu_ras_sysfs_remove(adev, ras_block);
>
> ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 06/15] drm/amdgpu: enhance amdgpu_ras_pre_fini() to better support SR
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (4 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 05/15] drm/amdgpu: enhance amdgpu_ras_block_late_fini() Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-16 21:19 ` Mario Limonciello
2025-01-17 6:09 ` Lazar, Lijo
2025-01-13 1:42 ` [RFC v2 07/15] drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini() Jiang Liu
` (9 subsequent siblings)
15 siblings, 2 replies; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Enhance amdgpu_ras_pre_fini() to better support suspend/resume by:
1) fix possible resource leakage. amdgpu_release_ras_context() only
kfree(con) but doesn't release resources associated with the con
object.
2) call amdgpu_ras_pre_fini() in amdgpu_device_suspend() to undo what
has been done by amdgpu_ras_late_init(), because amdgpu_device_resume()
will invoke amdgpu_ras_late_init() on resume.
3) move amdgpu_ras_recovery_fini() from amdgpu_ras_pre_fini() to
amdgpu_ras_fini()
4) move calling of `obj->ras_fini()` from amdgpu_ras_fini() to
amdgpu_ras_pre_fini().
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 44 +++++++++++++---------
2 files changed, 31 insertions(+), 19 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0a121aab5c74..2bfe113e17c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4613,6 +4613,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return 0;
release_ras_con:
+ amdgpu_ras_pre_fini(adev);
+ amdgpu_ras_fini(adev);
if (amdgpu_sriov_vf(adev))
amdgpu_virt_release_full_gpu(adev, true);
@@ -4627,8 +4629,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->virt.ops = NULL;
r = -EAGAIN;
}
- amdgpu_release_ras_context(adev);
-
failed:
amdgpu_vf_error_trans_all(adev);
@@ -4921,6 +4921,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
cancel_delayed_work_sync(&adev->delayed_init_work);
+ /* disable ras feature must before hw fini */
+ amdgpu_ras_pre_fini(adev);
amdgpu_ras_suspend(adev);
amdgpu_device_ip_suspend_phase1(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7bbab7297c97..5ac63f9cffda 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4270,42 +4270,49 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_ras_block_list *node, *tmp;
+ struct amdgpu_ras_block_object *obj;
- if (!adev->ras_enabled || !con)
- return 0;
+ if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev))
+ goto disable;
+ list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
+ obj = node->ras_obj;
+ if (!obj)
+ continue;
+
+ if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
+ continue;
+
+ if (obj->ras_fini)
+ obj->ras_fini(adev, &obj->ras_comm);
+ else
+ amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
+ }
+disable:
/* Need disable ras on all IPs here before ip [hw/sw]fini */
- if (AMDGPU_RAS_GET_FEATURES(con->features))
+ if (con && AMDGPU_RAS_GET_FEATURES(con->features))
amdgpu_ras_disable_all_features(adev, 0);
- amdgpu_ras_recovery_fini(adev);
+
return 0;
}
int amdgpu_ras_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras_block_list *ras_node, *tmp;
- struct amdgpu_ras_block_object *obj = NULL;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
if (!adev->ras_enabled || !con)
- return 0;
+ goto out_free_context;
list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
- if (ras_node->ras_obj) {
- obj = ras_node->ras_obj;
- if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
- obj->ras_fini)
- obj->ras_fini(adev, &obj->ras_comm);
- else
- amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
- }
-
/* Clear ras blocks from ras_list and free ras block list node */
list_del(&ras_node->node);
kfree(ras_node);
}
+ amdgpu_ras_recovery_fini(adev);
amdgpu_ras_fs_fini(adev);
amdgpu_ras_interrupt_remove_all(adev);
@@ -4323,8 +4330,11 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
cancel_delayed_work_sync(&con->ras_counte_delay_work);
- amdgpu_ras_set_context(adev, NULL);
- kfree(con);
+out_free_context:
+ if (con) {
+ amdgpu_ras_set_context(adev, NULL);
+ kfree(con);
+ }
return 0;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 06/15] drm/amdgpu: enhance amdgpu_ras_pre_fini() to better support SR
2025-01-13 1:42 ` [RFC v2 06/15] drm/amdgpu: enhance amdgpu_ras_pre_fini() to better support SR Jiang Liu
@ 2025-01-16 21:19 ` Mario Limonciello
2025-01-17 6:09 ` Lazar, Lijo
1 sibling, 0 replies; 39+ messages in thread
From: Mario Limonciello @ 2025-01-16 21:19 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, lijo.lazar, Hawking.Zhang,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/12/2025 19:42, Jiang Liu wrote:
> Enhance amdgpu_ras_pre_fini() to better support suspend/resume by:
> 1) fix possible resource leakage. amdgpu_release_ras_context() only
> kfree(con) but doesn't release resources associated with the con
> object.
> 2) call amdgpu_ras_pre_fini() in amdgpu_device_suspend() to undo what
> has been done by amdgpu_ras_late_init(), because amdgpu_device_resume()
> will invoke amdgpu_ras_late_init() on resume.
> 3) move amdgpu_ras_recovery_fini() from amdgpu_ras_pre_fini() to
> amdgpu_ras_fini()
> 4) move calling of `obj->ras_fini()` from amdgpu_ras_fini() to
> amdgpu_ras_pre_fini().
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 44 +++++++++++++---------
> 2 files changed, 31 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0a121aab5c74..2bfe113e17c7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4613,6 +4613,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> return 0;
>
> release_ras_con:
> + amdgpu_ras_pre_fini(adev);
> + amdgpu_ras_fini(adev);
> if (amdgpu_sriov_vf(adev))
> amdgpu_virt_release_full_gpu(adev, true);
>
> @@ -4627,8 +4629,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> adev->virt.ops = NULL;
> r = -EAGAIN;
> }
> - amdgpu_release_ras_context(adev);
> -
> failed:
> amdgpu_vf_error_trans_all(adev);
>
> @@ -4921,6 +4921,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>
> cancel_delayed_work_sync(&adev->delayed_init_work);
>
> + /* disable ras feature must before hw fini */
> + amdgpu_ras_pre_fini(adev);
> amdgpu_ras_suspend(adev);
>
> amdgpu_device_ip_suspend_phase1(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 7bbab7297c97..5ac63f9cffda 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4270,42 +4270,49 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
> int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
> {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct amdgpu_ras_block_list *node, *tmp;
> + struct amdgpu_ras_block_object *obj;
>
> - if (!adev->ras_enabled || !con)
> - return 0;
> + if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev))
> + goto disable;
>
> + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
> + obj = node->ras_obj;
> + if (!obj)
> + continue;
> +
> + if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
> + continue;
> +
> + if (obj->ras_fini)
> + obj->ras_fini(adev, &obj->ras_comm);
> + else
> + amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
> + }
>
> +disable:
> /* Need disable ras on all IPs here before ip [hw/sw]fini */
> - if (AMDGPU_RAS_GET_FEATURES(con->features))
> + if (con && AMDGPU_RAS_GET_FEATURES(con->features))
> amdgpu_ras_disable_all_features(adev, 0);
> - amdgpu_ras_recovery_fini(adev);
> +
> return 0;
> }
>
> int amdgpu_ras_fini(struct amdgpu_device *adev)
> {
> struct amdgpu_ras_block_list *ras_node, *tmp;
> - struct amdgpu_ras_block_object *obj = NULL;
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>
> if (!adev->ras_enabled || !con)
> - return 0;
> + goto out_free_context;
This block doesn't make sense for two reasons.
1) Why would you have a context if adev->ras_enabled was false?
2) If amdgpu_ras_get_context() returned NULL, then jumping to
out_free_context and checking con again doesn't make sense - you already
knew it was NULL.
>
> list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
> - if (ras_node->ras_obj) {
> - obj = ras_node->ras_obj;
> - if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
> - obj->ras_fini)
> - obj->ras_fini(adev, &obj->ras_comm);
> - else
> - amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
> - }
> -
> /* Clear ras blocks from ras_list and free ras block list node */
> list_del(&ras_node->node);
> kfree(ras_node);
> }
>
> + amdgpu_ras_recovery_fini(adev);
> amdgpu_ras_fs_fini(adev);
> amdgpu_ras_interrupt_remove_all(adev);
>
> @@ -4323,8 +4330,11 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>
> cancel_delayed_work_sync(&con->ras_counte_delay_work);
>
> - amdgpu_ras_set_context(adev, NULL);
> - kfree(con);
> +out_free_context:
> + if (con) {
> + amdgpu_ras_set_context(adev, NULL);
> + kfree(con);
> + }
>
> return 0;
> }
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [RFC v2 06/15] drm/amdgpu: enhance amdgpu_ras_pre_fini() to better support SR
2025-01-13 1:42 ` [RFC v2 06/15] drm/amdgpu: enhance amdgpu_ras_pre_fini() to better support SR Jiang Liu
2025-01-16 21:19 ` Mario Limonciello
@ 2025-01-17 6:09 ` Lazar, Lijo
1 sibling, 0 replies; 39+ messages in thread
From: Lazar, Lijo @ 2025-01-17 6:09 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx, Tao Zhou
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Enhance amdgpu_ras_pre_fini() to better support suspend/resume by:
> 1) fix possible resource leakage. amdgpu_release_ras_context() only
> kfree(con) but doesn't release resources associated with the con
> object.
> 2) call amdgpu_ras_pre_fini() in amdgpu_device_suspend() to undo what
> has been done by amdgpu_ras_late_init(), because amdgpu_device_resume()
> will invoke amdgpu_ras_late_init() on resume.
> 3) move amdgpu_ras_recovery_fini() from amdgpu_ras_pre_fini() to
> amdgpu_ras_fini()
> 4) move calling of `obj->ras_fini()` from amdgpu_ras_fini() to
> amdgpu_ras_pre_fini().
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 44 +++++++++++++---------
> 2 files changed, 31 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0a121aab5c74..2bfe113e17c7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4613,6 +4613,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> return 0;
>
> release_ras_con:
> + amdgpu_ras_pre_fini(adev);
> + amdgpu_ras_fini(adev);
> if (amdgpu_sriov_vf(adev))
> amdgpu_virt_release_full_gpu(adev, true);
>
> @@ -4627,8 +4629,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> adev->virt.ops = NULL;
> r = -EAGAIN;
> }
> - amdgpu_release_ras_context(adev);
> -
> failed:
> amdgpu_vf_error_trans_all(adev);
>
> @@ -4921,6 +4921,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>
> cancel_delayed_work_sync(&adev->delayed_init_work);
>
> + /* disable ras feature must before hw fini */
> + amdgpu_ras_pre_fini(adev);
> amdgpu_ras_suspend(adev);
Based on the usages above, it makes more sense to keep
amdgpu_ras_pre_fini as a static function and call in
ras_fini/ras_suspend (contain the calls at ras layer and avoid another
new public interface).
Copying Tao to take a look.
Thanks,
Lijo
>
> amdgpu_device_ip_suspend_phase1(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 7bbab7297c97..5ac63f9cffda 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4270,42 +4270,49 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
> int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
> {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct amdgpu_ras_block_list *node, *tmp;
> + struct amdgpu_ras_block_object *obj;
>
> - if (!adev->ras_enabled || !con)
> - return 0;
> + if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev))
> + goto disable;
>
> + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
> + obj = node->ras_obj;
> + if (!obj)
> + continue;
> +
> + if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
> + continue;
> +
> + if (obj->ras_fini)
> + obj->ras_fini(adev, &obj->ras_comm);
> + else
> + amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
> + }
>
> +disable:
> /* Need disable ras on all IPs here before ip [hw/sw]fini */
> - if (AMDGPU_RAS_GET_FEATURES(con->features))
> + if (con && AMDGPU_RAS_GET_FEATURES(con->features))
> amdgpu_ras_disable_all_features(adev, 0);
> - amdgpu_ras_recovery_fini(adev);
> +
> return 0;
> }
>
> int amdgpu_ras_fini(struct amdgpu_device *adev)
> {
> struct amdgpu_ras_block_list *ras_node, *tmp;
> - struct amdgpu_ras_block_object *obj = NULL;
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>
> if (!adev->ras_enabled || !con)
> - return 0;
> + goto out_free_context;
>
> list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
> - if (ras_node->ras_obj) {
> - obj = ras_node->ras_obj;
> - if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
> - obj->ras_fini)
> - obj->ras_fini(adev, &obj->ras_comm);
> - else
> - amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
> - }
> -
> /* Clear ras blocks from ras_list and free ras block list node */
> list_del(&ras_node->node);
> kfree(ras_node);
> }
>
> + amdgpu_ras_recovery_fini(adev);
> amdgpu_ras_fs_fini(adev);
> amdgpu_ras_interrupt_remove_all(adev);
>
> @@ -4323,8 +4330,11 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>
> cancel_delayed_work_sync(&con->ras_counte_delay_work);
>
> - amdgpu_ras_set_context(adev, NULL);
> - kfree(con);
> +out_free_context:
> + if (con) {
> + amdgpu_ras_set_context(adev, NULL);
> + kfree(con);
> + }
>
> return 0;
> }
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 07/15] drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini()
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (5 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 06/15] drm/amdgpu: enhance amdgpu_ras_pre_fini() to better support SR Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-16 21:25 ` Mario Limonciello
` (2 more replies)
2025-01-13 1:42 ` [RFC v2 08/15] drm/amdgpu: make IP block state machine works in stack like way Jiang Liu
` (8 subsequent siblings)
15 siblings, 3 replies; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini(), to keep same
style with other code.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++---
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++--------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 +++---
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +-
drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +-
drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +-
drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +-
14 files changed, 25 insertions(+), 25 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2bfe113e17c7..6cbd19ad0fa5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4613,7 +4613,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return 0;
release_ras_con:
- amdgpu_ras_pre_fini(adev);
+ amdgpu_ras_early_fini(adev);
amdgpu_ras_fini(adev);
if (amdgpu_sriov_vf(adev))
amdgpu_virt_release_full_gpu(adev, true);
@@ -4705,7 +4705,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
amdgpu_xcp_cfg_sysfs_fini(adev);
/* disable ras feature must before hw fini */
- amdgpu_ras_pre_fini(adev);
+ amdgpu_ras_early_fini(adev);
amdgpu_ttm_set_buffer_funcs_status(adev, false);
@@ -4922,7 +4922,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
cancel_delayed_work_sync(&adev->delayed_init_work);
/* disable ras feature must before hw fini */
- amdgpu_ras_pre_fini(adev);
+ amdgpu_ras_early_fini(adev);
amdgpu_ras_suspend(adev);
amdgpu_device_ip_suspend_phase1(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 6d5d81f0dc4e..2e7c09530ec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -918,7 +918,7 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index b6d2eb049f54..80248930082c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -300,7 +300,7 @@ int amdgpu_jpeg_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index d085687a47ea..c75ce91f94ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -71,6 +71,6 @@ int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5ac63f9cffda..b11e3eb2b100 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4124,7 +4124,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
return 0;
cleanup:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
@@ -4135,7 +4135,7 @@ static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
}
/* helper function to remove ras fs node and interrupt handler */
-void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
+void amdgpu_ras_block_early_fini(struct amdgpu_device *adev,
struct ras_common_if *ras_block)
{
struct amdgpu_ras_block_object *ras_obj;
@@ -4156,10 +4156,10 @@ void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
amdgpu_ras_interrupt_remove_handler(adev, ras_block);
}
-static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
+static void amdgpu_ras_block_early_fini_default(struct amdgpu_device *adev,
struct ras_common_if *ras_block)
{
- return amdgpu_ras_block_late_fini(adev, ras_block);
+ return amdgpu_ras_block_early_fini(adev, ras_block);
}
/* do some init work after IP late init as dependence.
@@ -4267,7 +4267,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
}
/* do some fini work before IP fini as dependence */
-int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
+int amdgpu_ras_early_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct amdgpu_ras_block_list *node, *tmp;
@@ -4284,10 +4284,10 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
continue;
- if (obj->ras_fini)
- obj->ras_fini(adev, &obj->ras_comm);
+ if (obj->ras_early_fini)
+ obj->ras_early_fini(adev, &obj->ras_comm);
else
- amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
+ amdgpu_ras_block_early_fini_default(adev, &obj->ras_comm);
}
disable:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 35881087b17b..3a6f70b75e47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -711,7 +711,7 @@ struct amdgpu_ras_block_object {
int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj,
enum amdgpu_ras_block block, uint32_t sub_block_index);
int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
- void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
+ void (*ras_early_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
ras_ih_cb ras_cb;
const struct amdgpu_ras_block_hw_ops *hw_ops;
};
@@ -825,13 +825,13 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) {
/* called in ip_init and ip_fini */
int amdgpu_ras_init(struct amdgpu_device *adev);
int amdgpu_ras_late_init(struct amdgpu_device *adev);
+int amdgpu_ras_early_fini(struct amdgpu_device *adev);
int amdgpu_ras_fini(struct amdgpu_device *adev);
-int amdgpu_ras_pre_fini(struct amdgpu_device *adev);
int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block);
-void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
+void amdgpu_ras_block_early_fini(struct amdgpu_device *adev,
struct ras_common_if *ras_block);
int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 174badca27e7..1a1834e47b50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -116,7 +116,7 @@ int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev,
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index eafe20d8fe0b..dd787f5f2f23 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -336,7 +336,7 @@ int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 83faf6e6788a..3ab80399d2ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1248,7 +1248,7 @@ int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 74b4349e345a..825c331f48f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1174,7 +1174,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 2ba185875baa..ce70acfbf22c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -5080,7 +5080,7 @@ static int gfx_v9_4_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 88f9771c1686..28bc2f946e91 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -1311,7 +1311,7 @@ static int jpeg_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_comm
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index e646e5cef0a2..467283165a3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -789,7 +789,7 @@ static int mmhub_v1_8_ras_late_init(struct amdgpu_device *adev, struct ras_commo
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index ecdc027f8220..063b3bafd134 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -1996,7 +1996,7 @@ static int vcn_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 07/15] drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini()
2025-01-13 1:42 ` [RFC v2 07/15] drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini() Jiang Liu
@ 2025-01-16 21:25 ` Mario Limonciello
2025-01-17 1:19 ` Wang, Yang(Kevin)
2025-01-17 8:37 ` Lazar, Lijo
2 siblings, 0 replies; 39+ messages in thread
From: Mario Limonciello @ 2025-01-16 21:25 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, lijo.lazar, Hawking.Zhang,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/12/2025 19:42, Jiang Liu wrote:
> Rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini(), to keep same
> style with other code.
Besides amdgpu_ras_pre_fini() -> amdgpu_ras_early_fini() you also
changed amdgpu_ras_block_late_fini() -> amdgpu_ras_early_fini().
Is that really intended? If so; the commit message needs to be amended.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++--------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 +++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +-
> 14 files changed, 25 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2bfe113e17c7..6cbd19ad0fa5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4613,7 +4613,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> return 0;
>
> release_ras_con:
> - amdgpu_ras_pre_fini(adev);
> + amdgpu_ras_early_fini(adev);
> amdgpu_ras_fini(adev);
> if (amdgpu_sriov_vf(adev))
> amdgpu_virt_release_full_gpu(adev, true);
> @@ -4705,7 +4705,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
> amdgpu_xcp_cfg_sysfs_fini(adev);
>
> /* disable ras feature must before hw fini */
> - amdgpu_ras_pre_fini(adev);
> + amdgpu_ras_early_fini(adev);
>
> amdgpu_ttm_set_buffer_funcs_status(adev, false);
>
> @@ -4922,7 +4922,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
> cancel_delayed_work_sync(&adev->delayed_init_work);
>
> /* disable ras feature must before hw fini */
> - amdgpu_ras_pre_fini(adev);
> + amdgpu_ras_early_fini(adev);
> amdgpu_ras_suspend(adev);
>
> amdgpu_device_ip_suspend_phase1(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 6d5d81f0dc4e..2e7c09530ec1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -918,7 +918,7 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
>
> return 0;
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
> index b6d2eb049f54..80248930082c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
> @@ -300,7 +300,7 @@ int amdgpu_jpeg_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> index d085687a47ea..c75ce91f94ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> @@ -71,6 +71,6 @@ int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *
>
> return 0;
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 5ac63f9cffda..b11e3eb2b100 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4124,7 +4124,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
> return 0;
>
> cleanup:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> @@ -4135,7 +4135,7 @@ static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
> }
>
> /* helper function to remove ras fs node and interrupt handler */
> -void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
> +void amdgpu_ras_block_early_fini(struct amdgpu_device *adev,
> struct ras_common_if *ras_block)
> {
> struct amdgpu_ras_block_object *ras_obj;
> @@ -4156,10 +4156,10 @@ void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
> amdgpu_ras_interrupt_remove_handler(adev, ras_block);
> }
>
> -static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
> +static void amdgpu_ras_block_early_fini_default(struct amdgpu_device *adev,
> struct ras_common_if *ras_block)
> {
> - return amdgpu_ras_block_late_fini(adev, ras_block);
> + return amdgpu_ras_block_early_fini(adev, ras_block);
> }
>
> /* do some init work after IP late init as dependence.
> @@ -4267,7 +4267,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
> }
>
> /* do some fini work before IP fini as dependence */
> -int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
> +int amdgpu_ras_early_fini(struct amdgpu_device *adev)
> {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> struct amdgpu_ras_block_list *node, *tmp;
> @@ -4284,10 +4284,10 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
> if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
> continue;
>
> - if (obj->ras_fini)
> - obj->ras_fini(adev, &obj->ras_comm);
> + if (obj->ras_early_fini)
> + obj->ras_early_fini(adev, &obj->ras_comm);
> else
> - amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
> + amdgpu_ras_block_early_fini_default(adev, &obj->ras_comm);
> }
>
> disable:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 35881087b17b..3a6f70b75e47 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -711,7 +711,7 @@ struct amdgpu_ras_block_object {
> int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj,
> enum amdgpu_ras_block block, uint32_t sub_block_index);
> int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
> - void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
> + void (*ras_early_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
> ras_ih_cb ras_cb;
> const struct amdgpu_ras_block_hw_ops *hw_ops;
> };
> @@ -825,13 +825,13 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) {
> /* called in ip_init and ip_fini */
> int amdgpu_ras_init(struct amdgpu_device *adev);
> int amdgpu_ras_late_init(struct amdgpu_device *adev);
> +int amdgpu_ras_early_fini(struct amdgpu_device *adev);
> int amdgpu_ras_fini(struct amdgpu_device *adev);
> -int amdgpu_ras_pre_fini(struct amdgpu_device *adev);
>
> int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
> struct ras_common_if *ras_block);
>
> -void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
> +void amdgpu_ras_block_early_fini(struct amdgpu_device *adev,
> struct ras_common_if *ras_block);
>
> int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 174badca27e7..1a1834e47b50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -116,7 +116,7 @@ int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev,
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index eafe20d8fe0b..dd787f5f2f23 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -336,7 +336,7 @@ int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> index 83faf6e6788a..3ab80399d2ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> @@ -1248,7 +1248,7 @@ int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 74b4349e345a..825c331f48f0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -1174,7 +1174,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 2ba185875baa..ce70acfbf22c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -5080,7 +5080,7 @@ static int gfx_v9_4_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> index 88f9771c1686..28bc2f946e91 100644
> --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> @@ -1311,7 +1311,7 @@ static int jpeg_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_comm
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
> index e646e5cef0a2..467283165a3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
> @@ -789,7 +789,7 @@ static int mmhub_v1_8_ras_late_init(struct amdgpu_device *adev, struct ras_commo
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> index ecdc027f8220..063b3bafd134 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> @@ -1996,7 +1996,7 @@ static int vcn_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
>
> return r;
> }
^ permalink raw reply [flat|nested] 39+ messages in thread* RE: [RFC v2 07/15] drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini()
2025-01-13 1:42 ` [RFC v2 07/15] drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini() Jiang Liu
2025-01-16 21:25 ` Mario Limonciello
@ 2025-01-17 1:19 ` Wang, Yang(Kevin)
2025-01-17 8:37 ` Lazar, Lijo
2 siblings, 0 replies; 39+ messages in thread
From: Wang, Yang(Kevin) @ 2025-01-17 1:19 UTC (permalink / raw)
To: Jiang Liu, Deucher, Alexander, Koenig, Christian, Pan, Xinhui,
airlied@gmail.com, simona@ffwll.ch, Khatri, Sunil, Lazar, Lijo,
Zhang, Hawking, Limonciello, Mario, Chen, Xiaogang, Russell, Kent,
shuox.liu@linux.alibaba.com, amd-gfx@lists.freedesktop.org
[AMD Official Use Only - AMD Internal Distribution Only]
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Jiang Liu
Sent: Monday, January 13, 2025 09:42
To: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Pan, Xinhui <Xinhui.Pan@amd.com>; airlied@gmail.com; simona@ffwll.ch; Khatri, Sunil <Sunil.Khatri@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Limonciello, Mario <Mario.Limonciello@amd.com>; Chen, Xiaogang <Xiaogang.Chen@amd.com>; Russell, Kent <Kent.Russell@amd.com>; shuox.liu@linux.alibaba.com; amd-gfx@lists.freedesktop.org
Cc: Jiang Liu <gerry@linux.alibaba.com>
Subject: [RFC v2 07/15] drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini()
Rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini(), to keep same style with other code.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++---
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++--------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 +++---
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +-
drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +-
drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +-
drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +-
14 files changed, 25 insertions(+), 25 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2bfe113e17c7..6cbd19ad0fa5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4613,7 +4613,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return 0;
release_ras_con:
- amdgpu_ras_pre_fini(adev);
+ amdgpu_ras_early_fini(adev);
amdgpu_ras_fini(adev);
if (amdgpu_sriov_vf(adev))
amdgpu_virt_release_full_gpu(adev, true); @@ -4705,7 +4705,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
amdgpu_xcp_cfg_sysfs_fini(adev);
/* disable ras feature must before hw fini */
- amdgpu_ras_pre_fini(adev);
+ amdgpu_ras_early_fini(adev);
amdgpu_ttm_set_buffer_funcs_status(adev, false);
@@ -4922,7 +4922,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
cancel_delayed_work_sync(&adev->delayed_init_work);
/* disable ras feature must before hw fini */
- amdgpu_ras_pre_fini(adev);
+ amdgpu_ras_early_fini(adev);
amdgpu_ras_suspend(adev);
amdgpu_device_ip_suspend_phase1(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 6d5d81f0dc4e..2e7c09530ec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -918,7 +918,7 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
return 0;
late_fini:
[kevin]:
It's better to rename above label name together.
Best Regards,
Kevin
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index b6d2eb049f54..80248930082c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -300,7 +300,7 @@ int amdgpu_jpeg_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index d085687a47ea..c75ce91f94ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -71,6 +71,6 @@ int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5ac63f9cffda..b11e3eb2b100 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4124,7 +4124,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
return 0;
cleanup:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
@@ -4135,7 +4135,7 @@ static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev, }
/* helper function to remove ras fs node and interrupt handler */ -void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
+void amdgpu_ras_block_early_fini(struct amdgpu_device *adev,
struct ras_common_if *ras_block)
{
struct amdgpu_ras_block_object *ras_obj; @@ -4156,10 +4156,10 @@ void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
amdgpu_ras_interrupt_remove_handler(adev, ras_block); }
-static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
+static void amdgpu_ras_block_early_fini_default(struct amdgpu_device
+*adev,
struct ras_common_if *ras_block)
{
- return amdgpu_ras_block_late_fini(adev, ras_block);
+ return amdgpu_ras_block_early_fini(adev, ras_block);
}
/* do some init work after IP late init as dependence.
@@ -4267,7 +4267,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) }
/* do some fini work before IP fini as dependence */ -int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
+int amdgpu_ras_early_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct amdgpu_ras_block_list *node, *tmp; @@ -4284,10 +4284,10 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
continue;
- if (obj->ras_fini)
- obj->ras_fini(adev, &obj->ras_comm);
+ if (obj->ras_early_fini)
+ obj->ras_early_fini(adev, &obj->ras_comm);
else
- amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
+ amdgpu_ras_block_early_fini_default(adev, &obj->ras_comm);
}
disable:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 35881087b17b..3a6f70b75e47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -711,7 +711,7 @@ struct amdgpu_ras_block_object {
int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj,
enum amdgpu_ras_block block, uint32_t sub_block_index);
int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
- void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
+ void (*ras_early_fini)(struct amdgpu_device *adev, struct
+ras_common_if *ras_block);
ras_ih_cb ras_cb;
const struct amdgpu_ras_block_hw_ops *hw_ops; }; @@ -825,13 +825,13 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) {
/* called in ip_init and ip_fini */
int amdgpu_ras_init(struct amdgpu_device *adev); int amdgpu_ras_late_init(struct amdgpu_device *adev);
+int amdgpu_ras_early_fini(struct amdgpu_device *adev);
int amdgpu_ras_fini(struct amdgpu_device *adev); -int amdgpu_ras_pre_fini(struct amdgpu_device *adev);
int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block);
-void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
+void amdgpu_ras_block_early_fini(struct amdgpu_device *adev,
struct ras_common_if *ras_block);
int amdgpu_ras_feature_enable(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 174badca27e7..1a1834e47b50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -116,7 +116,7 @@ int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev,
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index eafe20d8fe0b..dd787f5f2f23 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -336,7 +336,7 @@ int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 83faf6e6788a..3ab80399d2ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1248,7 +1248,7 @@ int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 74b4349e345a..825c331f48f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1174,7 +1174,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 2ba185875baa..ce70acfbf22c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -5080,7 +5080,7 @@ static int gfx_v9_4_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 88f9771c1686..28bc2f946e91 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -1311,7 +1311,7 @@ static int jpeg_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_comm
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
index e646e5cef0a2..467283165a3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
@@ -789,7 +789,7 @@ static int mmhub_v1_8_ras_late_init(struct amdgpu_device *adev, struct ras_commo
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index ecdc027f8220..063b3bafd134 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -1996,7 +1996,7 @@ static int vcn_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo
return 0;
late_fini:
- amdgpu_ras_block_late_fini(adev, ras_block);
+ amdgpu_ras_block_early_fini(adev, ras_block);
return r;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 07/15] drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini()
2025-01-13 1:42 ` [RFC v2 07/15] drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini() Jiang Liu
2025-01-16 21:25 ` Mario Limonciello
2025-01-17 1:19 ` Wang, Yang(Kevin)
@ 2025-01-17 8:37 ` Lazar, Lijo
2 siblings, 0 replies; 39+ messages in thread
From: Lazar, Lijo @ 2025-01-17 8:37 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini(), to keep same
> style with other code.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++--------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 +++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +-
> 14 files changed, 25 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2bfe113e17c7..6cbd19ad0fa5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4613,7 +4613,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> return 0;
>
> release_ras_con:
> - amdgpu_ras_pre_fini(adev);
> + amdgpu_ras_early_fini(adev);
> amdgpu_ras_fini(adev);
> if (amdgpu_sriov_vf(adev))
> amdgpu_virt_release_full_gpu(adev, true);
> @@ -4705,7 +4705,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
> amdgpu_xcp_cfg_sysfs_fini(adev);
>
> /* disable ras feature must before hw fini */
> - amdgpu_ras_pre_fini(adev);
> + amdgpu_ras_early_fini(adev);
>
> amdgpu_ttm_set_buffer_funcs_status(adev, false);
>
> @@ -4922,7 +4922,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
> cancel_delayed_work_sync(&adev->delayed_init_work);
>
> /* disable ras feature must before hw fini */
> - amdgpu_ras_pre_fini(adev);
> + amdgpu_ras_early_fini(adev);
> amdgpu_ras_suspend(adev);
>
> amdgpu_device_ip_suspend_phase1(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 6d5d81f0dc4e..2e7c09530ec1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -918,7 +918,7 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
>
> return 0;
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
> index b6d2eb049f54..80248930082c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
> @@ -300,7 +300,7 @@ int amdgpu_jpeg_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
This feels confusing - calling early_fini from late_init
Thanks,
Lijo
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> index d085687a47ea..c75ce91f94ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
> @@ -71,6 +71,6 @@ int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *
>
> return 0;
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 5ac63f9cffda..b11e3eb2b100 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4124,7 +4124,7 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
> return 0;
>
> cleanup:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> @@ -4135,7 +4135,7 @@ static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
> }
>
> /* helper function to remove ras fs node and interrupt handler */
> -void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
> +void amdgpu_ras_block_early_fini(struct amdgpu_device *adev,
> struct ras_common_if *ras_block)
> {
> struct amdgpu_ras_block_object *ras_obj;
> @@ -4156,10 +4156,10 @@ void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
> amdgpu_ras_interrupt_remove_handler(adev, ras_block);
> }
>
> -static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
> +static void amdgpu_ras_block_early_fini_default(struct amdgpu_device *adev,
> struct ras_common_if *ras_block)
> {
> - return amdgpu_ras_block_late_fini(adev, ras_block);
> + return amdgpu_ras_block_early_fini(adev, ras_block);
> }
>
> /* do some init work after IP late init as dependence.
> @@ -4267,7 +4267,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
> }
>
> /* do some fini work before IP fini as dependence */
> -int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
> +int amdgpu_ras_early_fini(struct amdgpu_device *adev)
> {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> struct amdgpu_ras_block_list *node, *tmp;
> @@ -4284,10 +4284,10 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
> if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
> continue;
>
> - if (obj->ras_fini)
> - obj->ras_fini(adev, &obj->ras_comm);
> + if (obj->ras_early_fini)
> + obj->ras_early_fini(adev, &obj->ras_comm);
> else
> - amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
> + amdgpu_ras_block_early_fini_default(adev, &obj->ras_comm);
> }
>
> disable:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 35881087b17b..3a6f70b75e47 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -711,7 +711,7 @@ struct amdgpu_ras_block_object {
> int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj,
> enum amdgpu_ras_block block, uint32_t sub_block_index);
> int (*ras_late_init)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
> - void (*ras_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
> + void (*ras_early_fini)(struct amdgpu_device *adev, struct ras_common_if *ras_block);
> ras_ih_cb ras_cb;
> const struct amdgpu_ras_block_hw_ops *hw_ops;
> };
> @@ -825,13 +825,13 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) {
> /* called in ip_init and ip_fini */
> int amdgpu_ras_init(struct amdgpu_device *adev);
> int amdgpu_ras_late_init(struct amdgpu_device *adev);
> +int amdgpu_ras_early_fini(struct amdgpu_device *adev);
> int amdgpu_ras_fini(struct amdgpu_device *adev);
> -int amdgpu_ras_pre_fini(struct amdgpu_device *adev);
>
> int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
> struct ras_common_if *ras_block);
>
> -void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
> +void amdgpu_ras_block_early_fini(struct amdgpu_device *adev,
> struct ras_common_if *ras_block);
>
> int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 174badca27e7..1a1834e47b50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -116,7 +116,7 @@ int amdgpu_sdma_ras_late_init(struct amdgpu_device *adev,
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index eafe20d8fe0b..dd787f5f2f23 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -336,7 +336,7 @@ int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> index 83faf6e6788a..3ab80399d2ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> @@ -1248,7 +1248,7 @@ int amdgpu_vcn_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 74b4349e345a..825c331f48f0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -1174,7 +1174,7 @@ static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_comm
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 2ba185875baa..ce70acfbf22c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -5080,7 +5080,7 @@ static int gfx_v9_4_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> index 88f9771c1686..28bc2f946e91 100644
> --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
> @@ -1311,7 +1311,7 @@ static int jpeg_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_comm
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
> index e646e5cef0a2..467283165a3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c
> @@ -789,7 +789,7 @@ static int mmhub_v1_8_ras_late_init(struct amdgpu_device *adev, struct ras_commo
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> index ecdc027f8220..063b3bafd134 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> @@ -1996,7 +1996,7 @@ static int vcn_v4_0_3_ras_late_init(struct amdgpu_device *adev, struct ras_commo
> return 0;
>
> late_fini:
> - amdgpu_ras_block_late_fini(adev, ras_block);
> + amdgpu_ras_block_early_fini(adev, ras_block);
>
> return r;
> }
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 08/15] drm/amdgpu: make IP block state machine works in stack like way
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (6 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 07/15] drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini() Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-17 8:45 ` Lazar, Lijo
2025-01-13 1:42 ` [RFC v2 09/15] drm/amdgpu_dm: enhance amdgpu_dm_early_fini() for PM ops Jiang Liu
` (7 subsequent siblings)
15 siblings, 1 reply; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
There are some mismatches between IP block state machine and its
associated status flags, especially about the meaning of
`status.late_initialized`. So let's make the state machine and
associated status flas work in stack-like way as below:
Callback Status
early_init: valid = true
sw_init: sw = true
hw_init: hw = true
late_init: late_initialized = true
early_fini: late_initialized = false
hw_fini: hw = false
sw_fini: sw = false
late_fini: valid = false
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6cbd19ad0fa5..6b503fb7e366 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3376,6 +3376,8 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
}
+
+ adev->ip_blocks[i].status.late_initialized = false;
}
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
@@ -3445,15 +3447,14 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
}
}
adev->ip_blocks[i].status.sw = false;
- adev->ip_blocks[i].status.valid = false;
}
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!adev->ip_blocks[i].status.late_initialized)
+ if (!adev->ip_blocks[i].status.valid)
continue;
if (adev->ip_blocks[i].version->funcs->late_fini)
adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
- adev->ip_blocks[i].status.late_initialized = false;
+ adev->ip_blocks[i].status.valid = false;
}
amdgpu_ras_fini(adev);
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 08/15] drm/amdgpu: make IP block state machine works in stack like way
2025-01-13 1:42 ` [RFC v2 08/15] drm/amdgpu: make IP block state machine works in stack like way Jiang Liu
@ 2025-01-17 8:45 ` Lazar, Lijo
0 siblings, 0 replies; 39+ messages in thread
From: Lazar, Lijo @ 2025-01-17 8:45 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> There are some mismatches between IP block state machine and its
> associated status flags, especially about the meaning of
> `status.late_initialized`. So let's make the state machine and
> associated status flas work in stack-like way as below:
> Callback Status
> early_init: valid = true
> sw_init: sw = true
> hw_init: hw = true
> late_init: late_initialized = true
> early_fini: late_initialized = false
Changing the state like this is confusing. The intention of late_fini is
to reverse the steps in late_init. It's straight forward read like if
the ip is not late_initialized, no need to late_fini. This is making
that complicated.
Thanks,
Lijo
> hw_fini: hw = false
> sw_fini: sw = false
> late_fini: valid = false
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 6cbd19ad0fa5..6b503fb7e366 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3376,6 +3376,8 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
> DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
> adev->ip_blocks[i].version->funcs->name, r);
> }
> +
> + adev->ip_blocks[i].status.late_initialized = false;
> }
>
> amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
> @@ -3445,15 +3447,14 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
> }
> }
> adev->ip_blocks[i].status.sw = false;
> - adev->ip_blocks[i].status.valid = false;
> }
>
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> - if (!adev->ip_blocks[i].status.late_initialized)
> + if (!adev->ip_blocks[i].status.valid)
> continue;
> if (adev->ip_blocks[i].version->funcs->late_fini)
> adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
> - adev->ip_blocks[i].status.late_initialized = false;
> + adev->ip_blocks[i].status.valid = false;
> }
>
> amdgpu_ras_fini(adev);
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 09/15] drm/amdgpu_dm: enhance amdgpu_dm_early_fini() for PM ops
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (7 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 08/15] drm/amdgpu: make IP block state machine works in stack like way Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-16 21:30 ` Mario Limonciello
2025-01-13 1:42 ` [RFC v2 10/15] drm/admgpu: make device state machine work in stack like way Jiang Liu
` (6 subsequent siblings)
15 siblings, 1 reply; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Enhance amdgpu_dm_early_fini() so it can be called in power
management operations.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 9121abe0e5ef..7b900b293c0d 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -2175,6 +2175,9 @@ static int amdgpu_dm_early_fini(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
+ if (adev->in_s0ix || adev->in_s3 || adev->in_s4 || adev->in_suspend)
+ return 0;
+
amdgpu_dm_audio_fini(adev);
return 0;
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 09/15] drm/amdgpu_dm: enhance amdgpu_dm_early_fini() for PM ops
2025-01-13 1:42 ` [RFC v2 09/15] drm/amdgpu_dm: enhance amdgpu_dm_early_fini() for PM ops Jiang Liu
@ 2025-01-16 21:30 ` Mario Limonciello
0 siblings, 0 replies; 39+ messages in thread
From: Mario Limonciello @ 2025-01-16 21:30 UTC (permalink / raw)
To: Wentland, Harry, Jiang Liu, alexander.deucher, christian.koenig,
Xinhui.Pan, airlied, simona, sunil.khatri, lijo.lazar,
Hawking.Zhang, xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/12/2025 19:42, Jiang Liu wrote:
> Enhance amdgpu_dm_early_fini() so it can be called in power
> management operations.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 9121abe0e5ef..7b900b293c0d 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -2175,6 +2175,9 @@ static int amdgpu_dm_early_fini(struct amdgpu_ip_block *ip_block)
> {
> struct amdgpu_device *adev = ip_block->adev;
>
> + if (adev->in_s0ix || adev->in_s3 || adev->in_s4 || adev->in_suspend)
> + return 0;
> +
I'm a bit confused how this has even happened over suspend. The call
stack I see is:
amdgpu_pci_remove()
->amdgpu_driver_unload_kms()
->->amdgpu_device_fini_hw()
->->-> amdgpu_device_ip_fini_early()
->->->-> adev->ip_blocks[i].version->funcs->early_fini()
Is there another call stack I'm missing?
> amdgpu_dm_audio_fini(adev);
>
> return 0;
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 10/15] drm/admgpu: make device state machine work in stack like way
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (8 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 09/15] drm/amdgpu_dm: enhance amdgpu_dm_early_fini() for PM ops Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-13 22:27 ` Mario Limonciello
2025-01-17 8:54 ` Lazar, Lijo
2025-01-13 1:42 ` [RFC v2 11/15] drm/amdgpu: convert ip block bool flags into an enum Jiang Liu
` (5 subsequent siblings)
15 siblings, 2 replies; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Make the device state machine work in stack like way to better support
suspend/resume by following changes:
1. amdgpu_driver_load_kms()
amdgpu_device_init()
amdgpu_device_ip_early_init()
ip_blocks[i].early_init()
ip_blocks[i].status.valid = true
amdgpu_device_ip_init()
amdgpu_ras_init()
ip_blocks[i].sw_init()
ip_blocks[i].status.sw = true
ip_blocks[i].hw_init()
ip_blocks[i].status.hw = true
amdgpu_device_ip_late_init()
ip_blocks[i].late_init()
ip_blocks[i].status.late_initialized = true
amdgpu_ras_late_init()
ras_blocks[i].ras_late_init()
amdgpu_ras_feature_enable_on_boot()
2. amdgpu_pmops_suspend()/amdgpu_pmops_freeze()/amdgpu_pmops_poweroff()
amdgpu_device_suspend()
amdgpu_ras_early_fini()
ras_blocks[i].ras_early_fini()
amdgpu_ras_feature_disable()
amdgpu_ras_suspend()
amdgpu_ras_disable_all_features()
+++ ip_blocks[i].early_fini()
+++ ip_blocks[i].status.late_initialized = false
ip_blocks[i].suspend()
3. amdgpu_pmops_resume()/amdgpu_pmops_thaw()/amdgpu_pmops_restore()
amdgpu_device_resume()
amdgpu_device_ip_resume()
ip_blocks[i].resume()
amdgpu_device_ip_late_init()
ip_blocks[i].late_init()
ip_blocks[i].status.late_initialized = true
amdgpu_ras_late_init()
ras_blocks[i].ras_late_init()
amdgpu_ras_feature_enable_on_boot()
amdgpu_ras_resume()
amdgpu_ras_enable_all_features()
4. amdgpu_driver_unload_kms()
amdgpu_device_fini_hw()
amdgpu_ras_early_fini()
ras_blocks[i].ras_early_fini()
+++ ip_blocks[i].early_fini()
+++ ip_blocks[i].status.late_initialized = false
ip_blocks[i].hw_fini()
ip_blocks[i].status.hw = false
5. amdgpu_driver_release_kms()
amdgpu_device_fini_sw()
amdgpu_device_ip_fini()
ip_blocks[i].sw_fini()
ip_blocks[i].status.sw = false
--- ip_blocks[i].status.valid = false
+++ amdgpu_ras_fini()
ip_blocks[i].late_fini()
+++ ip_blocks[i].status.valid = false
--- ip_blocks[i].status.late_initialized = false
--- amdgpu_ras_fini()
The main changes include:
1) invoke ip_blocks[i].early_fini in amdgpu_pmops_suspend().
2) set ip_blocks[i].status.late_initialized to false after calling
callback `early_fini`. We have auditted all usages of the
late_initialized flag and no functional changes found.
3) only set ip_blocks[i].status.valid = false after calling the
`late_fini` callback.
4) call amdgpu_ras_fini() before invoking ip_blocks[i].late_fini.
There's one more task left to analyze GPU reset related state machine
transitions.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++++++++++++--
1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6b503fb7e366..c2e4057ecd82 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3449,6 +3449,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
adev->ip_blocks[i].status.sw = false;
}
+ amdgpu_ras_fini(adev);
+
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
if (!adev->ip_blocks[i].status.valid)
continue;
@@ -3457,8 +3459,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
adev->ip_blocks[i].status.valid = false;
}
- amdgpu_ras_fini(adev);
-
return 0;
}
@@ -3516,6 +3516,24 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
dev_warn(adev->dev, "Failed to disallow df cstate");
+ for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
+ if (!adev->ip_blocks[i].status.valid)
+ continue;
+ if (!adev->ip_blocks[i].status.late_initialized)
+ continue;
+
+ if (adev->ip_blocks[i].version->funcs->early_fini) {
+ r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
+ if (r) {
+ DRM_ERROR(" of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name, r);
+ return r;
+ }
+ }
+
+ adev->ip_blocks[i].status.late_initialized = false;
+ }
+
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
if (!adev->ip_blocks[i].status.valid)
continue;
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 10/15] drm/admgpu: make device state machine work in stack like way
2025-01-13 1:42 ` [RFC v2 10/15] drm/admgpu: make device state machine work in stack like way Jiang Liu
@ 2025-01-13 22:27 ` Mario Limonciello
2025-01-14 1:58 ` Gerry Liu
2025-01-17 8:54 ` Lazar, Lijo
1 sibling, 1 reply; 39+ messages in thread
From: Mario Limonciello @ 2025-01-13 22:27 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, lijo.lazar, Hawking.Zhang,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/12/2025 19:42, Jiang Liu wrote:
> Make the device state machine work in stack like way to better support
> suspend/resume by following changes:
>
> 1. amdgpu_driver_load_kms()
> amdgpu_device_init()
> amdgpu_device_ip_early_init()
> ip_blocks[i].early_init()
> ip_blocks[i].status.valid = true
> amdgpu_device_ip_init()
> amdgpu_ras_init()
> ip_blocks[i].sw_init()
> ip_blocks[i].status.sw = true
> ip_blocks[i].hw_init()
> ip_blocks[i].status.hw = true
> amdgpu_device_ip_late_init()
> ip_blocks[i].late_init()
> ip_blocks[i].status.late_initialized = true
> amdgpu_ras_late_init()
> ras_blocks[i].ras_late_init()
> amdgpu_ras_feature_enable_on_boot()
>
> 2. amdgpu_pmops_suspend()/amdgpu_pmops_freeze()/amdgpu_pmops_poweroff()
> amdgpu_device_suspend()
> amdgpu_ras_early_fini()
> ras_blocks[i].ras_early_fini()
> amdgpu_ras_feature_disable()
> amdgpu_ras_suspend()
> amdgpu_ras_disable_all_features()
> +++ ip_blocks[i].early_fini()
> +++ ip_blocks[i].status.late_initialized = false
> ip_blocks[i].suspend()
>
> 3. amdgpu_pmops_resume()/amdgpu_pmops_thaw()/amdgpu_pmops_restore()
> amdgpu_device_resume()
> amdgpu_device_ip_resume()
> ip_blocks[i].resume()
> amdgpu_device_ip_late_init()
> ip_blocks[i].late_init()
> ip_blocks[i].status.late_initialized = true
> amdgpu_ras_late_init()
> ras_blocks[i].ras_late_init()
> amdgpu_ras_feature_enable_on_boot()
> amdgpu_ras_resume()
> amdgpu_ras_enable_all_features()
>
> 4. amdgpu_driver_unload_kms()
> amdgpu_device_fini_hw()
> amdgpu_ras_early_fini()
> ras_blocks[i].ras_early_fini()
> +++ ip_blocks[i].early_fini()
> +++ ip_blocks[i].status.late_initialized = false
> ip_blocks[i].hw_fini()
> ip_blocks[i].status.hw = false
>
> 5. amdgpu_driver_release_kms()
> amdgpu_device_fini_sw()
> amdgpu_device_ip_fini()
> ip_blocks[i].sw_fini()
> ip_blocks[i].status.sw = false
> --- ip_blocks[i].status.valid = false
> +++ amdgpu_ras_fini()
> ip_blocks[i].late_fini()
> +++ ip_blocks[i].status.valid = false
> --- ip_blocks[i].status.late_initialized = false
> --- amdgpu_ras_fini()
>
> The main changes include:
> 1) invoke ip_blocks[i].early_fini in amdgpu_pmops_suspend().
> 2) set ip_blocks[i].status.late_initialized to false after calling
> callback `early_fini`. We have auditted all usages of the
> late_initialized flag and no functional changes found.
> 3) only set ip_blocks[i].status.valid = false after calling the
> `late_fini` callback.
> 4) call amdgpu_ras_fini() before invoking ip_blocks[i].late_fini.
>
> There's one more task left to analyze GPU reset related state machine
> transitions.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
Ideally I think you should swap the order of patch 10 and 11, what do
you think?
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++++++++++++--
> 1 file changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 6b503fb7e366..c2e4057ecd82 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3449,6 +3449,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
> adev->ip_blocks[i].status.sw = false;
> }
>
> + amdgpu_ras_fini(adev);
> +
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> if (!adev->ip_blocks[i].status.valid)
> continue;
> @@ -3457,8 +3459,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
> adev->ip_blocks[i].status.valid = false;
> }
>
> - amdgpu_ras_fini(adev);
> -
> return 0;
> }
>
> @@ -3516,6 +3516,24 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
> if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
> dev_warn(adev->dev, "Failed to disallow df cstate");
>
> + for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> + if (!adev->ip_blocks[i].status.valid)
> + continue;
> + if (!adev->ip_blocks[i].status.late_initialized)
> + continue;
> +
> + if (adev->ip_blocks[i].version->funcs->early_fini) {
> + r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
> + if (r) {
> + DRM_ERROR(" of IP block <%s> failed %d\n",
> + adev->ip_blocks[i].version->funcs->name, r);
> + return r;
> + }
> + }
> +
> + adev->ip_blocks[i].status.late_initialized = false;
> + }
> +
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> if (!adev->ip_blocks[i].status.valid)
> continue;
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [RFC v2 10/15] drm/admgpu: make device state machine work in stack like way
2025-01-13 22:27 ` Mario Limonciello
@ 2025-01-14 1:58 ` Gerry Liu
2025-01-15 19:36 ` Mario Limonciello
0 siblings, 1 reply; 39+ messages in thread
From: Gerry Liu @ 2025-01-14 1:58 UTC (permalink / raw)
To: Mario Limonciello
Cc: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, xiaogang.chen,
Kent.Russell, shuox.liu, amd-gfx
> 2025年1月14日 06:27,Mario Limonciello <mario.limonciello@amd.com> 写道:
>
> On 1/12/2025 19:42, Jiang Liu wrote:
>> Make the device state machine work in stack like way to better support
>> suspend/resume by following changes:
>> 1. amdgpu_driver_load_kms()
>> amdgpu_device_init()
>> amdgpu_device_ip_early_init()
>> ip_blocks[i].early_init()
>> ip_blocks[i].status.valid = true
>> amdgpu_device_ip_init()
>> amdgpu_ras_init()
>> ip_blocks[i].sw_init()
>> ip_blocks[i].status.sw = true
>> ip_blocks[i].hw_init()
>> ip_blocks[i].status.hw = true
>> amdgpu_device_ip_late_init()
>> ip_blocks[i].late_init()
>> ip_blocks[i].status.late_initialized = true
>> amdgpu_ras_late_init()
>> ras_blocks[i].ras_late_init()
>> amdgpu_ras_feature_enable_on_boot()
>> 2. amdgpu_pmops_suspend()/amdgpu_pmops_freeze()/amdgpu_pmops_poweroff()
>> amdgpu_device_suspend()
>> amdgpu_ras_early_fini()
>> ras_blocks[i].ras_early_fini()
>> amdgpu_ras_feature_disable()
>> amdgpu_ras_suspend()
>> amdgpu_ras_disable_all_features()
>> +++ ip_blocks[i].early_fini()
>> +++ ip_blocks[i].status.late_initialized = false
>> ip_blocks[i].suspend()
>> 3. amdgpu_pmops_resume()/amdgpu_pmops_thaw()/amdgpu_pmops_restore()
>> amdgpu_device_resume()
>> amdgpu_device_ip_resume()
>> ip_blocks[i].resume()
>> amdgpu_device_ip_late_init()
>> ip_blocks[i].late_init()
>> ip_blocks[i].status.late_initialized = true
>> amdgpu_ras_late_init()
>> ras_blocks[i].ras_late_init()
>> amdgpu_ras_feature_enable_on_boot()
>> amdgpu_ras_resume()
>> amdgpu_ras_enable_all_features()
>> 4. amdgpu_driver_unload_kms()
>> amdgpu_device_fini_hw()
>> amdgpu_ras_early_fini()
>> ras_blocks[i].ras_early_fini()
>> +++ ip_blocks[i].early_fini()
>> +++ ip_blocks[i].status.late_initialized = false
>> ip_blocks[i].hw_fini()
>> ip_blocks[i].status.hw = false
>> 5. amdgpu_driver_release_kms()
>> amdgpu_device_fini_sw()
>> amdgpu_device_ip_fini()
>> ip_blocks[i].sw_fini()
>> ip_blocks[i].status.sw = false
>> --- ip_blocks[i].status.valid = false
>> +++ amdgpu_ras_fini()
>> ip_blocks[i].late_fini()
>> +++ ip_blocks[i].status.valid = false
>> --- ip_blocks[i].status.late_initialized = false
>> --- amdgpu_ras_fini()
>> The main changes include:
>> 1) invoke ip_blocks[i].early_fini in amdgpu_pmops_suspend().
>> 2) set ip_blocks[i].status.late_initialized to false after calling
>> callback `early_fini`. We have auditted all usages of the
>> late_initialized flag and no functional changes found.
>> 3) only set ip_blocks[i].status.valid = false after calling the
>> `late_fini` callback.
>> 4) call amdgpu_ras_fini() before invoking ip_blocks[i].late_fini.
>> There's one more task left to analyze GPU reset related state machine
>> transitions.
>> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
>
> Ideally I think you should swap the order of patch 10 and 11, what do you think?
I realized this when working patch 11, many changes introduced by patch 10 are changed again by patch 11.
But swapping these patches will cause too much rework. How about folding these two patches instead?
>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++++++++++++--
>> 1 file changed, 20 insertions(+), 2 deletions(-)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 6b503fb7e366..c2e4057ecd82 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3449,6 +3449,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
>> adev->ip_blocks[i].status.sw = false;
>> }
>> + amdgpu_ras_fini(adev);
>> +
>> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
>> if (!adev->ip_blocks[i].status.valid)
>> continue;
>> @@ -3457,8 +3459,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
>> adev->ip_blocks[i].status.valid = false;
>> }
>> - amdgpu_ras_fini(adev);
>> -
>> return 0;
>> }
>> @@ -3516,6 +3516,24 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
>> if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
>> dev_warn(adev->dev, "Failed to disallow df cstate");
>> + for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
>> + if (!adev->ip_blocks[i].status.valid)
>> + continue;
>> + if (!adev->ip_blocks[i].status.late_initialized)
>> + continue;
>> +
>> + if (adev->ip_blocks[i].version->funcs->early_fini) {
>> + r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
>> + if (r) {
>> + DRM_ERROR(" of IP block <%s> failed %d\n",
>> + adev->ip_blocks[i].version->funcs->name, r);
>> + return r;
>> + }
>> + }
>> +
>> + adev->ip_blocks[i].status.late_initialized = false;
>> + }
>> +
>> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
>> if (!adev->ip_blocks[i].status.valid)
>> continue;
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [RFC v2 10/15] drm/admgpu: make device state machine work in stack like way
2025-01-14 1:58 ` Gerry Liu
@ 2025-01-15 19:36 ` Mario Limonciello
0 siblings, 0 replies; 39+ messages in thread
From: Mario Limonciello @ 2025-01-15 19:36 UTC (permalink / raw)
To: Gerry Liu
Cc: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, xiaogang.chen,
Kent.Russell, shuox.liu, amd-gfx
On 1/13/2025 19:58, Gerry Liu wrote:
>
>
>> 2025年1月14日 06:27,Mario Limonciello <mario.limonciello@amd.com> 写道:
>>
>> On 1/12/2025 19:42, Jiang Liu wrote:
>>> Make the device state machine work in stack like way to better support
>>> suspend/resume by following changes:
>>> 1. amdgpu_driver_load_kms()
>>> amdgpu_device_init()
>>> amdgpu_device_ip_early_init()
>>> ip_blocks[i].early_init()
>>> ip_blocks[i].status.valid = true
>>> amdgpu_device_ip_init()
>>> amdgpu_ras_init()
>>> ip_blocks[i].sw_init()
>>> ip_blocks[i].status.sw = true
>>> ip_blocks[i].hw_init()
>>> ip_blocks[i].status.hw = true
>>> amdgpu_device_ip_late_init()
>>> ip_blocks[i].late_init()
>>> ip_blocks[i].status.late_initialized = true
>>> amdgpu_ras_late_init()
>>> ras_blocks[i].ras_late_init()
>>> amdgpu_ras_feature_enable_on_boot()
>>> 2. amdgpu_pmops_suspend()/amdgpu_pmops_freeze()/amdgpu_pmops_poweroff()
>>> amdgpu_device_suspend()
>>> amdgpu_ras_early_fini()
>>> ras_blocks[i].ras_early_fini()
>>> amdgpu_ras_feature_disable()
>>> amdgpu_ras_suspend()
>>> amdgpu_ras_disable_all_features()
>>> +++ ip_blocks[i].early_fini()
>>> +++ ip_blocks[i].status.late_initialized = false
>>> ip_blocks[i].suspend()
>>> 3. amdgpu_pmops_resume()/amdgpu_pmops_thaw()/amdgpu_pmops_restore()
>>> amdgpu_device_resume()
>>> amdgpu_device_ip_resume()
>>> ip_blocks[i].resume()
>>> amdgpu_device_ip_late_init()
>>> ip_blocks[i].late_init()
>>> ip_blocks[i].status.late_initialized = true
>>> amdgpu_ras_late_init()
>>> ras_blocks[i].ras_late_init()
>>> amdgpu_ras_feature_enable_on_boot()
>>> amdgpu_ras_resume()
>>> amdgpu_ras_enable_all_features()
>>> 4. amdgpu_driver_unload_kms()
>>> amdgpu_device_fini_hw()
>>> amdgpu_ras_early_fini()
>>> ras_blocks[i].ras_early_fini()
>>> +++ ip_blocks[i].early_fini()
>>> +++ ip_blocks[i].status.late_initialized = false
>>> ip_blocks[i].hw_fini()
>>> ip_blocks[i].status.hw = false
>>> 5. amdgpu_driver_release_kms()
>>> amdgpu_device_fini_sw()
>>> amdgpu_device_ip_fini()
>>> ip_blocks[i].sw_fini()
>>> ip_blocks[i].status.sw = false
>>> --- ip_blocks[i].status.valid = false
>>> +++ amdgpu_ras_fini()
>>> ip_blocks[i].late_fini()
>>> +++ ip_blocks[i].status.valid = false
>>> --- ip_blocks[i].status.late_initialized = false
>>> --- amdgpu_ras_fini()
>>> The main changes include:
>>> 1) invoke ip_blocks[i].early_fini in amdgpu_pmops_suspend().
>>> 2) set ip_blocks[i].status.late_initialized to false after calling
>>> callback `early_fini`. We have auditted all usages of the
>>> late_initialized flag and no functional changes found.
>>> 3) only set ip_blocks[i].status.valid = false after calling the
>>> `late_fini` callback.
>>> 4) call amdgpu_ras_fini() before invoking ip_blocks[i].late_fini.
>>> There's one more task left to analyze GPU reset related state machine
>>> transitions.
>>> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
>>
>> Ideally I think you should swap the order of patch 10 and 11, what do you think?
> I realized this when working patch 11, many changes introduced by patch 10 are changed again by patch 11.
> But swapping these patches will cause too much rework. How about folding these two patches instead?
It might be too big of a patch because it changes a lot all at same time.
Re-ordering is painful but leads to more reable patches and less
ping-pong of code.
I think you can do something like this (I've done this myself on big
patch series):
1) squash the two patches together
git rebase -i HEAD~15
2) Spit it out as a patch file
git format-patch -1 $SHA
3) Check out the commit before this combined one
git checkout -b gerry/rebase $SHA~1
4) Apply the patch using patch -p1 (NOT git am)
patch -p1 < foo.patch
5) Use vscode (specifically) to stage the lines that should go into
patch 10.
6) Commit patch 10
7) Stage the lines that go into patch 11.
8) Commit patch 11
9) Note those two commit hashes
10) Go back to your original branch
git checkout gerry/original
11) Run a rebase again, but swap out the "squashed" hash with the two
hashes you made.
IE if your two hashes are aaaabbb and bbbccc and it's currently
pick abc123
change it to
pick aaaabbb
pick bbbccc
Then finish the rebase and it should swap them all out for you!
>
>
>>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++++++++++++--
>>> 1 file changed, 20 insertions(+), 2 deletions(-)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 6b503fb7e366..c2e4057ecd82 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3449,6 +3449,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
>>> adev->ip_blocks[i].status.sw = false;
>>> }
>>> + amdgpu_ras_fini(adev);
>>> +
>>> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
>>> if (!adev->ip_blocks[i].status.valid)
>>> continue;
>>> @@ -3457,8 +3459,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
>>> adev->ip_blocks[i].status.valid = false;
>>> }
>>> - amdgpu_ras_fini(adev);
>>> -
>>> return 0;
>>> }
>>> @@ -3516,6 +3516,24 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
>>> if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
>>> dev_warn(adev->dev, "Failed to disallow df cstate");
>>> + for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
>>> + if (!adev->ip_blocks[i].status.valid)
>>> + continue;
>>> + if (!adev->ip_blocks[i].status.late_initialized)
>>> + continue;
>>> +
>>> + if (adev->ip_blocks[i].version->funcs->early_fini) {
>>> + r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
>>> + if (r) {
>>> + DRM_ERROR(" of IP block <%s> failed %d\n",
>>> + adev->ip_blocks[i].version->funcs->name, r);
>>> + return r;
>>> + }
>>> + }
>>> +
>>> + adev->ip_blocks[i].status.late_initialized = false;
>>> + }
>>> +
>>> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
>>> if (!adev->ip_blocks[i].status.valid)
>>> continue;
>
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [RFC v2 10/15] drm/admgpu: make device state machine work in stack like way
2025-01-13 1:42 ` [RFC v2 10/15] drm/admgpu: make device state machine work in stack like way Jiang Liu
2025-01-13 22:27 ` Mario Limonciello
@ 2025-01-17 8:54 ` Lazar, Lijo
1 sibling, 0 replies; 39+ messages in thread
From: Lazar, Lijo @ 2025-01-17 8:54 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Make the device state machine work in stack like way to better support
> suspend/resume by following changes:
>
> 1. amdgpu_driver_load_kms()
> amdgpu_device_init()
> amdgpu_device_ip_early_init()
> ip_blocks[i].early_init()
> ip_blocks[i].status.valid = true
> amdgpu_device_ip_init()
> amdgpu_ras_init()
> ip_blocks[i].sw_init()
> ip_blocks[i].status.sw = true
> ip_blocks[i].hw_init()
> ip_blocks[i].status.hw = true
> amdgpu_device_ip_late_init()
> ip_blocks[i].late_init()
> ip_blocks[i].status.late_initialized = true
> amdgpu_ras_late_init()
> ras_blocks[i].ras_late_init()
> amdgpu_ras_feature_enable_on_boot()
>
> 2. amdgpu_pmops_suspend()/amdgpu_pmops_freeze()/amdgpu_pmops_poweroff()
> amdgpu_device_suspend()
> amdgpu_ras_early_fini()
> ras_blocks[i].ras_early_fini()
> amdgpu_ras_feature_disable()
> amdgpu_ras_suspend()
> amdgpu_ras_disable_all_features()
> +++ ip_blocks[i].early_fini()
> +++ ip_blocks[i].status.late_initialized = false
As said in the previous patch, please don't add confusion. You could
maintain a state machine like early fini done/late fini done etc, but
please don't introduce this kind of confusing things.
Thanks,
Lijo
> ip_blocks[i].suspend()
>
> 3. amdgpu_pmops_resume()/amdgpu_pmops_thaw()/amdgpu_pmops_restore()
> amdgpu_device_resume()
> amdgpu_device_ip_resume()
> ip_blocks[i].resume()
> amdgpu_device_ip_late_init()
> ip_blocks[i].late_init()
> ip_blocks[i].status.late_initialized = true
> amdgpu_ras_late_init()
> ras_blocks[i].ras_late_init()
> amdgpu_ras_feature_enable_on_boot()
> amdgpu_ras_resume()
> amdgpu_ras_enable_all_features()
>
> 4. amdgpu_driver_unload_kms()
> amdgpu_device_fini_hw()
> amdgpu_ras_early_fini()
> ras_blocks[i].ras_early_fini()
> +++ ip_blocks[i].early_fini()
> +++ ip_blocks[i].status.late_initialized = false
> ip_blocks[i].hw_fini()
> ip_blocks[i].status.hw = false
>
> 5. amdgpu_driver_release_kms()
> amdgpu_device_fini_sw()
> amdgpu_device_ip_fini()
> ip_blocks[i].sw_fini()
> ip_blocks[i].status.sw = false
> --- ip_blocks[i].status.valid = false
> +++ amdgpu_ras_fini()
> ip_blocks[i].late_fini()
> +++ ip_blocks[i].status.valid = false
> --- ip_blocks[i].status.late_initialized = false
> --- amdgpu_ras_fini()
>
> The main changes include:
> 1) invoke ip_blocks[i].early_fini in amdgpu_pmops_suspend().
> 2) set ip_blocks[i].status.late_initialized to false after calling
> callback `early_fini`. We have auditted all usages of the
> late_initialized flag and no functional changes found.
> 3) only set ip_blocks[i].status.valid = false after calling the
> `late_fini` callback.
> 4) call amdgpu_ras_fini() before invoking ip_blocks[i].late_fini.
>
> There's one more task left to analyze GPU reset related state machine
> transitions.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++++++++++++--
> 1 file changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 6b503fb7e366..c2e4057ecd82 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3449,6 +3449,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
> adev->ip_blocks[i].status.sw = false;
> }
>
> + amdgpu_ras_fini(adev);
> +
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> if (!adev->ip_blocks[i].status.valid)
> continue;
> @@ -3457,8 +3459,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
> adev->ip_blocks[i].status.valid = false;
> }
>
> - amdgpu_ras_fini(adev);
> -
> return 0;
> }
>
> @@ -3516,6 +3516,24 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
> if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
> dev_warn(adev->dev, "Failed to disallow df cstate");
>
> + for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> + if (!adev->ip_blocks[i].status.valid)
> + continue;
> + if (!adev->ip_blocks[i].status.late_initialized)
> + continue;
> +
> + if (adev->ip_blocks[i].version->funcs->early_fini) {
> + r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
> + if (r) {
> + DRM_ERROR(" of IP block <%s> failed %d\n",
> + adev->ip_blocks[i].version->funcs->name, r);
> + return r;
> + }
> + }
> +
> + adev->ip_blocks[i].status.late_initialized = false;
> + }
> +
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> if (!adev->ip_blocks[i].status.valid)
> continue;
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 11/15] drm/amdgpu: convert ip block bool flags into an enum
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (9 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 10/15] drm/admgpu: make device state machine work in stack like way Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-17 8:57 ` Lazar, Lijo
2025-01-13 1:42 ` [RFC v2 12/15] drm/amdgpu: introduce IP block iterators to reduce duplicated code Jiang Liu
` (4 subsequent siblings)
15 siblings, 1 reply; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Convert ip block bool flags into an enumeration, to explicitly mark
current state of the ip block. Also introduce helper functions to
manipulate the ip block state. Now the state machine works as below:
Callbacks State after successfully execute callback
AMDGPU_IP_STATE_INVALID
.early_init() AMDGPU_IP_STATE_EARLY
.sw_init() AMDGPU_IP_STATE_SW
.hw_init() AMDGPU_IP_STATE_HW
.late_init() AMDGPU_IP_STATE_LATE
.early_fini() AMDGPU_IP_STATE_HW
.hw_fini() AMDGPU_IP_STATE_SW
.sw_fini() AMDGPU_IP_STATE_EARLY
.late_fini() AMDGPU_IP_STATE_INVALID
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/aldebaran.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 48 ++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 144 ++++++++++++--------
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 4 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 6 +-
drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 2 +-
drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c | 2 +-
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 2 +-
9 files changed, 143 insertions(+), 73 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index e13fbd974141..b2bad8837b64 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -301,7 +301,7 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
return r;
}
}
- adev->ip_blocks[i].status.late_initialized = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
}
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 24ef39b706e3..f3275a281280 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -395,11 +395,32 @@ enum amdgpu_marker {
#define AMDGPU_MARKER_IRQ(idx) (AMDGPU_MARKER_IRQ0 + (idx))
+/**
+ * States for ip block state machine.
+ *
+ * The IP block state machine has five states and the associated state
+ * transition works in stack like way as below:
+ * Callbacks State after successfully execute callback
+ * AMDGPU_IP_STATE_INVALID
+ * .early_init() AMDGPU_IP_STATE_EARLY
+ * .sw_init() AMDGPU_IP_STATE_SW
+ * .hw_init() AMDGPU_IP_STATE_HW
+ * .late_init() AMDGPU_IP_STATE_LATE
+ * .early_fini() AMDGPU_IP_STATE_HW
+ * .hw_fini() AMDGPU_IP_STATE_SW
+ * .sw_fini() AMDGPU_IP_STATE_EARLY
+ * .late_fini() AMDGPU_IP_STATE_INVALID
+ */
+enum amdgpu_device_ip_state {
+ AMDGPU_IP_STATE_INVALID = 0,
+ AMDGPU_IP_STATE_EARLY = 1,
+ AMDGPU_IP_STATE_SW = 2,
+ AMDGPU_IP_STATE_HW = 3,
+ AMDGPU_IP_STATE_LATE = 4,
+};
+
struct amdgpu_ip_block_status {
- bool valid;
- bool sw;
- bool hw;
- bool late_initialized;
+ enum amdgpu_device_ip_state state;
bool hang;
uint64_t markers;
};
@@ -429,6 +450,25 @@ amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
const struct amdgpu_ip_block_version *ip_block_version);
+void amdgpu_device_ip_set_state(struct amdgpu_device *adev, int index,
+ enum amdgpu_device_ip_state state);
+enum amdgpu_device_ip_state amdgpu_device_ip_state(struct amdgpu_device *adev,
+ int index);
+bool amdgpu_device_ip_valid(struct amdgpu_device *adev, int index);
+void amdgpu_ip_block_set_state(struct amdgpu_ip_block *ip_block,
+ enum amdgpu_device_ip_state state);
+
+static inline enum amdgpu_device_ip_state
+amdgpu_ip_block_state(struct amdgpu_ip_block *ip_block)
+{
+ return ip_block->status.state;
+}
+
+static inline bool amdgpu_ip_block_valid(struct amdgpu_ip_block *ip_block)
+{
+ return ip_block->status.state != AMDGPU_IP_STATE_INVALID;
+}
+
static inline void amdgpu_ip_block_set_marker(struct amdgpu_ip_block *ip_block,
enum amdgpu_marker marker)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c2e4057ecd82..fcfbdcfd1fa3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -297,7 +297,7 @@ int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
}
}
- ip_block->status.hw = false;
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_SW);
return 0;
}
@@ -315,7 +315,7 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
}
}
- ip_block->status.hw = true;
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_HW);
return 0;
}
@@ -2164,7 +2164,7 @@ int amdgpu_device_ip_set_clockgating_state(void *dev,
int i, r = 0;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (adev->ip_blocks[i].version->type != block_type)
continue;
@@ -2198,7 +2198,7 @@ int amdgpu_device_ip_set_powergating_state(void *dev,
int i, r = 0;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (adev->ip_blocks[i].version->type != block_type)
continue;
@@ -2230,7 +2230,7 @@ void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
int i;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
@@ -2252,7 +2252,7 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
int i, r;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (adev->ip_blocks[i].version->type == block_type) {
if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
@@ -2284,7 +2284,7 @@ bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
for (i = 0; i < adev->num_ip_blocks; i++) {
if (adev->ip_blocks[i].version->type == block_type)
- return adev->ip_blocks[i].status.valid;
+ return amdgpu_device_ip_valid(adev, i);
}
return false;
@@ -2375,6 +2375,29 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
return 0;
}
+void amdgpu_device_ip_set_state(struct amdgpu_device *adev, int index,
+ enum amdgpu_device_ip_state state)
+{
+ amdgpu_ip_block_set_state(&adev->ip_blocks[index], state);
+}
+
+enum amdgpu_device_ip_state amdgpu_device_ip_state(struct amdgpu_device *adev,
+ int index)
+{
+ return amdgpu_ip_block_state(&adev->ip_blocks[index]);
+}
+
+bool amdgpu_device_ip_valid(struct amdgpu_device *adev, int index)
+{
+ return amdgpu_ip_block_valid(&adev->ip_blocks[index]);
+}
+
+void amdgpu_ip_block_set_state(struct amdgpu_ip_block *ip_block,
+ enum amdgpu_device_ip_state state)
+{
+ ip_block->status.state = state;
+}
+
/**
* amdgpu_device_enable_virtual_display - enable virtual display feature
*
@@ -2671,20 +2694,21 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
DRM_WARN("disabled ip block: %d <%s>\n",
i, adev->ip_blocks[i].version->funcs->name);
- adev->ip_blocks[i].status.valid = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
} else if (ip_block->version->funcs->early_init) {
r = ip_block->version->funcs->early_init(ip_block);
if (r == -ENOENT) {
- adev->ip_blocks[i].status.valid = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
} else if (r) {
DRM_ERROR("early_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
total = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
} else {
- adev->ip_blocks[i].status.valid = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_EARLY);
}
} else {
- adev->ip_blocks[i].status.valid = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_EARLY);
}
/* get the vbios after the asic_funcs are set up */
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
@@ -2715,7 +2739,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
return -ENODEV;
ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
- if (ip_block->status.valid != false)
+ if (amdgpu_ip_block_valid(ip_block))
amdgpu_amdkfd_device_probe(adev);
adev->cg_flags &= amdgpu_cg_mask;
@@ -2729,9 +2753,9 @@ static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
int i, r;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.sw)
+ if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
continue;
- if (adev->ip_blocks[i].status.hw)
+ if (amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
continue;
if (!amdgpu_ip_member_of_hwini(
adev, adev->ip_blocks[i].version->type))
@@ -2745,7 +2769,7 @@ static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
adev->ip_blocks[i].version->funcs->name, r);
return r;
}
- adev->ip_blocks[i].status.hw = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
}
}
@@ -2757,9 +2781,9 @@ static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
int i, r;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.sw)
+ if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
continue;
- if (adev->ip_blocks[i].status.hw)
+ if (amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
continue;
if (!amdgpu_ip_member_of_hwini(
adev, adev->ip_blocks[i].version->type))
@@ -2770,7 +2794,7 @@ static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
adev->ip_blocks[i].version->funcs->name, r);
return r;
}
- adev->ip_blocks[i].status.hw = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
}
return 0;
@@ -2791,11 +2815,11 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
AMD_IP_BLOCK_TYPE_PSP))
break;
- if (!adev->ip_blocks[i].status.sw)
+ if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
continue;
/* no need to do the fw loading again if already done*/
- if (adev->ip_blocks[i].status.hw == true)
+ if (amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
break;
if (amdgpu_in_reset(adev) || adev->in_suspend) {
@@ -2809,7 +2833,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
adev->ip_blocks[i].version->funcs->name, r);
return r;
}
- adev->ip_blocks[i].status.hw = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
}
break;
}
@@ -2900,7 +2924,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
return r;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (adev->ip_blocks[i].version->funcs->sw_init) {
r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
@@ -2910,7 +2934,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
goto init_failed;
}
}
- adev->ip_blocks[i].status.sw = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
if (!amdgpu_ip_member_of_hwini(
adev, adev->ip_blocks[i].version->type))
@@ -2923,7 +2947,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
DRM_ERROR("hw_init %d failed %d\n", i, r);
goto init_failed;
}
- adev->ip_blocks[i].status.hw = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
} else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
/* need to do gmc hw init early so we can allocate gpu mem */
/* Try to reserve bad pages early */
@@ -2945,7 +2969,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
goto init_failed;
}
- adev->ip_blocks[i].status.hw = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
/* right after GMC hw init, we create CSA */
if (adev->gfx.mcbp) {
@@ -3130,7 +3154,7 @@ int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
for (j = 0; j < adev->num_ip_blocks; j++) {
i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
- if (!adev->ip_blocks[i].status.late_initialized)
+ if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_LATE)
continue;
/* skip CG for GFX, SDMA on S0ix */
if (adev->in_s0ix &&
@@ -3167,7 +3191,7 @@ int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
for (j = 0; j < adev->num_ip_blocks; j++) {
i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
- if (!adev->ip_blocks[i].status.late_initialized)
+ if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_LATE)
continue;
/* skip PG for GFX, SDMA on S0ix */
if (adev->in_s0ix &&
@@ -3246,7 +3270,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
int i = 0, r;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.hw)
+ if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_HW)
continue;
if (adev->ip_blocks[i].version->funcs->late_init) {
r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
@@ -3256,7 +3280,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
return r;
}
}
- adev->ip_blocks[i].status.late_initialized = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
}
r = amdgpu_ras_late_init(adev);
@@ -3336,7 +3360,7 @@ static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
}
}
- ip_block->status.hw = false;
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_SW);
}
/**
@@ -3354,7 +3378,7 @@ static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
return;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.hw)
+ if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_HW)
continue;
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
@@ -3377,7 +3401,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
adev->ip_blocks[i].version->funcs->name, r);
}
- adev->ip_blocks[i].status.late_initialized = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
}
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
@@ -3389,7 +3413,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
amdgpu_device_smu_fini_early(adev);
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!adev->ip_blocks[i].status.hw)
+ if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_HW)
continue;
amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
@@ -3427,7 +3451,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
amdgpu_amdkfd_device_fini_sw(adev);
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!adev->ip_blocks[i].status.sw)
+ if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
continue;
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
@@ -3446,17 +3470,17 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
adev->ip_blocks[i].version->funcs->name, r);
}
}
- adev->ip_blocks[i].status.sw = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_EARLY);
}
amdgpu_ras_fini(adev);
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (adev->ip_blocks[i].version->funcs->late_fini)
adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
- adev->ip_blocks[i].status.valid = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
}
return 0;
@@ -3517,9 +3541,9 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
dev_warn(adev->dev, "Failed to disallow df cstate");
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
- if (!adev->ip_blocks[i].status.late_initialized)
+ if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_LATE)
continue;
if (adev->ip_blocks[i].version->funcs->early_fini) {
@@ -3531,11 +3555,11 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
}
}
- adev->ip_blocks[i].status.late_initialized = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
}
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
/* displays are handled separately */
@@ -3570,7 +3594,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
/* displays are handled in phase1 */
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
@@ -3578,7 +3602,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
/* PSP lost connection when err_event_athub occurs */
if (amdgpu_ras_intr_triggered() &&
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
- adev->ip_blocks[i].status.hw = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
continue;
}
@@ -3620,7 +3644,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
/* XXX handle errors */
r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
- adev->ip_blocks[i].status.hw = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
/* handle putting the SMC in the appropriate state */
if (!amdgpu_sriov_vf(adev)) {
@@ -3687,12 +3711,12 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
struct amdgpu_ip_block *block;
block = &adev->ip_blocks[i];
- block->status.hw = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
if (block->version->type != ip_order[j] ||
- !block->status.valid)
+ !amdgpu_device_ip_valid(adev, i))
continue;
r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
@@ -3701,7 +3725,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
block->version->funcs->name);
return r;
}
- block->status.hw = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
}
}
@@ -3731,7 +3755,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
if (!block)
continue;
- if (block->status.valid && !block->status.hw) {
+ if (amdgpu_ip_block_valid(block) &&
+ amdgpu_ip_block_state(block) < AMDGPU_IP_STATE_HW) {
if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
r = amdgpu_ip_block_resume(block);
} else {
@@ -3743,7 +3768,7 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
block->version->funcs->name);
break;
}
- block->status.hw = true;
+ amdgpu_ip_block_set_state(block, AMDGPU_IP_STATE_HW);
}
}
@@ -3767,7 +3792,8 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
int i, r;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
+ if (!amdgpu_device_ip_valid(adev, i) ||
+ amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
continue;
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
@@ -3801,7 +3827,8 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
int i, r;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
+ if (!amdgpu_device_ip_valid(adev, i) ||
+ amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
continue;
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
@@ -3835,7 +3862,8 @@ static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
int i, r;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
+ if (!amdgpu_device_ip_valid(adev, i) ||
+ amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
continue;
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
@@ -4888,7 +4916,7 @@ int amdgpu_device_prepare(struct drm_device *dev)
flush_delayed_work(&adev->gfx.gfx_off_delay_work);
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
continue;
@@ -5090,7 +5118,7 @@ static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
return true;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (adev->ip_blocks[i].version->funcs->check_soft_reset)
adev->ip_blocks[i].status.hang =
@@ -5120,7 +5148,7 @@ static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
int i, r = 0;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (adev->ip_blocks[i].status.hang &&
adev->ip_blocks[i].version->funcs->pre_soft_reset) {
@@ -5150,7 +5178,7 @@ static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
return true;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
@@ -5182,7 +5210,7 @@ static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
int i, r = 0;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (adev->ip_blocks[i].status.hang &&
adev->ip_blocks[i].version->funcs->soft_reset) {
@@ -5211,7 +5239,7 @@ static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
int i, r = 0;
for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
if (adev->ip_blocks[i].status.hang &&
adev->ip_blocks[i].version->funcs->post_soft_reset)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index f908355df07c..33030a0bfef2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -515,7 +515,7 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
for (i = 0; i < adev->num_ip_blocks; i++)
if (adev->ip_blocks[i].version->type == type &&
- adev->ip_blocks[i].status.valid)
+ amdgpu_device_ip_valid(adev, i))
break;
if (i == adev->num_ip_blocks)
@@ -636,7 +636,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
type = amdgpu_ip_get_block_type(adev, info->query_hw_ip.type);
ip_block = amdgpu_device_ip_get_ip_block(adev, type);
- if (!ip_block || !ip_block->status.valid)
+ if (!ip_block || !amdgpu_ip_block_valid(ip_block))
return -EINVAL;
if (adev->xcp_mgr && adev->xcp_mgr->num_xcps > 0 &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a441dffca45b..26a0d9050dca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -3897,7 +3897,8 @@ static ssize_t psp_usbc_pd_fw_sysfs_read(struct device *dev,
int ret;
ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP);
- if (!ip_block || !ip_block->status.late_initialized) {
+ if (!ip_block ||
+ amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_LATE) {
dev_info(adev->dev, "PSP block is not ready yet\n.");
return -EBUSY;
}
@@ -3929,7 +3930,8 @@ static ssize_t psp_usbc_pd_fw_sysfs_write(struct device *dev,
struct amdgpu_ip_block *ip_block;
ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP);
- if (!ip_block || !ip_block->status.late_initialized) {
+ if (!ip_block ||
+ amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_LATE) {
dev_err(adev->dev, "PSP block is not ready yet.");
return -EBUSY;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index dabfbdf6f1ce..eb72dac61c83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -31,9 +31,9 @@ static int amdgpu_reset_xgmi_reset_on_init_suspend(struct amdgpu_device *adev)
int i;
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!adev->ip_blocks[i].status.valid)
+ if (!amdgpu_device_ip_valid(adev, i))
continue;
- if (!adev->ip_blocks[i].status.hw)
+ if (amdgpu_device_ip_state(adev, i) <= AMDGPU_IP_STATE_HW)
continue;
/* displays are handled in phase1 */
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
@@ -41,7 +41,7 @@ static int amdgpu_reset_xgmi_reset_on_init_suspend(struct amdgpu_device *adev)
/* XXX handle errors */
amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
- adev->ip_blocks[i].status.hw = false;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
}
/* VCN FW shared region is in frambuffer, there are some flags
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
index 2594467bdd87..c9479a92a9df 100644
--- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -204,7 +204,7 @@ static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
return r;
}
}
- adev->ip_blocks[i].status.late_initialized = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
}
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
index 70569ea906bc..75dc5cb1e1ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
@@ -205,7 +205,7 @@ static int smu_v13_0_10_mode2_restore_ip(struct amdgpu_device *adev)
return r;
}
}
- adev->ip_blocks[i].status.late_initialized = true;
+ amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
}
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 8ca793c222ff..c75402d606c3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -230,7 +230,7 @@ static bool is_vcn_enabled(struct amdgpu_device *adev)
for (i = 0; i < adev->num_ip_blocks; i++) {
if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_VCN ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_JPEG) &&
- !adev->ip_blocks[i].status.valid)
+ !amdgpu_device_ip_valid(adev, i))
return false;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 11/15] drm/amdgpu: convert ip block bool flags into an enum
2025-01-13 1:42 ` [RFC v2 11/15] drm/amdgpu: convert ip block bool flags into an enum Jiang Liu
@ 2025-01-17 8:57 ` Lazar, Lijo
0 siblings, 0 replies; 39+ messages in thread
From: Lazar, Lijo @ 2025-01-17 8:57 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Convert ip block bool flags into an enumeration, to explicitly mark
> current state of the ip block. Also introduce helper functions to
> manipulate the ip block state. Now the state machine works as below:
> Callbacks State after successfully execute callback
> AMDGPU_IP_STATE_INVALID
> .early_init() AMDGPU_IP_STATE_EARLY
> .sw_init() AMDGPU_IP_STATE_SW
> .hw_init() AMDGPU_IP_STATE_HW
> .late_init() AMDGPU_IP_STATE_LATE
> .early_fini() AMDGPU_IP_STATE_HW
> .hw_fini() AMDGPU_IP_STATE_SW
> .sw_fini() AMDGPU_IP_STATE_EARLY
> .late_fini() AMDGPU_IP_STATE_INVALID
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> ---
> drivers/gpu/drm/amd/amdgpu/aldebaran.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 48 ++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 144 ++++++++++++--------
> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 4 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 6 +-
> drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c | 2 +-
> drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 2 +-
> 9 files changed, 143 insertions(+), 73 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index e13fbd974141..b2bad8837b64 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -301,7 +301,7 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
> return r;
> }
> }
> - adev->ip_blocks[i].status.late_initialized = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
> }
>
> amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 24ef39b706e3..f3275a281280 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -395,11 +395,32 @@ enum amdgpu_marker {
>
> #define AMDGPU_MARKER_IRQ(idx) (AMDGPU_MARKER_IRQ0 + (idx))
>
> +/**
> + * States for ip block state machine.
> + *
> + * The IP block state machine has five states and the associated state
> + * transition works in stack like way as below:
> + * Callbacks State after successfully execute callback
> + * AMDGPU_IP_STATE_INVALID
> + * .early_init() AMDGPU_IP_STATE_EARLY
> + * .sw_init() AMDGPU_IP_STATE_SW
> + * .hw_init() AMDGPU_IP_STATE_HW
> + * .late_init() AMDGPU_IP_STATE_LATE
> + * .early_fini() AMDGPU_IP_STATE_HW
> + * .hw_fini() AMDGPU_IP_STATE_SW
> + * .sw_fini() AMDGPU_IP_STATE_EARLY
> + * .late_fini() AMDGPU_IP_STATE_INVALID
> + */
> +enum amdgpu_device_ip_state {
> + AMDGPU_IP_STATE_INVALID = 0,
> + AMDGPU_IP_STATE_EARLY = 1,
> + AMDGPU_IP_STATE_SW = 2,
> + AMDGPU_IP_STATE_HW = 3,
> + AMDGPU_IP_STATE_LATE = 4,
Would suggest to add separate INIT/FINI stages for each state -
EARLY_INIT_DONE/EARLY_FINI_DONE. Then reading the code will be easier.
Thanks,
Lijo
> +};
> +
> struct amdgpu_ip_block_status {
> - bool valid;
> - bool sw;
> - bool hw;
> - bool late_initialized;
> + enum amdgpu_device_ip_state state;
> bool hang;
> uint64_t markers;
> };
> @@ -429,6 +450,25 @@ amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
> int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
> const struct amdgpu_ip_block_version *ip_block_version);
>
> +void amdgpu_device_ip_set_state(struct amdgpu_device *adev, int index,
> + enum amdgpu_device_ip_state state);
> +enum amdgpu_device_ip_state amdgpu_device_ip_state(struct amdgpu_device *adev,
> + int index);
> +bool amdgpu_device_ip_valid(struct amdgpu_device *adev, int index);
> +void amdgpu_ip_block_set_state(struct amdgpu_ip_block *ip_block,
> + enum amdgpu_device_ip_state state);
> +
> +static inline enum amdgpu_device_ip_state
> +amdgpu_ip_block_state(struct amdgpu_ip_block *ip_block)
> +{
> + return ip_block->status.state;
> +}
> +
> +static inline bool amdgpu_ip_block_valid(struct amdgpu_ip_block *ip_block)
> +{
> + return ip_block->status.state != AMDGPU_IP_STATE_INVALID;
> +}
> +
> static inline void amdgpu_ip_block_set_marker(struct amdgpu_ip_block *ip_block,
> enum amdgpu_marker marker)
> {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index c2e4057ecd82..fcfbdcfd1fa3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -297,7 +297,7 @@ int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
> }
> }
>
> - ip_block->status.hw = false;
> + amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_SW);
> return 0;
> }
>
> @@ -315,7 +315,7 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
> }
> }
>
> - ip_block->status.hw = true;
> + amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_HW);
> return 0;
> }
>
> @@ -2164,7 +2164,7 @@ int amdgpu_device_ip_set_clockgating_state(void *dev,
> int i, r = 0;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (adev->ip_blocks[i].version->type != block_type)
> continue;
> @@ -2198,7 +2198,7 @@ int amdgpu_device_ip_set_powergating_state(void *dev,
> int i, r = 0;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (adev->ip_blocks[i].version->type != block_type)
> continue;
> @@ -2230,7 +2230,7 @@ void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
> int i;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
> adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
> @@ -2252,7 +2252,7 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
> int i, r;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (adev->ip_blocks[i].version->type == block_type) {
> if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
> @@ -2284,7 +2284,7 @@ bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> if (adev->ip_blocks[i].version->type == block_type)
> - return adev->ip_blocks[i].status.valid;
> + return amdgpu_device_ip_valid(adev, i);
> }
> return false;
>
> @@ -2375,6 +2375,29 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
> return 0;
> }
>
> +void amdgpu_device_ip_set_state(struct amdgpu_device *adev, int index,
> + enum amdgpu_device_ip_state state)
> +{
> + amdgpu_ip_block_set_state(&adev->ip_blocks[index], state);
> +}
> +
> +enum amdgpu_device_ip_state amdgpu_device_ip_state(struct amdgpu_device *adev,
> + int index)
> +{
> + return amdgpu_ip_block_state(&adev->ip_blocks[index]);
> +}
> +
> +bool amdgpu_device_ip_valid(struct amdgpu_device *adev, int index)
> +{
> + return amdgpu_ip_block_valid(&adev->ip_blocks[index]);
> +}
> +
> +void amdgpu_ip_block_set_state(struct amdgpu_ip_block *ip_block,
> + enum amdgpu_device_ip_state state)
> +{
> + ip_block->status.state = state;
> +}
> +
> /**
> * amdgpu_device_enable_virtual_display - enable virtual display feature
> *
> @@ -2671,20 +2694,21 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
> if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
> DRM_WARN("disabled ip block: %d <%s>\n",
> i, adev->ip_blocks[i].version->funcs->name);
> - adev->ip_blocks[i].status.valid = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
> } else if (ip_block->version->funcs->early_init) {
> r = ip_block->version->funcs->early_init(ip_block);
> if (r == -ENOENT) {
> - adev->ip_blocks[i].status.valid = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
> } else if (r) {
> DRM_ERROR("early_init of IP block <%s> failed %d\n",
> adev->ip_blocks[i].version->funcs->name, r);
> total = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
> } else {
> - adev->ip_blocks[i].status.valid = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_EARLY);
> }
> } else {
> - adev->ip_blocks[i].status.valid = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_EARLY);
> }
> /* get the vbios after the asic_funcs are set up */
> if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
> @@ -2715,7 +2739,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
> return -ENODEV;
>
> ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
> - if (ip_block->status.valid != false)
> + if (amdgpu_ip_block_valid(ip_block))
> amdgpu_amdkfd_device_probe(adev);
>
> adev->cg_flags &= amdgpu_cg_mask;
> @@ -2729,9 +2753,9 @@ static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
> int i, r;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.sw)
> + if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
> continue;
> - if (adev->ip_blocks[i].status.hw)
> + if (amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
> continue;
> if (!amdgpu_ip_member_of_hwini(
> adev, adev->ip_blocks[i].version->type))
> @@ -2745,7 +2769,7 @@ static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
> adev->ip_blocks[i].version->funcs->name, r);
> return r;
> }
> - adev->ip_blocks[i].status.hw = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
> }
> }
>
> @@ -2757,9 +2781,9 @@ static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
> int i, r;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.sw)
> + if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
> continue;
> - if (adev->ip_blocks[i].status.hw)
> + if (amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
> continue;
> if (!amdgpu_ip_member_of_hwini(
> adev, adev->ip_blocks[i].version->type))
> @@ -2770,7 +2794,7 @@ static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
> adev->ip_blocks[i].version->funcs->name, r);
> return r;
> }
> - adev->ip_blocks[i].status.hw = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
> }
>
> return 0;
> @@ -2791,11 +2815,11 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
> AMD_IP_BLOCK_TYPE_PSP))
> break;
>
> - if (!adev->ip_blocks[i].status.sw)
> + if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
> continue;
>
> /* no need to do the fw loading again if already done*/
> - if (adev->ip_blocks[i].status.hw == true)
> + if (amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
> break;
>
> if (amdgpu_in_reset(adev) || adev->in_suspend) {
> @@ -2809,7 +2833,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
> adev->ip_blocks[i].version->funcs->name, r);
> return r;
> }
> - adev->ip_blocks[i].status.hw = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
> }
> break;
> }
> @@ -2900,7 +2924,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
> return r;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (adev->ip_blocks[i].version->funcs->sw_init) {
> r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
> @@ -2910,7 +2934,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
> goto init_failed;
> }
> }
> - adev->ip_blocks[i].status.sw = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
>
> if (!amdgpu_ip_member_of_hwini(
> adev, adev->ip_blocks[i].version->type))
> @@ -2923,7 +2947,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
> DRM_ERROR("hw_init %d failed %d\n", i, r);
> goto init_failed;
> }
> - adev->ip_blocks[i].status.hw = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
> } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
> /* need to do gmc hw init early so we can allocate gpu mem */
> /* Try to reserve bad pages early */
> @@ -2945,7 +2969,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
> DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
> goto init_failed;
> }
> - adev->ip_blocks[i].status.hw = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
>
> /* right after GMC hw init, we create CSA */
> if (adev->gfx.mcbp) {
> @@ -3130,7 +3154,7 @@ int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
>
> for (j = 0; j < adev->num_ip_blocks; j++) {
> i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
> - if (!adev->ip_blocks[i].status.late_initialized)
> + if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_LATE)
> continue;
> /* skip CG for GFX, SDMA on S0ix */
> if (adev->in_s0ix &&
> @@ -3167,7 +3191,7 @@ int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
>
> for (j = 0; j < adev->num_ip_blocks; j++) {
> i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
> - if (!adev->ip_blocks[i].status.late_initialized)
> + if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_LATE)
> continue;
> /* skip PG for GFX, SDMA on S0ix */
> if (adev->in_s0ix &&
> @@ -3246,7 +3270,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
> int i = 0, r;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.hw)
> + if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_HW)
> continue;
> if (adev->ip_blocks[i].version->funcs->late_init) {
> r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
> @@ -3256,7 +3280,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
> return r;
> }
> }
> - adev->ip_blocks[i].status.late_initialized = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
> }
>
> r = amdgpu_ras_late_init(adev);
> @@ -3336,7 +3360,7 @@ static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
> }
> }
>
> - ip_block->status.hw = false;
> + amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_SW);
> }
>
> /**
> @@ -3354,7 +3378,7 @@ static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
> return;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.hw)
> + if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_HW)
> continue;
> if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
> amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
> @@ -3377,7 +3401,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
> adev->ip_blocks[i].version->funcs->name, r);
> }
>
> - adev->ip_blocks[i].status.late_initialized = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
> }
>
> amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
> @@ -3389,7 +3413,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
> amdgpu_device_smu_fini_early(adev);
>
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> - if (!adev->ip_blocks[i].status.hw)
> + if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_HW)
> continue;
>
> amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
> @@ -3427,7 +3451,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
> amdgpu_amdkfd_device_fini_sw(adev);
>
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> - if (!adev->ip_blocks[i].status.sw)
> + if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
> continue;
>
> if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
> @@ -3446,17 +3470,17 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
> adev->ip_blocks[i].version->funcs->name, r);
> }
> }
> - adev->ip_blocks[i].status.sw = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_EARLY);
> }
>
> amdgpu_ras_fini(adev);
>
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (adev->ip_blocks[i].version->funcs->late_fini)
> adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
> - adev->ip_blocks[i].status.valid = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
> }
>
> return 0;
> @@ -3517,9 +3541,9 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
> dev_warn(adev->dev, "Failed to disallow df cstate");
>
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> - if (!adev->ip_blocks[i].status.late_initialized)
> + if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_LATE)
> continue;
>
> if (adev->ip_blocks[i].version->funcs->early_fini) {
> @@ -3531,11 +3555,11 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
> }
> }
>
> - adev->ip_blocks[i].status.late_initialized = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
> }
>
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
>
> /* displays are handled separately */
> @@ -3570,7 +3594,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
> amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
>
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> /* displays are handled in phase1 */
> if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
> @@ -3578,7 +3602,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
> /* PSP lost connection when err_event_athub occurs */
> if (amdgpu_ras_intr_triggered() &&
> adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
> - adev->ip_blocks[i].status.hw = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
> continue;
> }
>
> @@ -3620,7 +3644,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
>
> /* XXX handle errors */
> r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
> - adev->ip_blocks[i].status.hw = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
>
> /* handle putting the SMC in the appropriate state */
> if (!amdgpu_sriov_vf(adev)) {
> @@ -3687,12 +3711,12 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
> struct amdgpu_ip_block *block;
>
> block = &adev->ip_blocks[i];
> - block->status.hw = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
>
> for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
>
> if (block->version->type != ip_order[j] ||
> - !block->status.valid)
> + !amdgpu_device_ip_valid(adev, i))
> continue;
>
> r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
> @@ -3701,7 +3725,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
> block->version->funcs->name);
> return r;
> }
> - block->status.hw = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
> }
> }
>
> @@ -3731,7 +3755,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
> if (!block)
> continue;
>
> - if (block->status.valid && !block->status.hw) {
> + if (amdgpu_ip_block_valid(block) &&
> + amdgpu_ip_block_state(block) < AMDGPU_IP_STATE_HW) {
> if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
> r = amdgpu_ip_block_resume(block);
> } else {
> @@ -3743,7 +3768,7 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
> block->version->funcs->name);
> break;
> }
> - block->status.hw = true;
> + amdgpu_ip_block_set_state(block, AMDGPU_IP_STATE_HW);
> }
> }
>
> @@ -3767,7 +3792,8 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
> int i, r;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
> + if (!amdgpu_device_ip_valid(adev, i) ||
> + amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
> continue;
> if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> @@ -3801,7 +3827,8 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
> int i, r;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
> + if (!amdgpu_device_ip_valid(adev, i) ||
> + amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
> continue;
> if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
> adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
> @@ -3835,7 +3862,8 @@ static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
> int i, r;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
> + if (!amdgpu_device_ip_valid(adev, i) ||
> + amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
> continue;
> if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
> r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
> @@ -4888,7 +4916,7 @@ int amdgpu_device_prepare(struct drm_device *dev)
> flush_delayed_work(&adev->gfx.gfx_off_delay_work);
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
> continue;
> @@ -5090,7 +5118,7 @@ static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
> return true;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (adev->ip_blocks[i].version->funcs->check_soft_reset)
> adev->ip_blocks[i].status.hang =
> @@ -5120,7 +5148,7 @@ static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
> int i, r = 0;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (adev->ip_blocks[i].status.hang &&
> adev->ip_blocks[i].version->funcs->pre_soft_reset) {
> @@ -5150,7 +5178,7 @@ static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
> return true;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
> (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
> @@ -5182,7 +5210,7 @@ static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
> int i, r = 0;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (adev->ip_blocks[i].status.hang &&
> adev->ip_blocks[i].version->funcs->soft_reset) {
> @@ -5211,7 +5239,7 @@ static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
> int i, r = 0;
>
> for (i = 0; i < adev->num_ip_blocks; i++) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> if (adev->ip_blocks[i].status.hang &&
> adev->ip_blocks[i].version->funcs->post_soft_reset)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index f908355df07c..33030a0bfef2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -515,7 +515,7 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
>
> for (i = 0; i < adev->num_ip_blocks; i++)
> if (adev->ip_blocks[i].version->type == type &&
> - adev->ip_blocks[i].status.valid)
> + amdgpu_device_ip_valid(adev, i))
> break;
>
> if (i == adev->num_ip_blocks)
> @@ -636,7 +636,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> type = amdgpu_ip_get_block_type(adev, info->query_hw_ip.type);
> ip_block = amdgpu_device_ip_get_ip_block(adev, type);
>
> - if (!ip_block || !ip_block->status.valid)
> + if (!ip_block || !amdgpu_ip_block_valid(ip_block))
> return -EINVAL;
>
> if (adev->xcp_mgr && adev->xcp_mgr->num_xcps > 0 &&
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index a441dffca45b..26a0d9050dca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -3897,7 +3897,8 @@ static ssize_t psp_usbc_pd_fw_sysfs_read(struct device *dev,
> int ret;
>
> ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP);
> - if (!ip_block || !ip_block->status.late_initialized) {
> + if (!ip_block ||
> + amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_LATE) {
> dev_info(adev->dev, "PSP block is not ready yet\n.");
> return -EBUSY;
> }
> @@ -3929,7 +3930,8 @@ static ssize_t psp_usbc_pd_fw_sysfs_write(struct device *dev,
> struct amdgpu_ip_block *ip_block;
>
> ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP);
> - if (!ip_block || !ip_block->status.late_initialized) {
> + if (!ip_block ||
> + amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_LATE) {
> dev_err(adev->dev, "PSP block is not ready yet.");
> return -EBUSY;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index dabfbdf6f1ce..eb72dac61c83 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -31,9 +31,9 @@ static int amdgpu_reset_xgmi_reset_on_init_suspend(struct amdgpu_device *adev)
> int i;
>
> for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> - if (!adev->ip_blocks[i].status.valid)
> + if (!amdgpu_device_ip_valid(adev, i))
> continue;
> - if (!adev->ip_blocks[i].status.hw)
> + if (amdgpu_device_ip_state(adev, i) <= AMDGPU_IP_STATE_HW)
> continue;
> /* displays are handled in phase1 */
> if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
> @@ -41,7 +41,7 @@ static int amdgpu_reset_xgmi_reset_on_init_suspend(struct amdgpu_device *adev)
>
> /* XXX handle errors */
> amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
> - adev->ip_blocks[i].status.hw = false;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
> }
>
> /* VCN FW shared region is in frambuffer, there are some flags
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> index 2594467bdd87..c9479a92a9df 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> @@ -204,7 +204,7 @@ static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
> return r;
> }
> }
> - adev->ip_blocks[i].status.late_initialized = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
> }
>
> amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
> diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
> index 70569ea906bc..75dc5cb1e1ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
> @@ -205,7 +205,7 @@ static int smu_v13_0_10_mode2_restore_ip(struct amdgpu_device *adev)
> return r;
> }
> }
> - adev->ip_blocks[i].status.late_initialized = true;
> + amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
> }
>
> amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 8ca793c222ff..c75402d606c3 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -230,7 +230,7 @@ static bool is_vcn_enabled(struct amdgpu_device *adev)
> for (i = 0; i < adev->num_ip_blocks; i++) {
> if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_VCN ||
> adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_JPEG) &&
> - !adev->ip_blocks[i].status.valid)
> + !amdgpu_device_ip_valid(adev, i))
> return false;
> }
>
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 12/15] drm/amdgpu: introduce IP block iterators to reduce duplicated code
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (10 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 11/15] drm/amdgpu: convert ip block bool flags into an enum Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-13 1:42 ` [RFC v2 13/15] drm/amdgpu: walk IP blocks in reverse order when shutdown Jiang Liu
` (3 subsequent siblings)
15 siblings, 0 replies; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Introduce following IP block iterators to reduce duplicated code:
- amdgpu_for_each_ip_block
- amdgpu_for_each_ip_block_reverse
- amdgpu_for_each_ip_block_valid
- amdgpu_for_each_ip_block_valid_reverse
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/aldebaran.c | 46 +-
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 30 +-
.../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 3 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 479 ++++++++----------
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 10 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 16 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 14 +-
drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 50 +-
drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c | 51 +-
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 +-
11 files changed, 337 insertions(+), 378 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index b2bad8837b64..9fcce99544d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -73,19 +73,18 @@ aldebaran_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)
{
- int r, i;
+ int r;
+ struct amdgpu_ip_block *ip_block;
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
- for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA))
+ amdgpu_for_each_ip_block_reverse(adev, ip_block) {
+ if (!(ip_block->version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
- r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
+ r = amdgpu_ip_block_suspend(ip_block);
if (r)
return r;
}
@@ -201,7 +200,7 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
{
struct amdgpu_firmware_info *ucode_list[AMDGPU_UCODE_ID_MAXIMUM];
struct amdgpu_firmware_info *ucode;
- struct amdgpu_ip_block *cmn_block;
+ struct amdgpu_ip_block *cmn_block, *ip_block;
int ucode_count = 0;
int i, r;
@@ -269,39 +268,32 @@ static int aldebaran_mode2_restore_ip(struct amdgpu_device *adev)
return r;
}
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA))
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (!(ip_block->version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
- r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
+ r = amdgpu_ip_block_resume(ip_block);
if (r)
return r;
}
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_COMMON))
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (!(ip_block->version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_SDMA ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_COMMON))
continue;
- if (adev->ip_blocks[i].version->funcs->late_init) {
- r = adev->ip_blocks[i].version->funcs->late_init(
- &adev->ip_blocks[i]);
+ if (ip_block->version->funcs->late_init) {
+ r = ip_block->version->funcs->late_init(ip_block);
if (r) {
dev_err(adev->dev,
"late_init of IP block <%s> failed %d after reset\n",
- adev->ip_blocks[i].version->funcs->name,
- r);
+ ip_block->version->funcs->name, r);
return r;
}
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_LATE);
}
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f3275a281280..8707ca566ea7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -450,11 +450,9 @@ amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
const struct amdgpu_ip_block_version *ip_block_version);
-void amdgpu_device_ip_set_state(struct amdgpu_device *adev, int index,
- enum amdgpu_device_ip_state state);
-enum amdgpu_device_ip_state amdgpu_device_ip_state(struct amdgpu_device *adev,
- int index);
-bool amdgpu_device_ip_valid(struct amdgpu_device *adev, int index);
+enum amdgpu_device_ip_state
+amdgpu_device_ip_state(struct amdgpu_device *adev, int index);
+
void amdgpu_ip_block_set_state(struct amdgpu_ip_block *ip_block,
enum amdgpu_device_ip_state state);
@@ -491,6 +489,28 @@ static inline bool amdgpu_ip_block_test_and_clear_marker(struct amdgpu_ip_block
return set;
}
+#define amdgpu_for_each_ip_block(adev, entry) \
+ for (int __i = 0; \
+ __i < (adev)->num_ip_blocks && ((entry) = &(adev)->ip_blocks[__i]); \
+ __i++)
+
+#define amdgpu_for_each_ip_block_reverse(adev, entry) \
+ for (int __i = (adev)->num_ip_blocks; \
+ __i >= 0 && ((entry) = &(adev)->ip_blocks[__i]); \
+ __i--)
+
+#define amdgpu_for_each_ip_block_valid(adev, entry) \
+ for (int __i = (adev)->num_ip_blocks; \
+ __i >= 0 && ((entry) = &(adev)->ip_blocks[__i]); \
+ __i--) \
+ if (amdgpu_ip_block_valid((entry)))
+
+#define amdgpu_for_each_ip_block_valid_reverse(adev, entry) \
+ for (int __i = (adev)->num_ip_blocks; \
+ __i >= 0 && (entry = &(adev)->ip_blocks[__i]); \
+ __i--) \
+ if (amdgpu_ip_block_valid((entry)))
+
/*
* BIOS.
*/
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
index 824f9da5b6ce..d3a30b9a26a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
@@ -282,8 +282,7 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
/* dump the ip state for each ip */
drm_printf(&p, "IP Dump\n");
- for (int i = 0; i < coredump->adev->num_ip_blocks; i++) {
- ip_block = &coredump->adev->ip_blocks[i];
+ amdgpu_for_each_ip_block(coredump->adev, ip_block) {
if (ip_block->version->funcs->print_ip_state) {
drm_printf(&p, "IP: %s\n", ip_block->version->funcs->name);
ip_block->version->funcs->print_ip_state(ip_block, &p);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index fcfbdcfd1fa3..af356226fbca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2161,20 +2161,18 @@ int amdgpu_device_ip_set_clockgating_state(void *dev,
enum amd_clockgating_state state)
{
struct amdgpu_device *adev = dev;
- int i, r = 0;
+ struct amdgpu_ip_block *ip_block;
+ int r = 0;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (ip_block->version->type != block_type)
continue;
- if (adev->ip_blocks[i].version->type != block_type)
+ if (!ip_block->version->funcs->set_clockgating_state)
continue;
- if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
- continue;
- r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
- &adev->ip_blocks[i], state);
+ r = ip_block->version->funcs->set_clockgating_state(ip_block, state);
if (r)
DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
}
return r;
}
@@ -2195,20 +2193,18 @@ int amdgpu_device_ip_set_powergating_state(void *dev,
enum amd_powergating_state state)
{
struct amdgpu_device *adev = dev;
- int i, r = 0;
+ int r = 0;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (ip_block->version->type != block_type)
continue;
- if (adev->ip_blocks[i].version->type != block_type)
+ if (!ip_block->version->funcs->set_powergating_state)
continue;
- if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
- continue;
- r = adev->ip_blocks[i].version->funcs->set_powergating_state(
- &adev->ip_blocks[i], state);
+ r = ip_block->version->funcs->set_powergating_state(ip_block, state);
if (r)
DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
}
return r;
}
@@ -2227,13 +2223,11 @@ int amdgpu_device_ip_set_powergating_state(void *dev,
void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
u64 *flags)
{
- int i;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
- adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (ip_block->version->funcs->get_clockgating_state)
+ ip_block->version->funcs->get_clockgating_state((void *)adev, flags);
}
}
@@ -2249,15 +2243,13 @@ void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
enum amd_ip_block_type block_type)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if (adev->ip_blocks[i].version->type == block_type) {
- if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
- r = adev->ip_blocks[i].version->funcs->wait_for_idle(
- &adev->ip_blocks[i]);
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (ip_block->version->type == block_type) {
+ if (ip_block->version->funcs->wait_for_idle) {
+ r = ip_block->version->funcs->wait_for_idle(ip_block);
if (r)
return r;
}
@@ -2280,11 +2272,11 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
enum amd_ip_block_type block_type)
{
- int i;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (adev->ip_blocks[i].version->type == block_type)
- return amdgpu_device_ip_valid(adev, i);
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (ip_block->version->type == block_type)
+ return amdgpu_ip_block_valid(ip_block);
}
return false;
@@ -2303,11 +2295,11 @@ struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
enum amd_ip_block_type type)
{
- int i;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++)
- if (adev->ip_blocks[i].version->type == type)
- return &adev->ip_blocks[i];
+ amdgpu_for_each_ip_block(adev, ip_block)
+ if (ip_block->version->type == type)
+ return ip_block;
return NULL;
}
@@ -2375,23 +2367,12 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
return 0;
}
-void amdgpu_device_ip_set_state(struct amdgpu_device *adev, int index,
- enum amdgpu_device_ip_state state)
-{
- amdgpu_ip_block_set_state(&adev->ip_blocks[index], state);
-}
-
enum amdgpu_device_ip_state amdgpu_device_ip_state(struct amdgpu_device *adev,
int index)
{
return amdgpu_ip_block_state(&adev->ip_blocks[index]);
}
-bool amdgpu_device_ip_valid(struct amdgpu_device *adev, int index)
-{
- return amdgpu_ip_block_valid(&adev->ip_blocks[index]);
-}
-
void amdgpu_ip_block_set_state(struct amdgpu_ip_block *ip_block,
enum amdgpu_device_ip_state state)
{
@@ -2693,25 +2674,24 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
DRM_WARN("disabled ip block: %d <%s>\n",
- i, adev->ip_blocks[i].version->funcs->name);
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
+ i, ip_block->version->funcs->name);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_INVALID);
} else if (ip_block->version->funcs->early_init) {
r = ip_block->version->funcs->early_init(ip_block);
if (r == -ENOENT) {
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_INVALID);
} else if (r) {
DRM_ERROR("early_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
total = false;
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
} else {
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_EARLY);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_EARLY);
}
} else {
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_EARLY);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_EARLY);
}
/* get the vbios after the asic_funcs are set up */
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_COMMON) {
r = amdgpu_device_parse_gpu_info_fw(adev);
if (r)
return r;
@@ -2750,26 +2730,26 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_SW)
continue;
- if (amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
+ if (amdgpu_ip_block_state(ip_block) >= AMDGPU_IP_STATE_HW)
continue;
- if (!amdgpu_ip_member_of_hwini(
- adev, adev->ip_blocks[i].version->type))
+ if (!amdgpu_ip_member_of_hwini(adev, ip_block->version->type))
continue;
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
- (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
- r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+ (amdgpu_sriov_vf(adev) && (ip_block->version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_IH) {
+ r = ip_block->version->funcs->hw_init(ip_block);
if (r) {
DRM_ERROR("hw_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
return r;
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_HW);
}
}
@@ -2778,23 +2758,23 @@ static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_SW)
continue;
- if (amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
+ if (amdgpu_ip_block_state(ip_block) >= AMDGPU_IP_STATE_HW)
continue;
- if (!amdgpu_ip_member_of_hwini(
- adev, adev->ip_blocks[i].version->type))
+ if (!amdgpu_ip_member_of_hwini(adev, ip_block->version->type))
continue;
- r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
+ r = ip_block->version->funcs->hw_init(ip_block);
if (r) {
DRM_ERROR("hw_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
return r;
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_HW);
}
return 0;
@@ -2803,37 +2783,37 @@ static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
{
int r = 0;
- int i;
uint32_t smu_version;
+ struct amdgpu_ip_block *ip_block;
if (adev->asic_type >= CHIP_VEGA10) {
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (ip_block->version->type != AMD_IP_BLOCK_TYPE_PSP)
continue;
if (!amdgpu_ip_member_of_hwini(adev,
AMD_IP_BLOCK_TYPE_PSP))
break;
- if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
+ if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_SW)
continue;
/* no need to do the fw loading again if already done*/
- if (amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
+ if (amdgpu_ip_block_state(ip_block) >= AMDGPU_IP_STATE_HW)
break;
if (amdgpu_in_reset(adev) || adev->in_suspend) {
- r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
+ r = amdgpu_ip_block_resume(ip_block);
if (r)
return r;
} else {
- r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
+ r = ip_block->version->funcs->hw_init(ip_block);
if (r) {
DRM_ERROR("hw_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
return r;
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_HW);
}
break;
}
@@ -2917,38 +2897,37 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
static int amdgpu_device_ip_init(struct amdgpu_device *adev)
{
bool init_badpage;
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
r = amdgpu_ras_init(adev);
if (r)
return r;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if (adev->ip_blocks[i].version->funcs->sw_init) {
- r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (ip_block->version->funcs->sw_init) {
+ r = ip_block->version->funcs->sw_init(ip_block);
if (r) {
DRM_ERROR("sw_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
goto init_failed;
}
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_SW);
- if (!amdgpu_ip_member_of_hwini(
- adev, adev->ip_blocks[i].version->type))
+ if (!amdgpu_ip_member_of_hwini(adev, ip_block->version->type))
continue;
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_COMMON) {
/* need to do common hw init early so everything is set up for gmc */
- r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
+ r = ip_block->version->funcs->hw_init(ip_block);
if (r) {
- DRM_ERROR("hw_init %d failed %d\n", i, r);
+ DRM_ERROR("hw_init %s failed %d\n",
+ ip_block->version->funcs->name, r);
goto init_failed;
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
- } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_HW);
+ } else if (ip_block->version->type == AMD_IP_BLOCK_TYPE_GMC) {
/* need to do gmc hw init early so we can allocate gpu mem */
/* Try to reserve bad pages early */
if (amdgpu_sriov_vf(adev))
@@ -2959,9 +2938,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
goto init_failed;
}
- r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
+ r = ip_block->version->funcs->hw_init(ip_block);
if (r) {
- DRM_ERROR("hw_init %d failed %d\n", i, r);
+ DRM_ERROR("hw_init %s failed %d\n",
+ ip_block->version->funcs->name, r);
goto init_failed;
}
r = amdgpu_device_wb_init(adev);
@@ -2969,7 +2949,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
goto init_failed;
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_HW);
/* right after GMC hw init, we create CSA */
if (adev->gfx.mcbp) {
@@ -3267,20 +3247,21 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
{
struct amdgpu_gpu_instance *gpu_instance;
+ struct amdgpu_ip_block *ip_block;
int i = 0, r;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_HW)
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_HW)
continue;
- if (adev->ip_blocks[i].version->funcs->late_init) {
- r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
+ if (ip_block->version->funcs->late_init) {
+ r = ip_block->version->funcs->late_init(ip_block);
if (r) {
DRM_ERROR("late_init of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
return r;
}
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_LATE);
}
r = amdgpu_ras_late_init(adev);
@@ -3372,16 +3353,16 @@ static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
*/
static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
{
- int i;
+ struct amdgpu_ip_block *ip_block;
if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
return;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_HW)
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_HW)
continue;
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
- amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
+ amdgpu_ip_block_hw_fini(ip_block);
break;
}
}
@@ -3389,19 +3370,20 @@ static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!adev->ip_blocks[i].version->funcs->early_fini)
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (!ip_block->version->funcs->early_fini)
continue;
- r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
+ r = ip_block->version->funcs->early_fini(ip_block);
if (r) {
DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_HW);
}
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
@@ -3412,11 +3394,11 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
/* Workaround for ASICs need to disable SMC first */
amdgpu_device_smu_fini_early(adev);
- for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_HW)
+ amdgpu_for_each_ip_block_reverse(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_HW)
continue;
- amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
+ amdgpu_ip_block_hw_fini(ip_block);
}
if (amdgpu_sriov_vf(adev)) {
@@ -3440,7 +3422,8 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
*/
static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
amdgpu_virt_release_ras_err_handler_data(adev);
@@ -3450,11 +3433,11 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
amdgpu_amdkfd_device_fini_sw(adev);
- for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_SW)
+ amdgpu_for_each_ip_block_reverse(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_SW)
continue;
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_GMC) {
amdgpu_ucode_free_bo(adev);
amdgpu_free_static_csa(&adev->virt.csa_obj);
amdgpu_device_wb_fini(adev);
@@ -3462,25 +3445,23 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
amdgpu_ib_pool_fini(adev);
amdgpu_seq64_fini(adev);
}
- if (adev->ip_blocks[i].version->funcs->sw_fini) {
- r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
+ if (ip_block->version->funcs->sw_fini) {
+ r = ip_block->version->funcs->sw_fini(ip_block);
/* XXX handle errors */
if (r) {
DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
}
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_EARLY);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_EARLY);
}
amdgpu_ras_fini(adev);
- for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if (adev->ip_blocks[i].version->funcs->late_fini)
- adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_INVALID);
+ amdgpu_for_each_ip_block_valid_reverse(adev, ip_block) {
+ if (ip_block->version->funcs->late_fini)
+ ip_block->version->funcs->late_fini(ip_block);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_INVALID);
}
return 0;
@@ -3527,7 +3508,8 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
*/
static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
@@ -3540,34 +3522,29 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
dev_warn(adev->dev, "Failed to disallow df cstate");
- for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if (amdgpu_device_ip_state(adev, i) < AMDGPU_IP_STATE_LATE)
+ amdgpu_for_each_ip_block_valid_reverse(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_LATE)
continue;
- if (adev->ip_blocks[i].version->funcs->early_fini) {
- r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
+ if (ip_block->version->funcs->early_fini) {
+ r = ip_block->version->funcs->early_fini(ip_block);
if (r) {
DRM_ERROR(" of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
+ ip_block->version->funcs->name, r);
return r;
}
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_HW);
}
- for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
-
+ amdgpu_for_each_ip_block_valid_reverse(adev, ip_block) {
/* displays are handled separately */
- if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
+ if (ip_block->version->type != AMD_IP_BLOCK_TYPE_DCE)
continue;
/* XXX handle errors */
- r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
+ r = amdgpu_ip_block_suspend(ip_block);
if (r)
return r;
}
@@ -3588,27 +3565,25 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
*/
static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
if (adev->in_s0ix)
amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
- for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
+ amdgpu_for_each_ip_block_valid_reverse(adev, ip_block) {
/* displays are handled in phase1 */
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_DCE)
continue;
/* PSP lost connection when err_event_athub occurs */
if (amdgpu_ras_intr_triggered() &&
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_PSP) {
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_SW);
continue;
}
/* skip unnecessary suspend if we do not initialize them yet */
- if (!amdgpu_ip_member_of_hwini(
- adev, adev->ip_blocks[i].version->type))
+ if (!amdgpu_ip_member_of_hwini(adev, ip_block->version->type))
continue;
/* skip suspend of gfx/mes and psp for S0ix
@@ -3617,17 +3592,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
* so no need to suspend it.
*/
if (adev->in_s0ix &&
- (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
+ (ip_block->version->type == AMD_IP_BLOCK_TYPE_PSP ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_MES))
continue;
/* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
if (adev->in_s0ix &&
(amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
IP_VERSION(5, 0, 0)) &&
- (adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA))
+ (ip_block->version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
/* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
@@ -3639,16 +3613,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
*/
if (amdgpu_in_reset(adev) &&
(adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_PSP)
continue;
/* XXX handle errors */
- r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
+ r = amdgpu_ip_block_suspend(ip_block);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_SW);
/* handle putting the SMC in the appropriate state */
if (!amdgpu_sriov_vf(adev)) {
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
if (r) {
DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
@@ -3697,7 +3671,8 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *block;
static enum amd_ip_block_type ip_order[] = {
AMD_IP_BLOCK_TYPE_COMMON,
@@ -3706,26 +3681,24 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
AMD_IP_BLOCK_TYPE_IH,
};
- for (i = 0; i < adev->num_ip_blocks; i++) {
+ amdgpu_for_each_ip_block(adev, block) {
int j;
- struct amdgpu_ip_block *block;
- block = &adev->ip_blocks[i];
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
+ amdgpu_ip_block_set_state(block, AMDGPU_IP_STATE_SW);
for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
if (block->version->type != ip_order[j] ||
- !amdgpu_device_ip_valid(adev, i))
+ !amdgpu_ip_block_valid(block))
continue;
- r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
+ r = block->version->funcs->hw_init(block);
if (r) {
dev_err(adev->dev, "RE-INIT-early: %s failed\n",
block->version->funcs->name);
return r;
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_HW);
+ amdgpu_ip_block_set_state(block, AMDGPU_IP_STATE_HW);
}
}
@@ -3789,18 +3762,18 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
*/
static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i) ||
- amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) >= AMDGPU_IP_STATE_HW)
continue;
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
- (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_GMC ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_IH ||
+ (ip_block->version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
- r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
+ r = amdgpu_ip_block_resume(ip_block);
if (r)
return r;
}
@@ -3824,19 +3797,19 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
*/
static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i) ||
- amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) >= AMDGPU_IP_STATE_HW)
continue;
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_GMC ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_IH ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_DCE ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_PSP)
continue;
- r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
+ r = amdgpu_ip_block_resume(ip_block);
if (r)
return r;
}
@@ -3859,14 +3832,14 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
*/
static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i) ||
- amdgpu_device_ip_state(adev, i) >= AMDGPU_IP_STATE_HW)
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) >= AMDGPU_IP_STATE_HW)
continue;
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
- r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_DCE) {
+ r = amdgpu_ip_block_resume(ip_block);
if (r)
return r;
}
@@ -4901,7 +4874,8 @@ static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mo
int amdgpu_device_prepare(struct drm_device *dev)
{
struct amdgpu_device *adev = drm_to_adev(dev);
- int i, r;
+ struct amdgpu_ip_block *ip_block;
+ int r;
amdgpu_choose_low_power_state(adev);
@@ -4915,12 +4889,10 @@ int amdgpu_device_prepare(struct drm_device *dev)
flush_delayed_work(&adev->gfx.gfx_off_delay_work);
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (!ip_block->version->funcs->prepare_suspend)
continue;
- if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
- continue;
- r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
+ r = ip_block->version->funcs->prepare_suspend(ip_block);
if (r)
goto unprepare;
}
@@ -5108,8 +5080,8 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
*/
static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
{
- int i;
bool asic_hang = false;
+ struct amdgpu_ip_block *block;
if (amdgpu_sriov_vf(adev))
return true;
@@ -5117,15 +5089,11 @@ static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
if (amdgpu_asic_need_full_reset(adev))
return true;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if (adev->ip_blocks[i].version->funcs->check_soft_reset)
- adev->ip_blocks[i].status.hang =
- adev->ip_blocks[i].version->funcs->check_soft_reset(
- &adev->ip_blocks[i]);
- if (adev->ip_blocks[i].status.hang) {
- dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
+ amdgpu_for_each_ip_block_valid(adev, block) {
+ if (block->version->funcs->check_soft_reset)
+ block->status.hang = block->version->funcs->check_soft_reset(block);
+ if (block->status.hang) {
+ dev_info(adev->dev, "IP block:%s is hung!\n", block->version->funcs->name);
asic_hang = true;
}
}
@@ -5145,14 +5113,13 @@ static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
*/
static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
{
- int i, r = 0;
+ int r;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if (adev->ip_blocks[i].status.hang &&
- adev->ip_blocks[i].version->funcs->pre_soft_reset) {
- r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (ip_block->status.hang &&
+ ip_block->version->funcs->pre_soft_reset) {
+ r = ip_block->version->funcs->pre_soft_reset(ip_block);
if (r)
return r;
}
@@ -5172,20 +5139,18 @@ static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
*/
static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
{
- int i;
+ struct amdgpu_ip_block *ip_block;
if (amdgpu_asic_need_full_reset(adev))
return true;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
- (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
- (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
- (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
- if (adev->ip_blocks[i].status.hang) {
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if ((ip_block->version->type == AMD_IP_BLOCK_TYPE_GMC) ||
+ (ip_block->version->type == AMD_IP_BLOCK_TYPE_SMC) ||
+ (ip_block->version->type == AMD_IP_BLOCK_TYPE_ACP) ||
+ (ip_block->version->type == AMD_IP_BLOCK_TYPE_DCE) ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_PSP) {
+ if (ip_block->status.hang) {
dev_info(adev->dev, "Some block need full reset!\n");
return true;
}
@@ -5207,14 +5172,13 @@ static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
*/
static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
{
- int i, r = 0;
+ int r = 0;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if (adev->ip_blocks[i].status.hang &&
- adev->ip_blocks[i].version->funcs->soft_reset) {
- r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (ip_block->status.hang &&
+ ip_block->version->funcs->soft_reset) {
+ r = ip_block->version->funcs->soft_reset(ip_block);
if (r)
return r;
}
@@ -5236,14 +5200,13 @@ static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
*/
static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
{
- int i, r = 0;
+ int r = 0;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if (adev->ip_blocks[i].status.hang &&
- adev->ip_blocks[i].version->funcs->post_soft_reset)
- r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (ip_block->status.hang &&
+ ip_block->version->funcs->post_soft_reset)
+ r = ip_block->version->funcs->post_soft_reset(ip_block);
if (r)
return r;
}
@@ -5473,6 +5436,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
int i, r = 0;
struct amdgpu_job *job = NULL;
struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
+ struct amdgpu_ip_block *ip_block;
bool need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
@@ -5532,10 +5496,9 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
dev_info(tmp_adev->dev, "Dumping IP State\n");
/* Trigger ip dump before we reset the asic */
- for (i = 0; i < tmp_adev->num_ip_blocks; i++)
- if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
- tmp_adev->ip_blocks[i].version->funcs
- ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
+ amdgpu_for_each_ip_block(tmp_adev, ip_block)
+ if (ip_block->version->funcs->dump_ip_state)
+ ip_block->version->funcs->dump_ip_state((void *)ip_block);
dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 100f04475943..07340fa4a30e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -36,13 +36,13 @@
static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
struct amdgpu_job *job)
{
- int i;
+ struct amdgpu_ip_block *block;
dev_info(adev->dev, "Dumping IP State\n");
- for (i = 0; i < adev->num_ip_blocks; i++)
- if (adev->ip_blocks[i].version->funcs->dump_ip_state)
- adev->ip_blocks[i].version->funcs
- ->dump_ip_state((void *)&adev->ip_blocks[i]);
+ amdgpu_for_each_ip_block(adev, block) {
+ if (block->version->funcs->dump_ip_state)
+ block->version->funcs->dump_ip_state((void *)block);
+ }
dev_info(adev->dev, "Dumping IP State Completed\n");
amdgpu_coredump(adev, true, false, job);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 33030a0bfef2..5582b1433621 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -400,6 +400,8 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
enum amd_ip_block_type type;
unsigned int num_rings = 0;
unsigned int i, j;
+ bool found = false;
+ struct amdgpu_ip_block *ip_block;
if (info->query_hw_ip.ip_instance >= AMDGPU_HW_IP_INSTANCE_MAX_COUNT)
return -EINVAL;
@@ -513,19 +515,21 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
return -EINVAL;
}
- for (i = 0; i < adev->num_ip_blocks; i++)
- if (adev->ip_blocks[i].version->type == type &&
- amdgpu_device_ip_valid(adev, i))
+ amdgpu_for_each_ip_block_valid(adev, ip_block) {
+ if (ip_block->version->type == type) {
+ found = true;
break;
+ }
+ }
- if (i == adev->num_ip_blocks)
+ if (!found)
return 0;
num_rings = min(amdgpu_ctx_num_entities[info->query_hw_ip.type],
num_rings);
- result->hw_ip_version_major = adev->ip_blocks[i].version->major;
- result->hw_ip_version_minor = adev->ip_blocks[i].version->minor;
+ result->hw_ip_version_major = ip_block->version->major;
+ result->hw_ip_version_minor = ip_block->version->minor;
if (adev->asic_type >= CHIP_VEGA10) {
switch (type) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 26a0d9050dca..3d7b715cd369 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -3897,8 +3897,7 @@ static ssize_t psp_usbc_pd_fw_sysfs_read(struct device *dev,
int ret;
ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP);
- if (!ip_block ||
- amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_LATE) {
+ if (!ip_block || amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_LATE) {
dev_info(adev->dev, "PSP block is not ready yet\n.");
return -EBUSY;
}
@@ -3930,8 +3929,7 @@ static ssize_t psp_usbc_pd_fw_sysfs_write(struct device *dev,
struct amdgpu_ip_block *ip_block;
ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP);
- if (!ip_block ||
- amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_LATE) {
+ if (!ip_block || amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_LATE) {
dev_err(adev->dev, "PSP block is not ready yet.");
return -EBUSY;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index eb72dac61c83..fafec228f57e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -28,20 +28,18 @@
static int amdgpu_reset_xgmi_reset_on_init_suspend(struct amdgpu_device *adev)
{
- int i;
+ struct amdgpu_ip_block *ip_block;
- for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!amdgpu_device_ip_valid(adev, i))
- continue;
- if (amdgpu_device_ip_state(adev, i) <= AMDGPU_IP_STATE_HW)
+ amdgpu_for_each_ip_block_valid_reverse(adev, ip_block) {
+ if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_HW)
continue;
/* displays are handled in phase1 */
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_DCE)
continue;
/* XXX handle errors */
- amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_SW);
+ amdgpu_ip_block_suspend(ip_block);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_SW);
}
/* VCN FW shared region is in frambuffer, there are some flags
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
index c9479a92a9df..01f2f9bd8dd0 100644
--- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -69,19 +69,18 @@ sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device *adev)
{
- int r, i;
+ int r;
+ struct amdgpu_ip_block *ip_block;
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
- for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA))
+ amdgpu_for_each_ip_block_reverse(adev, ip_block) {
+ if (!(ip_block->version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
- r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
+ r = amdgpu_ip_block_suspend(ip_block);
if (r)
return r;
}
@@ -148,8 +147,9 @@ sienna_cichlid_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
{
- int i, r;
+ int r;
struct psp_context *psp = &adev->psp;
+ struct amdgpu_ip_block *ip_block;
r = psp_rlc_autoload_start(psp);
if (r) {
@@ -167,44 +167,38 @@ static int sienna_cichlid_mode2_restore_ip(struct amdgpu_device *adev)
return r;
}
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
- r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (ip_block->version->type == AMD_IP_BLOCK_TYPE_IH) {
+ r = amdgpu_ip_block_resume(ip_block);
if (r)
return r;
}
}
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA))
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (!(ip_block->version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
- r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
+ r = amdgpu_ip_block_resume(ip_block);
if (r)
return r;
}
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA))
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (!(ip_block->version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
- if (adev->ip_blocks[i].version->funcs->late_init) {
- r = adev->ip_blocks[i].version->funcs->late_init(
- &adev->ip_blocks[i]);
+ if (ip_block->version->funcs->late_init) {
+ r = ip_block->version->funcs->late_init(ip_block);
if (r) {
dev_err(adev->dev,
"late_init of IP block <%s> failed %d after reset\n",
- adev->ip_blocks[i].version->funcs->name,
- r);
+ ip_block->version->funcs->name, r);
return r;
}
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_LATE);
}
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
index 75dc5cb1e1ec..90b20f658c8b 100644
--- a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
@@ -66,21 +66,19 @@ smu_v13_0_10_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
static int smu_v13_0_10_mode2_suspend_ip(struct amdgpu_device *adev)
{
- int r, i;
+ int r;
+ struct amdgpu_ip_block *ip_block;
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
- for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
- if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_MES))
+ amdgpu_for_each_ip_block_reverse(adev, ip_block) {
+ if (!(ip_block->version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_SDMA ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_MES))
continue;
- r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
+ r = amdgpu_ip_block_suspend(ip_block);
if (r)
return r;
}
@@ -144,6 +142,7 @@ static int smu_v13_0_10_mode2_restore_ip(struct amdgpu_device *adev)
struct amdgpu_firmware_info *ucode;
struct amdgpu_firmware_info *ucode_list[2];
int ucode_count = 0;
+ struct amdgpu_ip_block *ip_block;
for (i = 0; i < adev->firmware.max_ucodes; i++) {
ucode = &adev->firmware.ucode[i];
@@ -172,40 +171,32 @@ static int smu_v13_0_10_mode2_restore_ip(struct amdgpu_device *adev)
amdgpu_dpm_enable_gfx_features(adev);
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_MES ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA))
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (!(ip_block->version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_MES ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
- r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
+ r = amdgpu_ip_block_resume(ip_block);
if (r)
return r;
}
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if (!(adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_GFX ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_MES ||
- adev->ip_blocks[i].version->type ==
- AMD_IP_BLOCK_TYPE_SDMA))
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if (!(ip_block->version->type == AMD_IP_BLOCK_TYPE_GFX ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_MES ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
- if (adev->ip_blocks[i].version->funcs->late_init) {
- r = adev->ip_blocks[i].version->funcs->late_init(
- &adev->ip_blocks[i]);
+ if (ip_block->version->funcs->late_init) {
+ r = ip_block->version->funcs->late_init(ip_block);
if (r) {
dev_err(adev->dev,
"late_init of IP block <%s> failed %d after reset\n",
- adev->ip_blocks[i].version->funcs->name,
- r);
+ ip_block->version->funcs->name, r);
return r;
}
}
- amdgpu_device_ip_set_state(adev, i, AMDGPU_IP_STATE_LATE);
+ amdgpu_ip_block_set_state(ip_block, AMDGPU_IP_STATE_LATE);
}
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index c75402d606c3..1f8450fb1cc1 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -225,12 +225,12 @@ static int smu_set_gfx_imu_enable(struct smu_context *smu)
static bool is_vcn_enabled(struct amdgpu_device *adev)
{
- int i;
+ struct amdgpu_ip_block *ip_block;
- for (i = 0; i < adev->num_ip_blocks; i++) {
- if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_VCN ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_JPEG) &&
- !amdgpu_device_ip_valid(adev, i))
+ amdgpu_for_each_ip_block(adev, ip_block) {
+ if ((ip_block->version->type == AMD_IP_BLOCK_TYPE_VCN ||
+ ip_block->version->type == AMD_IP_BLOCK_TYPE_JPEG) &&
+ !amdgpu_ip_block_valid(ip_block))
return false;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* [RFC v2 13/15] drm/amdgpu: walk IP blocks in reverse order when shutdown
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (11 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 12/15] drm/amdgpu: introduce IP block iterators to reduce duplicated code Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-13 22:28 ` Mario Limonciello
2025-01-13 1:42 ` [RFC v2 14/15] drm/amdgpu/nbio: improve the way to manage irq reference count Jiang Liu
` (2 subsequent siblings)
15 siblings, 1 reply; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Walk IP blocks in reverse order in function amdgpu_device_ip_fini_early
and amdgpu_device_smu_fini_early, to keep consistence with other finish
functions.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index af356226fbca..a1501344f336 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3358,7 +3358,7 @@ static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
return;
- amdgpu_for_each_ip_block(adev, ip_block) {
+ amdgpu_for_each_ip_block_reverse(adev, ip_block) {
if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_HW)
continue;
if (ip_block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
@@ -3373,7 +3373,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
int r;
struct amdgpu_ip_block *ip_block;
- amdgpu_for_each_ip_block(adev, ip_block) {
+ amdgpu_for_each_ip_block_reverse(adev, ip_block) {
if (!ip_block->version->funcs->early_fini)
continue;
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* Re: [RFC v2 13/15] drm/amdgpu: walk IP blocks in reverse order when shutdown
2025-01-13 1:42 ` [RFC v2 13/15] drm/amdgpu: walk IP blocks in reverse order when shutdown Jiang Liu
@ 2025-01-13 22:28 ` Mario Limonciello
0 siblings, 0 replies; 39+ messages in thread
From: Mario Limonciello @ 2025-01-13 22:28 UTC (permalink / raw)
To: Jiang Liu, alexander.deucher, christian.koenig, Xinhui.Pan,
airlied, simona, sunil.khatri, lijo.lazar, Hawking.Zhang,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
On 1/12/2025 19:42, Jiang Liu wrote:
> Walk IP blocks in reverse order in function amdgpu_device_ip_fini_early
> and amdgpu_device_smu_fini_early, to keep consistence with other finish
> functions.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index af356226fbca..a1501344f336 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3358,7 +3358,7 @@ static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
> if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
> return;
>
> - amdgpu_for_each_ip_block(adev, ip_block) {
> + amdgpu_for_each_ip_block_reverse(adev, ip_block) {
> if (amdgpu_ip_block_state(ip_block) < AMDGPU_IP_STATE_HW)
> continue;
> if (ip_block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
> @@ -3373,7 +3373,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
> int r;
> struct amdgpu_ip_block *ip_block;
>
> - amdgpu_for_each_ip_block(adev, ip_block) {
> + amdgpu_for_each_ip_block_reverse(adev, ip_block) {
> if (!ip_block->version->funcs->early_fini)
> continue;
>
^ permalink raw reply [flat|nested] 39+ messages in thread
* [RFC v2 14/15] drm/amdgpu/nbio: improve the way to manage irq reference count
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (12 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 13/15] drm/amdgpu: walk IP blocks in reverse order when shutdown Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-13 1:42 ` [RFC v2 15/15] drm/amdgpu/asic: make ip block operations symmetric by .early_fini() Jiang Liu
2025-01-20 6:27 ` [RFC v2 00/15] Enhance device state machine to better support suspend/resume Zhang, Hawking
15 siblings, 0 replies; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Refactor nbio related code to improve the way to manage irq reference
count. Originally amdgpu_irq_get() is called from ip_blocks[].late_init
and amdgpu_irq_put is called from ip_blocks[].hw_fini. The asymmetric
design may cause issue under certain conditions. So
1) introduce amdgpu_nbio_ras_early_fini() to undo work done by
amdgpu_nbio_ras_late_init().
2) remove call of amdgpu_irq_put in xxxx_hw_fini().
3) record the status where reference count is held for specific irq.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 16 +++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 1 +
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 1 +
drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 1 +
drivers/gpu/drm/amd/amdgpu/soc15.c | 16 ----------------
5 files changed, 18 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index c75ce91f94ab..b8a69ceec2e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -64,13 +64,27 @@ int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *
r = amdgpu_irq_get(adev, &adev->nbio.ras_controller_irq, 0);
if (r)
goto late_fini;
+ amdgpu_ras_set_marker(adev, ras_block, AMDGPU_MARKER_IRQ0);
r = amdgpu_irq_get(adev, &adev->nbio.ras_err_event_athub_irq, 0);
if (r)
goto late_fini;
+ amdgpu_ras_set_marker(adev, ras_block, AMDGPU_MARKER_IRQ1);
}
return 0;
late_fini:
- amdgpu_ras_block_early_fini(adev, ras_block);
+ amdgpu_nbio_ras_early_fini(adev, ras_block);
return r;
}
+
+void amdgpu_nbio_ras_early_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block)
+{
+ if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
+ if (amdgpu_ras_test_and_clear_marker(adev, ras_block, AMDGPU_MARKER_IRQ0))
+ amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
+ if (amdgpu_ras_test_and_clear_marker(adev, ras_block, AMDGPU_MARKER_IRQ1))
+ amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+ }
+
+ amdgpu_ras_block_early_fini(adev, ras_block);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 79c2f807b9fe..e1edf75602c3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -117,6 +117,7 @@ struct amdgpu_nbio {
int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
+void amdgpu_nbio_ras_early_fini(struct amdgpu_device *adev, struct ras_common_if *ras_block);
u64 amdgpu_nbio_get_pcie_replay_count(struct amdgpu_device *adev);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index a26a9be58eac..c27d0fbf9cec 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -665,6 +665,7 @@ struct amdgpu_nbio_ras nbio_v7_4_ras = {
},
.hw_ops = &nbio_v7_4_ras_hw_ops,
.ras_late_init = amdgpu_nbio_ras_late_init,
+ .ras_early_fini = amdgpu_nbio_ras_early_fini,
},
.handle_ras_controller_intr_no_bifring = nbio_v7_4_handle_ras_controller_intr_no_bifring,
.handle_ras_err_event_athub_intr_no_bifring = nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring,
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index 8a0a63ac88d2..684a38a16247 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -703,6 +703,7 @@ struct amdgpu_nbio_ras nbio_v7_9_ras = {
},
.hw_ops = &nbio_v7_9_ras_hw_ops,
.ras_late_init = amdgpu_nbio_ras_late_init,
+ .ras_early_fini = amdgpu_nbio_ras_early_fini,
},
.handle_ras_controller_intr_no_bifring = nbio_v7_9_handle_ras_controller_intr_no_bifring,
.handle_ras_err_event_athub_intr_no_bifring = nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index a59b4c36cad7..5aabb55d2d25 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1328,22 +1328,6 @@ static int soc15_common_hw_fini(struct amdgpu_ip_block *ip_block)
if (amdgpu_sriov_vf(adev))
xgpu_ai_mailbox_put_irq(adev);
- /*
- * For minimal init, late_init is not called, hence RAS irqs are not
- * enabled.
- */
- if ((!amdgpu_sriov_vf(adev)) &&
- (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) &&
- adev->nbio.ras_if &&
- amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
- if (adev->nbio.ras &&
- adev->nbio.ras->init_ras_controller_interrupt)
- amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
- if (adev->nbio.ras &&
- adev->nbio.ras->init_ras_err_event_athub_interrupt)
- amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
- }
-
return 0;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* [RFC v2 15/15] drm/amdgpu/asic: make ip block operations symmetric by .early_fini()
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (13 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 14/15] drm/amdgpu/nbio: improve the way to manage irq reference count Jiang Liu
@ 2025-01-13 1:42 ` Jiang Liu
2025-01-20 6:27 ` [RFC v2 00/15] Enhance device state machine to better support suspend/resume Zhang, Hawking
15 siblings, 0 replies; 39+ messages in thread
From: Jiang Liu @ 2025-01-13 1:42 UTC (permalink / raw)
To: alexander.deucher, christian.koenig, Xinhui.Pan, airlied, simona,
sunil.khatri, lijo.lazar, Hawking.Zhang, mario.limonciello,
xiaogang.chen, Kent.Russell, shuox.liu, amd-gfx
Cc: Jiang Liu
Make ip block operations for asic symmetric by making using of the
.early_fini() hook, which will undo work done by the .late_init() hook.
1) introduce xxx_common_early_fini() for nv/soc15/soc21/soc24.
2) move `enable_doorbell_selfring_aperture(adev, false)` from .hw_init()
into .early_fini().
3) call xgpu_nv_mailbox_put_irq() for nv.c to avoid possible resource
leakage.
4) use flags to track irq reference count usage.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
drivers/gpu/drm/amd/amdgpu/nv.c | 14 +++++++++++-
drivers/gpu/drm/amd/amdgpu/soc15.c | 22 +++++++++++--------
drivers/gpu/drm/amd/amdgpu/soc21.c | 35 ++++++++++++++++++++----------
drivers/gpu/drm/amd/amdgpu/soc24.c | 22 +++++++++++--------
4 files changed, 63 insertions(+), 30 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 47db483c3516..cc68f6ab538b 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -973,6 +973,18 @@ static int nv_common_late_init(struct amdgpu_ip_block *ip_block)
return 0;
}
+static int nv_common_early_fini(struct amdgpu_ip_block *ip_block)
+{
+ struct amdgpu_device *adev = ip_block->adev;
+
+ adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false);
+
+ if (amdgpu_sriov_vf(adev))
+ xgpu_nv_mailbox_put_irq(adev);
+
+ return 0;
+}
+
static int nv_common_sw_init(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
@@ -1019,7 +1031,6 @@ static int nv_common_hw_fini(struct amdgpu_ip_block *ip_block)
* selfring doorbell.
*/
adev->nbio.funcs->enable_doorbell_aperture(adev, false);
- adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false);
return 0;
}
@@ -1095,6 +1106,7 @@ static const struct amd_ip_funcs nv_common_ip_funcs = {
.name = "nv_common",
.early_init = nv_common_early_init,
.late_init = nv_common_late_init,
+ .early_fini = nv_common_early_fini,
.sw_init = nv_common_sw_init,
.hw_init = nv_common_hw_init,
.hw_fini = nv_common_hw_fini,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 5aabb55d2d25..c6a83bbd4414 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1246,6 +1246,18 @@ static int soc15_common_late_init(struct amdgpu_ip_block *ip_block)
return 0;
}
+static int soc15_common_early_fini(struct amdgpu_ip_block *ip_block)
+{
+ struct amdgpu_device *adev = ip_block->adev;
+
+ adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false);
+
+ if (amdgpu_sriov_vf(adev))
+ xgpu_ai_mailbox_put_irq(adev);
+
+ return 0;
+}
+
static int soc15_common_sw_init(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
@@ -1317,16 +1329,7 @@ static int soc15_common_hw_fini(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
- /* Disable the doorbell aperture and selfring doorbell aperture
- * separately in hw_fini because soc15_enable_doorbell_aperture
- * has been removed and there is no need to delay disabling
- * selfring doorbell.
- */
adev->nbio.funcs->enable_doorbell_aperture(adev, false);
- adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false);
-
- if (amdgpu_sriov_vf(adev))
- xgpu_ai_mailbox_put_irq(adev);
return 0;
}
@@ -1496,6 +1499,7 @@ static const struct amd_ip_funcs soc15_common_ip_funcs = {
.name = "soc15_common",
.early_init = soc15_common_early_init,
.late_init = soc15_common_late_init,
+ .early_fini = soc15_common_early_fini,
.sw_init = soc15_common_sw_init,
.sw_fini = soc15_common_sw_fini,
.hw_init = soc15_common_hw_init,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c
index 62ad67d0b598..1635b96e2706 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -796,6 +796,7 @@ static int soc21_common_early_init(struct amdgpu_ip_block *ip_block)
static int soc21_common_late_init(struct amdgpu_ip_block *ip_block)
{
+ int r;
struct amdgpu_device *adev = ip_block->adev;
if (amdgpu_sriov_vf(adev)) {
@@ -816,12 +817,16 @@ static int soc21_common_late_init(struct amdgpu_ip_block *ip_block)
}
} else {
if (adev->nbio.ras &&
- adev->nbio.ras_err_event_athub_irq.funcs)
+ adev->nbio.ras_err_event_athub_irq.funcs) {
/* don't need to fail gpu late init
* if enabling athub_err_event interrupt failed
* nbio v4_3 only support fatal error hanlding
* just enable the interrupt directly */
- amdgpu_irq_get(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+ r = amdgpu_irq_get(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+ if (r)
+ return r;
+ amdgpu_ip_block_set_marker(ip_block, AMDGPU_MARKER_IRQ0);
+ }
}
/* Enable selfring doorbell aperture late because doorbell BAR
@@ -832,6 +837,22 @@ static int soc21_common_late_init(struct amdgpu_ip_block *ip_block)
return 0;
}
+static int soc21_common_early_fini(struct amdgpu_ip_block *ip_block)
+{
+ struct amdgpu_device *adev = ip_block->adev;
+
+ adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false);
+
+ if (amdgpu_sriov_vf(adev)) {
+ xgpu_nv_mailbox_put_irq(adev);
+ } else {
+ if (amdgpu_ip_block_test_and_clear_marker(ip_block, AMDGPU_MARKER_IRQ0))
+ amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
+ }
+
+ return 0;
+}
+
static int soc21_common_sw_init(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
@@ -872,15 +893,6 @@ static int soc21_common_hw_fini(struct amdgpu_ip_block *ip_block)
* selfring doorbell.
*/
adev->nbio.funcs->enable_doorbell_aperture(adev, false);
- adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false);
-
- if (amdgpu_sriov_vf(adev)) {
- xgpu_nv_mailbox_put_irq(adev);
- } else {
- if (adev->nbio.ras &&
- adev->nbio.ras_err_event_athub_irq.funcs)
- amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
- }
return 0;
}
@@ -985,6 +997,7 @@ static const struct amd_ip_funcs soc21_common_ip_funcs = {
.name = "soc21_common",
.early_init = soc21_common_early_init,
.late_init = soc21_common_late_init,
+ .early_fini = soc21_common_early_fini,
.sw_init = soc21_common_sw_init,
.hw_init = soc21_common_hw_init,
.hw_fini = soc21_common_hw_fini,
diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c
index 6b8e078ee7c7..7d5c8d4180b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc24.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc24.c
@@ -465,6 +465,18 @@ static int soc24_common_late_init(struct amdgpu_ip_block *ip_block)
return 0;
}
+static int soc24_common_early_fini(struct amdgpu_ip_block *ip_block)
+{
+ struct amdgpu_device *adev = ip_block->adev;
+
+ adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false);
+
+ if (amdgpu_sriov_vf(adev))
+ xgpu_nv_mailbox_put_irq(adev);
+
+ return 0;
+}
+
static int soc24_common_sw_init(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
@@ -509,15 +521,6 @@ static int soc24_common_hw_fini(struct amdgpu_ip_block *ip_block)
* selfring doorbell.
*/
adev->nbio.funcs->enable_doorbell_aperture(adev, false);
- adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false);
-
- if (amdgpu_sriov_vf(adev)) {
- xgpu_nv_mailbox_put_irq(adev);
- } else {
- if (adev->nbio.ras &&
- adev->nbio.ras_err_event_athub_irq.funcs)
- amdgpu_irq_put(adev, &adev->nbio.ras_err_event_athub_irq, 0);
- }
return 0;
}
@@ -590,6 +593,7 @@ static const struct amd_ip_funcs soc24_common_ip_funcs = {
.name = "soc24_common",
.early_init = soc24_common_early_init,
.late_init = soc24_common_late_init,
+ .early_fini = soc24_common_early_fini,
.sw_init = soc24_common_sw_init,
.hw_init = soc24_common_hw_init,
.hw_fini = soc24_common_hw_fini,
--
2.43.5
^ permalink raw reply related [flat|nested] 39+ messages in thread* RE: [RFC v2 00/15] Enhance device state machine to better support suspend/resume
2025-01-13 1:42 [RFC v2 00/15] Enhance device state machine to better support suspend/resume Jiang Liu
` (14 preceding siblings ...)
2025-01-13 1:42 ` [RFC v2 15/15] drm/amdgpu/asic: make ip block operations symmetric by .early_fini() Jiang Liu
@ 2025-01-20 6:27 ` Zhang, Hawking
2025-01-23 0:02 ` Mika Laitio
15 siblings, 1 reply; 39+ messages in thread
From: Zhang, Hawking @ 2025-01-20 6:27 UTC (permalink / raw)
To: Jiang Liu, Deucher, Alexander, Koenig, Christian, Pan, Xinhui,
airlied@gmail.com, simona@ffwll.ch, Khatri, Sunil, Lazar, Lijo,
Limonciello, Mario, Chen, Xiaogang, Russell, Kent,
shuox.liu@linux.alibaba.com, amd-gfx@lists.freedesktop.org
[AMD Official Use Only - AMD Internal Distribution Only]
Thanks for the patches.
We currently have no plans to include RAS programming as part of the device suspend/resume sequence. Instead, we are focusing on a series of clean up patches and a new RAS software module that will eliminate all legacy code/workarounds and also the changes you proposed here. It is not necessary to make this interim change in the upstream.
Regards,
Hawking
-----Original Message-----
From: Jiang Liu <gerry@linux.alibaba.com>
Sent: Monday, January 13, 2025 09:42
To: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Pan, Xinhui <Xinhui.Pan@amd.com>; airlied@gmail.com; simona@ffwll.ch; Khatri, Sunil <Sunil.Khatri@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Limonciello, Mario <Mario.Limonciello@amd.com>; Chen, Xiaogang <Xiaogang.Chen@amd.com>; Russell, Kent <Kent.Russell@amd.com>; shuox.liu@linux.alibaba.com; amd-gfx@lists.freedesktop.org
Cc: Jiang Liu <gerry@linux.alibaba.com>
Subject: [RFC v2 00/15] Enhance device state machine to better support suspend/resume
Recently we were testing suspend/resume functionality with AMD GPUs, we have encountered several resource tracking related bugs, such as double buffer free, use after free and unbalanced irq reference count.
We have tried to solve these issues case by case, but found that may not be the right way. Especially about the unbalanced irq reference count, there will be new issues appear once we fixed the current known issues. After analyzing related source code, we found that there may be some fundamental implementation flaws behind these resource tracking issues.
The amdgpu driver has two major state machines to driver the device management flow, one is for ip blocks, the other is for ras blocks.
The hook points defined in struct amd_ip_funcs for device setup/teardown are symmetric, but the implementation is asymmetric, sometime even ambiguous. The most obvious two issues we noticed are:
1) amdgpu_irq_get() are called from .late_init() but amdgpu_irq_put()
are called from .hw_fini() instead of .early_fini().
2) the way to reset ip_bloc.status.valid/sw/hw/late_initialized doesn't
match the way to set those flags.
When taking device suspend/resume into account, in addition to device probe/remove, things get much more complex. Some issues arise because many suspend/resume implementations directly reuse .hw_init/.hw_fini/ .late_init hook points.
So we try to fix those issues by two enhancements/refinements to current device management state machines.
The first change is to make the ip block state machine and associated status flags work in stack-like way as below:
Callbacks State after successfully execute callback
AMDGPU_IP_STATE_INVALID
.early_init() AMDGPU_IP_STATE_EARLY
.sw_init() AMDGPU_IP_STATE_SW
.hw_init() AMDGPU_IP_STATE_HW
.late_init() AMDGPU_IP_STATE_LATE
.early_fini() AMDGPU_IP_STATE_HW
.hw_fini() AMDGPU_IP_STATE_SW
.sw_fini() AMDGPU_IP_STATE_EARLY
.late_fini() AMDGPU_IP_STATE_INVALID
Also do the same thing for ras block state machine, though it's much more simpler.
The second change is fine tune the overall device management work flow as below:
1. amdgpu_driver_load_kms()
amdgpu_device_init()
amdgpu_device_ip_early_init()
ip_blocks[i].early_init()
ip_blocks[i].status.valid = true
amdgpu_device_ip_init()
amdgpu_ras_init()
ip_blocks[i].sw_init()
ip_blocks[i].status.sw = true
ip_blocks[i].hw_init()
ip_blocks[i].status.hw = true
amdgpu_device_ip_late_init()
ip_blocks[i].late_init()
ip_blocks[i].status.late_initialized = true
amdgpu_ras_late_init()
ras_blocks[i].ras_late_init()
amdgpu_ras_feature_enable_on_boot()
2. amdgpu_pmops_suspend()/amdgpu_pmops_freeze()/amdgpu_pmops_poweroff()
amdgpu_device_suspend()
amdgpu_ras_early_fini()
ras_blocks[i].ras_early_fini()
amdgpu_ras_feature_disable()
amdgpu_ras_suspend()
amdgpu_ras_disable_all_features()
+++ ip_blocks[i].early_fini()
+++ ip_blocks[i].status.late_initialized = false
ip_blocks[i].suspend()
3. amdgpu_pmops_resume()/amdgpu_pmops_thaw()/amdgpu_pmops_restore()
amdgpu_device_resume()
amdgpu_device_ip_resume()
ip_blocks[i].resume()
amdgpu_device_ip_late_init()
ip_blocks[i].late_init()
ip_blocks[i].status.late_initialized = true
amdgpu_ras_late_init()
ras_blocks[i].ras_late_init()
amdgpu_ras_feature_enable_on_boot()
amdgpu_ras_resume()
amdgpu_ras_enable_all_features()
4. amdgpu_driver_unload_kms()
amdgpu_device_fini_hw()
amdgpu_ras_early_fini()
ras_blocks[i].ras_early_fini()
+++ ip_blocks[i].early_fini()
+++ ip_blocks[i].status.late_initialized = false
ip_blocks[i].hw_fini()
ip_blocks[i].status.hw = false
5. amdgpu_driver_release_kms()
amdgpu_device_fini_sw()
amdgpu_device_ip_fini()
ip_blocks[i].sw_fini()
ip_blocks[i].status.sw = false
--- ip_blocks[i].status.valid = false
+++ amdgpu_ras_fini()
ip_blocks[i].late_fini()
+++ ip_blocks[i].status.valid = false
--- ip_blocks[i].status.late_initialized = false
--- amdgpu_ras_fini()
The main changes include:
1) invoke ip_blocks[i].early_fini in amdgpu_pmops_suspend().
Currently there's only one ip block which provides `early_fini`
callback. We have add a check of `in_s3` to keep current behavior in
function amdgpu_dm_early_fini(). So there should be no functional
changes.
2) set ip_blocks[i].status.late_initialized to false after calling
callback `early_fini`. We have auditted all usages of the
late_initialized flag and no functional changes found.
3) only set ip_blocks[i].status.valid = false after calling the
`late_fini` callback.
4) call amdgpu_ras_fini() before invoking ip_blocks[i].late_fini.
Then we try to refine each subsystem, such as nbio, asic etc, to follow the new design. Currently we have only taken the nbio and asic as examples to show the proposed changes. Once we have confirmed that's the right way to go, we will handle the lefting subsystems.
This is in early stage and requesting for comments, any comments and suggestions are welcomed!
v2:
- remove patch 1 in v1, it already got merged
- convert status bool flags for ip block into enum
- introduce iterators to walk ip blocks
- refine the way to define status markers
- split amdgpu_dm related change into a dedicated patch
- add patch 13 to walk ip blocks in reverse order when shutdown
Jiang Liu (15):
drm/amdgpu: add helper functions to track status for ras manager
drm/amdgpu: add a flag to track ras debugfs creation status
drm/amdgpu: free all resources on error recovery path of
amdgpu_ras_init()
drm/amdgpu: introduce a flag to track refcount held for features
drm/amdgpu: enhance amdgpu_ras_block_late_fini()
drm/amdgpu: enhance amdgpu_ras_pre_fini() to better support SR
drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini()
drm/amdgpu: make IP block state machine works in stack like way
drm/amdgpu_dm: enhance amdgpu_dm_early_fini() for PM ops
drm/admgpu: make device state machine work in stack like way
drm/amdgpu: convert ip block bool flags into an enum
drm/amdgpu: introduce IP block iterators to reduce duplicated code
drm/amdgpu: walk IP blocks in reverse order when shutdown
drm/amdgpu/nbio: improve the way to manage irq reference count
drm/amdgpu/asic: make ip block operations symmetric by .early_fini()
drivers/gpu/drm/amd/amdgpu/aldebaran.c | 46 +-
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 109 +++-
.../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 3 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 504 +++++++++---------
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 10 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 18 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 16 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 142 +++--
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 16 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 14 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +-
drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +-
drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +-
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 1 +
drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 1 +
drivers/gpu/drm/amd/amdgpu/nv.c | 14 +-
drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 50 +-
drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c | 51 +-
drivers/gpu/drm/amd/amdgpu/soc15.c | 38 +-
drivers/gpu/drm/amd/amdgpu/soc21.c | 35 +-
drivers/gpu/drm/amd/amdgpu/soc24.c | 22 +-
drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +-
.../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 +-
32 files changed, 668 insertions(+), 460 deletions(-)
--
2.43.5
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [RFC v2 00/15] Enhance device state machine to better support suspend/resume
2025-01-20 6:27 ` [RFC v2 00/15] Enhance device state machine to better support suspend/resume Zhang, Hawking
@ 2025-01-23 0:02 ` Mika Laitio
0 siblings, 0 replies; 39+ messages in thread
From: Mika Laitio @ 2025-01-23 0:02 UTC (permalink / raw)
To: Zhang, Hawking
Cc: Jiang Liu, Deucher, Alexander, Koenig, Christian, Pan, Xinhui,
airlied@gmail.com, simona@ffwll.ch, Khatri, Sunil, Lazar, Lijo,
Limonciello, Mario, Chen, Xiaogang, Russell, Kent,
shuox.liu@linux.alibaba.com, amd-gfx@lists.freedesktop.org
[-- Attachment #1: Type: text/plain, Size: 11319 bytes --]
Is the latest version of this patch series (with possible fixes based on to
comments) however maintained/available on some git tree for testing?
On Sun, Jan 19, 2025 at 10:28 PM Zhang, Hawking <Hawking.Zhang@amd.com>
wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> Thanks for the patches.
>
> We currently have no plans to include RAS programming as part of the
> device suspend/resume sequence. Instead, we are focusing on a series of
> clean up patches and a new RAS software module that will eliminate all
> legacy code/workarounds and also the changes you proposed here. It is not
> necessary to make this interim change in the upstream.
>
> Regards,
> Hawking
>
> -----Original Message-----
> From: Jiang Liu <gerry@linux.alibaba.com>
> Sent: Monday, January 13, 2025 09:42
> To: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <
> Christian.Koenig@amd.com>; Pan, Xinhui <Xinhui.Pan@amd.com>;
> airlied@gmail.com; simona@ffwll.ch; Khatri, Sunil <Sunil.Khatri@amd.com>;
> Lazar, Lijo <Lijo.Lazar@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>;
> Limonciello, Mario <Mario.Limonciello@amd.com>; Chen, Xiaogang <
> Xiaogang.Chen@amd.com>; Russell, Kent <Kent.Russell@amd.com>;
> shuox.liu@linux.alibaba.com; amd-gfx@lists.freedesktop.org
> Cc: Jiang Liu <gerry@linux.alibaba.com>
> Subject: [RFC v2 00/15] Enhance device state machine to better support
> suspend/resume
>
> Recently we were testing suspend/resume functionality with AMD GPUs, we
> have encountered several resource tracking related bugs, such as double
> buffer free, use after free and unbalanced irq reference count.
>
> We have tried to solve these issues case by case, but found that may not
> be the right way. Especially about the unbalanced irq reference count,
> there will be new issues appear once we fixed the current known issues.
> After analyzing related source code, we found that there may be some
> fundamental implementation flaws behind these resource tracking issues.
>
> The amdgpu driver has two major state machines to driver the device
> management flow, one is for ip blocks, the other is for ras blocks.
> The hook points defined in struct amd_ip_funcs for device setup/teardown
> are symmetric, but the implementation is asymmetric, sometime even
> ambiguous. The most obvious two issues we noticed are:
> 1) amdgpu_irq_get() are called from .late_init() but amdgpu_irq_put()
> are called from .hw_fini() instead of .early_fini().
> 2) the way to reset ip_bloc.status.valid/sw/hw/late_initialized doesn't
> match the way to set those flags.
>
> When taking device suspend/resume into account, in addition to device
> probe/remove, things get much more complex. Some issues arise because many
> suspend/resume implementations directly reuse .hw_init/.hw_fini/ .late_init
> hook points.
>
> So we try to fix those issues by two enhancements/refinements to current
> device management state machines.
>
> The first change is to make the ip block state machine and associated
> status flags work in stack-like way as below:
> Callbacks State after successfully execute callback
> AMDGPU_IP_STATE_INVALID
> .early_init() AMDGPU_IP_STATE_EARLY
> .sw_init() AMDGPU_IP_STATE_SW
> .hw_init() AMDGPU_IP_STATE_HW
> .late_init() AMDGPU_IP_STATE_LATE
> .early_fini() AMDGPU_IP_STATE_HW
> .hw_fini() AMDGPU_IP_STATE_SW
> .sw_fini() AMDGPU_IP_STATE_EARLY
> .late_fini() AMDGPU_IP_STATE_INVALID
>
> Also do the same thing for ras block state machine, though it's much more
> simpler.
>
> The second change is fine tune the overall device management work flow as
> below:
> 1. amdgpu_driver_load_kms()
> amdgpu_device_init()
> amdgpu_device_ip_early_init()
> ip_blocks[i].early_init()
> ip_blocks[i].status.valid = true
> amdgpu_device_ip_init()
> amdgpu_ras_init()
> ip_blocks[i].sw_init()
> ip_blocks[i].status.sw = true
> ip_blocks[i].hw_init()
> ip_blocks[i].status.hw = true
> amdgpu_device_ip_late_init()
> ip_blocks[i].late_init()
> ip_blocks[i].status.late_initialized = true
> amdgpu_ras_late_init()
> ras_blocks[i].ras_late_init()
> amdgpu_ras_feature_enable_on_boot()
>
> 2. amdgpu_pmops_suspend()/amdgpu_pmops_freeze()/amdgpu_pmops_poweroff()
> amdgpu_device_suspend()
> amdgpu_ras_early_fini()
> ras_blocks[i].ras_early_fini()
> amdgpu_ras_feature_disable()
> amdgpu_ras_suspend()
> amdgpu_ras_disable_all_features()
> +++ ip_blocks[i].early_fini()
> +++ ip_blocks[i].status.late_initialized = false
> ip_blocks[i].suspend()
>
> 3. amdgpu_pmops_resume()/amdgpu_pmops_thaw()/amdgpu_pmops_restore()
> amdgpu_device_resume()
> amdgpu_device_ip_resume()
> ip_blocks[i].resume()
> amdgpu_device_ip_late_init()
> ip_blocks[i].late_init()
> ip_blocks[i].status.late_initialized = true
> amdgpu_ras_late_init()
> ras_blocks[i].ras_late_init()
> amdgpu_ras_feature_enable_on_boot()
> amdgpu_ras_resume()
> amdgpu_ras_enable_all_features()
>
> 4. amdgpu_driver_unload_kms()
> amdgpu_device_fini_hw()
> amdgpu_ras_early_fini()
> ras_blocks[i].ras_early_fini()
> +++ ip_blocks[i].early_fini()
> +++ ip_blocks[i].status.late_initialized = false
> ip_blocks[i].hw_fini()
> ip_blocks[i].status.hw = false
>
> 5. amdgpu_driver_release_kms()
> amdgpu_device_fini_sw()
> amdgpu_device_ip_fini()
> ip_blocks[i].sw_fini()
> ip_blocks[i].status.sw = false
> --- ip_blocks[i].status.valid = false
> +++ amdgpu_ras_fini()
> ip_blocks[i].late_fini()
> +++ ip_blocks[i].status.valid = false
> --- ip_blocks[i].status.late_initialized = false
> --- amdgpu_ras_fini()
>
> The main changes include:
> 1) invoke ip_blocks[i].early_fini in amdgpu_pmops_suspend().
> Currently there's only one ip block which provides `early_fini`
> callback. We have add a check of `in_s3` to keep current behavior in
> function amdgpu_dm_early_fini(). So there should be no functional
> changes.
> 2) set ip_blocks[i].status.late_initialized to false after calling
> callback `early_fini`. We have auditted all usages of the
> late_initialized flag and no functional changes found.
> 3) only set ip_blocks[i].status.valid = false after calling the
> `late_fini` callback.
> 4) call amdgpu_ras_fini() before invoking ip_blocks[i].late_fini.
>
> Then we try to refine each subsystem, such as nbio, asic etc, to follow
> the new design. Currently we have only taken the nbio and asic as examples
> to show the proposed changes. Once we have confirmed that's the right way
> to go, we will handle the lefting subsystems.
>
> This is in early stage and requesting for comments, any comments and
> suggestions are welcomed!
>
>
> v2:
> - remove patch 1 in v1, it already got merged
> - convert status bool flags for ip block into enum
> - introduce iterators to walk ip blocks
> - refine the way to define status markers
> - split amdgpu_dm related change into a dedicated patch
> - add patch 13 to walk ip blocks in reverse order when shutdown
>
> Jiang Liu (15):
> drm/amdgpu: add helper functions to track status for ras manager
> drm/amdgpu: add a flag to track ras debugfs creation status
> drm/amdgpu: free all resources on error recovery path of
> amdgpu_ras_init()
> drm/amdgpu: introduce a flag to track refcount held for features
> drm/amdgpu: enhance amdgpu_ras_block_late_fini()
> drm/amdgpu: enhance amdgpu_ras_pre_fini() to better support SR
> drm/admgpu: rename amdgpu_ras_pre_fini() to amdgpu_ras_early_fini()
> drm/amdgpu: make IP block state machine works in stack like way
> drm/amdgpu_dm: enhance amdgpu_dm_early_fini() for PM ops
> drm/admgpu: make device state machine work in stack like way
> drm/amdgpu: convert ip block bool flags into an enum
> drm/amdgpu: introduce IP block iterators to reduce duplicated code
> drm/amdgpu: walk IP blocks in reverse order when shutdown
> drm/amdgpu/nbio: improve the way to manage irq reference count
> drm/amdgpu/asic: make ip block operations symmetric by .early_fini()
>
> drivers/gpu/drm/amd/amdgpu/aldebaran.c | 46 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 109 +++-
> .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 3 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 504 +++++++++---------
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 10 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 18 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 16 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 142 +++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 16 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 14 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 1 +
> drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 1 +
> drivers/gpu/drm/amd/amdgpu/nv.c | 14 +-
> drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 50 +-
> drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c | 51 +-
> drivers/gpu/drm/amd/amdgpu/soc15.c | 38 +-
> drivers/gpu/drm/amd/amdgpu/soc21.c | 35 +-
> drivers/gpu/drm/amd/amdgpu/soc24.c | 22 +-
> drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 2 +-
> .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +
> drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 10 +-
> 32 files changed, 668 insertions(+), 460 deletions(-)
>
> --
> 2.43.5
>
>
[-- Attachment #2: Type: text/html, Size: 13784 bytes --]
^ permalink raw reply [flat|nested] 39+ messages in thread