amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu: Add wrapper function for dpc state
@ 2025-07-31  6:14 Lijo Lazar
  2025-07-31  6:14 ` [PATCH 2/3] drm/amdgpu: Set dpc status appropriately Lijo Lazar
  2025-07-31  6:14 ` [PATCH 3/3] drm/amdgpu: Prevent hardware access in dpc state Lijo Lazar
  0 siblings, 2 replies; 4+ messages in thread
From: Lijo Lazar @ 2025-07-31  6:14 UTC (permalink / raw)
  To: amd-gfx; +Cc: Hawking.Zhang, Alexander.Deucher, Ce.Sun

Use wrapper functions to set/indicate dpc status.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 11 +++++++++++
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f4f80cb2f706..ea66322c279b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5707,7 +5707,7 @@ int amdgpu_device_link_reset(struct amdgpu_device *adev)
 
 	dev_info(adev->dev, "GPU link reset\n");
 
-	if (!adev->pcie_reset_ctx.occurs_dpc)
+	if (!amdgpu_reset_in_dpc(adev))
 		ret = amdgpu_dpm_link_reset(adev);
 
 	if (ret)
@@ -6158,7 +6158,7 @@ static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
 			list_add_tail(&tmp_adev->reset_list, device_list);
 			if (adev->shutdown)
 				tmp_adev->shutdown = true;
-			if (adev->pcie_reset_ctx.occurs_dpc)
+			if (amdgpu_reset_in_dpc(adev))
 				tmp_adev->pcie_reset_ctx.in_link_reset = true;
 		}
 		if (!list_is_first(&adev->reset_list, device_list))
@@ -6234,9 +6234,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
 		drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
 
 		/* disable ras on ALL IPs */
-		if (!need_emergency_restart &&
-		      (!adev->pcie_reset_ctx.occurs_dpc) &&
-		      amdgpu_device_ip_need_full_reset(tmp_adev))
+		if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) &&
+		    amdgpu_device_ip_need_full_reset(tmp_adev))
 			amdgpu_ras_suspend(tmp_adev);
 
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
@@ -6264,10 +6263,10 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
 
 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
 	list_for_each_entry(tmp_adev, device_list, reset_list) {
-		if (adev->pcie_reset_ctx.occurs_dpc)
+		if (amdgpu_reset_in_dpc(adev))
 			tmp_adev->no_hw_access = true;
 		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
-		if (adev->pcie_reset_ctx.occurs_dpc)
+		if (amdgpu_reset_in_dpc(adev))
 			tmp_adev->no_hw_access = false;
 		/*TODO Should we stop ?*/
 		if (r) {
@@ -6900,7 +6899,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
 
 		if (hive)
 			mutex_lock(&hive->hive_lock);
-		adev->pcie_reset_ctx.occurs_dpc = true;
+		amdgpu_reset_set_dpc_status(adev, true);
 		memset(&reset_context, 0, sizeof(reset_context));
 		INIT_LIST_HEAD(&device_list);
 
@@ -7063,7 +7062,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
 	amdgpu_device_sched_resume(&device_list, NULL, NULL);
 	amdgpu_device_gpu_resume(adev, &device_list, false);
 	amdgpu_device_recovery_put_reset_lock(adev, &device_list);
-	adev->pcie_reset_ctx.occurs_dpc = false;
+	amdgpu_reset_set_dpc_status(adev, false);
 
 	if (hive) {
 		mutex_unlock(&hive->hive_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 4d9b9701139b..3a806953338f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -160,4 +160,15 @@ int amdgpu_reset_do_xgmi_reset_on_init(
 
 bool amdgpu_reset_in_recovery(struct amdgpu_device *adev);
 
+static inline void amdgpu_reset_set_dpc_status(struct amdgpu_device *adev,
+					       bool status)
+{
+	adev->pcie_reset_ctx.occurs_dpc = status;
+}
+
+static inline bool amdgpu_reset_in_dpc(struct amdgpu_device *adev)
+{
+	return adev->pcie_reset_ctx.occurs_dpc;
+}
+
 #endif
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/3] drm/amdgpu: Set dpc status appropriately
  2025-07-31  6:14 [PATCH 1/3] drm/amdgpu: Add wrapper function for dpc state Lijo Lazar
@ 2025-07-31  6:14 ` Lijo Lazar
  2025-07-31  6:14 ` [PATCH 3/3] drm/amdgpu: Prevent hardware access in dpc state Lijo Lazar
  1 sibling, 0 replies; 4+ messages in thread
From: Lijo Lazar @ 2025-07-31  6:14 UTC (permalink / raw)
  To: amd-gfx; +Cc: Hawking.Zhang, Alexander.Deucher, Ce.Sun

Set the dpc status based on hardware stae. Also, clear the status before
reinitialization after a successful reset.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ea66322c279b..076ad472a95e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5836,6 +5836,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
 		amdgpu_set_init_level(tmp_adev, init_level);
 		if (full_reset) {
 			/* post card */
+			amdgpu_reset_set_dpc_status(tmp_adev, false);
 			amdgpu_ras_clear_err_state(tmp_adev);
 			r = amdgpu_device_asic_init(tmp_adev);
 			if (r) {
@@ -6882,11 +6883,6 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
 
 	dev_info(adev->dev, "PCI error: detected callback!!\n");
 
-	if (!amdgpu_dpm_is_link_reset_supported(adev)) {
-		dev_warn(adev->dev, "No support for XGMI hive yet...\n");
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-
 	adev->pci_channel_state = state;
 
 	switch (state) {
@@ -6896,10 +6892,23 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
 	case pci_channel_io_frozen:
 		/* Fatal error, prepare for slot reset */
 		dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state);
+		if (hive) {
+			/* Hive devices should be able to support FW based
+			 * link reset on other devices, if not return.
+			 */
+			if (!amdgpu_dpm_is_link_reset_supported(adev)) {
+				dev_warn(adev->dev,
+					 "No support for XGMI hive yet...\n");
+				return PCI_ERS_RESULT_DISCONNECT;
+			}
+			/* Set dpc status only if device is part of hive
+			 * Non-hive devices should be able to recover after
+			 * link reset.
+			 */
+			amdgpu_reset_set_dpc_status(adev, true);
 
-		if (hive)
 			mutex_lock(&hive->hive_lock);
-		amdgpu_reset_set_dpc_status(adev, true);
+		}
 		memset(&reset_context, 0, sizeof(reset_context));
 		INIT_LIST_HEAD(&device_list);
 
@@ -7062,7 +7071,6 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
 	amdgpu_device_sched_resume(&device_list, NULL, NULL);
 	amdgpu_device_gpu_resume(adev, &device_list, false);
 	amdgpu_device_recovery_put_reset_lock(adev, &device_list);
-	amdgpu_reset_set_dpc_status(adev, false);
 
 	if (hive) {
 		mutex_unlock(&hive->hive_lock);
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/3] drm/amdgpu: Prevent hardware access in dpc state
  2025-07-31  6:14 [PATCH 1/3] drm/amdgpu: Add wrapper function for dpc state Lijo Lazar
  2025-07-31  6:14 ` [PATCH 2/3] drm/amdgpu: Set dpc status appropriately Lijo Lazar
@ 2025-07-31  6:14 ` Lijo Lazar
  2025-08-01 15:40   ` Sun, Ce(Overlord)
  1 sibling, 1 reply; 4+ messages in thread
From: Lijo Lazar @ 2025-07-31  6:14 UTC (permalink / raw)
  To: amd-gfx; +Cc: Hawking.Zhang, Alexander.Deucher, Ce.Sun

Don't allow hardware access while in dpc state.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ----
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 1 +
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 076ad472a95e..cfd72faec16e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6264,11 +6264,7 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
 
 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
 	list_for_each_entry(tmp_adev, device_list, reset_list) {
-		if (amdgpu_reset_in_dpc(adev))
-			tmp_adev->no_hw_access = true;
 		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
-		if (amdgpu_reset_in_dpc(adev))
-			tmp_adev->no_hw_access = false;
 		/*TODO Should we stop ?*/
 		if (r) {
 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 3a806953338f..2f92b3be40f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -164,6 +164,7 @@ static inline void amdgpu_reset_set_dpc_status(struct amdgpu_device *adev,
 					       bool status)
 {
 	adev->pcie_reset_ctx.occurs_dpc = status;
+	adev->no_hw_access = status;
 }
 
 static inline bool amdgpu_reset_in_dpc(struct amdgpu_device *adev)
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: Prevent hardware access in dpc state
  2025-07-31  6:14 ` [PATCH 3/3] drm/amdgpu: Prevent hardware access in dpc state Lijo Lazar
@ 2025-08-01 15:40   ` Sun, Ce(Overlord)
  0 siblings, 0 replies; 4+ messages in thread
From: Sun, Ce(Overlord) @ 2025-08-01 15:40 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx@lists.freedesktop.org
  Cc: Zhang, Hawking, Deucher, Alexander

[-- Attachment #1: Type: text/plain, Size: 2320 bytes --]

[AMD Official Use Only - AMD Internal Distribution Only]

the series :
reviewed-by Ce Sun <cesun102@amd.com>

获取 Outlook for iOS<https://aka.ms/o0ukef>
________________________________
发件人: Lazar, Lijo <Lijo.Lazar@amd.com>
发送时间: Thursday, July 31, 2025 2:14:07 PM
收件人: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
抄送: Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Sun, Ce(Overlord) <Ce.Sun@amd.com>
主题: [PATCH 3/3] drm/amdgpu: Prevent hardware access in dpc state

Don't allow hardware access while in dpc state.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ----
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 1 +
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 076ad472a95e..cfd72faec16e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6264,11 +6264,7 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev,

 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
         list_for_each_entry(tmp_adev, device_list, reset_list) {
-               if (amdgpu_reset_in_dpc(adev))
-                       tmp_adev->no_hw_access = true;
                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
-               if (amdgpu_reset_in_dpc(adev))
-                       tmp_adev->no_hw_access = false;
                 /*TODO Should we stop ?*/
                 if (r) {
                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 3a806953338f..2f92b3be40f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -164,6 +164,7 @@ static inline void amdgpu_reset_set_dpc_status(struct amdgpu_device *adev,
                                                bool status)
 {
         adev->pcie_reset_ctx.occurs_dpc = status;
+       adev->no_hw_access = status;
 }

 static inline bool amdgpu_reset_in_dpc(struct amdgpu_device *adev)
--
2.49.0


[-- Attachment #2: Type: text/html, Size: 4928 bytes --]

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-08-01 15:40 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-31  6:14 [PATCH 1/3] drm/amdgpu: Add wrapper function for dpc state Lijo Lazar
2025-07-31  6:14 ` [PATCH 2/3] drm/amdgpu: Set dpc status appropriately Lijo Lazar
2025-07-31  6:14 ` [PATCH 3/3] drm/amdgpu: Prevent hardware access in dpc state Lijo Lazar
2025-08-01 15:40   ` Sun, Ce(Overlord)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).