[PATCHv3 1/3] drm/amdgpu: Implement MES Suspend and Resume APIs for GFX11

AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCHv3 1/3] drm/amdgpu: Implement MES Suspend and Resume APIs for GFX11
@ 2024-08-16 18:01 Mukul Joshi
  2024-08-16 18:01 ` [PATCHv3 2/3] drm/amdkfd: Update queue unmap after VM fault with MES Mukul Joshi
  2024-08-16 18:01 ` [PATCHv3 3/3] drm/amdkfd: Update BadOpcode Interrupt handling " Mukul Joshi
  0 siblings, 2 replies; 6+ messages in thread
From: Mukul Joshi @ 2024-08-16 18:01 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling, alexander.deucher, Mukul Joshi

Add implementation for MES Suspend and Resume APIs to unmap/map
all queues for GFX11. Support for GFX12 will be added when the
corresponding firmware support is in place.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
v1->v2:
- Add MES FW version check.
- Update amdgpu_mes_suspend/amdgpu_mes_resume handling.

v2->v3:
- No change.

 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 71 +++++++++++++------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  2 +
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 32 ++++++++++-
 3 files changed, 69 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 04a4f0dfec15..44c74a08987d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -501,60 +501,50 @@ int amdgpu_mes_remove_gang(struct amdgpu_device *adev, int gang_id)
 
 int amdgpu_mes_suspend(struct amdgpu_device *adev)
 {
-	struct idr *idp;
-	struct amdgpu_mes_process *process;
-	struct amdgpu_mes_gang *gang;
 	struct mes_suspend_gang_input input;
-	int r, pasid;
+	int r;
+
+	if (!amdgpu_mes_suspend_resume_all_supported(adev))
+		return 0;
+
+	memset(&input, 0x0, sizeof(struct mes_suspend_gang_input));
+	input.suspend_all_gangs = 1;
 
 	/*
 	 * Avoid taking any other locks under MES lock to avoid circular
 	 * lock dependencies.
 	 */
 	amdgpu_mes_lock(&adev->mes);
-
-	idp = &adev->mes.pasid_idr;
-
-	idr_for_each_entry(idp, process, pasid) {
-		list_for_each_entry(gang, &process->gang_list, list) {
-			r = adev->mes.funcs->suspend_gang(&adev->mes, &input);
-			if (r)
-				DRM_ERROR("failed to suspend pasid %d gangid %d",
-					 pasid, gang->gang_id);
-		}
-	}
-
+	r = adev->mes.funcs->suspend_gang(&adev->mes, &input);
 	amdgpu_mes_unlock(&adev->mes);
-	return 0;
+	if (r)
+		DRM_ERROR("failed to suspend all gangs");
+
+	return r;
 }
 
 int amdgpu_mes_resume(struct amdgpu_device *adev)
 {
-	struct idr *idp;
-	struct amdgpu_mes_process *process;
-	struct amdgpu_mes_gang *gang;
 	struct mes_resume_gang_input input;
-	int r, pasid;
+	int r;
+
+	if (!amdgpu_mes_suspend_resume_all_supported(adev))
+		return 0;
+
+	memset(&input, 0x0, sizeof(struct mes_resume_gang_input));
+	input.resume_all_gangs = 1;
 
 	/*
 	 * Avoid taking any other locks under MES lock to avoid circular
 	 * lock dependencies.
 	 */
 	amdgpu_mes_lock(&adev->mes);
-
-	idp = &adev->mes.pasid_idr;
-
-	idr_for_each_entry(idp, process, pasid) {
-		list_for_each_entry(gang, &process->gang_list, list) {
-			r = adev->mes.funcs->resume_gang(&adev->mes, &input);
-			if (r)
-				DRM_ERROR("failed to resume pasid %d gangid %d",
-					 pasid, gang->gang_id);
-		}
-	}
-
+	r = adev->mes.funcs->resume_gang(&adev->mes, &input);
 	amdgpu_mes_unlock(&adev->mes);
-	return 0;
+	if (r)
+		DRM_ERROR("failed to resume all gangs");
+
+	return r;
 }
 
 static int amdgpu_mes_queue_alloc_mqd(struct amdgpu_device *adev,
@@ -1651,6 +1641,19 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe)
 	return r;
 }
 
+bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
+{
+	uint32_t mes_rev = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
+	bool is_supported = false;
+
+	if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0) &&
+	    amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(12, 0, 0) &&
+	    mes_rev >= 0x63)
+		is_supported = true;
+
+	return is_supported;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 5c8867d2380a..a5b1ea60cac8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -503,4 +503,6 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
 	memalloc_noreclaim_restore(mes->saved_flags);
 	mutex_unlock(&mes->mutex_hidden);
 }
+
+bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
 #endif /* __AMDGPU_MES_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 6f5a80519af9..8edcd85a1261 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -441,13 +441,41 @@ static int mes_v11_0_unmap_legacy_queue(struct amdgpu_mes *mes,
 static int mes_v11_0_suspend_gang(struct amdgpu_mes *mes,
 				  struct mes_suspend_gang_input *input)
 {
-	return 0;
+	union MESAPI__SUSPEND mes_suspend_gang_pkt;
+
+	memset(&mes_suspend_gang_pkt, 0, sizeof(mes_suspend_gang_pkt));
+
+	mes_suspend_gang_pkt.header.type = MES_API_TYPE_SCHEDULER;
+	mes_suspend_gang_pkt.header.opcode = MES_SCH_API_SUSPEND;
+	mes_suspend_gang_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
+
+	mes_suspend_gang_pkt.suspend_all_gangs = input->suspend_all_gangs;
+	mes_suspend_gang_pkt.gang_context_addr = input->gang_context_addr;
+	mes_suspend_gang_pkt.suspend_fence_addr = input->suspend_fence_addr;
+	mes_suspend_gang_pkt.suspend_fence_value = input->suspend_fence_value;
+
+	return mes_v11_0_submit_pkt_and_poll_completion(mes,
+			&mes_suspend_gang_pkt, sizeof(mes_suspend_gang_pkt),
+			offsetof(union MESAPI__SUSPEND, api_status));
 }
 
 static int mes_v11_0_resume_gang(struct amdgpu_mes *mes,
 				 struct mes_resume_gang_input *input)
 {
-	return 0;
+	union MESAPI__RESUME mes_resume_gang_pkt;
+
+	memset(&mes_resume_gang_pkt, 0, sizeof(mes_resume_gang_pkt));
+
+	mes_resume_gang_pkt.header.type = MES_API_TYPE_SCHEDULER;
+	mes_resume_gang_pkt.header.opcode = MES_SCH_API_RESUME;
+	mes_resume_gang_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
+
+	mes_resume_gang_pkt.resume_all_gangs = input->resume_all_gangs;
+	mes_resume_gang_pkt.gang_context_addr = input->gang_context_addr;
+
+	return mes_v11_0_submit_pkt_and_poll_completion(mes,
+			&mes_resume_gang_pkt, sizeof(mes_resume_gang_pkt),
+			offsetof(union MESAPI__RESUME, api_status));
 }
 
 static int mes_v11_0_query_sched_status(struct amdgpu_mes *mes)
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCHv3 2/3] drm/amdkfd: Update queue unmap after VM fault with MES
  2024-08-16 18:01 [PATCHv3 1/3] drm/amdgpu: Implement MES Suspend and Resume APIs for GFX11 Mukul Joshi
@ 2024-08-16 18:01 ` Mukul Joshi
  2024-08-16 20:04   ` Kasiviswanathan, Harish
  2024-08-16 23:08   ` Felix Kuehling
  2024-08-16 18:01 ` [PATCHv3 3/3] drm/amdkfd: Update BadOpcode Interrupt handling " Mukul Joshi
  1 sibling, 2 replies; 6+ messages in thread
From: Mukul Joshi @ 2024-08-16 18:01 UTC (permalink / raw)
  To: amd-gfx; +Cc: Felix.Kuehling, alexander.deucher, Mukul Joshi

MEC FW expects MES to unmap all queues when a VM fault is observed
on a queue and then resumed once the affected process is terminated.
Use the MES Suspend and Resume APIs to achieve this.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
v1->v2:
- Add MES FW version check.
- Separate out the kfd_dqm_evict_pasid into another function.
- Use amdgpu_mes_suspend/amdgpu_mes_resume to suspend/resume queues.

v2->v3:
- Use down_read_trylock/up_read instead of dqm->is_hws_hang.
- Increase eviction count if the process is already evicted in
  kfd_dqm_evict_pasid_mes to make sure the process stays evicted.

 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 87 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f6e211070299..0ca933d2099c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -319,6 +319,46 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm)
 	return retval;
 }
 
+static int suspend_all_queues_mes(struct device_queue_manager *dqm)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+	int r = 0;
+
+	if (!down_read_trylock(&adev->reset_domain->sem))
+		return -EIO;
+
+	r = amdgpu_mes_suspend(adev);
+	up_read(&adev->reset_domain->sem);
+
+	if (r) {
+		dev_err(adev->dev, "failed to suspend gangs from MES\n");
+		dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
+		kfd_hws_hang(dqm);
+	}
+
+	return r;
+}
+
+static int resume_all_queues_mes(struct device_queue_manager *dqm)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+	int r = 0;
+
+	if (!down_read_trylock(&adev->reset_domain->sem))
+		return -EIO;
+
+	r = amdgpu_mes_resume(adev);
+	up_read(&adev->reset_domain->sem);
+
+	if (r) {
+		dev_err(adev->dev, "failed to resume gangs from MES\n");
+		dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
+		kfd_hws_hang(dqm);
+	}
+
+	return r;
+}
+
 static void increment_queue_count(struct device_queue_manager *dqm,
 				  struct qcm_process_device *qpd,
 				  struct queue *q)
@@ -2835,6 +2875,44 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
 	kfree(dqm);
 }
 
+static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
+				   struct qcm_process_device *qpd)
+{
+	struct device *dev = dqm->dev->adev->dev;
+	int ret = 0;
+
+	/* Check if process is already evicted */
+	dqm_lock(dqm);
+	if (qpd->evicted) {
+		/* Increment the evicted count to make sure the
+		 * process stays evicted before its terminated.
+		 */
+		qpd->evicted++;
+		dqm_unlock(dqm);
+		goto out;
+	}
+	dqm_unlock(dqm);
+
+	ret = suspend_all_queues_mes(dqm);
+	if (ret) {
+		dev_err(dev, "Suspending all queues failed");
+		goto out;
+	}
+
+	ret = dqm->ops.evict_process_queues(dqm, qpd);
+	if (ret) {
+		dev_err(dev, "Evicting process queues failed");
+		goto out;
+	}
+
+	ret = resume_all_queues_mes(dqm);
+	if (ret)
+		dev_err(dev, "Resuming all queues failed");
+
+out:
+	return ret;
+}
+
 int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)
 {
 	struct kfd_process_device *pdd;
@@ -2845,8 +2923,13 @@ int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)
 		return -EINVAL;
 	WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
 	pdd = kfd_get_process_device_data(dqm->dev, p);
-	if (pdd)
-		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+	if (pdd) {
+		if (dqm->dev->kfd->shared_resources.enable_mes)
+			ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
+		else
+			ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+	}
+
 	kfd_unref_process(p);
 
 	return ret;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCHv3 3/3] drm/amdkfd: Update BadOpcode Interrupt handling with MES
  2024-08-16 18:01 [PATCHv3 1/3] drm/amdgpu: Implement MES Suspend and Resume APIs for GFX11 Mukul Joshi
  2024-08-16 18:01 ` [PATCHv3 2/3] drm/amdkfd: Update queue unmap after VM fault with MES Mukul Joshi
@ 2024-08-16 18:01 ` Mukul Joshi
  2024-08-16 23:09   ` Felix Kuehling
  1 sibling, 1 reply; 6+ messages in thread
From: Mukul Joshi @ 2024-08-16 18:01 UTC (permalink / raw)
  To: amd-gfx
  Cc: Felix.Kuehling, alexander.deucher, Mukul Joshi,
	Harish Kasiviswanathan

Based on the recommendation of MEC FW, update BadOpcode interrupt
handling by unmapping all queues, removing the queue that got the
interrupt from scheduling and remapping rest of the queues back when
using MES scheduler. This is done to prevent the case where unmapping
of the bad queue can fail thereby causing a GPU reset.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Acked-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
v1->v2:
- No change.

v2->v3:
- No change.

 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 51 +++++++++++++++++++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  9 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
 3 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 0ca933d2099c..d7db33f378e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2875,6 +2875,57 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
 	kfree(dqm);
 }
 
+int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id)
+{
+	struct kfd_process_device *pdd;
+	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+	struct device_queue_manager *dqm = knode->dqm;
+	struct device *dev = dqm->dev->adev->dev;
+	struct qcm_process_device *qpd;
+	struct queue *q = NULL;
+	int ret = 0;
+
+	if (!p)
+		return -EINVAL;
+
+	dqm_lock(dqm);
+
+	pdd = kfd_get_process_device_data(dqm->dev, p);
+	if (pdd) {
+		qpd = &pdd->qpd;
+
+		list_for_each_entry(q, &qpd->queues_list, list) {
+			if (q->doorbell_id == doorbell_id && q->properties.is_active) {
+				ret = suspend_all_queues_mes(dqm);
+				if (ret) {
+					dev_err(dev, "Suspending all queues failed");
+					goto out;
+				}
+
+				q->properties.is_evicted = true;
+				q->properties.is_active = false;
+				decrement_queue_count(dqm, qpd, q);
+
+				ret = remove_queue_mes(dqm, q, qpd);
+				if (ret) {
+					dev_err(dev, "Removing bad queue failed");
+					goto out;
+				}
+
+				ret = resume_all_queues_mes(dqm);
+				if (ret)
+					dev_err(dev, "Resuming all queues failed");
+
+				break;
+			}
+		}
+	}
+
+out:
+	dqm_unlock(dqm);
+	return ret;
+}
+
 static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
 				   struct qcm_process_device *qpd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index f524a55eee11..b3f988b275a8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -330,11 +330,14 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
 		if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
 			kfd_signal_event_interrupt(pasid, context_id0, 32);
 		else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
-			 KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)))
-			kfd_set_dbg_ev_from_interrupt(dev, pasid,
-				KFD_CTXID0_DOORBELL_ID(context_id0),
+			 KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) {
+			u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0);
+
+			kfd_set_dbg_ev_from_interrupt(dev, pasid, doorbell_id,
 				KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
 				NULL, 0);
+			kfd_dqm_suspend_bad_queue_mes(dev, pasid, doorbell_id);
+		}
 
 		/* SDMA */
 		else if (source_id == SOC21_INTSRC_SDMA_TRAP)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f7c12d4f0abb..7bba6bed2f48 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1324,6 +1324,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_node *dev,
 					enum kfd_queue_type type);
 void kernel_queue_uninit(struct kernel_queue *kq);
 int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid);
+int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id);
 
 /* Process Queue Manager */
 struct process_queue_node {
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* RE: [PATCHv3 2/3] drm/amdkfd: Update queue unmap after VM fault with MES
  2024-08-16 18:01 ` [PATCHv3 2/3] drm/amdkfd: Update queue unmap after VM fault with MES Mukul Joshi
@ 2024-08-16 20:04   ` Kasiviswanathan, Harish
  2024-08-16 23:08   ` Felix Kuehling
  1 sibling, 0 replies; 6+ messages in thread
From: Kasiviswanathan, Harish @ 2024-08-16 20:04 UTC (permalink / raw)
  To: Joshi, Mukul, amd-gfx@lists.freedesktop.org
  Cc: Kuehling, Felix, Deucher, Alexander, Joshi, Mukul

[AMD Official Use Only - AMD Internal Distribution Only]

This series reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Mukul Joshi
Sent: Friday, August 16, 2024 2:02 PM
To: amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix <Felix.Kuehling@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Joshi, Mukul <Mukul.Joshi@amd.com>
Subject: [PATCHv3 2/3] drm/amdkfd: Update queue unmap after VM fault with MES

MEC FW expects MES to unmap all queues when a VM fault is observed
on a queue and then resumed once the affected process is terminated.
Use the MES Suspend and Resume APIs to achieve this.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
v1->v2:
- Add MES FW version check.
- Separate out the kfd_dqm_evict_pasid into another function.
- Use amdgpu_mes_suspend/amdgpu_mes_resume to suspend/resume queues.

v2->v3:
- Use down_read_trylock/up_read instead of dqm->is_hws_hang.
- Increase eviction count if the process is already evicted in
  kfd_dqm_evict_pasid_mes to make sure the process stays evicted.

 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 87 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f6e211070299..0ca933d2099c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -319,6 +319,46 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm)
        return retval;
 }

+static int suspend_all_queues_mes(struct device_queue_manager *dqm)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+       int r = 0;
+
+       if (!down_read_trylock(&adev->reset_domain->sem))
+               return -EIO;
+
+       r = amdgpu_mes_suspend(adev);
+       up_read(&adev->reset_domain->sem);
+
+       if (r) {
+               dev_err(adev->dev, "failed to suspend gangs from MES\n");
+               dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
+               kfd_hws_hang(dqm);
+       }
+
+       return r;
+}
+
+static int resume_all_queues_mes(struct device_queue_manager *dqm)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+       int r = 0;
+
+       if (!down_read_trylock(&adev->reset_domain->sem))
+               return -EIO;
+
+       r = amdgpu_mes_resume(adev);
+       up_read(&adev->reset_domain->sem);
+
+       if (r) {
+               dev_err(adev->dev, "failed to resume gangs from MES\n");
+               dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
+               kfd_hws_hang(dqm);
+       }
+
+       return r;
+}
+
 static void increment_queue_count(struct device_queue_manager *dqm,
                                  struct qcm_process_device *qpd,
                                  struct queue *q)
@@ -2835,6 +2875,44 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
        kfree(dqm);
 }

+static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
+                                  struct qcm_process_device *qpd)
+{
+       struct device *dev = dqm->dev->adev->dev;
+       int ret = 0;
+
+       /* Check if process is already evicted */
+       dqm_lock(dqm);
+       if (qpd->evicted) {
+               /* Increment the evicted count to make sure the
+                * process stays evicted before its terminated.
+                */
+               qpd->evicted++;
+               dqm_unlock(dqm);
+               goto out;
+       }
+       dqm_unlock(dqm);
+
+       ret = suspend_all_queues_mes(dqm);
+       if (ret) {
+               dev_err(dev, "Suspending all queues failed");
+               goto out;
+       }
+
+       ret = dqm->ops.evict_process_queues(dqm, qpd);
+       if (ret) {
+               dev_err(dev, "Evicting process queues failed");
+               goto out;
+       }
+
+       ret = resume_all_queues_mes(dqm);
+       if (ret)
+               dev_err(dev, "Resuming all queues failed");
+
+out:
+       return ret;
+}
+
 int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)
 {
        struct kfd_process_device *pdd;
@@ -2845,8 +2923,13 @@ int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)
                return -EINVAL;
        WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
        pdd = kfd_get_process_device_data(dqm->dev, p);
-       if (pdd)
-               ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+       if (pdd) {
+               if (dqm->dev->kfd->shared_resources.enable_mes)
+                       ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
+               else
+                       ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+       }
+
        kfd_unref_process(p);

        return ret;
--
2.35.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCHv3 2/3] drm/amdkfd: Update queue unmap after VM fault with MES
  2024-08-16 18:01 ` [PATCHv3 2/3] drm/amdkfd: Update queue unmap after VM fault with MES Mukul Joshi
  2024-08-16 20:04   ` Kasiviswanathan, Harish
@ 2024-08-16 23:08   ` Felix Kuehling
  1 sibling, 0 replies; 6+ messages in thread
From: Felix Kuehling @ 2024-08-16 23:08 UTC (permalink / raw)
  To: Mukul Joshi, amd-gfx; +Cc: alexander.deucher


On 2024-08-16 14:01, Mukul Joshi wrote:
> MEC FW expects MES to unmap all queues when a VM fault is observed
> on a queue and then resumed once the affected process is terminated.
> Use the MES Suspend and Resume APIs to achieve this.
>
> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> Acked-by: Alex Deucher <alexander.deucher@amd.com>

Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>


> ---
> v1->v2:
> - Add MES FW version check.
> - Separate out the kfd_dqm_evict_pasid into another function.
> - Use amdgpu_mes_suspend/amdgpu_mes_resume to suspend/resume queues.
>
> v2->v3:
> - Use down_read_trylock/up_read instead of dqm->is_hws_hang.
> - Increase eviction count if the process is already evicted in
>    kfd_dqm_evict_pasid_mes to make sure the process stays evicted.
>
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 87 ++++++++++++++++++-
>   1 file changed, 85 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f6e211070299..0ca933d2099c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -319,6 +319,46 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm)
>   	return retval;
>   }
>   
> +static int suspend_all_queues_mes(struct device_queue_manager *dqm)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> +	int r = 0;
> +
> +	if (!down_read_trylock(&adev->reset_domain->sem))
> +		return -EIO;
> +
> +	r = amdgpu_mes_suspend(adev);
> +	up_read(&adev->reset_domain->sem);
> +
> +	if (r) {
> +		dev_err(adev->dev, "failed to suspend gangs from MES\n");
> +		dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
> +		kfd_hws_hang(dqm);
> +	}
> +
> +	return r;
> +}
> +
> +static int resume_all_queues_mes(struct device_queue_manager *dqm)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> +	int r = 0;
> +
> +	if (!down_read_trylock(&adev->reset_domain->sem))
> +		return -EIO;
> +
> +	r = amdgpu_mes_resume(adev);
> +	up_read(&adev->reset_domain->sem);
> +
> +	if (r) {
> +		dev_err(adev->dev, "failed to resume gangs from MES\n");
> +		dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
> +		kfd_hws_hang(dqm);
> +	}
> +
> +	return r;
> +}
> +
>   static void increment_queue_count(struct device_queue_manager *dqm,
>   				  struct qcm_process_device *qpd,
>   				  struct queue *q)
> @@ -2835,6 +2875,44 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
>   	kfree(dqm);
>   }
>   
> +static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
> +				   struct qcm_process_device *qpd)
> +{
> +	struct device *dev = dqm->dev->adev->dev;
> +	int ret = 0;
> +
> +	/* Check if process is already evicted */
> +	dqm_lock(dqm);
> +	if (qpd->evicted) {
> +		/* Increment the evicted count to make sure the
> +		 * process stays evicted before its terminated.
> +		 */
> +		qpd->evicted++;
> +		dqm_unlock(dqm);
> +		goto out;
> +	}
> +	dqm_unlock(dqm);
> +
> +	ret = suspend_all_queues_mes(dqm);
> +	if (ret) {
> +		dev_err(dev, "Suspending all queues failed");
> +		goto out;
> +	}
> +
> +	ret = dqm->ops.evict_process_queues(dqm, qpd);
> +	if (ret) {
> +		dev_err(dev, "Evicting process queues failed");
> +		goto out;
> +	}
> +
> +	ret = resume_all_queues_mes(dqm);
> +	if (ret)
> +		dev_err(dev, "Resuming all queues failed");
> +
> +out:
> +	return ret;
> +}
> +
>   int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)
>   {
>   	struct kfd_process_device *pdd;
> @@ -2845,8 +2923,13 @@ int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)
>   		return -EINVAL;
>   	WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
>   	pdd = kfd_get_process_device_data(dqm->dev, p);
> -	if (pdd)
> -		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> +	if (pdd) {
> +		if (dqm->dev->kfd->shared_resources.enable_mes)
> +			ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
> +		else
> +			ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> +	}
> +
>   	kfd_unref_process(p);
>   
>   	return ret;

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCHv3 3/3] drm/amdkfd: Update BadOpcode Interrupt handling with MES
  2024-08-16 18:01 ` [PATCHv3 3/3] drm/amdkfd: Update BadOpcode Interrupt handling " Mukul Joshi
@ 2024-08-16 23:09   ` Felix Kuehling
  0 siblings, 0 replies; 6+ messages in thread
From: Felix Kuehling @ 2024-08-16 23:09 UTC (permalink / raw)
  To: Mukul Joshi, amd-gfx; +Cc: alexander.deucher, Harish Kasiviswanathan


On 2024-08-16 14:01, Mukul Joshi wrote:
> Based on the recommendation of MEC FW, update BadOpcode interrupt
> handling by unmapping all queues, removing the queue that got the
> interrupt from scheduling and remapping rest of the queues back when
> using MES scheduler. This is done to prevent the case where unmapping
> of the bad queue can fail thereby causing a GPU reset.
>
> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
> Acked-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
> Acked-by: Alex Deucher <alexander.deucher@amd.com>

Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>


> ---
> v1->v2:
> - No change.
>
> v2->v3:
> - No change.
>
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 51 +++++++++++++++++++
>   .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  9 ++--
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
>   3 files changed, 58 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 0ca933d2099c..d7db33f378e2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -2875,6 +2875,57 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
>   	kfree(dqm);
>   }
>   
> +int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id)
> +{
> +	struct kfd_process_device *pdd;
> +	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
> +	struct device_queue_manager *dqm = knode->dqm;
> +	struct device *dev = dqm->dev->adev->dev;
> +	struct qcm_process_device *qpd;
> +	struct queue *q = NULL;
> +	int ret = 0;
> +
> +	if (!p)
> +		return -EINVAL;
> +
> +	dqm_lock(dqm);
> +
> +	pdd = kfd_get_process_device_data(dqm->dev, p);
> +	if (pdd) {
> +		qpd = &pdd->qpd;
> +
> +		list_for_each_entry(q, &qpd->queues_list, list) {
> +			if (q->doorbell_id == doorbell_id && q->properties.is_active) {
> +				ret = suspend_all_queues_mes(dqm);
> +				if (ret) {
> +					dev_err(dev, "Suspending all queues failed");
> +					goto out;
> +				}
> +
> +				q->properties.is_evicted = true;
> +				q->properties.is_active = false;
> +				decrement_queue_count(dqm, qpd, q);
> +
> +				ret = remove_queue_mes(dqm, q, qpd);
> +				if (ret) {
> +					dev_err(dev, "Removing bad queue failed");
> +					goto out;
> +				}
> +
> +				ret = resume_all_queues_mes(dqm);
> +				if (ret)
> +					dev_err(dev, "Resuming all queues failed");
> +
> +				break;
> +			}
> +		}
> +	}
> +
> +out:
> +	dqm_unlock(dqm);
> +	return ret;
> +}
> +
>   static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
>   				   struct qcm_process_device *qpd)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
> index f524a55eee11..b3f988b275a8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
> @@ -330,11 +330,14 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
>   		if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
>   			kfd_signal_event_interrupt(pasid, context_id0, 32);
>   		else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
> -			 KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)))
> -			kfd_set_dbg_ev_from_interrupt(dev, pasid,
> -				KFD_CTXID0_DOORBELL_ID(context_id0),
> +			 KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) {
> +			u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0);
> +
> +			kfd_set_dbg_ev_from_interrupt(dev, pasid, doorbell_id,
>   				KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
>   				NULL, 0);
> +			kfd_dqm_suspend_bad_queue_mes(dev, pasid, doorbell_id);
> +		}
>   
>   		/* SDMA */
>   		else if (source_id == SOC21_INTSRC_SDMA_TRAP)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f7c12d4f0abb..7bba6bed2f48 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1324,6 +1324,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_node *dev,
>   					enum kfd_queue_type type);
>   void kernel_queue_uninit(struct kernel_queue *kq);
>   int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid);
> +int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id);
>   
>   /* Process Queue Manager */
>   struct process_queue_node {

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-08-16 23:09 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-16 18:01 [PATCHv3 1/3] drm/amdgpu: Implement MES Suspend and Resume APIs for GFX11 Mukul Joshi
2024-08-16 18:01 ` [PATCHv3 2/3] drm/amdkfd: Update queue unmap after VM fault with MES Mukul Joshi
2024-08-16 20:04   ` Kasiviswanathan, Harish
2024-08-16 23:08   ` Felix Kuehling
2024-08-16 18:01 ` [PATCHv3 3/3] drm/amdkfd: Update BadOpcode Interrupt handling " Mukul Joshi
2024-08-16 23:09   ` Felix Kuehling

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox