AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
@ 2019-05-31 21:19 Zeng, Oak
       [not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Zeng, Oak @ 2019-05-31 21:19 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhao, Yong, Kuehling, Felix, Freehill, Chris, Zeng, Oak,
	Liu, Alex

SDMA queue allocation requires the dqm lock as it modify
the global dqm members. Introduce functions to allocate/deallocate
in locked/unlocked circumstance.

Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 46 ++++++++++++++++------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ece35c7..1f707bb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
 
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
 				struct queue *q);
+static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
+				struct queue *q);
 
 static void kfd_process_hw_exception(struct work_struct *work);
 
@@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 		deallocate_hqd(dqm, q);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		dqm->sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue_locked(dqm, q);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		dqm->xgmi_sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue_locked(dqm, q);
 	} else {
 		pr_debug("q->properties.type %d is invalid\n",
 				q->properties.type);
@@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		if (dqm->sdma_bitmap == 0)
 			return -ENOMEM;
+		dqm_lock(dqm);
 		bit = __ffs64(dqm->sdma_bitmap);
 		dqm->sdma_bitmap &= ~(1ULL << bit);
+		dqm_unlock(dqm);
 		q->sdma_id = bit;
 		q->properties.sdma_engine_id = q->sdma_id %
 				get_num_sdma_engines(dqm);
@@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		if (dqm->xgmi_sdma_bitmap == 0)
 			return -ENOMEM;
+		dqm_lock(dqm);
 		bit = __ffs64(dqm->xgmi_sdma_bitmap);
 		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
+		dqm_unlock(dqm);
 		q->sdma_id = bit;
 		/* sdma_engine_id is sdma id including
 		 * both PCIe-optimized SDMAs and XGMI-
@@ -953,17 +959,35 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
 	return 0;
 }
 
+static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
+				struct queue *q)
+{
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+		if (q->sdma_id >= get_num_sdma_queues(dqm))
+			return;
+		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
+			return;
+		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
+	}
+}
+
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
 				struct queue *q)
 {
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		if (q->sdma_id >= get_num_sdma_queues(dqm))
 			return;
+		dqm_lock(dqm);
 		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
+		dqm_unlock(dqm);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
 			return;
+		dqm_lock(dqm);
 		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
+		dqm_unlock(dqm);
 	}
 }
 
@@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
 
 	retval = allocate_doorbell(qpd, q);
 	if (retval)
-		goto out_deallocate_sdma_queue;
+		goto out_deallocate_sdma_queue_locked;
 
 	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
 	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
@@ -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
 	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
 out_deallocate_doorbell:
 	deallocate_doorbell(qpd, q);
-out_deallocate_sdma_queue:
-	deallocate_sdma_queue(dqm, q);
+out_deallocate_sdma_queue_locked:
+	deallocate_sdma_queue_locked(dqm, q);
 
 	return retval;
 }
@@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
 	retval = allocate_doorbell(qpd, q);
 	if (retval)
-		goto out_deallocate_sdma_queue;
+		goto out_deallocate_sdma_queue_locked;
 
 	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
 			q->properties.type)];
@@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
 out_deallocate_doorbell:
 	deallocate_doorbell(qpd, q);
-out_deallocate_sdma_queue:
+out_deallocate_sdma_queue_locked:
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
 		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
 		deallocate_sdma_queue(dqm, q);
@@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		dqm->sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue_locked(dqm, q);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		dqm->xgmi_sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q);
+		deallocate_sdma_queue_locked(dqm, q);
 	}
 
 	list_del(&q->list);
@@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	list_for_each_entry(q, &qpd->queues_list, list) {
 		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 			dqm->sdma_queue_count--;
-			deallocate_sdma_queue(dqm, q);
+			deallocate_sdma_queue_locked(dqm, q);
 		} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 			dqm->xgmi_sdma_queue_count--;
-			deallocate_sdma_queue(dqm, q);
+			deallocate_sdma_queue_locked(dqm, q);
 		}
 
 		if (q->properties.is_active)
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/4] drm/amdkfd: Only initialize sdma vm for sdma queues
       [not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-31 21:19   ` Zeng, Oak
  2019-05-31 21:19   ` [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch Zeng, Oak
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 8+ messages in thread
From: Zeng, Oak @ 2019-05-31 21:19 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhao, Yong, Kuehling, Felix, Freehill, Chris, Zeng, Oak,
	Liu, Alex

Don't do the same for compute queues

Change-Id: Id5f743ca10c2b761590bfe18cab2f802d3c04d2d
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 1f707bb..74944ec 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1230,7 +1230,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 		q->properties.is_evicted = (q->properties.queue_size > 0 &&
 					    q->properties.queue_percent > 0 &&
 					    q->properties.queue_address != 0);
-	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
 	q->properties.tba_addr = qpd->tba_addr;
 	q->properties.tma_addr = qpd->tma_addr;
 	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch
       [not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  2019-05-31 21:19   ` [PATCH 2/4] drm/amdkfd: Only initialize sdma vm for sdma queues Zeng, Oak
@ 2019-05-31 21:19   ` Zeng, Oak
       [not found]     ` <1559337538-14249-3-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  2019-05-31 21:19   ` [PATCH 4/4] drm/amdkfd: Fix a circular lock dependency Zeng, Oak
  2019-05-31 21:31   ` [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition Kuehling, Felix
  3 siblings, 1 reply; 8+ messages in thread
From: Zeng, Oak @ 2019-05-31 21:19 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhao, Yong, Kuehling, Felix, Freehill, Chris, Zeng, Oak,
	Liu, Alex

This is prepare work to fix a circular lock dependency.
No logic change

Change-Id: I4e0ee918260e7780de972dd71f4ce787b4f6dde9
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 172 ++++++++-------------
 1 file changed, 62 insertions(+), 110 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 74944ec..aac7f7b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -42,10 +42,6 @@
 static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
 					unsigned int pasid, unsigned int vmid);
 
-static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd);
-
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
 				uint32_t filter_param);
@@ -55,15 +51,16 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 
 static int map_queues_cpsch(struct device_queue_manager *dqm);
 
-static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd);
-
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
 				struct queue *q);
 static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
 				struct queue *q);
 
+static inline void deallocate_hqd(struct device_queue_manager *dqm,
+				struct queue *q);
+static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
+static int allocate_sdma_queue(struct device_queue_manager *dqm,
+				struct queue *q);
 static void kfd_process_hw_exception(struct work_struct *work);
 
 static inline
@@ -272,6 +269,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd)
 {
 	int retval;
+	struct mqd_manager *mqd_mgr;
 
 	print_queue(q);
 
@@ -302,18 +300,51 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	q->properties.tba_addr = qpd->tba_addr;
 	q->properties.tma_addr = qpd->tma_addr;
 
+	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+			q->properties.type)];
 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
-		retval = create_compute_queue_nocpsch(dqm, q, qpd);
-	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
-			q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
-		retval = create_sdma_queue_nocpsch(dqm, q, qpd);
-	else
-		retval = -EINVAL;
+	{
+		retval = allocate_hqd(dqm, q);
+		if (retval)
+			goto deallocate_vmid;
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		retval = allocate_sdma_queue(dqm, q);
+		if (retval)
+			goto deallocate_vmid;
+		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+	}
 
-	if (retval) {
-		if (list_empty(&qpd->queues_list))
-			deallocate_vmid(dqm, qpd, q);
-		goto out_unlock;
+	retval = allocate_doorbell(qpd, q);
+	if (retval)
+		goto out_deallocate_hqd;
+
+	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
+				&q->gart_mqd_addr, &q->properties);
+	if (retval)
+		goto out_deallocate_doorbell;
+
+	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
+			q->pipe, q->queue);
+
+	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+		dqm->dev->kfd2kgd->set_scratch_backing_va(
+			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
+
+	if (q->properties.is_active) {
+
+		if (WARN(q->process->mm != current->mm,
+					"should only run in user thread"))
+			retval = -EFAULT;
+		else
+			/* TODO for sdma queue load mqd was called
+			 * unconditionally in original code, with mm_struct set
+			 * to NULL. I am not sure whether we need to keep
+			 * original logic here */
+			retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
+					&q->properties, current->mm);
+		if (retval)
+			goto out_uninit_mqd;
 	}
 
 	list_add(&q->list, &qpd->queues_list);
@@ -334,6 +365,19 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	pr_debug("Total of %d queues are accountable so far\n",
 			dqm->total_queue_count);
 
+out_uninit_mqd:
+	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+out_deallocate_doorbell:
+	deallocate_doorbell(qpd, q);
+out_deallocate_hqd:
+	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+		deallocate_hqd(dqm, q);
+	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		deallocate_sdma_queue_locked(dqm, q);
+deallocate_vmid:
+	if (list_empty(&qpd->queues_list))
+		deallocate_vmid(dqm, qpd, q);
 out_unlock:
 	dqm_unlock(dqm);
 	return retval;
@@ -379,58 +423,6 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm,
 	dqm->allocated_queues[q->pipe] |= (1 << q->queue);
 }
 
-static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd)
-{
-	struct mqd_manager *mqd_mgr;
-	int retval;
-
-	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
-
-	retval = allocate_hqd(dqm, q);
-	if (retval)
-		return retval;
-
-	retval = allocate_doorbell(qpd, q);
-	if (retval)
-		goto out_deallocate_hqd;
-
-	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
-				&q->gart_mqd_addr, &q->properties);
-	if (retval)
-		goto out_deallocate_doorbell;
-
-	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
-			q->pipe, q->queue);
-
-	dqm->dev->kfd2kgd->set_scratch_backing_va(
-			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
-
-	if (!q->properties.is_active)
-		return 0;
-
-	if (WARN(q->process->mm != current->mm,
-		 "should only run in user thread"))
-		retval = -EFAULT;
-	else
-		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
-					   &q->properties, current->mm);
-	if (retval)
-		goto out_uninit_mqd;
-
-	return 0;
-
-out_uninit_mqd:
-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-out_deallocate_doorbell:
-	deallocate_doorbell(qpd, q);
-out_deallocate_hqd:
-	deallocate_hqd(dqm, q);
-
-	return retval;
-}
-
 /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
  * to avoid asynchronized access
  */
@@ -991,46 +983,6 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
 	}
 }
 
-static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd)
-{
-	struct mqd_manager *mqd_mgr;
-	int retval;
-
-	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA];
-
-	retval = allocate_sdma_queue(dqm, q);
-	if (retval)
-		return retval;
-
-	retval = allocate_doorbell(qpd, q);
-	if (retval)
-		goto out_deallocate_sdma_queue_locked;
-
-	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
-	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
-				&q->gart_mqd_addr, &q->properties);
-	if (retval)
-		goto out_deallocate_doorbell;
-
-	retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
-				NULL);
-	if (retval)
-		goto out_uninit_mqd;
-
-	return 0;
-
-out_uninit_mqd:
-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-out_deallocate_doorbell:
-	deallocate_doorbell(qpd, q);
-out_deallocate_sdma_queue_locked:
-	deallocate_sdma_queue_locked(dqm, q);
-
-	return retval;
-}
-
 /*
  * Device Queue Manager implementation for cp scheduler
  */
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/4] drm/amdkfd: Fix a circular lock dependency
       [not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
  2019-05-31 21:19   ` [PATCH 2/4] drm/amdkfd: Only initialize sdma vm for sdma queues Zeng, Oak
  2019-05-31 21:19   ` [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch Zeng, Oak
@ 2019-05-31 21:19   ` Zeng, Oak
  2019-05-31 21:31   ` [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition Kuehling, Felix
  3 siblings, 0 replies; 8+ messages in thread
From: Zeng, Oak @ 2019-05-31 21:19 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhao, Yong, Kuehling, Felix, Freehill, Chris, Zeng, Oak,
	Liu, Alex

The idea to break the circular lock dependency is
to move init_mqd out of lock protection of dqm lock
in callstack #1 below. There is no need to.

[   59.510149] [drm] Initialized amdgpu 3.30.0 20150101 for 0000:04:00.0 on minor 0

[  513.604034] ======================================================
[  513.604205] WARNING: possible circular locking dependency detected
[  513.604375] 4.18.0-kfd-root #2 Tainted: G        W
[  513.604530] ------------------------------------------------------
[  513.604699] kswapd0/611 is trying to acquire lock:
[  513.604840] 00000000d254022e (&dqm->lock_hidden){+.+.}, at: evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.605150]
               but task is already holding lock:
[  513.605307] 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
[  513.605540]
               which lock already depends on the new lock.

[  513.605747]
               the existing dependency chain (in reverse order) is:
[  513.605944]
               -> #4 (&anon_vma->rwsem){++++}:
[  513.606106]        __vma_adjust+0x147/0x7f0
[  513.606231]        __split_vma+0x179/0x190
[  513.606353]        mprotect_fixup+0x217/0x260
[  513.606553]        do_mprotect_pkey+0x211/0x380
[  513.606752]        __x64_sys_mprotect+0x1b/0x20
[  513.606954]        do_syscall_64+0x50/0x1a0
[  513.607149]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  513.607380]
               -> #3 (&mapping->i_mmap_rwsem){++++}:
[  513.607678]        rmap_walk_file+0x1f0/0x280
[  513.607887]        page_referenced+0xdd/0x180
[  513.608081]        shrink_page_list+0x853/0xcb0
[  513.608279]        shrink_inactive_list+0x33b/0x700
[  513.608483]        shrink_node_memcg+0x37a/0x7f0
[  513.608682]        shrink_node+0xd8/0x490
[  513.608869]        balance_pgdat+0x18b/0x3b0
[  513.609062]        kswapd+0x203/0x5c0
[  513.609241]        kthread+0x100/0x140
[  513.609420]        ret_from_fork+0x24/0x30
[  513.609607]
               -> #2 (fs_reclaim){+.+.}:
[  513.609883]        kmem_cache_alloc_trace+0x34/0x2e0
[  513.610093]        reservation_object_reserve_shared+0x139/0x300
[  513.610326]        ttm_bo_init_reserved+0x291/0x480 [ttm]
[  513.610567]        amdgpu_bo_do_create+0x1d2/0x650 [amdgpu]
[  513.610811]        amdgpu_bo_create+0x40/0x1f0 [amdgpu]
[  513.611041]        amdgpu_bo_create_reserved+0x249/0x2d0 [amdgpu]
[  513.611290]        amdgpu_bo_create_kernel+0x12/0x70 [amdgpu]
[  513.611584]        amdgpu_ttm_init+0x2cb/0x560 [amdgpu]
[  513.611823]        gmc_v9_0_sw_init+0x400/0x750 [amdgpu]
[  513.612491]        amdgpu_device_init+0x14eb/0x1990 [amdgpu]
[  513.612730]        amdgpu_driver_load_kms+0x78/0x290 [amdgpu]
[  513.612958]        drm_dev_register+0x111/0x1a0
[  513.613171]        amdgpu_pci_probe+0x11c/0x1e0 [amdgpu]
[  513.613389]        local_pci_probe+0x3f/0x90
[  513.613581]        pci_device_probe+0x102/0x1c0
[  513.613779]        driver_probe_device+0x2a7/0x480
[  513.613984]        __driver_attach+0x10a/0x110
[  513.614179]        bus_for_each_dev+0x67/0xc0
[  513.614372]        bus_add_driver+0x1eb/0x260
[  513.614565]        driver_register+0x5b/0xe0
[  513.614756]        do_one_initcall+0xac/0x357
[  513.614952]        do_init_module+0x5b/0x213
[  513.615145]        load_module+0x2542/0x2d30
[  513.615337]        __do_sys_finit_module+0xd2/0x100
[  513.615541]        do_syscall_64+0x50/0x1a0
[  513.615731]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  513.615963]
               -> #1 (reservation_ww_class_mutex){+.+.}:
[  513.616293]        amdgpu_amdkfd_alloc_gtt_mem+0xcf/0x2c0 [amdgpu]
[  513.616554]        init_mqd+0x223/0x260 [amdgpu]
[  513.616779]        create_queue_nocpsch+0x4d9/0x600 [amdgpu]
[  513.617031]        pqm_create_queue+0x37c/0x520 [amdgpu]
[  513.617270]        kfd_ioctl_create_queue+0x2f9/0x650 [amdgpu]
[  513.617522]        kfd_ioctl+0x202/0x350 [amdgpu]
[  513.617724]        do_vfs_ioctl+0x9f/0x6c0
[  513.617914]        ksys_ioctl+0x66/0x70
[  513.618095]        __x64_sys_ioctl+0x16/0x20
[  513.618286]        do_syscall_64+0x50/0x1a0
[  513.618476]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  513.618695]
               -> #0 (&dqm->lock_hidden){+.+.}:
[  513.618984]        __mutex_lock+0x98/0x970
[  513.619197]        evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.619459]        kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
[  513.619710]        kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
[  513.620103]        amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
[  513.620363]        amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
[  513.620614]        __mmu_notifier_invalidate_range_start+0x70/0xb0
[  513.620851]        try_to_unmap_one+0x7fc/0x8f0
[  513.621049]        rmap_walk_anon+0x121/0x290
[  513.621242]        try_to_unmap+0x93/0xf0
[  513.621428]        shrink_page_list+0x606/0xcb0
[  513.621625]        shrink_inactive_list+0x33b/0x700
[  513.621835]        shrink_node_memcg+0x37a/0x7f0
[  513.622034]        shrink_node+0xd8/0x490
[  513.622219]        balance_pgdat+0x18b/0x3b0
[  513.622410]        kswapd+0x203/0x5c0
[  513.622589]        kthread+0x100/0x140
[  513.622769]        ret_from_fork+0x24/0x30
[  513.622957]
               other info that might help us debug this:

[  513.623354] Chain exists of:
                 &dqm->lock_hidden --> &mapping->i_mmap_rwsem --> &anon_vma->rwsem

[  513.623900]  Possible unsafe locking scenario:

[  513.624189]        CPU0                    CPU1
[  513.624397]        ----                    ----
[  513.624594]   lock(&anon_vma->rwsem);
[  513.624771]                                lock(&mapping->i_mmap_rwsem);
[  513.625020]                                lock(&anon_vma->rwsem);
[  513.625253]   lock(&dqm->lock_hidden);
[  513.625433]
                *** DEADLOCK ***

[  513.625783] 3 locks held by kswapd0/611:
[  513.625967]  #0: 00000000f14edf84 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30
[  513.626309]  #1: 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
[  513.626671]  #2: 0000000067b5cd12 (srcu){....}, at: __mmu_notifier_invalidate_range_start+0x5/0xb0
[  513.627037]
               stack backtrace:
[  513.627292] CPU: 0 PID: 611 Comm: kswapd0 Tainted: G        W         4.18.0-kfd-root #2
[  513.627632] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[  513.627990] Call Trace:
[  513.628143]  dump_stack+0x7c/0xbb
[  513.628315]  print_circular_bug.isra.37+0x21b/0x228
[  513.628581]  __lock_acquire+0xf7d/0x1470
[  513.628782]  ? unwind_next_frame+0x6c/0x4f0
[  513.628974]  ? lock_acquire+0xec/0x1e0
[  513.629154]  lock_acquire+0xec/0x1e0
[  513.629357]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.629587]  __mutex_lock+0x98/0x970
[  513.629790]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.630047]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.630309]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.630562]  evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[  513.630816]  kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
[  513.631057]  kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
[  513.631288]  amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
[  513.631536]  amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
[  513.632076]  __mmu_notifier_invalidate_range_start+0x70/0xb0
[  513.632299]  try_to_unmap_one+0x7fc/0x8f0
[  513.632487]  ? page_lock_anon_vma_read+0x68/0x250
[  513.632690]  rmap_walk_anon+0x121/0x290
[  513.632875]  try_to_unmap+0x93/0xf0
[  513.633050]  ? page_remove_rmap+0x330/0x330
[  513.633239]  ? rcu_read_unlock+0x60/0x60
[  513.633422]  ? page_get_anon_vma+0x160/0x160
[  513.633613]  shrink_page_list+0x606/0xcb0
[  513.633800]  shrink_inactive_list+0x33b/0x700
[  513.633997]  shrink_node_memcg+0x37a/0x7f0
[  513.634186]  ? shrink_node+0xd8/0x490
[  513.634363]  shrink_node+0xd8/0x490
[  513.634537]  balance_pgdat+0x18b/0x3b0
[  513.634718]  kswapd+0x203/0x5c0
[  513.634887]  ? wait_woken+0xb0/0xb0
[  513.635062]  kthread+0x100/0x140
[  513.635231]  ? balance_pgdat+0x3b0/0x3b0
[  513.635414]  ? kthread_delayed_work_timer_fn+0x80/0x80
[  513.635626]  ret_from_fork+0x24/0x30
[  513.636042] Evicting PASID 32768 queues
[  513.936236] Restoring PASID 32768 queues
[  524.708912] Evicting PASID 32768 queues
[  524.999875] Restoring PASID 32768 queues

Change-Id: Ib40b5fb8d7c6572888a0c879a315b382eb948647
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index aac7f7b..4808777 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -273,8 +273,6 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 
 	print_queue(q);
 
-	dqm_lock(dqm);
-
 	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
 		pr_warn("Can't create new usermode queue because %d queues were already created\n",
 				dqm->total_queue_count);
@@ -347,6 +345,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 			goto out_uninit_mqd;
 	}
 
+	dqm_lock(dqm);
 	list_add(&q->list, &qpd->queues_list);
 	qpd->queue_count++;
 	if (q->properties.is_active)
@@ -365,6 +364,9 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	pr_debug("Total of %d queues are accountable so far\n",
 			dqm->total_queue_count);
 
+out_unlock:
+	dqm_unlock(dqm);
+	return retval;
 out_uninit_mqd:
 	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
 out_deallocate_doorbell:
@@ -378,8 +380,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 deallocate_vmid:
 	if (list_empty(&qpd->queues_list))
 		deallocate_vmid(dqm, qpd, q);
-out_unlock:
-	dqm_unlock(dqm);
+
 	return retval;
 }
 
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
       [not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2019-05-31 21:19   ` [PATCH 4/4] drm/amdkfd: Fix a circular lock dependency Zeng, Oak
@ 2019-05-31 21:31   ` Kuehling, Felix
       [not found]     ` <f0e92b32-bd0a-aca4-c7dd-83f8e97b9a6f-5C7GfCeVMHo@public.gmane.org>
  3 siblings, 1 reply; 8+ messages in thread
From: Kuehling, Felix @ 2019-05-31 21:31 UTC (permalink / raw)
  To: Zeng, Oak,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhao, Yong, Freehill, Chris, Liu, Alex

On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
> SDMA queue allocation requires the dqm lock as it modify
> the global dqm members. Introduce functions to allocate/deallocate
> in locked/unlocked circumstance.
>
> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 46 ++++++++++++++++------
>   1 file changed, 35 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ece35c7..1f707bb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>   
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   				struct queue *q);
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
> +				struct queue *q);
>   
>   static void kfd_process_hw_exception(struct work_struct *work);
>   
> @@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>   		deallocate_hqd(dqm, q);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   		dqm->sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		dqm->xgmi_sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	} else {
>   		pr_debug("q->properties.type %d is invalid\n",
>   				q->properties.type);
> @@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   		if (dqm->sdma_bitmap == 0)
>   			return -ENOMEM;
> +		dqm_lock(dqm);
Doesn't this cause a locking error where you try to take the same lock 
twice in this call tree:

create_queue_nocpsch (takes DQM lock)
     -> create_sdma_queue_nocpsch
         -> allocate_sdma_queue (takes DQM lock again)


>   		bit = __ffs64(dqm->sdma_bitmap);
>   		dqm->sdma_bitmap &= ~(1ULL << bit);
> +		dqm_unlock(dqm);
>   		q->sdma_id = bit;
>   		q->properties.sdma_engine_id = q->sdma_id %
>   				get_num_sdma_engines(dqm);
> @@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		if (dqm->xgmi_sdma_bitmap == 0)
>   			return -ENOMEM;
> +		dqm_lock(dqm);
>   		bit = __ffs64(dqm->xgmi_sdma_bitmap);
>   		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> +		dqm_unlock(dqm);
>   		q->sdma_id = bit;
>   		/* sdma_engine_id is sdma id including
>   		 * both PCIe-optimized SDMAs and XGMI-
> @@ -953,17 +959,35 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   	return 0;
>   }
>   
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
> +				struct queue *q)
> +{
> +	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> +		if (q->sdma_id >= get_num_sdma_queues(dqm))
> +			return;
> +		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> +	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> +		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
> +			return;
> +		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> +	}
> +}
> +
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   				struct queue *q)
>   {
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   		if (q->sdma_id >= get_num_sdma_queues(dqm))
>   			return;
> +		dqm_lock(dqm);
>   		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> +		dqm_unlock(dqm);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>   			return;
> +		dqm_lock(dqm);
>   		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> +		dqm_unlock(dqm);
>   	}
>   }

You could minimize code duplication by defining deallocate_sdma_queue as 
a simple wrapper:

static void deallocate_sdma_queue(struct device_queue_manager *dqm, 
struct queue *q)
{
         dqm_lock(dqm);
         deallocate_sdma_queue_locked(dqm, q);
         dqm_unlock(dqm);
}


>   
> @@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>   
>   	retval = allocate_doorbell(qpd, q);
>   	if (retval)
> -		goto out_deallocate_sdma_queue;
> +		goto out_deallocate_sdma_queue_locked;
>   
>   	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>   	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> @@ -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>   	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>   out_deallocate_doorbell:
>   	deallocate_doorbell(qpd, q);
> -out_deallocate_sdma_queue:
> -	deallocate_sdma_queue(dqm, q);
> +out_deallocate_sdma_queue_locked:
> +	deallocate_sdma_queue_locked(dqm, q);
>   
>   	return retval;
>   }
> @@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   
>   	retval = allocate_doorbell(qpd, q);
>   	if (retval)
> -		goto out_deallocate_sdma_queue;
> +		goto out_deallocate_sdma_queue_locked;
>   
>   	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>   			q->properties.type)];
> @@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>   
>   out_deallocate_doorbell:
>   	deallocate_doorbell(qpd, q);
> -out_deallocate_sdma_queue:
> +out_deallocate_sdma_queue_locked:

Why are you renaming this label? You don't hold the DQM lock when you 
get here.


>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>   		deallocate_sdma_queue(dqm, q);
> @@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   		dqm->sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		dqm->xgmi_sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	}
>   
>   	list_del(&q->list);
> @@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   	list_for_each_entry(q, &qpd->queues_list, list) {
>   		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   			dqm->sdma_queue_count--;
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue_locked(dqm, q);
>   		} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   			dqm->xgmi_sdma_queue_count--;
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue_locked(dqm, q);
>   		}
>   
>   		if (q->properties.is_active)
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch
       [not found]     ` <1559337538-14249-3-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-31 21:40       ` Kuehling, Felix
  0 siblings, 0 replies; 8+ messages in thread
From: Kuehling, Felix @ 2019-05-31 21:40 UTC (permalink / raw)
  To: Zeng, Oak,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhao, Yong, Freehill, Chris, Liu, Alex


On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
> This is prepare work to fix a circular lock dependency.
> No logic change
>
> Change-Id: I4e0ee918260e7780de972dd71f4ce787b4f6dde9
> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 172 ++++++++-------------
>   1 file changed, 62 insertions(+), 110 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 74944ec..aac7f7b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -42,10 +42,6 @@
>   static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
>   					unsigned int pasid, unsigned int vmid);
>   
> -static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
> -					struct queue *q,
> -					struct qcm_process_device *qpd);
> -
>   static int execute_queues_cpsch(struct device_queue_manager *dqm,
>   				enum kfd_unmap_queues_filter filter,
>   				uint32_t filter_param);
> @@ -55,15 +51,16 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>   
>   static int map_queues_cpsch(struct device_queue_manager *dqm);
>   
> -static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
> -					struct queue *q,
> -					struct qcm_process_device *qpd);
> -
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   				struct queue *q);
>   static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>   				struct queue *q);
>   
> +static inline void deallocate_hqd(struct device_queue_manager *dqm,
> +				struct queue *q);
> +static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
> +static int allocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct queue *q);
>   static void kfd_process_hw_exception(struct work_struct *work);
>   
>   static inline
> @@ -272,6 +269,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   				struct qcm_process_device *qpd)
>   {
>   	int retval;
> +	struct mqd_manager *mqd_mgr;

Reverse Christmas tree ... Put the longer declaration first. It tends to 
make code more readable.


>   
>   	print_queue(q);
>   
> @@ -302,18 +300,51 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   	q->properties.tba_addr = qpd->tba_addr;
>   	q->properties.tma_addr = qpd->tma_addr;
>   
> +	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> +			q->properties.type)];
>   	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> -		retval = create_compute_queue_nocpsch(dqm, q, qpd);
> -	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> -			q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> -		retval = create_sdma_queue_nocpsch(dqm, q, qpd);
> -	else
> -		retval = -EINVAL;
> +	{

The open brace should be on the same line as the "if". Please run 
check_patch.pl, it should find those coding style violations. This 
convenient command checks the top 4 commits on your local branch without 
even having to generate the patch files first: "scripts/checkpatch.pl -g 
HEAD~4..HEAD".


> +		retval = allocate_hqd(dqm, q);
> +		if (retval)
> +			goto deallocate_vmid;
> +	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> +		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> +		retval = allocate_sdma_queue(dqm, q);
> +		if (retval)
> +			goto deallocate_vmid;
> +		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> +	}
>   
> -	if (retval) {
> -		if (list_empty(&qpd->queues_list))
> -			deallocate_vmid(dqm, qpd, q);
> -		goto out_unlock;
> +	retval = allocate_doorbell(qpd, q);
> +	if (retval)
> +		goto out_deallocate_hqd;
> +
> +	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> +				&q->gart_mqd_addr, &q->properties);
> +	if (retval)
> +		goto out_deallocate_doorbell;
> +
> +	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
> +			q->pipe, q->queue);
> +
> +	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> +		dqm->dev->kfd2kgd->set_scratch_backing_va(
> +			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
> +
> +	if (q->properties.is_active) {
> +
> +		if (WARN(q->process->mm != current->mm,
> +					"should only run in user thread"))
> +			retval = -EFAULT;
> +		else
> +			/* TODO for sdma queue load mqd was called
> +			 * unconditionally in original code, with mm_struct set
> +			 * to NULL. I am not sure whether we need to keep
> +			 * original logic here */
> +			retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
> +					&q->properties, current->mm);
> +		if (retval)
> +			goto out_uninit_mqd;
>   	}
>   
>   	list_add(&q->list, &qpd->queues_list);
> @@ -334,6 +365,19 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>   	pr_debug("Total of %d queues are accountable so far\n",
>   			dqm->total_queue_count);
>   
> +out_uninit_mqd:
> +	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +out_deallocate_doorbell:
> +	deallocate_doorbell(qpd, q);
> +out_deallocate_hqd:
> +	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> +		deallocate_hqd(dqm, q);
> +	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> +		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> +		deallocate_sdma_queue_locked(dqm, q);
> +deallocate_vmid:
> +	if (list_empty(&qpd->queues_list))
> +		deallocate_vmid(dqm, qpd, q);
>   out_unlock:
>   	dqm_unlock(dqm);
>   	return retval;

This looks like you always deallocate doorbells, SDMA queues etc., even 
when queue creation was successful. Seems wrong.

Regards,
   Felix


> @@ -379,58 +423,6 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm,
>   	dqm->allocated_queues[q->pipe] |= (1 << q->queue);
>   }
>   
> -static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
> -					struct queue *q,
> -					struct qcm_process_device *qpd)
> -{
> -	struct mqd_manager *mqd_mgr;
> -	int retval;
> -
> -	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
> -
> -	retval = allocate_hqd(dqm, q);
> -	if (retval)
> -		return retval;
> -
> -	retval = allocate_doorbell(qpd, q);
> -	if (retval)
> -		goto out_deallocate_hqd;
> -
> -	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> -				&q->gart_mqd_addr, &q->properties);
> -	if (retval)
> -		goto out_deallocate_doorbell;
> -
> -	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
> -			q->pipe, q->queue);
> -
> -	dqm->dev->kfd2kgd->set_scratch_backing_va(
> -			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
> -
> -	if (!q->properties.is_active)
> -		return 0;
> -
> -	if (WARN(q->process->mm != current->mm,
> -		 "should only run in user thread"))
> -		retval = -EFAULT;
> -	else
> -		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
> -					   &q->properties, current->mm);
> -	if (retval)
> -		goto out_uninit_mqd;
> -
> -	return 0;
> -
> -out_uninit_mqd:
> -	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> -out_deallocate_doorbell:
> -	deallocate_doorbell(qpd, q);
> -out_deallocate_hqd:
> -	deallocate_hqd(dqm, q);
> -
> -	return retval;
> -}
> -
>   /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
>    * to avoid asynchronized access
>    */
> @@ -991,46 +983,6 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   	}
>   }
>   
> -static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
> -					struct queue *q,
> -					struct qcm_process_device *qpd)
> -{
> -	struct mqd_manager *mqd_mgr;
> -	int retval;
> -
> -	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA];
> -
> -	retval = allocate_sdma_queue(dqm, q);
> -	if (retval)
> -		return retval;
> -
> -	retval = allocate_doorbell(qpd, q);
> -	if (retval)
> -		goto out_deallocate_sdma_queue_locked;
> -
> -	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> -	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> -				&q->gart_mqd_addr, &q->properties);
> -	if (retval)
> -		goto out_deallocate_doorbell;
> -
> -	retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
> -				NULL);
> -	if (retval)
> -		goto out_uninit_mqd;
> -
> -	return 0;
> -
> -out_uninit_mqd:
> -	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> -out_deallocate_doorbell:
> -	deallocate_doorbell(qpd, q);
> -out_deallocate_sdma_queue_locked:
> -	deallocate_sdma_queue_locked(dqm, q);
> -
> -	return retval;
> -}
> -
>   /*
>    * Device Queue Manager implementation for cp scheduler
>    */
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
       [not found]     ` <f0e92b32-bd0a-aca4-c7dd-83f8e97b9a6f-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-31 21:50       ` Kuehling, Felix
       [not found]         ` <d00bae2c-8c2f-b8b7-a69a-7205fd01a55a-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Kuehling, Felix @ 2019-05-31 21:50 UTC (permalink / raw)
  To: Zeng, Oak,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhao, Yong, Liu, Alex, Freehill, Chris

On 2019-05-31 5:31 p.m., Kuehling, Felix wrote:
> On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
>> SDMA queue allocation requires the dqm lock as it modify
>> the global dqm members. Introduce functions to allocate/deallocate
>> in locked/unlocked circumstance.
>>
>> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
>> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>> ---
>>    .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 46 ++++++++++++++++------
>>    1 file changed, 35 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index ece35c7..1f707bb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>>    
>>    static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>    				struct queue *q);
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> +				struct queue *q);
>>    
>>    static void kfd_process_hw_exception(struct work_struct *work);
>>    
>> @@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>    		deallocate_hqd(dqm, q);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		dqm->sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		dqm->xgmi_sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	} else {
>>    		pr_debug("q->properties.type %d is invalid\n",
>>    				q->properties.type);
>> @@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		if (dqm->sdma_bitmap == 0)
>>    			return -ENOMEM;
>> +		dqm_lock(dqm);
> Doesn't this cause a locking error where you try to take the same lock
> twice in this call tree:
>
> create_queue_nocpsch (takes DQM lock)
>       -> create_sdma_queue_nocpsch
>           -> allocate_sdma_queue (takes DQM lock again)
>
BTW, I think you actually caught a bug here. In the create_queue_cpsch 
path we failed to lock the DQM for SDMA queue allocation. In 
create_queue_nocpsch it was not a problem because we took the DQM lock 
earlier (which is itself a problem you're working on fixing).

However, now you're making the problem worse for the nocpsch case, at 
least until patch 4, which moves the DQM locking in 
create_queue_nocpsch. Maybe this change should come after patch 4 so you 
don't end up with a worse problem in the middle of your patch series.

Also, I think you're doing the locking unnecessarily fine grained. It's 
probably enough to take the DQM lock once at the start of 
allocate_sdma_queue, and drop it once in the end. No need to duplicate 
this in the two if branches.

Regards,
   Felix

>>    		bit = __ffs64(dqm->sdma_bitmap);
>>    		dqm->sdma_bitmap &= ~(1ULL << bit);
>> +		dqm_unlock(dqm);
>>    		q->sdma_id = bit;
>>    		q->properties.sdma_engine_id = q->sdma_id %
>>    				get_num_sdma_engines(dqm);
>> @@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		if (dqm->xgmi_sdma_bitmap == 0)
>>    			return -ENOMEM;
>> +		dqm_lock(dqm);
>>    		bit = __ffs64(dqm->xgmi_sdma_bitmap);
>>    		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>> +		dqm_unlock(dqm);
>>    		q->sdma_id = bit;
>>    		/* sdma_engine_id is sdma id including
>>    		 * both PCIe-optimized SDMAs and XGMI-
>> @@ -953,17 +959,35 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>    	return 0;
>>    }
>>    
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> +				struct queue *q)
>> +{
>> +	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> +		if (q->sdma_id >= get_num_sdma_queues(dqm))
>> +			return;
>> +		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> +	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> +		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>> +			return;
>> +		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> +	}
>> +}
>> +
>>    static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>    				struct queue *q)
>>    {
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		if (q->sdma_id >= get_num_sdma_queues(dqm))
>>    			return;
>> +		dqm_lock(dqm);
>>    		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> +		dqm_unlock(dqm);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>>    			return;
>> +		dqm_lock(dqm);
>>    		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> +		dqm_unlock(dqm);
>>    	}
>>    }
> You could minimize code duplication by defining deallocate_sdma_queue as
> a simple wrapper:
>
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q)
> {
>           dqm_lock(dqm);
>           deallocate_sdma_queue_locked(dqm, q);
>           dqm_unlock(dqm);
> }
>
>
>>    
>> @@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>>    
>>    	retval = allocate_doorbell(qpd, q);
>>    	if (retval)
>> -		goto out_deallocate_sdma_queue;
>> +		goto out_deallocate_sdma_queue_locked;
>>    
>>    	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>>    	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
>> @@ -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>>    	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>>    out_deallocate_doorbell:
>>    	deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> -	deallocate_sdma_queue(dqm, q);
>> +out_deallocate_sdma_queue_locked:
>> +	deallocate_sdma_queue_locked(dqm, q);
>>    
>>    	return retval;
>>    }
>> @@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>>    
>>    	retval = allocate_doorbell(qpd, q);
>>    	if (retval)
>> -		goto out_deallocate_sdma_queue;
>> +		goto out_deallocate_sdma_queue_locked;
>>    
>>    	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>    			q->properties.type)];
>> @@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>>    
>>    out_deallocate_doorbell:
>>    	deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> +out_deallocate_sdma_queue_locked:
> Why are you renaming this label? You don't hold the DQM lock when you
> get here.
>
>
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>    		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>    		deallocate_sdma_queue(dqm, q);
>> @@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>    
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		dqm->sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		dqm->xgmi_sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	}
>>    
>>    	list_del(&q->list);
>> @@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>>    	list_for_each_entry(q, &qpd->queues_list, list) {
>>    		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    			dqm->sdma_queue_count--;
>> -			deallocate_sdma_queue(dqm, q);
>> +			deallocate_sdma_queue_locked(dqm, q);
>>    		} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    			dqm->xgmi_sdma_queue_count--;
>> -			deallocate_sdma_queue(dqm, q);
>> +			deallocate_sdma_queue_locked(dqm, q);
>>    		}
>>    
>>    		if (q->properties.is_active)
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
       [not found]         ` <d00bae2c-8c2f-b8b7-a69a-7205fd01a55a-5C7GfCeVMHo@public.gmane.org>
@ 2019-06-03 16:05           ` Zeng, Oak
  0 siblings, 0 replies; 8+ messages in thread
From: Zeng, Oak @ 2019-06-03 16:05 UTC (permalink / raw)
  To: Kuehling, Felix,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
  Cc: Zhao, Yong, Liu, Alex, Freehill, Chris

Hi Felix,

See comment inline [Oak]

Regards,
Oak

-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling@amd.com> 
Sent: Friday, May 31, 2019 5:50 PM
To: Zeng, Oak <Oak.Zeng@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhao, Yong <Yong.Zhao@amd.com>; Freehill, Chris <Chris.Freehill@amd.com>; Liu, Alex <Alex.Liu@amd.com>
Subject: Re: [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition

On 2019-05-31 5:31 p.m., Kuehling, Felix wrote:
> On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
>> SDMA queue allocation requires the dqm lock as it modify the global 
>> dqm members. Introduce functions to allocate/deallocate in 
>> locked/unlocked circumstance.
>>
>> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
>> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>> ---
>>    .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 46 ++++++++++++++++------
>>    1 file changed, 35 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index ece35c7..1f707bb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct 
>> device_queue_manager *dqm,
>>    
>>    static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>    				struct queue *q);
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> +				struct queue *q);
>>    
>>    static void kfd_process_hw_exception(struct work_struct *work);
>>    
>> @@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>    		deallocate_hqd(dqm, q);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		dqm->sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		dqm->xgmi_sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	} else {
>>    		pr_debug("q->properties.type %d is invalid\n",
>>    				q->properties.type);
>> @@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		if (dqm->sdma_bitmap == 0)
>>    			return -ENOMEM;
>> +		dqm_lock(dqm);
> Doesn't this cause a locking error where you try to take the same lock 
> twice in this call tree:
>
> create_queue_nocpsch (takes DQM lock)
>       -> create_sdma_queue_nocpsch
>           -> allocate_sdma_queue (takes DQM lock again)
>
BTW, I think you actually caught a bug here. In the create_queue_cpsch path we failed to lock the DQM for SDMA queue allocation. In create_queue_nocpsch it was not a problem because we took the DQM lock earlier (which is itself a problem you're working on fixing).

However, now you're making the problem worse for the nocpsch case, at least until patch 4, which moves the DQM locking in create_queue_nocpsch. Maybe this change should come after patch 4 so you don't end up with a worse problem in the middle of your patch series.
[Oak]: I realized the same issue when I wrote this patch...I will have to create function allocate_sdma_queue_locked/unlocked to fix the issue properly. But I will have to delete allocate_sdma_queue_unlocked after patch 4 because it won't be used any more after patch 4.

It is a good idea to do this patch after patch 4. I will re-submit the patches series, also fixing other issues you mentioned.

Also, I think you're doing the locking unnecessarily fine grained. It's probably enough to take the DQM lock once at the start of allocate_sdma_queue, and drop it once in the end. No need to duplicate this in the two if branches.
[Oak] Agreed.

Regards,
   Felix

>>    		bit = __ffs64(dqm->sdma_bitmap);
>>    		dqm->sdma_bitmap &= ~(1ULL << bit);
>> +		dqm_unlock(dqm);
>>    		q->sdma_id = bit;
>>    		q->properties.sdma_engine_id = q->sdma_id %
>>    				get_num_sdma_engines(dqm);
>> @@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		if (dqm->xgmi_sdma_bitmap == 0)
>>    			return -ENOMEM;
>> +		dqm_lock(dqm);
>>    		bit = __ffs64(dqm->xgmi_sdma_bitmap);
>>    		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>> +		dqm_unlock(dqm);
>>    		q->sdma_id = bit;
>>    		/* sdma_engine_id is sdma id including
>>    		 * both PCIe-optimized SDMAs and XGMI- @@ -953,17 +959,35 @@ 
>> static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>    	return 0;
>>    }
>>    
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> +				struct queue *q)
>> +{
>> +	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> +		if (q->sdma_id >= get_num_sdma_queues(dqm))
>> +			return;
>> +		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> +	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> +		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>> +			return;
>> +		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> +	}
>> +}
>> +
>>    static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>    				struct queue *q)
>>    {
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		if (q->sdma_id >= get_num_sdma_queues(dqm))
>>    			return;
>> +		dqm_lock(dqm);
>>    		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> +		dqm_unlock(dqm);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>>    			return;
>> +		dqm_lock(dqm);
>>    		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> +		dqm_unlock(dqm);
>>    	}
>>    }
> You could minimize code duplication by defining deallocate_sdma_queue 
> as a simple wrapper:
>
> static void deallocate_sdma_queue(struct device_queue_manager *dqm, 
> struct queue *q) {
>           dqm_lock(dqm);
>           deallocate_sdma_queue_locked(dqm, q);
>           dqm_unlock(dqm);
> }
>
>
>>    
>> @@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct 
>> device_queue_manager *dqm,
>>    
>>    	retval = allocate_doorbell(qpd, q);
>>    	if (retval)
>> -		goto out_deallocate_sdma_queue;
>> +		goto out_deallocate_sdma_queue_locked;
>>    
>>    	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>>    	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, @@ 
>> -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>>    	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>>    out_deallocate_doorbell:
>>    	deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> -	deallocate_sdma_queue(dqm, q);
>> +out_deallocate_sdma_queue_locked:
>> +	deallocate_sdma_queue_locked(dqm, q);
>>    
>>    	return retval;
>>    }
>> @@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct 
>> device_queue_manager *dqm, struct queue *q,
>>    
>>    	retval = allocate_doorbell(qpd, q);
>>    	if (retval)
>> -		goto out_deallocate_sdma_queue;
>> +		goto out_deallocate_sdma_queue_locked;
>>    
>>    	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>    			q->properties.type)];
>> @@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct 
>> device_queue_manager *dqm, struct queue *q,
>>    
>>    out_deallocate_doorbell:
>>    	deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> +out_deallocate_sdma_queue_locked:
> Why are you renaming this label? You don't hold the DQM lock when you 
> get here.
>
>
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>    		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>    		deallocate_sdma_queue(dqm, q);
>> @@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct 
>> device_queue_manager *dqm,
>>    
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		dqm->sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		dqm->xgmi_sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	}
>>    
>>    	list_del(&q->list);
>> @@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>>    	list_for_each_entry(q, &qpd->queues_list, list) {
>>    		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    			dqm->sdma_queue_count--;
>> -			deallocate_sdma_queue(dqm, q);
>> +			deallocate_sdma_queue_locked(dqm, q);
>>    		} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    			dqm->xgmi_sdma_queue_count--;
>> -			deallocate_sdma_queue(dqm, q);
>> +			deallocate_sdma_queue_locked(dqm, q);
>>    		}
>>    
>>    		if (q->properties.is_active)
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-06-03 16:05 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-05-31 21:19 [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition Zeng, Oak
     [not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:19   ` [PATCH 2/4] drm/amdkfd: Only initialize sdma vm for sdma queues Zeng, Oak
2019-05-31 21:19   ` [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch Zeng, Oak
     [not found]     ` <1559337538-14249-3-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:40       ` Kuehling, Felix
2019-05-31 21:19   ` [PATCH 4/4] drm/amdkfd: Fix a circular lock dependency Zeng, Oak
2019-05-31 21:31   ` [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition Kuehling, Felix
     [not found]     ` <f0e92b32-bd0a-aca4-c7dd-83f8e97b9a6f-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:50       ` Kuehling, Felix
     [not found]         ` <d00bae2c-8c2f-b8b7-a69a-7205fd01a55a-5C7GfCeVMHo@public.gmane.org>
2019-06-03 16:05           ` Zeng, Oak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox