* [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
@ 2019-05-31 21:19 Zeng, Oak
[not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
0 siblings, 1 reply; 8+ messages in thread
From: Zeng, Oak @ 2019-05-31 21:19 UTC (permalink / raw)
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Zhao, Yong, Kuehling, Felix, Freehill, Chris, Zeng, Oak,
Liu, Alex
SDMA queue allocation requires the dqm lock as it modify
the global dqm members. Introduce functions to allocate/deallocate
in locked/unlocked circumstance.
Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
.../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 46 ++++++++++++++++------
1 file changed, 35 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index ece35c7..1f707bb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q);
+static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
+ struct queue *q);
static void kfd_process_hw_exception(struct work_struct *work);
@@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
deallocate_hqd(dqm, q);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
- deallocate_sdma_queue(dqm, q);
+ deallocate_sdma_queue_locked(dqm, q);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
dqm->xgmi_sdma_queue_count--;
- deallocate_sdma_queue(dqm, q);
+ deallocate_sdma_queue_locked(dqm, q);
} else {
pr_debug("q->properties.type %d is invalid\n",
q->properties.type);
@@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
if (dqm->sdma_bitmap == 0)
return -ENOMEM;
+ dqm_lock(dqm);
bit = __ffs64(dqm->sdma_bitmap);
dqm->sdma_bitmap &= ~(1ULL << bit);
+ dqm_unlock(dqm);
q->sdma_id = bit;
q->properties.sdma_engine_id = q->sdma_id %
get_num_sdma_engines(dqm);
@@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
if (dqm->xgmi_sdma_bitmap == 0)
return -ENOMEM;
+ dqm_lock(dqm);
bit = __ffs64(dqm->xgmi_sdma_bitmap);
dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
+ dqm_unlock(dqm);
q->sdma_id = bit;
/* sdma_engine_id is sdma id including
* both PCIe-optimized SDMAs and XGMI-
@@ -953,17 +959,35 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
return 0;
}
+static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
+ struct queue *q)
+{
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+ if (q->sdma_id >= get_num_sdma_queues(dqm))
+ return;
+ dqm->sdma_bitmap |= (1ULL << q->sdma_id);
+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
+ return;
+ dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
+ }
+}
+
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q)
{
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
if (q->sdma_id >= get_num_sdma_queues(dqm))
return;
+ dqm_lock(dqm);
dqm->sdma_bitmap |= (1ULL << q->sdma_id);
+ dqm_unlock(dqm);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
return;
+ dqm_lock(dqm);
dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
+ dqm_unlock(dqm);
}
}
@@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
retval = allocate_doorbell(qpd, q);
if (retval)
- goto out_deallocate_sdma_queue;
+ goto out_deallocate_sdma_queue_locked;
dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
@@ -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
out_deallocate_doorbell:
deallocate_doorbell(qpd, q);
-out_deallocate_sdma_queue:
- deallocate_sdma_queue(dqm, q);
+out_deallocate_sdma_queue_locked:
+ deallocate_sdma_queue_locked(dqm, q);
return retval;
}
@@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
retval = allocate_doorbell(qpd, q);
if (retval)
- goto out_deallocate_sdma_queue;
+ goto out_deallocate_sdma_queue_locked;
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
@@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
out_deallocate_doorbell:
deallocate_doorbell(qpd, q);
-out_deallocate_sdma_queue:
+out_deallocate_sdma_queue_locked:
if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
deallocate_sdma_queue(dqm, q);
@@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
- deallocate_sdma_queue(dqm, q);
+ deallocate_sdma_queue_locked(dqm, q);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
dqm->xgmi_sdma_queue_count--;
- deallocate_sdma_queue(dqm, q);
+ deallocate_sdma_queue_locked(dqm, q);
}
list_del(&q->list);
@@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
list_for_each_entry(q, &qpd->queues_list, list) {
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
dqm->sdma_queue_count--;
- deallocate_sdma_queue(dqm, q);
+ deallocate_sdma_queue_locked(dqm, q);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
dqm->xgmi_sdma_queue_count--;
- deallocate_sdma_queue(dqm, q);
+ deallocate_sdma_queue_locked(dqm, q);
}
if (q->properties.is_active)
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/4] drm/amdkfd: Only initialize sdma vm for sdma queues
[not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-31 21:19 ` Zeng, Oak
2019-05-31 21:19 ` [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch Zeng, Oak
` (2 subsequent siblings)
3 siblings, 0 replies; 8+ messages in thread
From: Zeng, Oak @ 2019-05-31 21:19 UTC (permalink / raw)
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Zhao, Yong, Kuehling, Felix, Freehill, Chris, Zeng, Oak,
Liu, Alex
Don't do the same for compute queues
Change-Id: Id5f743ca10c2b761590bfe18cab2f802d3c04d2d
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 1f707bb..74944ec 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1230,7 +1230,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
q->properties.is_evicted = (q->properties.queue_size > 0 &&
q->properties.queue_percent > 0 &&
q->properties.queue_address != 0);
- dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+ if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+ dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
q->properties.tba_addr = qpd->tba_addr;
q->properties.tma_addr = qpd->tma_addr;
retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch
[not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:19 ` [PATCH 2/4] drm/amdkfd: Only initialize sdma vm for sdma queues Zeng, Oak
@ 2019-05-31 21:19 ` Zeng, Oak
[not found] ` <1559337538-14249-3-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:19 ` [PATCH 4/4] drm/amdkfd: Fix a circular lock dependency Zeng, Oak
2019-05-31 21:31 ` [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition Kuehling, Felix
3 siblings, 1 reply; 8+ messages in thread
From: Zeng, Oak @ 2019-05-31 21:19 UTC (permalink / raw)
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Zhao, Yong, Kuehling, Felix, Freehill, Chris, Zeng, Oak,
Liu, Alex
This is prepare work to fix a circular lock dependency.
No logic change
Change-Id: I4e0ee918260e7780de972dd71f4ce787b4f6dde9
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
.../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 172 ++++++++-------------
1 file changed, 62 insertions(+), 110 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 74944ec..aac7f7b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -42,10 +42,6 @@
static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
unsigned int pasid, unsigned int vmid);
-static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
- struct queue *q,
- struct qcm_process_device *qpd);
-
static int execute_queues_cpsch(struct device_queue_manager *dqm,
enum kfd_unmap_queues_filter filter,
uint32_t filter_param);
@@ -55,15 +51,16 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
static int map_queues_cpsch(struct device_queue_manager *dqm);
-static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
- struct queue *q,
- struct qcm_process_device *qpd);
-
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q);
static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
struct queue *q);
+static inline void deallocate_hqd(struct device_queue_manager *dqm,
+ struct queue *q);
+static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
+static int allocate_sdma_queue(struct device_queue_manager *dqm,
+ struct queue *q);
static void kfd_process_hw_exception(struct work_struct *work);
static inline
@@ -272,6 +269,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
int retval;
+ struct mqd_manager *mqd_mgr;
print_queue(q);
@@ -302,18 +300,51 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
q->properties.tba_addr = qpd->tba_addr;
q->properties.tma_addr = qpd->tma_addr;
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
- retval = create_compute_queue_nocpsch(dqm, q, qpd);
- else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
- q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
- retval = create_sdma_queue_nocpsch(dqm, q, qpd);
- else
- retval = -EINVAL;
+ {
+ retval = allocate_hqd(dqm, q);
+ if (retval)
+ goto deallocate_vmid;
+ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+ retval = allocate_sdma_queue(dqm, q);
+ if (retval)
+ goto deallocate_vmid;
+ dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+ }
- if (retval) {
- if (list_empty(&qpd->queues_list))
- deallocate_vmid(dqm, qpd, q);
- goto out_unlock;
+ retval = allocate_doorbell(qpd, q);
+ if (retval)
+ goto out_deallocate_hqd;
+
+ retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
+ &q->gart_mqd_addr, &q->properties);
+ if (retval)
+ goto out_deallocate_doorbell;
+
+ pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
+ q->pipe, q->queue);
+
+ if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+ dqm->dev->kfd2kgd->set_scratch_backing_va(
+ dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
+
+ if (q->properties.is_active) {
+
+ if (WARN(q->process->mm != current->mm,
+ "should only run in user thread"))
+ retval = -EFAULT;
+ else
+ /* TODO for sdma queue load mqd was called
+ * unconditionally in original code, with mm_struct set
+ * to NULL. I am not sure whether we need to keep
+ * original logic here */
+ retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
+ &q->properties, current->mm);
+ if (retval)
+ goto out_uninit_mqd;
}
list_add(&q->list, &qpd->queues_list);
@@ -334,6 +365,19 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
+out_uninit_mqd:
+ mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+out_deallocate_doorbell:
+ deallocate_doorbell(qpd, q);
+out_deallocate_hqd:
+ if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+ deallocate_hqd(dqm, q);
+ else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+ deallocate_sdma_queue_locked(dqm, q);
+deallocate_vmid:
+ if (list_empty(&qpd->queues_list))
+ deallocate_vmid(dqm, qpd, q);
out_unlock:
dqm_unlock(dqm);
return retval;
@@ -379,58 +423,6 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm,
dqm->allocated_queues[q->pipe] |= (1 << q->queue);
}
-static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
- struct queue *q,
- struct qcm_process_device *qpd)
-{
- struct mqd_manager *mqd_mgr;
- int retval;
-
- mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
-
- retval = allocate_hqd(dqm, q);
- if (retval)
- return retval;
-
- retval = allocate_doorbell(qpd, q);
- if (retval)
- goto out_deallocate_hqd;
-
- retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
- &q->gart_mqd_addr, &q->properties);
- if (retval)
- goto out_deallocate_doorbell;
-
- pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
- q->pipe, q->queue);
-
- dqm->dev->kfd2kgd->set_scratch_backing_va(
- dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
-
- if (!q->properties.is_active)
- return 0;
-
- if (WARN(q->process->mm != current->mm,
- "should only run in user thread"))
- retval = -EFAULT;
- else
- retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
- &q->properties, current->mm);
- if (retval)
- goto out_uninit_mqd;
-
- return 0;
-
-out_uninit_mqd:
- mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-out_deallocate_doorbell:
- deallocate_doorbell(qpd, q);
-out_deallocate_hqd:
- deallocate_hqd(dqm, q);
-
- return retval;
-}
-
/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
* to avoid asynchronized access
*/
@@ -991,46 +983,6 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
}
}
-static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
- struct queue *q,
- struct qcm_process_device *qpd)
-{
- struct mqd_manager *mqd_mgr;
- int retval;
-
- mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA];
-
- retval = allocate_sdma_queue(dqm, q);
- if (retval)
- return retval;
-
- retval = allocate_doorbell(qpd, q);
- if (retval)
- goto out_deallocate_sdma_queue_locked;
-
- dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
- retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
- &q->gart_mqd_addr, &q->properties);
- if (retval)
- goto out_deallocate_doorbell;
-
- retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
- NULL);
- if (retval)
- goto out_uninit_mqd;
-
- return 0;
-
-out_uninit_mqd:
- mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-out_deallocate_doorbell:
- deallocate_doorbell(qpd, q);
-out_deallocate_sdma_queue_locked:
- deallocate_sdma_queue_locked(dqm, q);
-
- return retval;
-}
-
/*
* Device Queue Manager implementation for cp scheduler
*/
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 4/4] drm/amdkfd: Fix a circular lock dependency
[not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:19 ` [PATCH 2/4] drm/amdkfd: Only initialize sdma vm for sdma queues Zeng, Oak
2019-05-31 21:19 ` [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch Zeng, Oak
@ 2019-05-31 21:19 ` Zeng, Oak
2019-05-31 21:31 ` [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition Kuehling, Felix
3 siblings, 0 replies; 8+ messages in thread
From: Zeng, Oak @ 2019-05-31 21:19 UTC (permalink / raw)
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Zhao, Yong, Kuehling, Felix, Freehill, Chris, Zeng, Oak,
Liu, Alex
The idea to break the circular lock dependency is
to move init_mqd out of lock protection of dqm lock
in callstack #1 below. There is no need to.
[ 59.510149] [drm] Initialized amdgpu 3.30.0 20150101 for 0000:04:00.0 on minor 0
[ 513.604034] ======================================================
[ 513.604205] WARNING: possible circular locking dependency detected
[ 513.604375] 4.18.0-kfd-root #2 Tainted: G W
[ 513.604530] ------------------------------------------------------
[ 513.604699] kswapd0/611 is trying to acquire lock:
[ 513.604840] 00000000d254022e (&dqm->lock_hidden){+.+.}, at: evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[ 513.605150]
but task is already holding lock:
[ 513.605307] 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
[ 513.605540]
which lock already depends on the new lock.
[ 513.605747]
the existing dependency chain (in reverse order) is:
[ 513.605944]
-> #4 (&anon_vma->rwsem){++++}:
[ 513.606106] __vma_adjust+0x147/0x7f0
[ 513.606231] __split_vma+0x179/0x190
[ 513.606353] mprotect_fixup+0x217/0x260
[ 513.606553] do_mprotect_pkey+0x211/0x380
[ 513.606752] __x64_sys_mprotect+0x1b/0x20
[ 513.606954] do_syscall_64+0x50/0x1a0
[ 513.607149] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 513.607380]
-> #3 (&mapping->i_mmap_rwsem){++++}:
[ 513.607678] rmap_walk_file+0x1f0/0x280
[ 513.607887] page_referenced+0xdd/0x180
[ 513.608081] shrink_page_list+0x853/0xcb0
[ 513.608279] shrink_inactive_list+0x33b/0x700
[ 513.608483] shrink_node_memcg+0x37a/0x7f0
[ 513.608682] shrink_node+0xd8/0x490
[ 513.608869] balance_pgdat+0x18b/0x3b0
[ 513.609062] kswapd+0x203/0x5c0
[ 513.609241] kthread+0x100/0x140
[ 513.609420] ret_from_fork+0x24/0x30
[ 513.609607]
-> #2 (fs_reclaim){+.+.}:
[ 513.609883] kmem_cache_alloc_trace+0x34/0x2e0
[ 513.610093] reservation_object_reserve_shared+0x139/0x300
[ 513.610326] ttm_bo_init_reserved+0x291/0x480 [ttm]
[ 513.610567] amdgpu_bo_do_create+0x1d2/0x650 [amdgpu]
[ 513.610811] amdgpu_bo_create+0x40/0x1f0 [amdgpu]
[ 513.611041] amdgpu_bo_create_reserved+0x249/0x2d0 [amdgpu]
[ 513.611290] amdgpu_bo_create_kernel+0x12/0x70 [amdgpu]
[ 513.611584] amdgpu_ttm_init+0x2cb/0x560 [amdgpu]
[ 513.611823] gmc_v9_0_sw_init+0x400/0x750 [amdgpu]
[ 513.612491] amdgpu_device_init+0x14eb/0x1990 [amdgpu]
[ 513.612730] amdgpu_driver_load_kms+0x78/0x290 [amdgpu]
[ 513.612958] drm_dev_register+0x111/0x1a0
[ 513.613171] amdgpu_pci_probe+0x11c/0x1e0 [amdgpu]
[ 513.613389] local_pci_probe+0x3f/0x90
[ 513.613581] pci_device_probe+0x102/0x1c0
[ 513.613779] driver_probe_device+0x2a7/0x480
[ 513.613984] __driver_attach+0x10a/0x110
[ 513.614179] bus_for_each_dev+0x67/0xc0
[ 513.614372] bus_add_driver+0x1eb/0x260
[ 513.614565] driver_register+0x5b/0xe0
[ 513.614756] do_one_initcall+0xac/0x357
[ 513.614952] do_init_module+0x5b/0x213
[ 513.615145] load_module+0x2542/0x2d30
[ 513.615337] __do_sys_finit_module+0xd2/0x100
[ 513.615541] do_syscall_64+0x50/0x1a0
[ 513.615731] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 513.615963]
-> #1 (reservation_ww_class_mutex){+.+.}:
[ 513.616293] amdgpu_amdkfd_alloc_gtt_mem+0xcf/0x2c0 [amdgpu]
[ 513.616554] init_mqd+0x223/0x260 [amdgpu]
[ 513.616779] create_queue_nocpsch+0x4d9/0x600 [amdgpu]
[ 513.617031] pqm_create_queue+0x37c/0x520 [amdgpu]
[ 513.617270] kfd_ioctl_create_queue+0x2f9/0x650 [amdgpu]
[ 513.617522] kfd_ioctl+0x202/0x350 [amdgpu]
[ 513.617724] do_vfs_ioctl+0x9f/0x6c0
[ 513.617914] ksys_ioctl+0x66/0x70
[ 513.618095] __x64_sys_ioctl+0x16/0x20
[ 513.618286] do_syscall_64+0x50/0x1a0
[ 513.618476] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 513.618695]
-> #0 (&dqm->lock_hidden){+.+.}:
[ 513.618984] __mutex_lock+0x98/0x970
[ 513.619197] evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[ 513.619459] kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
[ 513.619710] kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
[ 513.620103] amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
[ 513.620363] amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
[ 513.620614] __mmu_notifier_invalidate_range_start+0x70/0xb0
[ 513.620851] try_to_unmap_one+0x7fc/0x8f0
[ 513.621049] rmap_walk_anon+0x121/0x290
[ 513.621242] try_to_unmap+0x93/0xf0
[ 513.621428] shrink_page_list+0x606/0xcb0
[ 513.621625] shrink_inactive_list+0x33b/0x700
[ 513.621835] shrink_node_memcg+0x37a/0x7f0
[ 513.622034] shrink_node+0xd8/0x490
[ 513.622219] balance_pgdat+0x18b/0x3b0
[ 513.622410] kswapd+0x203/0x5c0
[ 513.622589] kthread+0x100/0x140
[ 513.622769] ret_from_fork+0x24/0x30
[ 513.622957]
other info that might help us debug this:
[ 513.623354] Chain exists of:
&dqm->lock_hidden --> &mapping->i_mmap_rwsem --> &anon_vma->rwsem
[ 513.623900] Possible unsafe locking scenario:
[ 513.624189] CPU0 CPU1
[ 513.624397] ---- ----
[ 513.624594] lock(&anon_vma->rwsem);
[ 513.624771] lock(&mapping->i_mmap_rwsem);
[ 513.625020] lock(&anon_vma->rwsem);
[ 513.625253] lock(&dqm->lock_hidden);
[ 513.625433]
*** DEADLOCK ***
[ 513.625783] 3 locks held by kswapd0/611:
[ 513.625967] #0: 00000000f14edf84 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30
[ 513.626309] #1: 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
[ 513.626671] #2: 0000000067b5cd12 (srcu){....}, at: __mmu_notifier_invalidate_range_start+0x5/0xb0
[ 513.627037]
stack backtrace:
[ 513.627292] CPU: 0 PID: 611 Comm: kswapd0 Tainted: G W 4.18.0-kfd-root #2
[ 513.627632] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[ 513.627990] Call Trace:
[ 513.628143] dump_stack+0x7c/0xbb
[ 513.628315] print_circular_bug.isra.37+0x21b/0x228
[ 513.628581] __lock_acquire+0xf7d/0x1470
[ 513.628782] ? unwind_next_frame+0x6c/0x4f0
[ 513.628974] ? lock_acquire+0xec/0x1e0
[ 513.629154] lock_acquire+0xec/0x1e0
[ 513.629357] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[ 513.629587] __mutex_lock+0x98/0x970
[ 513.629790] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[ 513.630047] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[ 513.630309] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[ 513.630562] evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
[ 513.630816] kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
[ 513.631057] kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
[ 513.631288] amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
[ 513.631536] amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
[ 513.632076] __mmu_notifier_invalidate_range_start+0x70/0xb0
[ 513.632299] try_to_unmap_one+0x7fc/0x8f0
[ 513.632487] ? page_lock_anon_vma_read+0x68/0x250
[ 513.632690] rmap_walk_anon+0x121/0x290
[ 513.632875] try_to_unmap+0x93/0xf0
[ 513.633050] ? page_remove_rmap+0x330/0x330
[ 513.633239] ? rcu_read_unlock+0x60/0x60
[ 513.633422] ? page_get_anon_vma+0x160/0x160
[ 513.633613] shrink_page_list+0x606/0xcb0
[ 513.633800] shrink_inactive_list+0x33b/0x700
[ 513.633997] shrink_node_memcg+0x37a/0x7f0
[ 513.634186] ? shrink_node+0xd8/0x490
[ 513.634363] shrink_node+0xd8/0x490
[ 513.634537] balance_pgdat+0x18b/0x3b0
[ 513.634718] kswapd+0x203/0x5c0
[ 513.634887] ? wait_woken+0xb0/0xb0
[ 513.635062] kthread+0x100/0x140
[ 513.635231] ? balance_pgdat+0x3b0/0x3b0
[ 513.635414] ? kthread_delayed_work_timer_fn+0x80/0x80
[ 513.635626] ret_from_fork+0x24/0x30
[ 513.636042] Evicting PASID 32768 queues
[ 513.936236] Restoring PASID 32768 queues
[ 524.708912] Evicting PASID 32768 queues
[ 524.999875] Restoring PASID 32768 queues
Change-Id: Ib40b5fb8d7c6572888a0c879a315b382eb948647
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index aac7f7b..4808777 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -273,8 +273,6 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
print_queue(q);
- dqm_lock(dqm);
-
if (dqm->total_queue_count >= max_num_of_queues_per_device) {
pr_warn("Can't create new usermode queue because %d queues were already created\n",
dqm->total_queue_count);
@@ -347,6 +345,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
goto out_uninit_mqd;
}
+ dqm_lock(dqm);
list_add(&q->list, &qpd->queues_list);
qpd->queue_count++;
if (q->properties.is_active)
@@ -365,6 +364,9 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
+out_unlock:
+ dqm_unlock(dqm);
+ return retval;
out_uninit_mqd:
mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
out_deallocate_doorbell:
@@ -378,8 +380,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
deallocate_vmid:
if (list_empty(&qpd->queues_list))
deallocate_vmid(dqm, qpd, q);
-out_unlock:
- dqm_unlock(dqm);
+
return retval;
}
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
[not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
` (2 preceding siblings ...)
2019-05-31 21:19 ` [PATCH 4/4] drm/amdkfd: Fix a circular lock dependency Zeng, Oak
@ 2019-05-31 21:31 ` Kuehling, Felix
[not found] ` <f0e92b32-bd0a-aca4-c7dd-83f8e97b9a6f-5C7GfCeVMHo@public.gmane.org>
3 siblings, 1 reply; 8+ messages in thread
From: Kuehling, Felix @ 2019-05-31 21:31 UTC (permalink / raw)
To: Zeng, Oak,
amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Zhao, Yong, Freehill, Chris, Liu, Alex
On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
> SDMA queue allocation requires the dqm lock as it modify
> the global dqm members. Introduce functions to allocate/deallocate
> in locked/unlocked circumstance.
>
> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 46 ++++++++++++++++------
> 1 file changed, 35 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ece35c7..1f707bb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q);
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
> + struct queue *q);
>
> static void kfd_process_hw_exception(struct work_struct *work);
>
> @@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
> deallocate_hqd(dqm, q);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> dqm->sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> dqm->xgmi_sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> } else {
> pr_debug("q->properties.type %d is invalid\n",
> q->properties.type);
> @@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> if (dqm->sdma_bitmap == 0)
> return -ENOMEM;
> + dqm_lock(dqm);
Doesn't this cause a locking error where you try to take the same lock
twice in this call tree:
create_queue_nocpsch (takes DQM lock)
-> create_sdma_queue_nocpsch
-> allocate_sdma_queue (takes DQM lock again)
> bit = __ffs64(dqm->sdma_bitmap);
> dqm->sdma_bitmap &= ~(1ULL << bit);
> + dqm_unlock(dqm);
> q->sdma_id = bit;
> q->properties.sdma_engine_id = q->sdma_id %
> get_num_sdma_engines(dqm);
> @@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> if (dqm->xgmi_sdma_bitmap == 0)
> return -ENOMEM;
> + dqm_lock(dqm);
> bit = __ffs64(dqm->xgmi_sdma_bitmap);
> dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
> + dqm_unlock(dqm);
> q->sdma_id = bit;
> /* sdma_engine_id is sdma id including
> * both PCIe-optimized SDMAs and XGMI-
> @@ -953,17 +959,35 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
> return 0;
> }
>
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
> + struct queue *q)
> +{
> + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> + if (q->sdma_id >= get_num_sdma_queues(dqm))
> + return;
> + dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> + if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
> + return;
> + dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> + }
> +}
> +
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q)
> {
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> if (q->sdma_id >= get_num_sdma_queues(dqm))
> return;
> + dqm_lock(dqm);
> dqm->sdma_bitmap |= (1ULL << q->sdma_id);
> + dqm_unlock(dqm);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
> return;
> + dqm_lock(dqm);
> dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
> + dqm_unlock(dqm);
> }
> }
You could minimize code duplication by defining deallocate_sdma_queue as
a simple wrapper:
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q)
{
dqm_lock(dqm);
deallocate_sdma_queue_locked(dqm, q);
dqm_unlock(dqm);
}
>
> @@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>
> retval = allocate_doorbell(qpd, q);
> if (retval)
> - goto out_deallocate_sdma_queue;
> + goto out_deallocate_sdma_queue_locked;
>
> dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> @@ -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
> mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> out_deallocate_doorbell:
> deallocate_doorbell(qpd, q);
> -out_deallocate_sdma_queue:
> - deallocate_sdma_queue(dqm, q);
> +out_deallocate_sdma_queue_locked:
> + deallocate_sdma_queue_locked(dqm, q);
>
> return retval;
> }
> @@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>
> retval = allocate_doorbell(qpd, q);
> if (retval)
> - goto out_deallocate_sdma_queue;
> + goto out_deallocate_sdma_queue_locked;
>
> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> q->properties.type)];
> @@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>
> out_deallocate_doorbell:
> deallocate_doorbell(qpd, q);
> -out_deallocate_sdma_queue:
> +out_deallocate_sdma_queue_locked:
Why are you renaming this label? You don't hold the DQM lock when you
get here.
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> deallocate_sdma_queue(dqm, q);
> @@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> dqm->sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> dqm->xgmi_sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> }
>
> list_del(&q->list);
> @@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> list_for_each_entry(q, &qpd->queues_list, list) {
> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> dqm->sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> dqm->xgmi_sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
> }
>
> if (q->properties.is_active)
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch
[not found] ` <1559337538-14249-3-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-31 21:40 ` Kuehling, Felix
0 siblings, 0 replies; 8+ messages in thread
From: Kuehling, Felix @ 2019-05-31 21:40 UTC (permalink / raw)
To: Zeng, Oak,
amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Zhao, Yong, Freehill, Chris, Liu, Alex
On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
> This is prepare work to fix a circular lock dependency.
> No logic change
>
> Change-Id: I4e0ee918260e7780de972dd71f4ce787b4f6dde9
> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 172 ++++++++-------------
> 1 file changed, 62 insertions(+), 110 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 74944ec..aac7f7b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -42,10 +42,6 @@
> static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
> unsigned int pasid, unsigned int vmid);
>
> -static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
> - struct queue *q,
> - struct qcm_process_device *qpd);
> -
> static int execute_queues_cpsch(struct device_queue_manager *dqm,
> enum kfd_unmap_queues_filter filter,
> uint32_t filter_param);
> @@ -55,15 +51,16 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
>
> static int map_queues_cpsch(struct device_queue_manager *dqm);
>
> -static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
> - struct queue *q,
> - struct qcm_process_device *qpd);
> -
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q);
> static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
> struct queue *q);
>
> +static inline void deallocate_hqd(struct device_queue_manager *dqm,
> + struct queue *q);
> +static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
> +static int allocate_sdma_queue(struct device_queue_manager *dqm,
> + struct queue *q);
> static void kfd_process_hw_exception(struct work_struct *work);
>
> static inline
> @@ -272,6 +269,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> struct qcm_process_device *qpd)
> {
> int retval;
> + struct mqd_manager *mqd_mgr;
Reverse Christmas tree ... Put the longer declaration first. It tends to
make code more readable.
>
> print_queue(q);
>
> @@ -302,18 +300,51 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> q->properties.tba_addr = qpd->tba_addr;
> q->properties.tma_addr = qpd->tma_addr;
>
> + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> + q->properties.type)];
> if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> - retval = create_compute_queue_nocpsch(dqm, q, qpd);
> - else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> - q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> - retval = create_sdma_queue_nocpsch(dqm, q, qpd);
> - else
> - retval = -EINVAL;
> + {
The open brace should be on the same line as the "if". Please run
check_patch.pl, it should find those coding style violations. This
convenient command checks the top 4 commits on your local branch without
even having to generate the patch files first: "scripts/checkpatch.pl -g
HEAD~4..HEAD".
> + retval = allocate_hqd(dqm, q);
> + if (retval)
> + goto deallocate_vmid;
> + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> + retval = allocate_sdma_queue(dqm, q);
> + if (retval)
> + goto deallocate_vmid;
> + dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> + }
>
> - if (retval) {
> - if (list_empty(&qpd->queues_list))
> - deallocate_vmid(dqm, qpd, q);
> - goto out_unlock;
> + retval = allocate_doorbell(qpd, q);
> + if (retval)
> + goto out_deallocate_hqd;
> +
> + retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> + &q->gart_mqd_addr, &q->properties);
> + if (retval)
> + goto out_deallocate_doorbell;
> +
> + pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
> + q->pipe, q->queue);
> +
> + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> + dqm->dev->kfd2kgd->set_scratch_backing_va(
> + dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
> +
> + if (q->properties.is_active) {
> +
> + if (WARN(q->process->mm != current->mm,
> + "should only run in user thread"))
> + retval = -EFAULT;
> + else
> + /* TODO for sdma queue load mqd was called
> + * unconditionally in original code, with mm_struct set
> + * to NULL. I am not sure whether we need to keep
> + * original logic here */
> + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
> + &q->properties, current->mm);
> + if (retval)
> + goto out_uninit_mqd;
> }
>
> list_add(&q->list, &qpd->queues_list);
> @@ -334,6 +365,19 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> pr_debug("Total of %d queues are accountable so far\n",
> dqm->total_queue_count);
>
> +out_uninit_mqd:
> + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +out_deallocate_doorbell:
> + deallocate_doorbell(qpd, q);
> +out_deallocate_hqd:
> + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
> + deallocate_hqd(dqm, q);
> + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
> + deallocate_sdma_queue_locked(dqm, q);
> +deallocate_vmid:
> + if (list_empty(&qpd->queues_list))
> + deallocate_vmid(dqm, qpd, q);
> out_unlock:
> dqm_unlock(dqm);
> return retval;
This looks like you always deallocate doorbells, SDMA queues etc., even
when queue creation was successful. Seems wrong.
Regards,
Felix
> @@ -379,58 +423,6 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm,
> dqm->allocated_queues[q->pipe] |= (1 << q->queue);
> }
>
> -static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
> - struct queue *q,
> - struct qcm_process_device *qpd)
> -{
> - struct mqd_manager *mqd_mgr;
> - int retval;
> -
> - mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
> -
> - retval = allocate_hqd(dqm, q);
> - if (retval)
> - return retval;
> -
> - retval = allocate_doorbell(qpd, q);
> - if (retval)
> - goto out_deallocate_hqd;
> -
> - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> - &q->gart_mqd_addr, &q->properties);
> - if (retval)
> - goto out_deallocate_doorbell;
> -
> - pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
> - q->pipe, q->queue);
> -
> - dqm->dev->kfd2kgd->set_scratch_backing_va(
> - dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
> -
> - if (!q->properties.is_active)
> - return 0;
> -
> - if (WARN(q->process->mm != current->mm,
> - "should only run in user thread"))
> - retval = -EFAULT;
> - else
> - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
> - &q->properties, current->mm);
> - if (retval)
> - goto out_uninit_mqd;
> -
> - return 0;
> -
> -out_uninit_mqd:
> - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> -out_deallocate_doorbell:
> - deallocate_doorbell(qpd, q);
> -out_deallocate_hqd:
> - deallocate_hqd(dqm, q);
> -
> - return retval;
> -}
> -
> /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
> * to avoid asynchronized access
> */
> @@ -991,46 +983,6 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> }
> }
>
> -static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
> - struct queue *q,
> - struct qcm_process_device *qpd)
> -{
> - struct mqd_manager *mqd_mgr;
> - int retval;
> -
> - mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA];
> -
> - retval = allocate_sdma_queue(dqm, q);
> - if (retval)
> - return retval;
> -
> - retval = allocate_doorbell(qpd, q);
> - if (retval)
> - goto out_deallocate_sdma_queue_locked;
> -
> - dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
> - &q->gart_mqd_addr, &q->properties);
> - if (retval)
> - goto out_deallocate_doorbell;
> -
> - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
> - NULL);
> - if (retval)
> - goto out_uninit_mqd;
> -
> - return 0;
> -
> -out_uninit_mqd:
> - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> -out_deallocate_doorbell:
> - deallocate_doorbell(qpd, q);
> -out_deallocate_sdma_queue_locked:
> - deallocate_sdma_queue_locked(dqm, q);
> -
> - return retval;
> -}
> -
> /*
> * Device Queue Manager implementation for cp scheduler
> */
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
[not found] ` <f0e92b32-bd0a-aca4-c7dd-83f8e97b9a6f-5C7GfCeVMHo@public.gmane.org>
@ 2019-05-31 21:50 ` Kuehling, Felix
[not found] ` <d00bae2c-8c2f-b8b7-a69a-7205fd01a55a-5C7GfCeVMHo@public.gmane.org>
0 siblings, 1 reply; 8+ messages in thread
From: Kuehling, Felix @ 2019-05-31 21:50 UTC (permalink / raw)
To: Zeng, Oak,
amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Zhao, Yong, Liu, Alex, Freehill, Chris
On 2019-05-31 5:31 p.m., Kuehling, Felix wrote:
> On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
>> SDMA queue allocation requires the dqm lock as it modify
>> the global dqm members. Introduce functions to allocate/deallocate
>> in locked/unlocked circumstance.
>>
>> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
>> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>> ---
>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 46 ++++++++++++++++------
>> 1 file changed, 35 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index ece35c7..1f707bb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>>
>> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>> struct queue *q);
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> + struct queue *q);
>>
>> static void kfd_process_hw_exception(struct work_struct *work);
>>
>> @@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>> deallocate_hqd(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> dqm->sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> dqm->xgmi_sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else {
>> pr_debug("q->properties.type %d is invalid\n",
>> q->properties.type);
>> @@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> if (dqm->sdma_bitmap == 0)
>> return -ENOMEM;
>> + dqm_lock(dqm);
> Doesn't this cause a locking error where you try to take the same lock
> twice in this call tree:
>
> create_queue_nocpsch (takes DQM lock)
> -> create_sdma_queue_nocpsch
> -> allocate_sdma_queue (takes DQM lock again)
>
BTW, I think you actually caught a bug here. In the create_queue_cpsch
path we failed to lock the DQM for SDMA queue allocation. In
create_queue_nocpsch it was not a problem because we took the DQM lock
earlier (which is itself a problem you're working on fixing).
However, now you're making the problem worse for the nocpsch case, at
least until patch 4, which moves the DQM locking in
create_queue_nocpsch. Maybe this change should come after patch 4 so you
don't end up with a worse problem in the middle of your patch series.
Also, I think you're doing the locking unnecessarily fine grained. It's
probably enough to take the DQM lock once at the start of
allocate_sdma_queue, and drop it once in the end. No need to duplicate
this in the two if branches.
Regards,
Felix
>> bit = __ffs64(dqm->sdma_bitmap);
>> dqm->sdma_bitmap &= ~(1ULL << bit);
>> + dqm_unlock(dqm);
>> q->sdma_id = bit;
>> q->properties.sdma_engine_id = q->sdma_id %
>> get_num_sdma_engines(dqm);
>> @@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> if (dqm->xgmi_sdma_bitmap == 0)
>> return -ENOMEM;
>> + dqm_lock(dqm);
>> bit = __ffs64(dqm->xgmi_sdma_bitmap);
>> dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>> + dqm_unlock(dqm);
>> q->sdma_id = bit;
>> /* sdma_engine_id is sdma id including
>> * both PCIe-optimized SDMAs and XGMI-
>> @@ -953,17 +959,35 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> return 0;
>> }
>>
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> + struct queue *q)
>> +{
>> + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> + if (q->sdma_id >= get_num_sdma_queues(dqm))
>> + return;
>> + dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> + if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>> + return;
>> + dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> + }
>> +}
>> +
>> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>> struct queue *q)
>> {
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> if (q->sdma_id >= get_num_sdma_queues(dqm))
>> return;
>> + dqm_lock(dqm);
>> dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> + dqm_unlock(dqm);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>> return;
>> + dqm_lock(dqm);
>> dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> + dqm_unlock(dqm);
>> }
>> }
> You could minimize code duplication by defining deallocate_sdma_queue as
> a simple wrapper:
>
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q)
> {
> dqm_lock(dqm);
> deallocate_sdma_queue_locked(dqm, q);
> dqm_unlock(dqm);
> }
>
>
>>
>> @@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>>
>> retval = allocate_doorbell(qpd, q);
>> if (retval)
>> - goto out_deallocate_sdma_queue;
>> + goto out_deallocate_sdma_queue_locked;
>>
>> dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>> retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
>> @@ -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>> mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>> out_deallocate_doorbell:
>> deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> - deallocate_sdma_queue(dqm, q);
>> +out_deallocate_sdma_queue_locked:
>> + deallocate_sdma_queue_locked(dqm, q);
>>
>> return retval;
>> }
>> @@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>>
>> retval = allocate_doorbell(qpd, q);
>> if (retval)
>> - goto out_deallocate_sdma_queue;
>> + goto out_deallocate_sdma_queue_locked;
>>
>> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>> q->properties.type)];
>> @@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>>
>> out_deallocate_doorbell:
>> deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> +out_deallocate_sdma_queue_locked:
> Why are you renaming this label? You don't hold the DQM lock when you
> get here.
>
>
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>> q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>> deallocate_sdma_queue(dqm, q);
>> @@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> dqm->sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> dqm->xgmi_sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> }
>>
>> list_del(&q->list);
>> @@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>> list_for_each_entry(q, &qpd->queues_list, list) {
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> dqm->sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> dqm->xgmi_sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> }
>>
>> if (q->properties.is_active)
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
[not found] ` <d00bae2c-8c2f-b8b7-a69a-7205fd01a55a-5C7GfCeVMHo@public.gmane.org>
@ 2019-06-03 16:05 ` Zeng, Oak
0 siblings, 0 replies; 8+ messages in thread
From: Zeng, Oak @ 2019-06-03 16:05 UTC (permalink / raw)
To: Kuehling, Felix,
amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Zhao, Yong, Liu, Alex, Freehill, Chris
Hi Felix,
See comment inline [Oak]
Regards,
Oak
-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling@amd.com>
Sent: Friday, May 31, 2019 5:50 PM
To: Zeng, Oak <Oak.Zeng@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhao, Yong <Yong.Zhao@amd.com>; Freehill, Chris <Chris.Freehill@amd.com>; Liu, Alex <Alex.Liu@amd.com>
Subject: Re: [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
On 2019-05-31 5:31 p.m., Kuehling, Felix wrote:
> On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
>> SDMA queue allocation requires the dqm lock as it modify the global
>> dqm members. Introduce functions to allocate/deallocate in
>> locked/unlocked circumstance.
>>
>> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
>> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>> ---
>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 46 ++++++++++++++++------
>> 1 file changed, 35 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index ece35c7..1f707bb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct
>> device_queue_manager *dqm,
>>
>> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>> struct queue *q);
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> + struct queue *q);
>>
>> static void kfd_process_hw_exception(struct work_struct *work);
>>
>> @@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>> deallocate_hqd(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> dqm->sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> dqm->xgmi_sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else {
>> pr_debug("q->properties.type %d is invalid\n",
>> q->properties.type);
>> @@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> if (dqm->sdma_bitmap == 0)
>> return -ENOMEM;
>> + dqm_lock(dqm);
> Doesn't this cause a locking error where you try to take the same lock
> twice in this call tree:
>
> create_queue_nocpsch (takes DQM lock)
> -> create_sdma_queue_nocpsch
> -> allocate_sdma_queue (takes DQM lock again)
>
BTW, I think you actually caught a bug here. In the create_queue_cpsch path we failed to lock the DQM for SDMA queue allocation. In create_queue_nocpsch it was not a problem because we took the DQM lock earlier (which is itself a problem you're working on fixing).
However, now you're making the problem worse for the nocpsch case, at least until patch 4, which moves the DQM locking in create_queue_nocpsch. Maybe this change should come after patch 4 so you don't end up with a worse problem in the middle of your patch series.
[Oak]: I realized the same issue when I wrote this patch...I will have to create function allocate_sdma_queue_locked/unlocked to fix the issue properly. But I will have to delete allocate_sdma_queue_unlocked after patch 4 because it won't be used any more after patch 4.
It is a good idea to do this patch after patch 4. I will re-submit the patches series, also fixing other issues you mentioned.
Also, I think you're doing the locking unnecessarily fine grained. It's probably enough to take the DQM lock once at the start of allocate_sdma_queue, and drop it once in the end. No need to duplicate this in the two if branches.
[Oak] Agreed.
Regards,
Felix
>> bit = __ffs64(dqm->sdma_bitmap);
>> dqm->sdma_bitmap &= ~(1ULL << bit);
>> + dqm_unlock(dqm);
>> q->sdma_id = bit;
>> q->properties.sdma_engine_id = q->sdma_id %
>> get_num_sdma_engines(dqm);
>> @@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> if (dqm->xgmi_sdma_bitmap == 0)
>> return -ENOMEM;
>> + dqm_lock(dqm);
>> bit = __ffs64(dqm->xgmi_sdma_bitmap);
>> dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>> + dqm_unlock(dqm);
>> q->sdma_id = bit;
>> /* sdma_engine_id is sdma id including
>> * both PCIe-optimized SDMAs and XGMI- @@ -953,17 +959,35 @@
>> static int allocate_sdma_queue(struct device_queue_manager *dqm,
>> return 0;
>> }
>>
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> + struct queue *q)
>> +{
>> + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> + if (q->sdma_id >= get_num_sdma_queues(dqm))
>> + return;
>> + dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> + if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>> + return;
>> + dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> + }
>> +}
>> +
>> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>> struct queue *q)
>> {
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> if (q->sdma_id >= get_num_sdma_queues(dqm))
>> return;
>> + dqm_lock(dqm);
>> dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> + dqm_unlock(dqm);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>> return;
>> + dqm_lock(dqm);
>> dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> + dqm_unlock(dqm);
>> }
>> }
> You could minimize code duplication by defining deallocate_sdma_queue
> as a simple wrapper:
>
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q) {
> dqm_lock(dqm);
> deallocate_sdma_queue_locked(dqm, q);
> dqm_unlock(dqm);
> }
>
>
>>
>> @@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct
>> device_queue_manager *dqm,
>>
>> retval = allocate_doorbell(qpd, q);
>> if (retval)
>> - goto out_deallocate_sdma_queue;
>> + goto out_deallocate_sdma_queue_locked;
>>
>> dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>> retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, @@
>> -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>> mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>> out_deallocate_doorbell:
>> deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> - deallocate_sdma_queue(dqm, q);
>> +out_deallocate_sdma_queue_locked:
>> + deallocate_sdma_queue_locked(dqm, q);
>>
>> return retval;
>> }
>> @@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct
>> device_queue_manager *dqm, struct queue *q,
>>
>> retval = allocate_doorbell(qpd, q);
>> if (retval)
>> - goto out_deallocate_sdma_queue;
>> + goto out_deallocate_sdma_queue_locked;
>>
>> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>> q->properties.type)];
>> @@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct
>> device_queue_manager *dqm, struct queue *q,
>>
>> out_deallocate_doorbell:
>> deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> +out_deallocate_sdma_queue_locked:
> Why are you renaming this label? You don't hold the DQM lock when you
> get here.
>
>
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>> q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>> deallocate_sdma_queue(dqm, q);
>> @@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct
>> device_queue_manager *dqm,
>>
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> dqm->sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> dqm->xgmi_sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> }
>>
>> list_del(&q->list);
>> @@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>> list_for_each_entry(q, &qpd->queues_list, list) {
>> if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> dqm->sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> dqm->xgmi_sdma_queue_count--;
>> - deallocate_sdma_queue(dqm, q);
>> + deallocate_sdma_queue_locked(dqm, q);
>> }
>>
>> if (q->properties.is_active)
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2019-06-03 16:05 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-05-31 21:19 [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition Zeng, Oak
[not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:19 ` [PATCH 2/4] drm/amdkfd: Only initialize sdma vm for sdma queues Zeng, Oak
2019-05-31 21:19 ` [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch Zeng, Oak
[not found] ` <1559337538-14249-3-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:40 ` Kuehling, Felix
2019-05-31 21:19 ` [PATCH 4/4] drm/amdkfd: Fix a circular lock dependency Zeng, Oak
2019-05-31 21:31 ` [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition Kuehling, Felix
[not found] ` <f0e92b32-bd0a-aca4-c7dd-83f8e97b9a6f-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:50 ` Kuehling, Felix
[not found] ` <d00bae2c-8c2f-b8b7-a69a-7205fd01a55a-5C7GfCeVMHo@public.gmane.org>
2019-06-03 16:05 ` Zeng, Oak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox