AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: "Kuehling, Felix" <Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
To: "Zeng, Oak" <Oak.Zeng-5C7GfCeVMHo@public.gmane.org>,
	"amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org"
	<amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Cc: "Zhao, Yong" <Yong.Zhao-5C7GfCeVMHo@public.gmane.org>,
	"Liu, Alex" <Alex.Liu-5C7GfCeVMHo@public.gmane.org>,
	"Freehill, Chris" <Chris.Freehill-5C7GfCeVMHo@public.gmane.org>
Subject: Re: [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition
Date: Fri, 31 May 2019 21:50:01 +0000	[thread overview]
Message-ID: <d00bae2c-8c2f-b8b7-a69a-7205fd01a55a@amd.com> (raw)
In-Reply-To: <f0e92b32-bd0a-aca4-c7dd-83f8e97b9a6f-5C7GfCeVMHo@public.gmane.org>

On 2019-05-31 5:31 p.m., Kuehling, Felix wrote:
> On 2019-05-31 5:19 p.m., Zeng, Oak wrote:
>> SDMA queue allocation requires the dqm lock as it modify
>> the global dqm members. Introduce functions to allocate/deallocate
>> in locked/unlocked circumstance.
>>
>> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
>> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
>> ---
>>    .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 46 ++++++++++++++++------
>>    1 file changed, 35 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index ece35c7..1f707bb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>>    
>>    static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>    				struct queue *q);
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> +				struct queue *q);
>>    
>>    static void kfd_process_hw_exception(struct work_struct *work);
>>    
>> @@ -446,10 +448,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>    		deallocate_hqd(dqm, q);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		dqm->sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		dqm->xgmi_sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	} else {
>>    		pr_debug("q->properties.type %d is invalid\n",
>>    				q->properties.type);
>> @@ -922,8 +924,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		if (dqm->sdma_bitmap == 0)
>>    			return -ENOMEM;
>> +		dqm_lock(dqm);
> Doesn't this cause a locking error where you try to take the same lock
> twice in this call tree:
>
> create_queue_nocpsch (takes DQM lock)
>       -> create_sdma_queue_nocpsch
>           -> allocate_sdma_queue (takes DQM lock again)
>
BTW, I think you actually caught a bug here. In the create_queue_cpsch 
path we failed to lock the DQM for SDMA queue allocation. In 
create_queue_nocpsch it was not a problem because we took the DQM lock 
earlier (which is itself a problem you're working on fixing).

However, now you're making the problem worse for the nocpsch case, at 
least until patch 4, which moves the DQM locking in 
create_queue_nocpsch. Maybe this change should come after patch 4 so you 
don't end up with a worse problem in the middle of your patch series.

Also, I think you're doing the locking unnecessarily fine grained. It's 
probably enough to take the DQM lock once at the start of 
allocate_sdma_queue, and drop it once in the end. No need to duplicate 
this in the two if branches.

Regards,
   Felix

>>    		bit = __ffs64(dqm->sdma_bitmap);
>>    		dqm->sdma_bitmap &= ~(1ULL << bit);
>> +		dqm_unlock(dqm);
>>    		q->sdma_id = bit;
>>    		q->properties.sdma_engine_id = q->sdma_id %
>>    				get_num_sdma_engines(dqm);
>> @@ -932,8 +936,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		if (dqm->xgmi_sdma_bitmap == 0)
>>    			return -ENOMEM;
>> +		dqm_lock(dqm);
>>    		bit = __ffs64(dqm->xgmi_sdma_bitmap);
>>    		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>> +		dqm_unlock(dqm);
>>    		q->sdma_id = bit;
>>    		/* sdma_engine_id is sdma id including
>>    		 * both PCIe-optimized SDMAs and XGMI-
>> @@ -953,17 +959,35 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>>    	return 0;
>>    }
>>    
>> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>> +				struct queue *q)
>> +{
>> +	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>> +		if (q->sdma_id >= get_num_sdma_queues(dqm))
>> +			return;
>> +		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> +	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>> +		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>> +			return;
>> +		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> +	}
>> +}
>> +
>>    static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>>    				struct queue *q)
>>    {
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		if (q->sdma_id >= get_num_sdma_queues(dqm))
>>    			return;
>> +		dqm_lock(dqm);
>>    		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
>> +		dqm_unlock(dqm);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
>>    			return;
>> +		dqm_lock(dqm);
>>    		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
>> +		dqm_unlock(dqm);
>>    	}
>>    }
> You could minimize code duplication by defining deallocate_sdma_queue as
> a simple wrapper:
>
> static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> struct queue *q)
> {
>           dqm_lock(dqm);
>           deallocate_sdma_queue_locked(dqm, q);
>           dqm_unlock(dqm);
> }
>
>
>>    
>> @@ -982,7 +1006,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>>    
>>    	retval = allocate_doorbell(qpd, q);
>>    	if (retval)
>> -		goto out_deallocate_sdma_queue;
>> +		goto out_deallocate_sdma_queue_locked;
>>    
>>    	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>>    	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
>> @@ -1001,8 +1025,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
>>    	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>>    out_deallocate_doorbell:
>>    	deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> -	deallocate_sdma_queue(dqm, q);
>> +out_deallocate_sdma_queue_locked:
>> +	deallocate_sdma_queue_locked(dqm, q);
>>    
>>    	return retval;
>>    }
>> @@ -1194,7 +1218,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>>    
>>    	retval = allocate_doorbell(qpd, q);
>>    	if (retval)
>> -		goto out_deallocate_sdma_queue;
>> +		goto out_deallocate_sdma_queue_locked;
>>    
>>    	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>    			q->properties.type)];
>> @@ -1242,7 +1266,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>>    
>>    out_deallocate_doorbell:
>>    	deallocate_doorbell(qpd, q);
>> -out_deallocate_sdma_queue:
>> +out_deallocate_sdma_queue_locked:
> Why are you renaming this label? You don't hold the DQM lock when you
> get here.
>
>
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>>    		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>>    		deallocate_sdma_queue(dqm, q);
>> @@ -1396,10 +1420,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>    
>>    	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    		dqm->sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    		dqm->xgmi_sdma_queue_count--;
>> -		deallocate_sdma_queue(dqm, q);
>> +		deallocate_sdma_queue_locked(dqm, q);
>>    	}
>>    
>>    	list_del(&q->list);
>> @@ -1625,10 +1649,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>>    	list_for_each_entry(q, &qpd->queues_list, list) {
>>    		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>>    			dqm->sdma_queue_count--;
>> -			deallocate_sdma_queue(dqm, q);
>> +			deallocate_sdma_queue_locked(dqm, q);
>>    		} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>>    			dqm->xgmi_sdma_queue_count--;
>> -			deallocate_sdma_queue(dqm, q);
>> +			deallocate_sdma_queue_locked(dqm, q);
>>    		}
>>    
>>    		if (q->properties.is_active)
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  parent reply	other threads:[~2019-05-31 21:50 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-31 21:19 [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition Zeng, Oak
     [not found] ` <1559337538-14249-1-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:19   ` [PATCH 2/4] drm/amdkfd: Only initialize sdma vm for sdma queues Zeng, Oak
2019-05-31 21:19   ` [PATCH 3/4] drm/amdkfd: Refactor create_queue_nocpsch Zeng, Oak
     [not found]     ` <1559337538-14249-3-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:40       ` Kuehling, Felix
2019-05-31 21:19   ` [PATCH 4/4] drm/amdkfd: Fix a circular lock dependency Zeng, Oak
2019-05-31 21:31   ` [PATCH 1/4] drm/amdkfd: Fix sdma queue allocate race condition Kuehling, Felix
     [not found]     ` <f0e92b32-bd0a-aca4-c7dd-83f8e97b9a6f-5C7GfCeVMHo@public.gmane.org>
2019-05-31 21:50       ` Kuehling, Felix [this message]
     [not found]         ` <d00bae2c-8c2f-b8b7-a69a-7205fd01a55a-5C7GfCeVMHo@public.gmane.org>
2019-06-03 16:05           ` Zeng, Oak

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d00bae2c-8c2f-b8b7-a69a-7205fd01a55a@amd.com \
    --to=felix.kuehling-5c7gfcevmho@public.gmane.org \
    --cc=Alex.Liu-5C7GfCeVMHo@public.gmane.org \
    --cc=Chris.Freehill-5C7GfCeVMHo@public.gmane.org \
    --cc=Oak.Zeng-5C7GfCeVMHo@public.gmane.org \
    --cc=Yong.Zhao-5C7GfCeVMHo@public.gmane.org \
    --cc=amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox