Re: [PATCH 6/6] drm/amdkfd: Fix sdma queue allocate race condition

From: "Kuehling, Felix" <Felix.Kuehling-5C7GfCeVMHo@public.gmane.org>
To: "Zeng, Oak" <Oak.Zeng-5C7GfCeVMHo@public.gmane.org>,
	"amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org"
	<amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org>
Cc: "Zhao, Yong" <Yong.Zhao-5C7GfCeVMHo@public.gmane.org>,
	"Freehill, Chris" <Chris.Freehill-5C7GfCeVMHo@public.gmane.org>,
	"Liu, Alex" <Alex.Liu-5C7GfCeVMHo@public.gmane.org>
Subject: Re: [PATCH 6/6] drm/amdkfd: Fix sdma queue allocate race condition
Date: Wed, 5 Jun 2019 22:24:42 +0000	[thread overview]
Message-ID: <a77d6d10-23b5-5a0d-734f-ba6918cb6b37@amd.com> (raw)
In-Reply-To: <1559750793-16608-6-git-send-email-Oak.Zeng-5C7GfCeVMHo@public.gmane.org>

I think the simpler way to fix this, is to restructure 
create_queue_cpsch similar to the nocpsch version where we allocate the 
MQD early and take the DQM lock right after that. That way you don't 
need locked and unlocked variants of allocate_sdma_queue and 
deallocate_sdma_queue.

Regards,
   Felix

On 2019-06-05 12:06 p.m., Zeng, Oak wrote:
> SDMA queue allocation requires the dqm lock as it modify
> the global dqm members. Introduce functions to allocate/deallocate
> in locked/unlocked circumstance.
>
> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
> Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 34 ++++++++++++++++------
>   1 file changed, 25 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 6b1a2ee..52e4ede 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -53,6 +53,8 @@ static int map_queues_cpsch(struct device_queue_manager *dqm);
>   
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   				struct queue *q);
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
> +				struct queue *q);
>   
>   static inline void deallocate_hqd(struct device_queue_manager *dqm,
>   				struct queue *q);
> @@ -434,10 +436,10 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>   		deallocate_hqd(dqm, q);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   		dqm->sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		dqm->xgmi_sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	} else {
>   		pr_debug("q->properties.type %d is invalid\n",
>   				q->properties.type);
> @@ -914,9 +916,12 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   {
>   	int bit;
>   
> +	dqm_lock(dqm);
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> -		if (dqm->sdma_bitmap == 0)
> +		if (dqm->sdma_bitmap == 0) {
> +			dqm_unlock(dqm);
>   			return -ENOMEM;
> +		}
>   		bit = __ffs64(dqm->sdma_bitmap);
>   		dqm->sdma_bitmap &= ~(1ULL << bit);
>   		q->sdma_id = bit;
> @@ -925,8 +930,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   		q->properties.sdma_queue_id = q->sdma_id /
>   				get_num_sdma_engines(dqm);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> -		if (dqm->xgmi_sdma_bitmap == 0)
> +		if (dqm->xgmi_sdma_bitmap == 0) {
> +			dqm_unlock(dqm);
>   			return -ENOMEM;
> +		}
>   		bit = __ffs64(dqm->xgmi_sdma_bitmap);
>   		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>   		q->sdma_id = bit;
> @@ -942,13 +949,14 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
>   				get_num_xgmi_sdma_engines(dqm);
>   	}
>   
> +	dqm_unlock(dqm);
>   	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
>   	pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
>   
>   	return 0;
>   }
>   
> -static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>   				struct queue *q)
>   {
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> @@ -962,6 +970,14 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   	}
>   }
>   
> +static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +				struct queue *q)
> +{
> +	dqm_lock(dqm);
> +	deallocate_sdma_queue_locked(dqm, q);
> +	dqm_unlock(dqm);
> +}
> +
>   /*
>    * Device Queue Manager implementation for cp scheduler
>    */
> @@ -1356,10 +1372,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   
>   	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   		dqm->sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   		dqm->xgmi_sdma_queue_count--;
> -		deallocate_sdma_queue(dqm, q);
> +		deallocate_sdma_queue_locked(dqm, q);
>   	}
>   
>   	list_del(&q->list);
> @@ -1585,10 +1601,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   	list_for_each_entry(q, &qpd->queues_list, list) {
>   		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   			dqm->sdma_queue_count--;
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue_locked(dqm, q);
>   		} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   			dqm->xgmi_sdma_queue_count--;
> -			deallocate_sdma_queue(dqm, q);
> +			deallocate_sdma_queue_locked(dqm, q);
>   		}
>   
>   		if (q->properties.is_active)
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx