[PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)

amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
@ 2025-08-13 13:45 Alex Deucher
  2025-08-13 16:31 ` Wu, David
  0 siblings, 1 reply; 36+ messages in thread
From: Alex Deucher @ 2025-08-13 13:45 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alex Deucher, Sathishkumar S

If there are multiple instances of the VCN running,
we may end up switching the video profile while another
instance is active because we only take into account
the current instance's submissions.  Look at all
outstanding fences for the video profile.

v2: drop early exit in begin_use()
v3: handle possible race between begin_use() work handler

Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile handling")
Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40 ++++++++++++-------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 9a76e11d1c184..593c1ddf8819b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -415,19 +415,25 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 	struct amdgpu_vcn_inst *vcn_inst =
 		container_of(work, struct amdgpu_vcn_inst, idle_work.work);
 	struct amdgpu_device *adev = vcn_inst->adev;
-	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
-	unsigned int i = vcn_inst->inst, j;
+	unsigned int total_fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
+	unsigned int i, j;
 	int r = 0;
 
-	if (adev->vcn.harvest_config & (1 << i))
+	if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
 		return;
 
-	for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
-		fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
+	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+		struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
+
+		for (j = 0; j < v->num_enc_rings; ++j)
+			fence[i] += amdgpu_fence_count_emitted(&v->ring_enc[j]);
+		fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
+		total_fences += fence[i];
+	}
 
 	/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
 	if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
-	    !adev->vcn.inst[i].using_unified_queue) {
+	    !vcn_inst->using_unified_queue) {
 		struct dpg_pause_state new_state;
 
 		if (fence[i] ||
@@ -436,18 +442,18 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 		else
 			new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
 
-		adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
+		vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
 	}
 
-	fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
-	fences += fence[i];
-
-	if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
+	if (!fence[vcn_inst->inst] && !atomic_read(&vcn_inst->total_submission_cnt)) {
+		/* This is specific to this instance */
 		mutex_lock(&vcn_inst->vcn_pg_lock);
 		vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
 		mutex_unlock(&vcn_inst->vcn_pg_lock);
 		mutex_lock(&adev->vcn.workload_profile_mutex);
-		if (adev->vcn.workload_profile_active) {
+		/* This is global and depends on all VCN instances */
+		if (adev->vcn.workload_profile_active && !total_fences &&
+		    !atomic_read(&adev->vcn.total_submission_cnt)) {
 			r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
 							    false);
 			if (r)
@@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
 	int r = 0;
 
 	atomic_inc(&vcn_inst->total_submission_cnt);
+	atomic_inc(&adev->vcn.total_submission_cnt);
 
 	cancel_delayed_work_sync(&vcn_inst->idle_work);
 
-	/* We can safely return early here because we've cancelled the
-	 * the delayed work so there is no one else to set it to false
-	 * and we don't care if someone else sets it to true.
-	 */
-	if (adev->vcn.workload_profile_active)
-		goto pg_lock;
-
 	mutex_lock(&adev->vcn.workload_profile_mutex);
 	if (!adev->vcn.workload_profile_active) {
 		r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
@@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
 	}
 	mutex_unlock(&adev->vcn.workload_profile_mutex);
 
-pg_lock:
 	mutex_lock(&vcn_inst->vcn_pg_lock);
 	vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
 
@@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
 		atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
 
 	atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
+	atomic_dec(&ring->adev->vcn.total_submission_cnt);
 
 	schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
 			      VCN_IDLE_TIMEOUT);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index b3fb1d0e43fc9..febc3ce8641ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -352,6 +352,7 @@ struct amdgpu_vcn {
 
 	uint16_t inst_mask;
 	uint8_t	num_inst_per_aid;
+	atomic_t		total_submission_cnt;
 
 	/* IP reg dump */
 	uint32_t		*ip_dump;
-- 
2.50.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 13:45 [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3) Alex Deucher
@ 2025-08-13 16:31 ` Wu, David
  2025-08-13 16:51   ` Alex Deucher
  0 siblings, 1 reply; 36+ messages in thread
From: Wu, David @ 2025-08-13 16:31 UTC (permalink / raw)
  To: Alex Deucher, amd-gfx; +Cc: Sathishkumar S

Hi Alex,

The addition of  total_submission_cnt should work - in that
it is unlikely to have a context switch right after the begin_use().
The suggestion of moving it inside the lock (which I prefer in case someone
adds more before the lock and not reviewed thoroughly)
  - up to you to decide.

Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>

Thanks,
David
On 8/13/2025 9:45 AM, Alex Deucher wrote:
> If there are multiple instances of the VCN running,
> we may end up switching the video profile while another
> instance is active because we only take into account
> the current instance's submissions.  Look at all
> outstanding fences for the video profile.
>
> v2: drop early exit in begin_use()
> v3: handle possible race between begin_use() work handler
>
> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile handling")
> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40 ++++++++++++-------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>   2 files changed, 21 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> index 9a76e11d1c184..593c1ddf8819b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> @@ -415,19 +415,25 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
>   	struct amdgpu_vcn_inst *vcn_inst =
>   		container_of(work, struct amdgpu_vcn_inst, idle_work.work);
>   	struct amdgpu_device *adev = vcn_inst->adev;
> -	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> -	unsigned int i = vcn_inst->inst, j;
> +	unsigned int total_fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> +	unsigned int i, j;
>   	int r = 0;
>   
> -	if (adev->vcn.harvest_config & (1 << i))
> +	if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>   		return;
>   
> -	for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> -		fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> +	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> +		struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
> +
> +		for (j = 0; j < v->num_enc_rings; ++j)
> +			fence[i] += amdgpu_fence_count_emitted(&v->ring_enc[j]);
> +		fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> +		total_fences += fence[i];
> +	}
>   
>   	/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
>   	if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> -	    !adev->vcn.inst[i].using_unified_queue) {
> +	    !vcn_inst->using_unified_queue) {
>   		struct dpg_pause_state new_state;
>   
>   		if (fence[i] ||
> @@ -436,18 +442,18 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
>   		else
>   			new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>   
> -		adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> +		vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>   	}
>   
> -	fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> -	fences += fence[i];
> -
> -	if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> +	if (!fence[vcn_inst->inst] && !atomic_read(&vcn_inst->total_submission_cnt)) {
> +		/* This is specific to this instance */
>   		mutex_lock(&vcn_inst->vcn_pg_lock);
>   		vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>   		mutex_unlock(&vcn_inst->vcn_pg_lock);
>   		mutex_lock(&adev->vcn.workload_profile_mutex);
> -		if (adev->vcn.workload_profile_active) {
> +		/* This is global and depends on all VCN instances */
> +		if (adev->vcn.workload_profile_active && !total_fences &&
> +		    !atomic_read(&adev->vcn.total_submission_cnt)) {
>   			r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
>   							    false);
>   			if (r)
> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>   	int r = 0;
>   
>   	atomic_inc(&vcn_inst->total_submission_cnt);
> +	atomic_inc(&adev->vcn.total_submission_cnt);
move this addition down inside the mutex lock
>   	cancel_delayed_work_sync(&vcn_inst->idle_work);
>   
> -	/* We can safely return early here because we've cancelled the
> -	 * the delayed work so there is no one else to set it to false
> -	 * and we don't care if someone else sets it to true.
> -	 */
> -	if (adev->vcn.workload_profile_active)
> -		goto pg_lock;
> -
>   	mutex_lock(&adev->vcn.workload_profile_mutex);
move to here:
                atomic_inc(&adev->vcn.total_submission_cnt);
I think this should work for multiple instances.

David
>   	if (!adev->vcn.workload_profile_active) {
>   		r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>   	}
>   	mutex_unlock(&adev->vcn.workload_profile_mutex);
>   
> -pg_lock:
>   	mutex_lock(&vcn_inst->vcn_pg_lock);
>   	vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>   
> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
>   		atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>   
>   	atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
> +	atomic_dec(&ring->adev->vcn.total_submission_cnt);
>   
>   	schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>   			      VCN_IDLE_TIMEOUT);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> index b3fb1d0e43fc9..febc3ce8641ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>   
>   	uint16_t inst_mask;
>   	uint8_t	num_inst_per_aid;
> +	atomic_t		total_submission_cnt;
>   
>   	/* IP reg dump */
>   	uint32_t		*ip_dump;


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 16:31 ` Wu, David
@ 2025-08-13 16:51   ` Alex Deucher
  2025-08-13 18:16     ` Wu, David
  0 siblings, 1 reply; 36+ messages in thread
From: Alex Deucher @ 2025-08-13 16:51 UTC (permalink / raw)
  To: Wu, David; +Cc: Alex Deucher, amd-gfx, Sathishkumar S

On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>
> Hi Alex,
>
> The addition of  total_submission_cnt should work - in that
> it is unlikely to have a context switch right after the begin_use().
> The suggestion of moving it inside the lock (which I prefer in case someone
> adds more before the lock and not reviewed thoroughly)
>   - up to you to decide.
>
> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>
> Thanks,
> David
> On 8/13/2025 9:45 AM, Alex Deucher wrote:
> > If there are multiple instances of the VCN running,
> > we may end up switching the video profile while another
> > instance is active because we only take into account
> > the current instance's submissions.  Look at all
> > outstanding fences for the video profile.
> >
> > v2: drop early exit in begin_use()
> > v3: handle possible race between begin_use() work handler
> >
> > Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile handling")
> > Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
> > Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40 ++++++++++++-------------
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
> >   2 files changed, 21 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> > index 9a76e11d1c184..593c1ddf8819b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> > @@ -415,19 +415,25 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >       struct amdgpu_vcn_inst *vcn_inst =
> >               container_of(work, struct amdgpu_vcn_inst, idle_work.work);
> >       struct amdgpu_device *adev = vcn_inst->adev;
> > -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> > -     unsigned int i = vcn_inst->inst, j;
> > +     unsigned int total_fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> > +     unsigned int i, j;
> >       int r = 0;
> >
> > -     if (adev->vcn.harvest_config & (1 << i))
> > +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
> >               return;
> >
> > -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> > -             fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> > +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> > +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
> > +
> > +             for (j = 0; j < v->num_enc_rings; ++j)
> > +                     fence[i] += amdgpu_fence_count_emitted(&v->ring_enc[j]);
> > +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> > +             total_fences += fence[i];
> > +     }
> >
> >       /* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
> >       if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> > -         !adev->vcn.inst[i].using_unified_queue) {
> > +         !vcn_inst->using_unified_queue) {
> >               struct dpg_pause_state new_state;
> >
> >               if (fence[i] ||
> > @@ -436,18 +442,18 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >               else
> >                       new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
> >
> > -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> > +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
> >       }
> >
> > -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> > -     fences += fence[i];
> > -
> > -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> > +     if (!fence[vcn_inst->inst] && !atomic_read(&vcn_inst->total_submission_cnt)) {
> > +             /* This is specific to this instance */
> >               mutex_lock(&vcn_inst->vcn_pg_lock);
> >               vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> >               mutex_unlock(&vcn_inst->vcn_pg_lock);
> >               mutex_lock(&adev->vcn.workload_profile_mutex);
> > -             if (adev->vcn.workload_profile_active) {
> > +             /* This is global and depends on all VCN instances */
> > +             if (adev->vcn.workload_profile_active && !total_fences &&
> > +                 !atomic_read(&adev->vcn.total_submission_cnt)) {
> >                       r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
> >                                                           false);
> >                       if (r)
> > @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
> >       int r = 0;
> >
> >       atomic_inc(&vcn_inst->total_submission_cnt);
> > +     atomic_inc(&adev->vcn.total_submission_cnt);
> move this addition down inside the mutex lock
> >       cancel_delayed_work_sync(&vcn_inst->idle_work);
> >
> > -     /* We can safely return early here because we've cancelled the
> > -      * the delayed work so there is no one else to set it to false
> > -      * and we don't care if someone else sets it to true.
> > -      */
> > -     if (adev->vcn.workload_profile_active)
> > -             goto pg_lock;
> > -
> >       mutex_lock(&adev->vcn.workload_profile_mutex);
> move to here:
>                 atomic_inc(&adev->vcn.total_submission_cnt);
> I think this should work for multiple instances.

Why does this need to be protected by the mutex?

Alex

>
> David
> >       if (!adev->vcn.workload_profile_active) {
> >               r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
> > @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
> >       }
> >       mutex_unlock(&adev->vcn.workload_profile_mutex);
> >
> > -pg_lock:
> >       mutex_lock(&vcn_inst->vcn_pg_lock);
> >       vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> >
> > @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
> >               atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
> >
> >       atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
> > +     atomic_dec(&ring->adev->vcn.total_submission_cnt);
> >
> >       schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
> >                             VCN_IDLE_TIMEOUT);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> > index b3fb1d0e43fc9..febc3ce8641ff 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> > @@ -352,6 +352,7 @@ struct amdgpu_vcn {
> >
> >       uint16_t inst_mask;
> >       uint8_t num_inst_per_aid;
> > +     atomic_t                total_submission_cnt;
> >
> >       /* IP reg dump */
> >       uint32_t                *ip_dump;
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 16:51   ` Alex Deucher
@ 2025-08-13 18:16     ` Wu, David
  2025-08-13 18:23       ` Sundararaju, Sathishkumar
  0 siblings, 1 reply; 36+ messages in thread
From: Wu, David @ 2025-08-13 18:16 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Alex Deucher, amd-gfx, Sathishkumar S

On 8/13/2025 12:51 PM, Alex Deucher wrote:
> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>> Hi Alex,
>>
>> The addition of  total_submission_cnt should work - in that
>> it is unlikely to have a context switch right after the begin_use().
>> The suggestion of moving it inside the lock (which I prefer in case someone
>> adds more before the lock and not reviewed thoroughly)
>>    - up to you to decide.
>>
>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>
>> Thanks,
>> David
>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>> If there are multiple instances of the VCN running,
>>> we may end up switching the video profile while another
>>> instance is active because we only take into account
>>> the current instance's submissions.  Look at all
>>> outstanding fences for the video profile.
>>>
>>> v2: drop early exit in begin_use()
>>> v3: handle possible race between begin_use() work handler
>>>
>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile handling")
>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40 ++++++++++++-------------
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>    2 files changed, 21 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>> @@ -415,19 +415,25 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>        struct amdgpu_vcn_inst *vcn_inst =
>>>                container_of(work, struct amdgpu_vcn_inst, idle_work.work);
>>>        struct amdgpu_device *adev = vcn_inst->adev;
>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>> -     unsigned int i = vcn_inst->inst, j;
>>> +     unsigned int total_fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>> +     unsigned int i, j;
>>>        int r = 0;
>>>
>>> -     if (adev->vcn.harvest_config & (1 << i))
>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>                return;
>>>
>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>> -             fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>> +
>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>> +                     fence[i] += amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>> +             total_fences += fence[i];
>>> +     }
>>>
>>>        /* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
>>>        if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>> +         !vcn_inst->using_unified_queue) {
>>>                struct dpg_pause_state new_state;
>>>
>>>                if (fence[i] ||
>>> @@ -436,18 +442,18 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>                else
>>>                        new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>>>
>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>        }
>>>
>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>> -     fences += fence[i];
>>> -
>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>> +     if (!fence[vcn_inst->inst] && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>> +             /* This is specific to this instance */
>>>                mutex_lock(&vcn_inst->vcn_pg_lock);
>>>                vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>                mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>                mutex_lock(&adev->vcn.workload_profile_mutex);
>>> -             if (adev->vcn.workload_profile_active) {
>>> +             /* This is global and depends on all VCN instances */
>>> +             if (adev->vcn.workload_profile_active && !total_fences &&
>>> +                 !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>                        r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
>>>                                                            false);
>>>                        if (r)
>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>        int r = 0;
>>>
>>>        atomic_inc(&vcn_inst->total_submission_cnt);
>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>> move this addition down inside the mutex lock
>>>        cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>
>>> -     /* We can safely return early here because we've cancelled the
>>> -      * the delayed work so there is no one else to set it to false
>>> -      * and we don't care if someone else sets it to true.
>>> -      */
>>> -     if (adev->vcn.workload_profile_active)
>>> -             goto pg_lock;
>>> -
>>>        mutex_lock(&adev->vcn.workload_profile_mutex);
>> move to here:
>>                  atomic_inc(&adev->vcn.total_submission_cnt);
>> I think this should work for multiple instances.
> Why does this need to be protected by the mutex?
hmm.. OK - no need and it is actually better before the mutex.
David
> Alex
>
>> David
>>>        if (!adev->vcn.workload_profile_active) {
>>>                r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>        }
>>>        mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>
>>> -pg_lock:
>>>        mutex_lock(&vcn_inst->vcn_pg_lock);
>>>        vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>>>
>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
>>>                atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>>>
>>>        atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
>>> +     atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>
>>>        schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>                              VCN_IDLE_TIMEOUT);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>
>>>        uint16_t inst_mask;
>>>        uint8_t num_inst_per_aid;
>>> +     atomic_t                total_submission_cnt;
>>>
>>>        /* IP reg dump */
>>>        uint32_t                *ip_dump;


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 18:16     ` Wu, David
@ 2025-08-13 18:23       ` Sundararaju, Sathishkumar
  2025-08-13 20:05         ` Alex Deucher
  0 siblings, 1 reply; 36+ messages in thread
From: Sundararaju, Sathishkumar @ 2025-08-13 18:23 UTC (permalink / raw)
  To: Wu, David, Alex Deucher; +Cc: Alex Deucher, amd-gfx

Hi Alex, Hi David,

I see David's concern but his suggestion yet wont solve the problem, 
neither the current form , reason :-

The emitted fence count and total submission count are fast transients 
which frequently become 0 in between video decodes (between jobs) even 
with the atomics and locks there can be a switch of video power profile, 
in the current form of patch that window is minimized, but still can 
happen if stress tested. But power state of any instance becoming zero 
can be a sure shot indication of break in a video decode, the mistake in 
my patch was using per instance mutex, I should have used a common 
global mutex, then that covers the situation David is trying to bring out.

Using one global vcn.pg_lock for idle and begin_use and using flags to 
track power state could help us totally avoid this situation.

Regards,

Sathish

On 8/13/2025 11:46 PM, Wu, David wrote:
> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>> Hi Alex,
>>>
>>> The addition of  total_submission_cnt should work - in that
>>> it is unlikely to have a context switch right after the begin_use().
>>> The suggestion of moving it inside the lock (which I prefer in case 
>>> someone
>>> adds more before the lock and not reviewed thoroughly)
>>>    - up to you to decide.
>>>
>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>
>>> Thanks,
>>> David
>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>> If there are multiple instances of the VCN running,
>>>> we may end up switching the video profile while another
>>>> instance is active because we only take into account
>>>> the current instance's submissions.  Look at all
>>>> outstanding fences for the video profile.
>>>>
>>>> v2: drop early exit in begin_use()
>>>> v3: handle possible race between begin_use() work handler
>>>>
>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile 
>>>> handling")
>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40 
>>>> ++++++++++++-------------
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>    2 files changed, 21 insertions(+), 20 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>> @@ -415,19 +415,25 @@ static void 
>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>        struct amdgpu_vcn_inst *vcn_inst =
>>>>                container_of(work, struct amdgpu_vcn_inst, 
>>>> idle_work.work);
>>>>        struct amdgpu_device *adev = vcn_inst->adev;
>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>> -     unsigned int i = vcn_inst->inst, j;
>>>> +     unsigned int total_fences = 0, 
>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>> +     unsigned int i, j;
>>>>        int r = 0;
>>>>
>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>                return;
>>>>
>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>> -             fence[i] += 
>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>>> +
>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>> +                     fence[i] += 
>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>> +             total_fences += fence[i];
>>>> +     }
>>>>
>>>>        /* Only set DPG pause for VCN3 or below, VCN4 and above will 
>>>> be handled by FW */
>>>>        if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>> +         !vcn_inst->using_unified_queue) {
>>>>                struct dpg_pause_state new_state;
>>>>
>>>>                if (fence[i] ||
>>>> @@ -436,18 +442,18 @@ static void 
>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>                else
>>>>                        new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>>>>
>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>>        }
>>>>
>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>> -     fences += fence[i];
>>>> -
>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>> +     if (!fence[vcn_inst->inst] && 
>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>> +             /* This is specific to this instance */
>>>>                mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>                vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>                mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>> -             if (adev->vcn.workload_profile_active) {
>>>> +             /* This is global and depends on all VCN instances */
>>>> +             if (adev->vcn.workload_profile_active && 
>>>> !total_fences &&
>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>                        r = amdgpu_dpm_switch_power_profile(adev, 
>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>> false);
>>>>                        if (r)
>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct 
>>>> amdgpu_ring *ring)
>>>>        int r = 0;
>>>>
>>>>        atomic_inc(&vcn_inst->total_submission_cnt);
>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>> move this addition down inside the mutex lock
>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>
>>>> -     /* We can safely return early here because we've cancelled the
>>>> -      * the delayed work so there is no one else to set it to false
>>>> -      * and we don't care if someone else sets it to true.
>>>> -      */
>>>> -     if (adev->vcn.workload_profile_active)
>>>> -             goto pg_lock;
>>>> -
>>>>        mutex_lock(&adev->vcn.workload_profile_mutex);
>>> move to here:
>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>> I think this should work for multiple instances.
>> Why does this need to be protected by the mutex?
> hmm.. OK - no need and it is actually better before the mutex.
> David
>> Alex
>>
>>> David
>>>>        if (!adev->vcn.workload_profile_active) {
>>>>                r = amdgpu_dpm_switch_power_profile(adev, 
>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct 
>>>> amdgpu_ring *ring)
>>>>        }
>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>
>>>> -pg_lock:
>>>>        mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>        vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>>>>
>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring 
>>>> *ring)
>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>>>>
>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>
>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>                              VCN_IDLE_TIMEOUT);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>
>>>>        uint16_t inst_mask;
>>>>        uint8_t num_inst_per_aid;
>>>> +     atomic_t                total_submission_cnt;
>>>>
>>>>        /* IP reg dump */
>>>>        uint32_t                *ip_dump;
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 18:23       ` Sundararaju, Sathishkumar
@ 2025-08-13 20:05         ` Alex Deucher
  2025-08-13 20:58           ` Sundararaju, Sathishkumar
  0 siblings, 1 reply; 36+ messages in thread
From: Alex Deucher @ 2025-08-13 20:05 UTC (permalink / raw)
  To: Sundararaju, Sathishkumar; +Cc: Wu, David, Alex Deucher, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 8861 bytes --]

On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
<sathishkumar.sundararaju@amd.com> wrote:
>
> Hi Alex, Hi David,
>
> I see David's concern but his suggestion yet wont solve the problem,
> neither the current form , reason :-
>
> The emitted fence count and total submission count are fast transients
> which frequently become 0 in between video decodes (between jobs) even
> with the atomics and locks there can be a switch of video power profile,
> in the current form of patch that window is minimized, but still can
> happen if stress tested. But power state of any instance becoming zero

Can you explain how this can happen?  I'm not seeing it.

If it is possible, maybe it would be easier to just split the profile
and powergating into separate handlers.  The profile one would be
global and the powergating one would be per instance.  See the
attached patches.

Alex

> can be a sure shot indication of break in a video decode, the mistake in
> my patch was using per instance mutex, I should have used a common
> global mutex, then that covers the situation David is trying to bring out.
>
> Using one global vcn.pg_lock for idle and begin_use and using flags to
> track power state could help us totally avoid this situation.
>
> Regards,
>
> Sathish
>
> On 8/13/2025 11:46 PM, Wu, David wrote:
> > On 8/13/2025 12:51 PM, Alex Deucher wrote:
> >> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
> >>> Hi Alex,
> >>>
> >>> The addition of  total_submission_cnt should work - in that
> >>> it is unlikely to have a context switch right after the begin_use().
> >>> The suggestion of moving it inside the lock (which I prefer in case
> >>> someone
> >>> adds more before the lock and not reviewed thoroughly)
> >>>    - up to you to decide.
> >>>
> >>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
> >>>
> >>> Thanks,
> >>> David
> >>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
> >>>> If there are multiple instances of the VCN running,
> >>>> we may end up switching the video profile while another
> >>>> instance is active because we only take into account
> >>>> the current instance's submissions.  Look at all
> >>>> outstanding fences for the video profile.
> >>>>
> >>>> v2: drop early exit in begin_use()
> >>>> v3: handle possible race between begin_use() work handler
> >>>>
> >>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
> >>>> handling")
> >>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
> >>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> >>>> ---
> >>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
> >>>> ++++++++++++-------------
> >>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
> >>>>    2 files changed, 21 insertions(+), 20 deletions(-)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>> index 9a76e11d1c184..593c1ddf8819b 100644
> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>> @@ -415,19 +415,25 @@ static void
> >>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>        struct amdgpu_vcn_inst *vcn_inst =
> >>>>                container_of(work, struct amdgpu_vcn_inst,
> >>>> idle_work.work);
> >>>>        struct amdgpu_device *adev = vcn_inst->adev;
> >>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>> -     unsigned int i = vcn_inst->inst, j;
> >>>> +     unsigned int total_fences = 0,
> >>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>> +     unsigned int i, j;
> >>>>        int r = 0;
> >>>>
> >>>> -     if (adev->vcn.harvest_config & (1 << i))
> >>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
> >>>>                return;
> >>>>
> >>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> >>>> -             fence[i] +=
> >>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> >>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> >>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
> >>>> +
> >>>> +             for (j = 0; j < v->num_enc_rings; ++j)
> >>>> +                     fence[i] +=
> >>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
> >>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> >>>> +             total_fences += fence[i];
> >>>> +     }
> >>>>
> >>>>        /* Only set DPG pause for VCN3 or below, VCN4 and above will
> >>>> be handled by FW */
> >>>>        if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> >>>> -         !adev->vcn.inst[i].using_unified_queue) {
> >>>> +         !vcn_inst->using_unified_queue) {
> >>>>                struct dpg_pause_state new_state;
> >>>>
> >>>>                if (fence[i] ||
> >>>> @@ -436,18 +442,18 @@ static void
> >>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>                else
> >>>>                        new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
> >>>>
> >>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> >>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
> >>>>        }
> >>>>
> >>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> >>>> -     fences += fence[i];
> >>>> -
> >>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>> +     if (!fence[vcn_inst->inst] &&
> >>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>> +             /* This is specific to this instance */
> >>>>                mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>                vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> >>>>                mutex_unlock(&vcn_inst->vcn_pg_lock);
> >>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>> -             if (adev->vcn.workload_profile_active) {
> >>>> +             /* This is global and depends on all VCN instances */
> >>>> +             if (adev->vcn.workload_profile_active &&
> >>>> !total_fences &&
> >>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
> >>>>                        r = amdgpu_dpm_switch_power_profile(adev,
> >>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>> false);
> >>>>                        if (r)
> >>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>> amdgpu_ring *ring)
> >>>>        int r = 0;
> >>>>
> >>>>        atomic_inc(&vcn_inst->total_submission_cnt);
> >>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
> >>> move this addition down inside the mutex lock
> >>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
> >>>>
> >>>> -     /* We can safely return early here because we've cancelled the
> >>>> -      * the delayed work so there is no one else to set it to false
> >>>> -      * and we don't care if someone else sets it to true.
> >>>> -      */
> >>>> -     if (adev->vcn.workload_profile_active)
> >>>> -             goto pg_lock;
> >>>> -
> >>>>        mutex_lock(&adev->vcn.workload_profile_mutex);
> >>> move to here:
> >>> atomic_inc(&adev->vcn.total_submission_cnt);
> >>> I think this should work for multiple instances.
> >> Why does this need to be protected by the mutex?
> > hmm.. OK - no need and it is actually better before the mutex.
> > David
> >> Alex
> >>
> >>> David
> >>>>        if (!adev->vcn.workload_profile_active) {
> >>>>                r = amdgpu_dpm_switch_power_profile(adev,
> >>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>> amdgpu_ring *ring)
> >>>>        }
> >>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
> >>>>
> >>>> -pg_lock:
> >>>>        mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>        vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> >>>>
> >>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
> >>>> *ring)
> >>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
> >>>>
> >>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
> >>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
> >>>>
> >>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
> >>>>                              VCN_IDLE_TIMEOUT);
> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
> >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
> >>>>
> >>>>        uint16_t inst_mask;
> >>>>        uint8_t num_inst_per_aid;
> >>>> +     atomic_t                total_submission_cnt;
> >>>>
> >>>>        /* IP reg dump */
> >>>>        uint32_t                *ip_dump;
> >

[-- Attachment #2: 0001-drm-amdgpu-vcn-fix-video-profile-race-condition-v4.patch --]
[-- Type: text/x-patch, Size: 13485 bytes --]

From 8599112cfc94e8bcf472db38627fd9b3261b107a Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Tue, 12 Aug 2025 11:38:09 -0400
Subject: [PATCH 1/2] drm/amdgpu/vcn: fix video profile race condition (v4)

If there are multiple instances of the VCN running,
we may end up switching the video profile while another
instance is active because we only take into account
the current instance's submissions.  Split the into
two work handlers, one for global state and one for
per instance state.

v2: drop early exit in begin_use()
v3: handle possible race between begin_use() work handler
v4: split into two handlers, one global and one per instance

Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile handling")
Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 56 +++++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  4 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c   |  3 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c   |  2 +
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   |  3 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  3 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c |  3 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c |  3 ++
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c |  3 ++
 9 files changed, 63 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index fd8ebf4b5a824..ebf730f8ace43 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -91,7 +91,7 @@ MODULE_FIRMWARE(FIRMWARE_VCN4_0_6_1);
 MODULE_FIRMWARE(FIRMWARE_VCN5_0_0);
 MODULE_FIRMWARE(FIRMWARE_VCN5_0_1);
 
-static void amdgpu_vcn_idle_work_handler(struct work_struct *work);
+static void amdgpu_vcn_instance_work_handler(struct work_struct *work);
 static void amdgpu_vcn_reg_dump_fini(struct amdgpu_device *adev);
 
 int amdgpu_vcn_early_init(struct amdgpu_device *adev, int i)
@@ -137,7 +137,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev, int i)
 	mutex_init(&adev->vcn.inst[i].vcn_pg_lock);
 	mutex_init(&adev->vcn.inst[i].engine_reset_mutex);
 	atomic_set(&adev->vcn.inst[i].total_submission_cnt, 0);
-	INIT_DELAYED_WORK(&adev->vcn.inst[i].idle_work, amdgpu_vcn_idle_work_handler);
+	INIT_DELAYED_WORK(&adev->vcn.inst[i].idle_work, amdgpu_vcn_instance_work_handler);
 	atomic_set(&adev->vcn.inst[i].dpg_enc_submission_cnt, 0);
 	if ((adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) &&
 	    (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG))
@@ -408,43 +408,61 @@ int amdgpu_vcn_resume(struct amdgpu_device *adev, int i)
 	return 0;
 }
 
-static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
+static void amdgpu_vcn_instance_work_handler(struct work_struct *work)
 {
 	struct amdgpu_vcn_inst *vcn_inst =
 		container_of(work, struct amdgpu_vcn_inst, idle_work.work);
 	struct amdgpu_device *adev = vcn_inst->adev;
-	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
-	unsigned int i = vcn_inst->inst, j;
-	int r = 0;
+	unsigned int fences = 0;
+	unsigned int i;
 
-	if (adev->vcn.harvest_config & (1 << i))
+	if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
 		return;
 
-	for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
-		fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
+	for (i = 0; i < vcn_inst->num_enc_rings; ++i)
+		fences += amdgpu_fence_count_emitted(&vcn_inst->ring_enc[i]);
+	fences += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
 
 	/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
 	if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
-	    !adev->vcn.inst[i].using_unified_queue) {
+	    !vcn_inst->using_unified_queue) {
 		struct dpg_pause_state new_state;
 
-		if (fence[i] ||
+		if (fences ||
 		    unlikely(atomic_read(&vcn_inst->dpg_enc_submission_cnt)))
 			new_state.fw_based = VCN_DPG_STATE__PAUSE;
 		else
 			new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
 
-		adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
+		vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
 	}
 
-	fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
-	fences += fence[i];
-
 	if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
 		mutex_lock(&vcn_inst->vcn_pg_lock);
 		vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
 		mutex_unlock(&vcn_inst->vcn_pg_lock);
-		mutex_lock(&adev->vcn.workload_profile_mutex);
+	} else {
+		schedule_delayed_work(&vcn_inst->idle_work, VCN_IDLE_TIMEOUT);
+	}
+}
+
+void amdgpu_vcn_global_work_handler(struct work_struct *work)
+{
+	struct amdgpu_device *adev =
+		container_of(work, struct amdgpu_device, vcn.global_work.work);
+	unsigned int total_fences = 0;
+	unsigned int i, j;
+	int r = 0;
+
+	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+		struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
+
+		for (j = 0; j < v->num_enc_rings; ++j)
+			total_fences += amdgpu_fence_count_emitted(&v->ring_enc[j]);
+		total_fences += amdgpu_fence_count_emitted(&v->ring_dec);
+	}
+
+	if (!total_fences && !atomic_read(&adev->vcn.total_submission_cnt)) {
 		if (adev->vcn.workload_profile_active) {
 			r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
 							    false);
@@ -454,7 +472,7 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 		}
 		mutex_unlock(&adev->vcn.workload_profile_mutex);
 	} else {
-		schedule_delayed_work(&vcn_inst->idle_work, VCN_IDLE_TIMEOUT);
+		schedule_delayed_work(&adev->vcn.global_work, VCN_IDLE_TIMEOUT);
 	}
 }
 
@@ -464,8 +482,10 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
 	struct amdgpu_vcn_inst *vcn_inst = &adev->vcn.inst[ring->me];
 	int r = 0;
 
+	atomic_inc(&adev->vcn.total_submission_cnt);
 	atomic_inc(&vcn_inst->total_submission_cnt);
 
+	cancel_delayed_work_sync(&adev->vcn.global_work);
 	cancel_delayed_work_sync(&vcn_inst->idle_work);
 
 	/* We can safely return early here because we've cancelled the
@@ -525,8 +545,10 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
 	    !adev->vcn.inst[ring->me].using_unified_queue)
 		atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
 
+	atomic_dec(&ring->adev->vcn.total_submission_cnt);
 	atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
 
+	schedule_delayed_work(&adev->vcn.global_work, VCN_IDLE_TIMEOUT);
 	schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
 			      VCN_IDLE_TIMEOUT);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index b3fb1d0e43fc9..70d97a2ac7719 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -352,6 +352,8 @@ struct amdgpu_vcn {
 
 	uint16_t inst_mask;
 	uint8_t	num_inst_per_aid;
+	atomic_t		total_submission_cnt;
+	struct delayed_work	global_work;
 
 	/* IP reg dump */
 	uint32_t		*ip_dump;
@@ -565,4 +567,6 @@ int amdgpu_vcn_reg_dump_init(struct amdgpu_device *adev,
 			     const struct amdgpu_hwip_reg_entry *reg, u32 count);
 void amdgpu_vcn_dump_ip_state(struct amdgpu_ip_block *ip_block);
 void amdgpu_vcn_print_ip_state(struct amdgpu_ip_block *ip_block, struct drm_printer *p);
+void amdgpu_vcn_global_work_handler(struct work_struct *work);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index b115137ab2d69..068bb17c51c20 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -139,6 +139,8 @@ static int vcn_v2_0_sw_init(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	volatile struct amdgpu_fw_shared *fw_shared;
 
+	INIT_DELAYED_WORK(&adev->vcn.global_work, amdgpu_vcn_global_work_handler);
+
 	/* VCN DEC TRAP */
 	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
 			      VCN_2_0__SRCID__UVD_SYSTEM_MESSAGE_INTERRUPT,
@@ -322,6 +324,7 @@ static int vcn_v2_0_hw_fini(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_vcn_inst *vinst = adev->vcn.inst;
 
 	cancel_delayed_work_sync(&vinst->idle_work);
+	cancel_delayed_work_sync(&adev->vcn.global_work);
 
 	if ((adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) ||
 	    (vinst->cur_state != AMD_PG_STATE_GATE &&
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
index 95173156f956a..280025a3a0cb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
@@ -177,6 +177,7 @@ static int vcn_v3_0_sw_init(struct amdgpu_ip_block *ip_block)
 	int vcn_doorbell_index = 0;
 	struct amdgpu_device *adev = ip_block->adev;
 
+	INIT_DELAYED_WORK(&adev->vcn.global_work, amdgpu_vcn_global_work_handler);
 	/*
 	 * Note: doorbell assignment is fixed for SRIOV multiple VCN engines
 	 * Formula:
@@ -444,6 +445,7 @@ static int vcn_v3_0_hw_fini(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int i;
 
+	cancel_delayed_work_sync(&adev->vcn.global_work);
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
 		struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index ae9d408e50c37..38010f42bcee9 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -184,6 +184,8 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int i, r;
 
+	INIT_DELAYED_WORK(&adev->vcn.global_work, amdgpu_vcn_global_work_handler);
+
 	for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
 		if (adev->vcn.harvest_config & (1 << i))
 			continue;
@@ -370,6 +372,7 @@ static int vcn_v4_0_hw_fini(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int i;
 
+	cancel_delayed_work_sync(&adev->vcn.global_work);
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
 		struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index d18aca3d663b9..0669df8d87314 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -161,6 +161,8 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_ring *ring;
 	int i, r, vcn_inst;
 
+	INIT_DELAYED_WORK(&adev->vcn.global_work, amdgpu_vcn_global_work_handler);
+
 	/* VCN DEC TRAP */
 	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
 		VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
@@ -373,6 +375,7 @@ static int vcn_v4_0_3_hw_fini(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int i;
 
+	cancel_delayed_work_sync(&adev->vcn.global_work);
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
 		struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
index 75c884a8f556b..65b34f9bee5a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
@@ -148,6 +148,8 @@ static int vcn_v4_0_5_sw_init(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int i, r;
 
+	INIT_DELAYED_WORK(&adev->vcn.global_work, amdgpu_vcn_global_work_handler);
+
 	for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
 		volatile struct amdgpu_vcn4_fw_shared *fw_shared;
 
@@ -320,6 +322,7 @@ static int vcn_v4_0_5_hw_fini(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int i;
 
+	cancel_delayed_work_sync(&adev->vcn.global_work);
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
 		struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
index 455f829b8bb99..882da3591ad4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
@@ -128,6 +128,8 @@ static int vcn_v5_0_0_sw_init(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int i, r;
 
+	INIT_DELAYED_WORK(&adev->vcn.global_work, amdgpu_vcn_global_work_handler);
+
 	for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
 		volatile struct amdgpu_vcn5_fw_shared *fw_shared;
 
@@ -283,6 +285,7 @@ static int vcn_v5_0_0_hw_fini(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int i;
 
+	cancel_delayed_work_sync(&adev->vcn.global_work);
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
 		struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[i];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
index 7cb21e2b4eb0e..213b24b1d0cc2 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
@@ -142,6 +142,8 @@ static int vcn_v5_0_1_sw_init(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_ring *ring;
 	int i, r, vcn_inst;
 
+	INIT_DELAYED_WORK(&adev->vcn.global_work, amdgpu_vcn_global_work_handler);
+
 	/* VCN UNIFIED TRAP */
 	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
 		VCN_5_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
@@ -318,6 +320,7 @@ static int vcn_v5_0_1_hw_fini(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int i;
 
+	cancel_delayed_work_sync(&adev->vcn.global_work);
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
 		struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[i];
 
-- 
2.50.1


[-- Attachment #3: 0002-drm-amdgpu-vcn2.5-switch-to-the-global-work-handler.patch --]
[-- Type: text/x-patch, Size: 4195 bytes --]

From c67e8ec11ed1a80bdd48889edf2f0542ce7c39a5 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Wed, 13 Aug 2025 16:03:20 -0400
Subject: [PATCH 2/2] drm/amdgpu/vcn2.5: switch to the global work handler

All of the idle work on vcn2.5 is global, so use the
new global work handler rather than abusing a single
instance of the per instance work handler.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 28 ++++++++++++---------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index 3a7c137a83efb..75158f41b8461 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -109,11 +109,10 @@ static int amdgpu_ih_clientid_vcns[] = {
 	SOC15_IH_CLIENTID_VCN1
 };
 
-static void vcn_v2_5_idle_work_handler(struct work_struct *work)
+static void vcn_v2_5_global_work_handler(struct work_struct *work)
 {
-	struct amdgpu_vcn_inst *vcn_inst =
-		container_of(work, struct amdgpu_vcn_inst, idle_work.work);
-	struct amdgpu_device *adev = vcn_inst->adev;
+	struct amdgpu_device *adev =
+		container_of(work, struct amdgpu_device, vcn.global_work.work);
 	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
 	unsigned int i, j;
 	int r = 0;
@@ -146,7 +145,7 @@ static void vcn_v2_5_idle_work_handler(struct work_struct *work)
 
 	}
 
-	if (!fences && !atomic_read(&adev->vcn.inst[0].total_submission_cnt)) {
+	if (!fences && !atomic_read(&adev->vcn.total_submission_cnt)) {
 		amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN,
 						       AMD_PG_STATE_GATE);
 		mutex_lock(&adev->vcn.workload_profile_mutex);
@@ -159,7 +158,7 @@ static void vcn_v2_5_idle_work_handler(struct work_struct *work)
 		}
 		mutex_unlock(&adev->vcn.workload_profile_mutex);
 	} else {
-		schedule_delayed_work(&adev->vcn.inst[0].idle_work, VCN_IDLE_TIMEOUT);
+		schedule_delayed_work(&adev->vcn.global_work, VCN_IDLE_TIMEOUT);
 	}
 }
 
@@ -169,9 +168,9 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
 	struct amdgpu_vcn_inst *v = &adev->vcn.inst[ring->me];
 	int r = 0;
 
-	atomic_inc(&adev->vcn.inst[0].total_submission_cnt);
+	atomic_inc(&adev->vcn.total_submission_cnt);
 
-	cancel_delayed_work_sync(&adev->vcn.inst[0].idle_work);
+	cancel_delayed_work_sync(&adev->vcn.global_work);
 
 	/* We can safely return early here because we've cancelled the
 	 * the delayed work so there is no one else to set it to false
@@ -230,10 +229,9 @@ static void vcn_v2_5_ring_end_use(struct amdgpu_ring *ring)
 	    !adev->vcn.inst[ring->me].using_unified_queue)
 		atomic_dec(&adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
 
-	atomic_dec(&adev->vcn.inst[0].total_submission_cnt);
+	atomic_dec(&adev->vcn.total_submission_cnt);
 
-	schedule_delayed_work(&adev->vcn.inst[0].idle_work,
-			      VCN_IDLE_TIMEOUT);
+	schedule_delayed_work(&adev->vcn.global_work, VCN_IDLE_TIMEOUT);
 }
 
 /**
@@ -299,6 +297,8 @@ static int vcn_v2_5_sw_init(struct amdgpu_ip_block *ip_block)
 	int i, j, r;
 	struct amdgpu_device *adev = ip_block->adev;
 
+	INIT_DELAYED_WORK(&adev->vcn.global_work, vcn_v2_5_global_work_handler);
+
 	for (j = 0; j < adev->vcn.num_vcn_inst; j++) {
 		volatile struct amdgpu_fw_shared *fw_shared;
 
@@ -328,9 +328,6 @@ static int vcn_v2_5_sw_init(struct amdgpu_ip_block *ip_block)
 		if (r)
 			return r;
 
-		/* Override the work func */
-		adev->vcn.inst[j].idle_work.work.func = vcn_v2_5_idle_work_handler;
-
 		amdgpu_vcn_setup_ucode(adev, j);
 
 		r = amdgpu_vcn_resume(adev, j);
@@ -533,14 +530,13 @@ static int vcn_v2_5_hw_fini(struct amdgpu_ip_block *ip_block)
 	struct amdgpu_device *adev = ip_block->adev;
 	int i;
 
+	cancel_delayed_work_sync(&adev->vcn.global_work);
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
 		struct amdgpu_vcn_inst *vinst = &adev->vcn.inst[i];
 
 		if (adev->vcn.harvest_config & (1 << i))
 			continue;
 
-		cancel_delayed_work_sync(&vinst->idle_work);
-
 		if ((adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) ||
 		    (vinst->cur_state != AMD_PG_STATE_GATE &&
 		     RREG32_SOC15(VCN, i, mmUVD_STATUS)))
-- 
2.50.1


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 20:05         ` Alex Deucher
@ 2025-08-13 20:58           ` Sundararaju, Sathishkumar
  2025-08-13 21:03             ` Alex Deucher
  0 siblings, 1 reply; 36+ messages in thread
From: Sundararaju, Sathishkumar @ 2025-08-13 20:58 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Wu, David, Alex Deucher, amd-gfx


On 8/14/2025 1:35 AM, Alex Deucher wrote:
> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
> <sathishkumar.sundararaju@amd.com> wrote:
>> Hi Alex, Hi David,
>>
>> I see David's concern but his suggestion yet wont solve the problem,
>> neither the current form , reason :-
>>
>> The emitted fence count and total submission count are fast transients
>> which frequently become 0 in between video decodes (between jobs) even
>> with the atomics and locks there can be a switch of video power profile,
>> in the current form of patch that window is minimized, but still can
>> happen if stress tested. But power state of any instance becoming zero
> Can you explain how this can happen?  I'm not seeing it.
Consider this situation, inst0 and inst1 actively decoding, inst0 decode 
completes, delayed idle work starts.
inst0 idle handler can read 0 total fences and 0 total submission count, 
even if inst1 is actively decoding,
that's between the jobs,
  - as begin_use increaments vcn.total_submission_cnt and end_use 
decreaments vcn.total_submission_cnt that can be 0.
  - if outstanding fences are cleared and no new emitted fence, between 
jobs , can be 0.
  - both of the above conditions do not mean video decode is complete on 
inst1, it is actively decoding.

Whereas if instances are powered off we are sure idle time is past and 
it is powered off, no possible way of
active video decode, when all instances are off we can safely assume no 
active decode and global lock protects
it against new begin_use on any instance. But the only distant concern 
is global common locks w.r.t perf, but we
are already having a global workprofile mutex , so there shouldn't be 
any drop in perf, with just one single
global lock for all instances.

Just sending out a patch with this fix, will leave it to you to decide 
the right method. If you think outstanding total fences
can never be 0 during decode, then your previous version (v3) itself is 
good, there is no real benefit of splitting the handlers as such.

Regards,
Sathish
>
> If it is possible, maybe it would be easier to just split the profile
> and powergating into separate handlers.  The profile one would be
> global and the powergating one would be per instance.  See the
> attached patches.
>
> Alex
>
>> can be a sure shot indication of break in a video decode, the mistake in
>> my patch was using per instance mutex, I should have used a common
>> global mutex, then that covers the situation David is trying to bring out.
>>
>> Using one global vcn.pg_lock for idle and begin_use and using flags to
>> track power state could help us totally avoid this situation.
>>
>> Regards,
>>
>> Sathish
>>
>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>> Hi Alex,
>>>>>
>>>>> The addition of  total_submission_cnt should work - in that
>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>> The suggestion of moving it inside the lock (which I prefer in case
>>>>> someone
>>>>> adds more before the lock and not reviewed thoroughly)
>>>>>     - up to you to decide.
>>>>>
>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>
>>>>> Thanks,
>>>>> David
>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>> If there are multiple instances of the VCN running,
>>>>>> we may end up switching the video profile while another
>>>>>> instance is active because we only take into account
>>>>>> the current instance's submissions.  Look at all
>>>>>> outstanding fences for the video profile.
>>>>>>
>>>>>> v2: drop early exit in begin_use()
>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>
>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
>>>>>> handling")
>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>> ---
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>> ++++++++++++-------------
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>     2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>         struct amdgpu_vcn_inst *vcn_inst =
>>>>>>                 container_of(work, struct amdgpu_vcn_inst,
>>>>>> idle_work.work);
>>>>>>         struct amdgpu_device *adev = vcn_inst->adev;
>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>> +     unsigned int total_fences = 0,
>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>> +     unsigned int i, j;
>>>>>>         int r = 0;
>>>>>>
>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>                 return;
>>>>>>
>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>> -             fence[i] +=
>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>>>>> +
>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>> +                     fence[i] +=
>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>> +             total_fences += fence[i];
>>>>>> +     }
>>>>>>
>>>>>>         /* Only set DPG pause for VCN3 or below, VCN4 and above will
>>>>>> be handled by FW */
>>>>>>         if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>                 struct dpg_pause_state new_state;
>>>>>>
>>>>>>                 if (fence[i] ||
>>>>>> @@ -436,18 +442,18 @@ static void
>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>                 else
>>>>>>                         new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>>>>>>
>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>>>>         }
>>>>>>
>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>> -     fences += fence[i];
>>>>>> -
>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>> +             /* This is specific to this instance */
>>>>>>                 mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>                 vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>                 mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>> !total_fences &&
>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>                         r = amdgpu_dpm_switch_power_profile(adev,
>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>> false);
>>>>>>                         if (r)
>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>> amdgpu_ring *ring)
>>>>>>         int r = 0;
>>>>>>
>>>>>>         atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>> move this addition down inside the mutex lock
>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>
>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>> -      */
>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>> -             goto pg_lock;
>>>>>> -
>>>>>>         mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>> move to here:
>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>> I think this should work for multiple instances.
>>>> Why does this need to be protected by the mutex?
>>> hmm.. OK - no need and it is actually better before the mutex.
>>> David
>>>> Alex
>>>>
>>>>> David
>>>>>>         if (!adev->vcn.workload_profile_active) {
>>>>>>                 r = amdgpu_dpm_switch_power_profile(adev,
>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>> amdgpu_ring *ring)
>>>>>>         }
>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>
>>>>>> -pg_lock:
>>>>>>         mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>         vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>>>>>>
>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
>>>>>> *ring)
>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>>>>>>
>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>
>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>                               VCN_IDLE_TIMEOUT);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>
>>>>>>         uint16_t inst_mask;
>>>>>>         uint8_t num_inst_per_aid;
>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>
>>>>>>         /* IP reg dump */
>>>>>>         uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 20:58           ` Sundararaju, Sathishkumar
@ 2025-08-13 21:03             ` Alex Deucher
  2025-08-13 21:16               ` Sundararaju, Sathishkumar
  2025-08-13 21:47               ` Wu, David
  0 siblings, 2 replies; 36+ messages in thread
From: Alex Deucher @ 2025-08-13 21:03 UTC (permalink / raw)
  To: Sundararaju, Sathishkumar; +Cc: Wu, David, Alex Deucher, amd-gfx

On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
<sathishkumar.sundararaju@amd.com> wrote:
>
>
> On 8/14/2025 1:35 AM, Alex Deucher wrote:
> > On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
> > <sathishkumar.sundararaju@amd.com> wrote:
> >> Hi Alex, Hi David,
> >>
> >> I see David's concern but his suggestion yet wont solve the problem,
> >> neither the current form , reason :-
> >>
> >> The emitted fence count and total submission count are fast transients
> >> which frequently become 0 in between video decodes (between jobs) even
> >> with the atomics and locks there can be a switch of video power profile,
> >> in the current form of patch that window is minimized, but still can
> >> happen if stress tested. But power state of any instance becoming zero
> > Can you explain how this can happen?  I'm not seeing it.
> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
> completes, delayed idle work starts.
> inst0 idle handler can read 0 total fences and 0 total submission count,
> even if inst1 is actively decoding,
> that's between the jobs,
>   - as begin_use increaments vcn.total_submission_cnt and end_use
> decreaments vcn.total_submission_cnt that can be 0.
>   - if outstanding fences are cleared and no new emitted fence, between
> jobs , can be 0.
>   - both of the above conditions do not mean video decode is complete on
> inst1, it is actively decoding.

How can there be active decoding without an outstanding fence?  In
that case, total_fences (fences from both instances) would be non-0.

Alex

>
> Whereas if instances are powered off we are sure idle time is past and
> it is powered off, no possible way of
> active video decode, when all instances are off we can safely assume no
> active decode and global lock protects
> it against new begin_use on any instance. But the only distant concern
> is global common locks w.r.t perf, but we
> are already having a global workprofile mutex , so there shouldn't be
> any drop in perf, with just one single
> global lock for all instances.
>
> Just sending out a patch with this fix, will leave it to you to decide
> the right method. If you think outstanding total fences
> can never be 0 during decode, then your previous version (v3) itself is
> good, there is no real benefit of splitting the handlers as such.
>
> Regards,
> Sathish
> >
> > If it is possible, maybe it would be easier to just split the profile
> > and powergating into separate handlers.  The profile one would be
> > global and the powergating one would be per instance.  See the
> > attached patches.
> >
> > Alex
> >
> >> can be a sure shot indication of break in a video decode, the mistake in
> >> my patch was using per instance mutex, I should have used a common
> >> global mutex, then that covers the situation David is trying to bring out.
> >>
> >> Using one global vcn.pg_lock for idle and begin_use and using flags to
> >> track power state could help us totally avoid this situation.
> >>
> >> Regards,
> >>
> >> Sathish
> >>
> >> On 8/13/2025 11:46 PM, Wu, David wrote:
> >>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
> >>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
> >>>>> Hi Alex,
> >>>>>
> >>>>> The addition of  total_submission_cnt should work - in that
> >>>>> it is unlikely to have a context switch right after the begin_use().
> >>>>> The suggestion of moving it inside the lock (which I prefer in case
> >>>>> someone
> >>>>> adds more before the lock and not reviewed thoroughly)
> >>>>>     - up to you to decide.
> >>>>>
> >>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
> >>>>>
> >>>>> Thanks,
> >>>>> David
> >>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
> >>>>>> If there are multiple instances of the VCN running,
> >>>>>> we may end up switching the video profile while another
> >>>>>> instance is active because we only take into account
> >>>>>> the current instance's submissions.  Look at all
> >>>>>> outstanding fences for the video profile.
> >>>>>>
> >>>>>> v2: drop early exit in begin_use()
> >>>>>> v3: handle possible race between begin_use() work handler
> >>>>>>
> >>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
> >>>>>> handling")
> >>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
> >>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> >>>>>> ---
> >>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
> >>>>>> ++++++++++++-------------
> >>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
> >>>>>>     2 files changed, 21 insertions(+), 20 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
> >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>> @@ -415,19 +415,25 @@ static void
> >>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>         struct amdgpu_vcn_inst *vcn_inst =
> >>>>>>                 container_of(work, struct amdgpu_vcn_inst,
> >>>>>> idle_work.work);
> >>>>>>         struct amdgpu_device *adev = vcn_inst->adev;
> >>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>> -     unsigned int i = vcn_inst->inst, j;
> >>>>>> +     unsigned int total_fences = 0,
> >>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>> +     unsigned int i, j;
> >>>>>>         int r = 0;
> >>>>>>
> >>>>>> -     if (adev->vcn.harvest_config & (1 << i))
> >>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
> >>>>>>                 return;
> >>>>>>
> >>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> >>>>>> -             fence[i] +=
> >>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> >>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> >>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
> >>>>>> +
> >>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
> >>>>>> +                     fence[i] +=
> >>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
> >>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> >>>>>> +             total_fences += fence[i];
> >>>>>> +     }
> >>>>>>
> >>>>>>         /* Only set DPG pause for VCN3 or below, VCN4 and above will
> >>>>>> be handled by FW */
> >>>>>>         if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> >>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
> >>>>>> +         !vcn_inst->using_unified_queue) {
> >>>>>>                 struct dpg_pause_state new_state;
> >>>>>>
> >>>>>>                 if (fence[i] ||
> >>>>>> @@ -436,18 +442,18 @@ static void
> >>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>                 else
> >>>>>>                         new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
> >>>>>>
> >>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> >>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>         }
> >>>>>>
> >>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> >>>>>> -     fences += fence[i];
> >>>>>> -
> >>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>> +     if (!fence[vcn_inst->inst] &&
> >>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>> +             /* This is specific to this instance */
> >>>>>>                 mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>                 vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> >>>>>>                 mutex_unlock(&vcn_inst->vcn_pg_lock);
> >>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>> -             if (adev->vcn.workload_profile_active) {
> >>>>>> +             /* This is global and depends on all VCN instances */
> >>>>>> +             if (adev->vcn.workload_profile_active &&
> >>>>>> !total_fences &&
> >>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
> >>>>>>                         r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>> false);
> >>>>>>                         if (r)
> >>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>> amdgpu_ring *ring)
> >>>>>>         int r = 0;
> >>>>>>
> >>>>>>         atomic_inc(&vcn_inst->total_submission_cnt);
> >>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>> move this addition down inside the mutex lock
> >>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
> >>>>>>
> >>>>>> -     /* We can safely return early here because we've cancelled the
> >>>>>> -      * the delayed work so there is no one else to set it to false
> >>>>>> -      * and we don't care if someone else sets it to true.
> >>>>>> -      */
> >>>>>> -     if (adev->vcn.workload_profile_active)
> >>>>>> -             goto pg_lock;
> >>>>>> -
> >>>>>>         mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>> move to here:
> >>>>> atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>> I think this should work for multiple instances.
> >>>> Why does this need to be protected by the mutex?
> >>> hmm.. OK - no need and it is actually better before the mutex.
> >>> David
> >>>> Alex
> >>>>
> >>>>> David
> >>>>>>         if (!adev->vcn.workload_profile_active) {
> >>>>>>                 r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>> amdgpu_ring *ring)
> >>>>>>         }
> >>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
> >>>>>>
> >>>>>> -pg_lock:
> >>>>>>         mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>         vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> >>>>>>
> >>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
> >>>>>> *ring)
> >>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
> >>>>>>
> >>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
> >>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
> >>>>>>
> >>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
> >>>>>>                               VCN_IDLE_TIMEOUT);
> >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
> >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
> >>>>>>
> >>>>>>         uint16_t inst_mask;
> >>>>>>         uint8_t num_inst_per_aid;
> >>>>>> +     atomic_t                total_submission_cnt;
> >>>>>>
> >>>>>>         /* IP reg dump */
> >>>>>>         uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 21:03             ` Alex Deucher
@ 2025-08-13 21:16               ` Sundararaju, Sathishkumar
  2025-08-13 22:08                 ` Alex Deucher
  2025-08-13 21:47               ` Wu, David
  1 sibling, 1 reply; 36+ messages in thread
From: Sundararaju, Sathishkumar @ 2025-08-13 21:16 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Wu, David, Alex Deucher, amd-gfx


On 8/14/2025 2:33 AM, Alex Deucher wrote:
> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
> <sathishkumar.sundararaju@amd.com> wrote:
>>
>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>> Hi Alex, Hi David,
>>>>
>>>> I see David's concern but his suggestion yet wont solve the problem,
>>>> neither the current form , reason :-
>>>>
>>>> The emitted fence count and total submission count are fast transients
>>>> which frequently become 0 in between video decodes (between jobs) even
>>>> with the atomics and locks there can be a switch of video power profile,
>>>> in the current form of patch that window is minimized, but still can
>>>> happen if stress tested. But power state of any instance becoming zero
>>> Can you explain how this can happen?  I'm not seeing it.
>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
>> completes, delayed idle work starts.
>> inst0 idle handler can read 0 total fences and 0 total submission count,
>> even if inst1 is actively decoding,
>> that's between the jobs,
>>    - as begin_use increaments vcn.total_submission_cnt and end_use
>> decreaments vcn.total_submission_cnt that can be 0.
>>    - if outstanding fences are cleared and no new emitted fence, between
>> jobs , can be 0.
>>    - both of the above conditions do not mean video decode is complete on
>> inst1, it is actively decoding.
> How can there be active decoding without an outstanding fence?  In
> that case, total_fences (fences from both instances) would be non-0.

I mean on inst1 the job scheduled is already complete, so 0 outstanding 
fences, newer job is yet to be scheduled

and commited to ring (inst1) , this doesn't mean decode has stopped on 
inst1 right (I am saying if timing of inst0 idle work coincides with this),

Or am I wrong in assuming this ? Can't this ever happen ? Please correct 
my understanding here.

Regards,

Sathish

>
> Alex
>
>> Whereas if instances are powered off we are sure idle time is past and
>> it is powered off, no possible way of
>> active video decode, when all instances are off we can safely assume no
>> active decode and global lock protects
>> it against new begin_use on any instance. But the only distant concern
>> is global common locks w.r.t perf, but we
>> are already having a global workprofile mutex , so there shouldn't be
>> any drop in perf, with just one single
>> global lock for all instances.
>>
>> Just sending out a patch with this fix, will leave it to you to decide
>> the right method. If you think outstanding total fences
>> can never be 0 during decode, then your previous version (v3) itself is
>> good, there is no real benefit of splitting the handlers as such.
>>
>> Regards,
>> Sathish
>>> If it is possible, maybe it would be easier to just split the profile
>>> and powergating into separate handlers.  The profile one would be
>>> global and the powergating one would be per instance.  See the
>>> attached patches.
>>>
>>> Alex
>>>
>>>> can be a sure shot indication of break in a video decode, the mistake in
>>>> my patch was using per instance mutex, I should have used a common
>>>> global mutex, then that covers the situation David is trying to bring out.
>>>>
>>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
>>>> track power state could help us totally avoid this situation.
>>>>
>>>> Regards,
>>>>
>>>> Sathish
>>>>
>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>> Hi Alex,
>>>>>>>
>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>> The suggestion of moving it inside the lock (which I prefer in case
>>>>>>> someone
>>>>>>> adds more before the lock and not reviewed thoroughly)
>>>>>>>      - up to you to decide.
>>>>>>>
>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>
>>>>>>> Thanks,
>>>>>>> David
>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>> If there are multiple instances of the VCN running,
>>>>>>>> we may end up switching the video profile while another
>>>>>>>> instance is active because we only take into account
>>>>>>>> the current instance's submissions.  Look at all
>>>>>>>> outstanding fences for the video profile.
>>>>>>>>
>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>
>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
>>>>>>>> handling")
>>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>> ++++++++++++-------------
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>      2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>          struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>                  container_of(work, struct amdgpu_vcn_inst,
>>>>>>>> idle_work.work);
>>>>>>>>          struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>> +     unsigned int i, j;
>>>>>>>>          int r = 0;
>>>>>>>>
>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>                  return;
>>>>>>>>
>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>> -             fence[i] +=
>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>>>>>>> +
>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>> +                     fence[i] +=
>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>> +             total_fences += fence[i];
>>>>>>>> +     }
>>>>>>>>
>>>>>>>>          /* Only set DPG pause for VCN3 or below, VCN4 and above will
>>>>>>>> be handled by FW */
>>>>>>>>          if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>                  struct dpg_pause_state new_state;
>>>>>>>>
>>>>>>>>                  if (fence[i] ||
>>>>>>>> @@ -436,18 +442,18 @@ static void
>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>                  else
>>>>>>>>                          new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>>>>>>>>
>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>          }
>>>>>>>>
>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>> -     fences += fence[i];
>>>>>>>> -
>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>                  mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>                  vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>                  mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>> !total_fences &&
>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>                          r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>> false);
>>>>>>>>                          if (r)
>>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>> amdgpu_ring *ring)
>>>>>>>>          int r = 0;
>>>>>>>>
>>>>>>>>          atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>> move this addition down inside the mutex lock
>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>
>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>> -      */
>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>> -             goto pg_lock;
>>>>>>>> -
>>>>>>>>          mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>> move to here:
>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>> I think this should work for multiple instances.
>>>>>> Why does this need to be protected by the mutex?
>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>> David
>>>>>> Alex
>>>>>>
>>>>>>> David
>>>>>>>>          if (!adev->vcn.workload_profile_active) {
>>>>>>>>                  r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>> amdgpu_ring *ring)
>>>>>>>>          }
>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>
>>>>>>>> -pg_lock:
>>>>>>>>          mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>          vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>>>>>>>>
>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
>>>>>>>> *ring)
>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>>>>>>>>
>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>
>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>                                VCN_IDLE_TIMEOUT);
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>
>>>>>>>>          uint16_t inst_mask;
>>>>>>>>          uint8_t num_inst_per_aid;
>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>
>>>>>>>>          /* IP reg dump */
>>>>>>>>          uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 21:16               ` Sundararaju, Sathishkumar
@ 2025-08-13 22:08                 ` Alex Deucher
  2025-08-13 23:13                   ` Sundararaju, Sathishkumar
  0 siblings, 1 reply; 36+ messages in thread
From: Alex Deucher @ 2025-08-13 22:08 UTC (permalink / raw)
  To: Sundararaju, Sathishkumar; +Cc: Wu, David, Alex Deucher, amd-gfx

On Wed, Aug 13, 2025 at 5:16 PM Sundararaju, Sathishkumar
<sathishkumar.sundararaju@amd.com> wrote:
>
>
> On 8/14/2025 2:33 AM, Alex Deucher wrote:
> > On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
> > <sathishkumar.sundararaju@amd.com> wrote:
> >>
> >> On 8/14/2025 1:35 AM, Alex Deucher wrote:
> >>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
> >>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>> Hi Alex, Hi David,
> >>>>
> >>>> I see David's concern but his suggestion yet wont solve the problem,
> >>>> neither the current form , reason :-
> >>>>
> >>>> The emitted fence count and total submission count are fast transients
> >>>> which frequently become 0 in between video decodes (between jobs) even
> >>>> with the atomics and locks there can be a switch of video power profile,
> >>>> in the current form of patch that window is minimized, but still can
> >>>> happen if stress tested. But power state of any instance becoming zero
> >>> Can you explain how this can happen?  I'm not seeing it.
> >> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
> >> completes, delayed idle work starts.
> >> inst0 idle handler can read 0 total fences and 0 total submission count,
> >> even if inst1 is actively decoding,
> >> that's between the jobs,
> >>    - as begin_use increaments vcn.total_submission_cnt and end_use
> >> decreaments vcn.total_submission_cnt that can be 0.
> >>    - if outstanding fences are cleared and no new emitted fence, between
> >> jobs , can be 0.
> >>    - both of the above conditions do not mean video decode is complete on
> >> inst1, it is actively decoding.
> > How can there be active decoding without an outstanding fence?  In
> > that case, total_fences (fences from both instances) would be non-0.
>
> I mean on inst1 the job scheduled is already complete, so 0 outstanding
> fences, newer job is yet to be scheduled
>
> and commited to ring (inst1) , this doesn't mean decode has stopped on
> inst1 right (I am saying if timing of inst0 idle work coincides with this),
>
> Or am I wrong in assuming this ? Can't this ever happen ? Please correct
> my understanding here.

The flow looks like:

begin_use(inst)
emit_fence(inst)
end_use(inst)

...later
fence signals
...later
work handler

In begin_use we increment the global and per instance submission.
This protects the power gating and profile until end_use.  In end use
we decrement the submissions because we don't need to protect anything
any more as we have the fence that was submitted via the ring.  That
fence won't signal until the job is complete.  For power gating, we
only care about the submission count and fences for that instance, for
the profile, we care about submission count and fences all instances.
Once the fences have signalled, the outstanding fences will be 0 and
there won't be any active work.

Alex

>
> Regards,
>
> Sathish
>
> >
> > Alex
> >
> >> Whereas if instances are powered off we are sure idle time is past and
> >> it is powered off, no possible way of
> >> active video decode, when all instances are off we can safely assume no
> >> active decode and global lock protects
> >> it against new begin_use on any instance. But the only distant concern
> >> is global common locks w.r.t perf, but we
> >> are already having a global workprofile mutex , so there shouldn't be
> >> any drop in perf, with just one single
> >> global lock for all instances.
> >>
> >> Just sending out a patch with this fix, will leave it to you to decide
> >> the right method. If you think outstanding total fences
> >> can never be 0 during decode, then your previous version (v3) itself is
> >> good, there is no real benefit of splitting the handlers as such.
> >>
> >> Regards,
> >> Sathish
> >>> If it is possible, maybe it would be easier to just split the profile
> >>> and powergating into separate handlers.  The profile one would be
> >>> global and the powergating one would be per instance.  See the
> >>> attached patches.
> >>>
> >>> Alex
> >>>
> >>>> can be a sure shot indication of break in a video decode, the mistake in
> >>>> my patch was using per instance mutex, I should have used a common
> >>>> global mutex, then that covers the situation David is trying to bring out.
> >>>>
> >>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
> >>>> track power state could help us totally avoid this situation.
> >>>>
> >>>> Regards,
> >>>>
> >>>> Sathish
> >>>>
> >>>> On 8/13/2025 11:46 PM, Wu, David wrote:
> >>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
> >>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
> >>>>>>> Hi Alex,
> >>>>>>>
> >>>>>>> The addition of  total_submission_cnt should work - in that
> >>>>>>> it is unlikely to have a context switch right after the begin_use().
> >>>>>>> The suggestion of moving it inside the lock (which I prefer in case
> >>>>>>> someone
> >>>>>>> adds more before the lock and not reviewed thoroughly)
> >>>>>>>      - up to you to decide.
> >>>>>>>
> >>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
> >>>>>>>
> >>>>>>> Thanks,
> >>>>>>> David
> >>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
> >>>>>>>> If there are multiple instances of the VCN running,
> >>>>>>>> we may end up switching the video profile while another
> >>>>>>>> instance is active because we only take into account
> >>>>>>>> the current instance's submissions.  Look at all
> >>>>>>>> outstanding fences for the video profile.
> >>>>>>>>
> >>>>>>>> v2: drop early exit in begin_use()
> >>>>>>>> v3: handle possible race between begin_use() work handler
> >>>>>>>>
> >>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
> >>>>>>>> handling")
> >>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
> >>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> >>>>>>>> ---
> >>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
> >>>>>>>> ++++++++++++-------------
> >>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
> >>>>>>>>      2 files changed, 21 insertions(+), 20 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
> >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>> @@ -415,19 +415,25 @@ static void
> >>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>          struct amdgpu_vcn_inst *vcn_inst =
> >>>>>>>>                  container_of(work, struct amdgpu_vcn_inst,
> >>>>>>>> idle_work.work);
> >>>>>>>>          struct amdgpu_device *adev = vcn_inst->adev;
> >>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>> -     unsigned int i = vcn_inst->inst, j;
> >>>>>>>> +     unsigned int total_fences = 0,
> >>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>> +     unsigned int i, j;
> >>>>>>>>          int r = 0;
> >>>>>>>>
> >>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
> >>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
> >>>>>>>>                  return;
> >>>>>>>>
> >>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> >>>>>>>> -             fence[i] +=
> >>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> >>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> >>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
> >>>>>>>> +
> >>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
> >>>>>>>> +                     fence[i] +=
> >>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
> >>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> >>>>>>>> +             total_fences += fence[i];
> >>>>>>>> +     }
> >>>>>>>>
> >>>>>>>>          /* Only set DPG pause for VCN3 or below, VCN4 and above will
> >>>>>>>> be handled by FW */
> >>>>>>>>          if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> >>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
> >>>>>>>> +         !vcn_inst->using_unified_queue) {
> >>>>>>>>                  struct dpg_pause_state new_state;
> >>>>>>>>
> >>>>>>>>                  if (fence[i] ||
> >>>>>>>> @@ -436,18 +442,18 @@ static void
> >>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>                  else
> >>>>>>>>                          new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
> >>>>>>>>
> >>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>>          }
> >>>>>>>>
> >>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> >>>>>>>> -     fences += fence[i];
> >>>>>>>> -
> >>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>> +     if (!fence[vcn_inst->inst] &&
> >>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>> +             /* This is specific to this instance */
> >>>>>>>>                  mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>                  vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> >>>>>>>>                  mutex_unlock(&vcn_inst->vcn_pg_lock);
> >>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>> -             if (adev->vcn.workload_profile_active) {
> >>>>>>>> +             /* This is global and depends on all VCN instances */
> >>>>>>>> +             if (adev->vcn.workload_profile_active &&
> >>>>>>>> !total_fences &&
> >>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
> >>>>>>>>                          r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>>>> false);
> >>>>>>>>                          if (r)
> >>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>>>> amdgpu_ring *ring)
> >>>>>>>>          int r = 0;
> >>>>>>>>
> >>>>>>>>          atomic_inc(&vcn_inst->total_submission_cnt);
> >>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>> move this addition down inside the mutex lock
> >>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
> >>>>>>>>
> >>>>>>>> -     /* We can safely return early here because we've cancelled the
> >>>>>>>> -      * the delayed work so there is no one else to set it to false
> >>>>>>>> -      * and we don't care if someone else sets it to true.
> >>>>>>>> -      */
> >>>>>>>> -     if (adev->vcn.workload_profile_active)
> >>>>>>>> -             goto pg_lock;
> >>>>>>>> -
> >>>>>>>>          mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>> move to here:
> >>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>> I think this should work for multiple instances.
> >>>>>> Why does this need to be protected by the mutex?
> >>>>> hmm.. OK - no need and it is actually better before the mutex.
> >>>>> David
> >>>>>> Alex
> >>>>>>
> >>>>>>> David
> >>>>>>>>          if (!adev->vcn.workload_profile_active) {
> >>>>>>>>                  r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>>>> amdgpu_ring *ring)
> >>>>>>>>          }
> >>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>
> >>>>>>>> -pg_lock:
> >>>>>>>>          mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>          vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> >>>>>>>>
> >>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
> >>>>>>>> *ring)
> >>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
> >>>>>>>>
> >>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
> >>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
> >>>>>>>>
> >>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
> >>>>>>>>                                VCN_IDLE_TIMEOUT);
> >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
> >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
> >>>>>>>>
> >>>>>>>>          uint16_t inst_mask;
> >>>>>>>>          uint8_t num_inst_per_aid;
> >>>>>>>> +     atomic_t                total_submission_cnt;
> >>>>>>>>
> >>>>>>>>          /* IP reg dump */
> >>>>>>>>          uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 22:08                 ` Alex Deucher
@ 2025-08-13 23:13                   ` Sundararaju, Sathishkumar
  2025-08-14  8:41                     ` Lazar, Lijo
  0 siblings, 1 reply; 36+ messages in thread
From: Sundararaju, Sathishkumar @ 2025-08-13 23:13 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Wu, David, Alex Deucher, amd-gfx


On 8/14/2025 3:38 AM, Alex Deucher wrote:
> On Wed, Aug 13, 2025 at 5:16 PM Sundararaju, Sathishkumar
> <sathishkumar.sundararaju@amd.com> wrote:
>>
>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>> Hi Alex, Hi David,
>>>>>>
>>>>>> I see David's concern but his suggestion yet wont solve the problem,
>>>>>> neither the current form , reason :-
>>>>>>
>>>>>> The emitted fence count and total submission count are fast transients
>>>>>> which frequently become 0 in between video decodes (between jobs) even
>>>>>> with the atomics and locks there can be a switch of video power profile,
>>>>>> in the current form of patch that window is minimized, but still can
>>>>>> happen if stress tested. But power state of any instance becoming zero
>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
>>>> completes, delayed idle work starts.
>>>> inst0 idle handler can read 0 total fences and 0 total submission count,
>>>> even if inst1 is actively decoding,
>>>> that's between the jobs,
>>>>     - as begin_use increaments vcn.total_submission_cnt and end_use
>>>> decreaments vcn.total_submission_cnt that can be 0.
>>>>     - if outstanding fences are cleared and no new emitted fence, between
>>>> jobs , can be 0.
>>>>     - both of the above conditions do not mean video decode is complete on
>>>> inst1, it is actively decoding.
>>> How can there be active decoding without an outstanding fence?  In
>>> that case, total_fences (fences from both instances) would be non-0.
>> I mean on inst1 the job scheduled is already complete, so 0 outstanding
>> fences, newer job is yet to be scheduled
>>
>> and commited to ring (inst1) , this doesn't mean decode has stopped on
>> inst1 right (I am saying if timing of inst0 idle work coincides with this),
>>
>> Or am I wrong in assuming this ? Can't this ever happen ? Please correct
>> my understanding here.
> The flow looks like:
>
> begin_use(inst)
> emit_fence(inst)
> end_use(inst)
>
> ...later
> fence signals
> ...later
> work handler
>
> In begin_use we increment the global and per instance submission.
> This protects the power gating and profile until end_use.  In end use
> we decrement the submissions because we don't need to protect anything
> any more as we have the fence that was submitted via the ring.  That
> fence won't signal until the job is complete.

Is a next begin_use always guaranteed to be run before current job fence 
signals ?

if not then both total submission and total fence are zero , example 
delayed job/packet submissions

from user space, or next job schedule happens after current job fence 
signals.

if this is never possible then (v3) is perfect.

Regards,

Sathish

> For power gating, we
> only care about the submission count and fences for that instance, for
> the profile, we care about submission count and fences all instances.
> Once the fences have signalled, the outstanding fences will be 0 and
> there won't be any active work.
>
> Alex
>
>> Regards,
>>
>> Sathish
>>
>>> Alex
>>>
>>>> Whereas if instances are powered off we are sure idle time is past and
>>>> it is powered off, no possible way of
>>>> active video decode, when all instances are off we can safely assume no
>>>> active decode and global lock protects
>>>> it against new begin_use on any instance. But the only distant concern
>>>> is global common locks w.r.t perf, but we
>>>> are already having a global workprofile mutex , so there shouldn't be
>>>> any drop in perf, with just one single
>>>> global lock for all instances.
>>>>
>>>> Just sending out a patch with this fix, will leave it to you to decide
>>>> the right method. If you think outstanding total fences
>>>> can never be 0 during decode, then your previous version (v3) itself is
>>>> good, there is no real benefit of splitting the handlers as such.
>>>>
>>>> Regards,
>>>> Sathish
>>>>> If it is possible, maybe it would be easier to just split the profile
>>>>> and powergating into separate handlers.  The profile one would be
>>>>> global and the powergating one would be per instance.  See the
>>>>> attached patches.
>>>>>
>>>>> Alex
>>>>>
>>>>>> can be a sure shot indication of break in a video decode, the mistake in
>>>>>> my patch was using per instance mutex, I should have used a common
>>>>>> global mutex, then that covers the situation David is trying to bring out.
>>>>>>
>>>>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
>>>>>> track power state could help us totally avoid this situation.
>>>>>>
>>>>>> Regards,
>>>>>>
>>>>>> Sathish
>>>>>>
>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>> Hi Alex,
>>>>>>>>>
>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>> The suggestion of moving it inside the lock (which I prefer in case
>>>>>>>>> someone
>>>>>>>>> adds more before the lock and not reviewed thoroughly)
>>>>>>>>>       - up to you to decide.
>>>>>>>>>
>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> David
>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>> If there are multiple instances of the VCN running,
>>>>>>>>>> we may end up switching the video profile while another
>>>>>>>>>> instance is active because we only take into account
>>>>>>>>>> the current instance's submissions.  Look at all
>>>>>>>>>> outstanding fences for the video profile.
>>>>>>>>>>
>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>
>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
>>>>>>>>>> handling")
>>>>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>> ---
>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>       2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>           struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>                   container_of(work, struct amdgpu_vcn_inst,
>>>>>>>>>> idle_work.work);
>>>>>>>>>>           struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>           int r = 0;
>>>>>>>>>>
>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>                   return;
>>>>>>>>>>
>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>> -             fence[i] +=
>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>>>>>>>>> +
>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>> +     }
>>>>>>>>>>
>>>>>>>>>>           /* Only set DPG pause for VCN3 or below, VCN4 and above will
>>>>>>>>>> be handled by FW */
>>>>>>>>>>           if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>                   struct dpg_pause_state new_state;
>>>>>>>>>>
>>>>>>>>>>                   if (fence[i] ||
>>>>>>>>>> @@ -436,18 +442,18 @@ static void
>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>                   else
>>>>>>>>>>                           new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>
>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>           }
>>>>>>>>>>
>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>> -
>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>                   mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>                   vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>                   mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>> !total_fences &&
>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>                           r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>> false);
>>>>>>>>>>                           if (r)
>>>>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>           int r = 0;
>>>>>>>>>>
>>>>>>>>>>           atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>
>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>> -      */
>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>> -
>>>>>>>>>>           mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>> move to here:
>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>> I think this should work for multiple instances.
>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>> David
>>>>>>>> Alex
>>>>>>>>
>>>>>>>>> David
>>>>>>>>>>           if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>                   r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>           }
>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>
>>>>>>>>>> -pg_lock:
>>>>>>>>>>           mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>           vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>>>>>>>>>>
>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
>>>>>>>>>> *ring)
>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>>>>>>>>>>
>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>
>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>                                 VCN_IDLE_TIMEOUT);
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>
>>>>>>>>>>           uint16_t inst_mask;
>>>>>>>>>>           uint8_t num_inst_per_aid;
>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>
>>>>>>>>>>           /* IP reg dump */
>>>>>>>>>>           uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 23:13                   ` Sundararaju, Sathishkumar
@ 2025-08-14  8:41                     ` Lazar, Lijo
  2025-08-14  9:11                       ` Sundararaju, Sathishkumar
  0 siblings, 1 reply; 36+ messages in thread
From: Lazar, Lijo @ 2025-08-14  8:41 UTC (permalink / raw)
  To: Sundararaju, Sathishkumar, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org

[-- Attachment #1: Type: text/plain, Size: 13943 bytes --]

[Public]

We already have a per instance power state that can be tracked. What about something like attached?

Thanks,
Lijo
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Sundararaju, Sathishkumar
Sent: Thursday, August 14, 2025 4:43 AM
To: Alex Deucher <alexdeucher@gmail.com>
Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)


On 8/14/2025 3:38 AM, Alex Deucher wrote:
> On Wed, Aug 13, 2025 at 5:16 PM Sundararaju, Sathishkumar
> <sathishkumar.sundararaju@amd.com> wrote:
>>
>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>> Hi Alex, Hi David,
>>>>>>
>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>> problem, neither the current form , reason :-
>>>>>>
>>>>>> The emitted fence count and total submission count are fast
>>>>>> transients which frequently become 0 in between video decodes
>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>> switch of video power profile, in the current form of patch that
>>>>>> window is minimized, but still can happen if stress tested. But
>>>>>> power state of any instance becoming zero
>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>> Consider this situation, inst0 and inst1 actively decoding, inst0
>>>> decode completes, delayed idle work starts.
>>>> inst0 idle handler can read 0 total fences and 0 total submission
>>>> count, even if inst1 is actively decoding, that's between the jobs,
>>>>     - as begin_use increaments vcn.total_submission_cnt and end_use
>>>> decreaments vcn.total_submission_cnt that can be 0.
>>>>     - if outstanding fences are cleared and no new emitted fence,
>>>> between jobs , can be 0.
>>>>     - both of the above conditions do not mean video decode is
>>>> complete on inst1, it is actively decoding.
>>> How can there be active decoding without an outstanding fence?  In
>>> that case, total_fences (fences from both instances) would be non-0.
>> I mean on inst1 the job scheduled is already complete, so 0
>> outstanding fences, newer job is yet to be scheduled
>>
>> and commited to ring (inst1) , this doesn't mean decode has stopped
>> on
>> inst1 right (I am saying if timing of inst0 idle work coincides with
>> this),
>>
>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>> correct my understanding here.
> The flow looks like:
>
> begin_use(inst)
> emit_fence(inst)
> end_use(inst)
>
> ...later
> fence signals
> ...later
> work handler
>
> In begin_use we increment the global and per instance submission.
> This protects the power gating and profile until end_use.  In end use
> we decrement the submissions because we don't need to protect anything
> any more as we have the fence that was submitted via the ring.  That
> fence won't signal until the job is complete.

Is a next begin_use always guaranteed to be run before current job fence signals ?

if not then both total submission and total fence are zero , example delayed job/packet submissions

from user space, or next job schedule happens after current job fence signals.

if this is never possible then (v3) is perfect.

Regards,

Sathish

> For power gating, we
> only care about the submission count and fences for that instance, for
> the profile, we care about submission count and fences all instances.
> Once the fences have signalled, the outstanding fences will be 0 and
> there won't be any active work.
>
> Alex
>
>> Regards,
>>
>> Sathish
>>
>>> Alex
>>>
>>>> Whereas if instances are powered off we are sure idle time is past
>>>> and it is powered off, no possible way of active video decode, when
>>>> all instances are off we can safely assume no active decode and
>>>> global lock protects it against new begin_use on any instance. But
>>>> the only distant concern is global common locks w.r.t perf, but we
>>>> are already having a global workprofile mutex , so there shouldn't
>>>> be any drop in perf, with just one single global lock for all
>>>> instances.
>>>>
>>>> Just sending out a patch with this fix, will leave it to you to
>>>> decide the right method. If you think outstanding total fences can
>>>> never be 0 during decode, then your previous version (v3) itself is
>>>> good, there is no real benefit of splitting the handlers as such.
>>>>
>>>> Regards,
>>>> Sathish
>>>>> If it is possible, maybe it would be easier to just split the
>>>>> profile and powergating into separate handlers.  The profile one
>>>>> would be global and the powergating one would be per instance.
>>>>> See the attached patches.
>>>>>
>>>>> Alex
>>>>>
>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>> mistake in my patch was using per instance mutex, I should have
>>>>>> used a common global mutex, then that covers the situation David is trying to bring out.
>>>>>>
>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>> flags to track power state could help us totally avoid this situation.
>>>>>>
>>>>>> Regards,
>>>>>>
>>>>>> Sathish
>>>>>>
>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>> Hi Alex,
>>>>>>>>>
>>>>>>>>> The addition of  total_submission_cnt should work - in that it
>>>>>>>>> is unlikely to have a context switch right after the begin_use().
>>>>>>>>> The suggestion of moving it inside the lock (which I prefer in
>>>>>>>>> case someone adds more before the lock and not reviewed
>>>>>>>>> thoroughly)
>>>>>>>>>       - up to you to decide.
>>>>>>>>>
>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> David
>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>> end up switching the video profile while another instance is
>>>>>>>>>> active because we only take into account the current
>>>>>>>>>> instance's submissions.  Look at all outstanding fences for
>>>>>>>>>> the video profile.
>>>>>>>>>>
>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>
>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
>>>>>>>>>> handling")
>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>> ---
>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>       2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>           struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>                   container_of(work, struct amdgpu_vcn_inst,
>>>>>>>>>> idle_work.work);
>>>>>>>>>>           struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>           int r = 0;
>>>>>>>>>>
>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>                   return;
>>>>>>>>>>
>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>> -             fence[i] +=
>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>>>>>>>>> +
>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>> +     }
>>>>>>>>>>
>>>>>>>>>>           /* Only set DPG pause for VCN3 or below, VCN4 and
>>>>>>>>>> above will be handled by FW */
>>>>>>>>>>           if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>                   struct dpg_pause_state new_state;
>>>>>>>>>>
>>>>>>>>>>                   if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct work_struct
>>>>>>>>>> *work)
>>>>>>>>>>                   else
>>>>>>>>>>                           new_state.fw_based =
>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>
>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>           }
>>>>>>>>>>
>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>> -
>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>                   mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>                   vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>                   mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>> !total_fences &&
>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>                           r =
>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>> false);
>>>>>>>>>>                           if (r) @@ -467,16 +473,10 @@ void
>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>           int r = 0;
>>>>>>>>>>
>>>>>>>>>>           atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>
>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>> -      */
>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>> -
>>>>>>>>>>           mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>> move to here:
>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>> I think this should work for multiple instances.
>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>> David
>>>>>>>> Alex
>>>>>>>>
>>>>>>>>> David
>>>>>>>>>>           if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>                   r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>           }
>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>
>>>>>>>>>> -pg_lock:
>>>>>>>>>>           mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>           vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>
>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>> amdgpu_ring
>>>>>>>>>> *ring)
>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission
>>>>>>>>>> _cnt);
>>>>>>>>>>
>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_c
>>>>>>>>>> nt);
>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>
>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>                                 VCN_IDLE_TIMEOUT); diff --git
>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>
>>>>>>>>>>           uint16_t inst_mask;
>>>>>>>>>>           uint8_t num_inst_per_aid;
>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>
>>>>>>>>>>           /* IP reg dump */
>>>>>>>>>>           uint32_t                *ip_dump;

[-- Attachment #2: 0001-drm-amdgpu-Check-vcn-state-before-profile-switch.patch --]
[-- Type: application/octet-stream, Size: 8167 bytes --]

From 0730094e8c2fd7134fed38a1d0aee74c456358d2 Mon Sep 17 00:00:00 2001
From: Lijo Lazar <lijo.lazar@amd.com>
Date: Thu, 14 Aug 2025 13:52:50 +0530
Subject: [PATCH] drm/amdgpu: Check vcn state before profile switch

Before switching video profile, check power state of all VCN instances.
Before submission, power state is changed and a check is done for
vidoe profile. If already active, it skips switching; otherwise, it
switches to video profile.  When it gets to idle state, power state of
the current instance is set to gated. After that a check is done if all
instances are in gated state. If so, video profile is disabled.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 83 ++++++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  5 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 27 +-------
 3 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 9a76e11d1c18..0727314e4198 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -410,6 +410,57 @@ int amdgpu_vcn_resume(struct amdgpu_device *adev, int i)
 	return 0;
 }
 
+int amdgpu_vcn_get_profile(struct amdgpu_device *adev)
+{
+	int r;
+
+	if (adev->vcn.workload_profile_active)
+		return 0;
+
+	mutex_lock(&adev->vcn.workload_profile_mutex);
+
+	r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
+					    true);
+	if (r)
+		dev_warn(adev->dev,
+			 "(%d) failed to enable video power profile mode\n", r);
+	else
+		adev->vcn.workload_profile_active = true;
+	mutex_unlock(&adev->vcn.workload_profile_mutex);
+
+	return r;
+}
+
+int amdgpu_vcn_put_profile(struct amdgpu_device *adev)
+{
+	bool pg = true;
+	int r, i;
+
+	mutex_lock(&adev->vcn.workload_profile_mutex);
+	for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
+		if (adev->vcn.inst[i].cur_state != AMD_PG_STATE_GATE) {
+			pg = false;
+			break;
+		}
+	}
+
+	if (pg) {
+		r = amdgpu_dpm_switch_power_profile(
+			adev, PP_SMC_POWER_PROFILE_VIDEO, false);
+		if (r)
+			dev_warn(
+				adev->dev,
+				"(%d) failed to disable video power profile mode\n",
+				r);
+		else
+			adev->vcn.workload_profile_active = false;
+	}
+
+	mutex_unlock(&adev->vcn.workload_profile_mutex);
+
+	return r;
+}
+
 static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 {
 	struct amdgpu_vcn_inst *vcn_inst =
@@ -417,7 +468,6 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 	struct amdgpu_device *adev = vcn_inst->adev;
 	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
 	unsigned int i = vcn_inst->inst, j;
-	int r = 0;
 
 	if (adev->vcn.harvest_config & (1 << i))
 		return;
@@ -446,15 +496,8 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 		mutex_lock(&vcn_inst->vcn_pg_lock);
 		vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
 		mutex_unlock(&vcn_inst->vcn_pg_lock);
-		mutex_lock(&adev->vcn.workload_profile_mutex);
-		if (adev->vcn.workload_profile_active) {
-			r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-							    false);
-			if (r)
-				dev_warn(adev->dev, "(%d) failed to disable video power profile mode\n", r);
-			adev->vcn.workload_profile_active = false;
-		}
-		mutex_unlock(&adev->vcn.workload_profile_mutex);
+		amdgpu_vcn_put_profile(adev);
+
 	} else {
 		schedule_delayed_work(&vcn_inst->idle_work, VCN_IDLE_TIMEOUT);
 	}
@@ -464,32 +507,14 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_vcn_inst *vcn_inst = &adev->vcn.inst[ring->me];
-	int r = 0;
 
 	atomic_inc(&vcn_inst->total_submission_cnt);
 
 	cancel_delayed_work_sync(&vcn_inst->idle_work);
 
-	/* We can safely return early here because we've cancelled the
-	 * the delayed work so there is no one else to set it to false
-	 * and we don't care if someone else sets it to true.
-	 */
-	if (adev->vcn.workload_profile_active)
-		goto pg_lock;
-
-	mutex_lock(&adev->vcn.workload_profile_mutex);
-	if (!adev->vcn.workload_profile_active) {
-		r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-						    true);
-		if (r)
-			dev_warn(adev->dev, "(%d) failed to switch to video power profile mode\n", r);
-		adev->vcn.workload_profile_active = true;
-	}
-	mutex_unlock(&adev->vcn.workload_profile_mutex);
-
-pg_lock:
 	mutex_lock(&vcn_inst->vcn_pg_lock);
 	vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
+	amdgpu_vcn_get_profile(adev);
 
 	/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
 	if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index b3fb1d0e43fc..c4dec986926f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -362,7 +362,7 @@ struct amdgpu_vcn {
 	bool			per_inst_fw;
 	unsigned		fw_version;
 
-	bool			workload_profile_active;
+	bool workload_profile_active;
 	struct mutex            workload_profile_mutex;
 	u32 reg_count;
 	const struct amdgpu_hwip_reg_entry *reg_list;
@@ -565,4 +565,7 @@ int amdgpu_vcn_reg_dump_init(struct amdgpu_device *adev,
 			     const struct amdgpu_hwip_reg_entry *reg, u32 count);
 void amdgpu_vcn_dump_ip_state(struct amdgpu_ip_block *ip_block);
 void amdgpu_vcn_print_ip_state(struct amdgpu_ip_block *ip_block, struct drm_printer *p);
+int amdgpu_vcn_get_profile(struct amdgpu_device *adev);
+int amdgpu_vcn_put_profile(struct amdgpu_device *adev);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index 3a7c137a83ef..26ed74101d27 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -116,7 +116,6 @@ static void vcn_v2_5_idle_work_handler(struct work_struct *work)
 	struct amdgpu_device *adev = vcn_inst->adev;
 	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
 	unsigned int i, j;
-	int r = 0;
 
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
 		struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
@@ -149,15 +148,7 @@ static void vcn_v2_5_idle_work_handler(struct work_struct *work)
 	if (!fences && !atomic_read(&adev->vcn.inst[0].total_submission_cnt)) {
 		amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN,
 						       AMD_PG_STATE_GATE);
-		mutex_lock(&adev->vcn.workload_profile_mutex);
-		if (adev->vcn.workload_profile_active) {
-			r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-							    false);
-			if (r)
-				dev_warn(adev->dev, "(%d) failed to disable video power profile mode\n", r);
-			adev->vcn.workload_profile_active = false;
-		}
-		mutex_unlock(&adev->vcn.workload_profile_mutex);
+		amdgpu_vcn_put_profile(adev);
 	} else {
 		schedule_delayed_work(&adev->vcn.inst[0].idle_work, VCN_IDLE_TIMEOUT);
 	}
@@ -167,7 +158,6 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_vcn_inst *v = &adev->vcn.inst[ring->me];
-	int r = 0;
 
 	atomic_inc(&adev->vcn.inst[0].total_submission_cnt);
 
@@ -177,23 +167,10 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
 	 * the delayed work so there is no one else to set it to false
 	 * and we don't care if someone else sets it to true.
 	 */
-	if (adev->vcn.workload_profile_active)
-		goto pg_lock;
-
-	mutex_lock(&adev->vcn.workload_profile_mutex);
-	if (!adev->vcn.workload_profile_active) {
-		r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-						    true);
-		if (r)
-			dev_warn(adev->dev, "(%d) failed to switch to video power profile mode\n", r);
-		adev->vcn.workload_profile_active = true;
-	}
-	mutex_unlock(&adev->vcn.workload_profile_mutex);
-
-pg_lock:
 	mutex_lock(&adev->vcn.inst[0].vcn_pg_lock);
 	amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN,
 					       AMD_PG_STATE_UNGATE);
+	amdgpu_vcn_get_profile(adev);
 
 	/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
 	if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14  8:41                     ` Lazar, Lijo
@ 2025-08-14  9:11                       ` Sundararaju, Sathishkumar
  2025-08-14  9:24                         ` Lazar, Lijo
  0 siblings, 1 reply; 36+ messages in thread
From: Sundararaju, Sathishkumar @ 2025-08-14  9:11 UTC (permalink / raw)
  To: Lazar, Lijo, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org

Hi Lijo,

On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
> [Public]
>
> We already have a per instance power state that can be tracked. What about something like attached?

This also has concurrent access of the power state , 
vcn.inst[i].cur_state is not protected by workload_profile_mutex

every where, it can still change while you are holding 
workload_profile_mutex and checking it.

Regards,

Sathish

>
> Thanks,
> Lijo
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Sundararaju, Sathishkumar
> Sent: Thursday, August 14, 2025 4:43 AM
> To: Alex Deucher <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
>
>
> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>> On Wed, Aug 13, 2025 at 5:16 PM Sundararaju, Sathishkumar
>> <sathishkumar.sundararaju@amd.com> wrote:
>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>> Hi Alex, Hi David,
>>>>>>>
>>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>>> problem, neither the current form , reason :-
>>>>>>>
>>>>>>> The emitted fence count and total submission count are fast
>>>>>>> transients which frequently become 0 in between video decodes
>>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>>> switch of video power profile, in the current form of patch that
>>>>>>> window is minimized, but still can happen if stress tested. But
>>>>>>> power state of any instance becoming zero
>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0
>>>>> decode completes, delayed idle work starts.
>>>>> inst0 idle handler can read 0 total fences and 0 total submission
>>>>> count, even if inst1 is actively decoding, that's between the jobs,
>>>>>      - as begin_use increaments vcn.total_submission_cnt and end_use
>>>>> decreaments vcn.total_submission_cnt that can be 0.
>>>>>      - if outstanding fences are cleared and no new emitted fence,
>>>>> between jobs , can be 0.
>>>>>      - both of the above conditions do not mean video decode is
>>>>> complete on inst1, it is actively decoding.
>>>> How can there be active decoding without an outstanding fence?  In
>>>> that case, total_fences (fences from both instances) would be non-0.
>>> I mean on inst1 the job scheduled is already complete, so 0
>>> outstanding fences, newer job is yet to be scheduled
>>>
>>> and commited to ring (inst1) , this doesn't mean decode has stopped
>>> on
>>> inst1 right (I am saying if timing of inst0 idle work coincides with
>>> this),
>>>
>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>>> correct my understanding here.
>> The flow looks like:
>>
>> begin_use(inst)
>> emit_fence(inst)
>> end_use(inst)
>>
>> ...later
>> fence signals
>> ...later
>> work handler
>>
>> In begin_use we increment the global and per instance submission.
>> This protects the power gating and profile until end_use.  In end use
>> we decrement the submissions because we don't need to protect anything
>> any more as we have the fence that was submitted via the ring.  That
>> fence won't signal until the job is complete.
> Is a next begin_use always guaranteed to be run before current job fence signals ?
>
> if not then both total submission and total fence are zero , example delayed job/packet submissions
>
> from user space, or next job schedule happens after current job fence signals.
>
> if this is never possible then (v3) is perfect.
>
> Regards,
>
> Sathish
>
>> For power gating, we
>> only care about the submission count and fences for that instance, for
>> the profile, we care about submission count and fences all instances.
>> Once the fences have signalled, the outstanding fences will be 0 and
>> there won't be any active work.
>>
>> Alex
>>
>>> Regards,
>>>
>>> Sathish
>>>
>>>> Alex
>>>>
>>>>> Whereas if instances are powered off we are sure idle time is past
>>>>> and it is powered off, no possible way of active video decode, when
>>>>> all instances are off we can safely assume no active decode and
>>>>> global lock protects it against new begin_use on any instance. But
>>>>> the only distant concern is global common locks w.r.t perf, but we
>>>>> are already having a global workprofile mutex , so there shouldn't
>>>>> be any drop in perf, with just one single global lock for all
>>>>> instances.
>>>>>
>>>>> Just sending out a patch with this fix, will leave it to you to
>>>>> decide the right method. If you think outstanding total fences can
>>>>> never be 0 during decode, then your previous version (v3) itself is
>>>>> good, there is no real benefit of splitting the handlers as such.
>>>>>
>>>>> Regards,
>>>>> Sathish
>>>>>> If it is possible, maybe it would be easier to just split the
>>>>>> profile and powergating into separate handlers.  The profile one
>>>>>> would be global and the powergating one would be per instance.
>>>>>> See the attached patches.
>>>>>>
>>>>>> Alex
>>>>>>
>>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>>> mistake in my patch was using per instance mutex, I should have
>>>>>>> used a common global mutex, then that covers the situation David is trying to bring out.
>>>>>>>
>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>>> flags to track power state could help us totally avoid this situation.
>>>>>>>
>>>>>>> Regards,
>>>>>>>
>>>>>>> Sathish
>>>>>>>
>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>> Hi Alex,
>>>>>>>>>>
>>>>>>>>>> The addition of  total_submission_cnt should work - in that it
>>>>>>>>>> is unlikely to have a context switch right after the begin_use().
>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer in
>>>>>>>>>> case someone adds more before the lock and not reviewed
>>>>>>>>>> thoroughly)
>>>>>>>>>>        - up to you to decide.
>>>>>>>>>>
>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>
>>>>>>>>>> Thanks,
>>>>>>>>>> David
>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>>> end up switching the video profile while another instance is
>>>>>>>>>>> active because we only take into account the current
>>>>>>>>>>> instance's submissions.  Look at all outstanding fences for
>>>>>>>>>>> the video profile.
>>>>>>>>>>>
>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>
>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
>>>>>>>>>>> handling")
>>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>> ---
>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>        2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>            struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>                    container_of(work, struct amdgpu_vcn_inst,
>>>>>>>>>>> idle_work.work);
>>>>>>>>>>>            struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>            int r = 0;
>>>>>>>>>>>
>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>>                    return;
>>>>>>>>>>>
>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>>>>>>>>>> +
>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>> +     }
>>>>>>>>>>>
>>>>>>>>>>>            /* Only set DPG pause for VCN3 or below, VCN4 and
>>>>>>>>>>> above will be handled by FW */
>>>>>>>>>>>            if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>                    struct dpg_pause_state new_state;
>>>>>>>>>>>
>>>>>>>>>>>                    if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct work_struct
>>>>>>>>>>> *work)
>>>>>>>>>>>                    else
>>>>>>>>>>>                            new_state.fw_based =
>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>
>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>            }
>>>>>>>>>>>
>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>> -
>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>                    mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>                    vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>                    mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>> !total_fences &&
>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>                            r =
>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>> false);
>>>>>>>>>>>                            if (r) @@ -467,16 +473,10 @@ void
>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>            int r = 0;
>>>>>>>>>>>
>>>>>>>>>>>            atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>
>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>> -      */
>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>> -
>>>>>>>>>>>            mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>> move to here:
>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>> David
>>>>>>>>> Alex
>>>>>>>>>
>>>>>>>>>> David
>>>>>>>>>>>            if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>                    r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>>            }
>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>
>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>            mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>            vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>>
>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>>> amdgpu_ring
>>>>>>>>>>> *ring)
>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission
>>>>>>>>>>> _cnt);
>>>>>>>>>>>
>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_c
>>>>>>>>>>> nt);
>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>
>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>                                  VCN_IDLE_TIMEOUT); diff --git
>>>>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>
>>>>>>>>>>>            uint16_t inst_mask;
>>>>>>>>>>>            uint8_t num_inst_per_aid;
>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>
>>>>>>>>>>>            /* IP reg dump */
>>>>>>>>>>>            uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14  9:11                       ` Sundararaju, Sathishkumar
@ 2025-08-14  9:24                         ` Lazar, Lijo
  2025-08-14  9:46                           ` Lazar, Lijo
  0 siblings, 1 reply; 36+ messages in thread
From: Lazar, Lijo @ 2025-08-14  9:24 UTC (permalink / raw)
  To: Sundararaju, Sathishkumar, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org

[Public]

That is not required I think. The power profile is set by an instance *after* setting itself to power on. Also, it's switched back after changing its power state to off.  If idle worker is run by another instance, it won't be seeing the inst0 as power gated and won't change power profile.

Thanks,
Lijo
-----Original Message-----
From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
Sent: Thursday, August 14, 2025 2:41 PM
To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher <alexdeucher@gmail.com>
Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)

Hi Lijo,

On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
> [Public]
>
> We already have a per instance power state that can be tracked. What about something like attached?

This also has concurrent access of the power state , vcn.inst[i].cur_state is not protected by workload_profile_mutex

every where, it can still change while you are holding workload_profile_mutex and checking it.

Regards,

Sathish

>
> Thanks,
> Lijo
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Sundararaju, Sathishkumar
> Sent: Thursday, August 14, 2025 4:43 AM
> To: Alex Deucher <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> (v3)
>
>
> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
>> <sathishkumar.sundararaju@amd.com> wrote:
>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>> Hi Alex, Hi David,
>>>>>>>
>>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>>> problem, neither the current form , reason :-
>>>>>>>
>>>>>>> The emitted fence count and total submission count are fast
>>>>>>> transients which frequently become 0 in between video decodes
>>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>>> switch of video power profile, in the current form of patch that
>>>>>>> window is minimized, but still can happen if stress tested. But
>>>>>>> power state of any instance becoming zero
>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0
>>>>> decode completes, delayed idle work starts.
>>>>> inst0 idle handler can read 0 total fences and 0 total submission
>>>>> count, even if inst1 is actively decoding, that's between the jobs,
>>>>>      - as begin_use increaments vcn.total_submission_cnt and
>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
>>>>>      - if outstanding fences are cleared and no new emitted fence,
>>>>> between jobs , can be 0.
>>>>>      - both of the above conditions do not mean video decode is
>>>>> complete on inst1, it is actively decoding.
>>>> How can there be active decoding without an outstanding fence?  In
>>>> that case, total_fences (fences from both instances) would be non-0.
>>> I mean on inst1 the job scheduled is already complete, so 0
>>> outstanding fences, newer job is yet to be scheduled
>>>
>>> and commited to ring (inst1) , this doesn't mean decode has stopped
>>> on
>>> inst1 right (I am saying if timing of inst0 idle work coincides with
>>> this),
>>>
>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>>> correct my understanding here.
>> The flow looks like:
>>
>> begin_use(inst)
>> emit_fence(inst)
>> end_use(inst)
>>
>> ...later
>> fence signals
>> ...later
>> work handler
>>
>> In begin_use we increment the global and per instance submission.
>> This protects the power gating and profile until end_use.  In end use
>> we decrement the submissions because we don't need to protect
>> anything any more as we have the fence that was submitted via the
>> ring.  That fence won't signal until the job is complete.
> Is a next begin_use always guaranteed to be run before current job fence signals ?
>
> if not then both total submission and total fence are zero , example
> delayed job/packet submissions
>
> from user space, or next job schedule happens after current job fence signals.
>
> if this is never possible then (v3) is perfect.
>
> Regards,
>
> Sathish
>
>> For power gating, we
>> only care about the submission count and fences for that instance,
>> for the profile, we care about submission count and fences all instances.
>> Once the fences have signalled, the outstanding fences will be 0 and
>> there won't be any active work.
>>
>> Alex
>>
>>> Regards,
>>>
>>> Sathish
>>>
>>>> Alex
>>>>
>>>>> Whereas if instances are powered off we are sure idle time is past
>>>>> and it is powered off, no possible way of active video decode,
>>>>> when all instances are off we can safely assume no active decode
>>>>> and global lock protects it against new begin_use on any instance.
>>>>> But the only distant concern is global common locks w.r.t perf,
>>>>> but we are already having a global workprofile mutex , so there
>>>>> shouldn't be any drop in perf, with just one single global lock
>>>>> for all instances.
>>>>>
>>>>> Just sending out a patch with this fix, will leave it to you to
>>>>> decide the right method. If you think outstanding total fences can
>>>>> never be 0 during decode, then your previous version (v3) itself
>>>>> is good, there is no real benefit of splitting the handlers as such.
>>>>>
>>>>> Regards,
>>>>> Sathish
>>>>>> If it is possible, maybe it would be easier to just split the
>>>>>> profile and powergating into separate handlers.  The profile one
>>>>>> would be global and the powergating one would be per instance.
>>>>>> See the attached patches.
>>>>>>
>>>>>> Alex
>>>>>>
>>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>>> mistake in my patch was using per instance mutex, I should have
>>>>>>> used a common global mutex, then that covers the situation David is trying to bring out.
>>>>>>>
>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>>> flags to track power state could help us totally avoid this situation.
>>>>>>>
>>>>>>> Regards,
>>>>>>>
>>>>>>> Sathish
>>>>>>>
>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>> Hi Alex,
>>>>>>>>>>
>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
>>>>>>>>>> in case someone adds more before the lock and not reviewed
>>>>>>>>>> thoroughly)
>>>>>>>>>>        - up to you to decide.
>>>>>>>>>>
>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>
>>>>>>>>>> Thanks,
>>>>>>>>>> David
>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>>> end up switching the video profile while another instance is
>>>>>>>>>>> active because we only take into account the current
>>>>>>>>>>> instance's submissions.  Look at all outstanding fences for
>>>>>>>>>>> the video profile.
>>>>>>>>>>>
>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>
>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
>>>>>>>>>>> profile
>>>>>>>>>>> handling")
>>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>> ---
>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>        2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>            struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>                    container_of(work, struct
>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
>>>>>>>>>>>            struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>            int r = 0;
>>>>>>>>>>>
>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>>                    return;
>>>>>>>>>>>
>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
>>>>>>>>>>> + &adev->vcn.inst[i];
>>>>>>>>>>> +
>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>> +     }
>>>>>>>>>>>
>>>>>>>>>>>            /* Only set DPG pause for VCN3 or below, VCN4 and
>>>>>>>>>>> above will be handled by FW */
>>>>>>>>>>>            if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>                    struct dpg_pause_state new_state;
>>>>>>>>>>>
>>>>>>>>>>>                    if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct work_struct
>>>>>>>>>>> *work)
>>>>>>>>>>>                    else
>>>>>>>>>>>                            new_state.fw_based =
>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>
>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst,
>>>>>>>>>>> + &new_state);
>>>>>>>>>>>            }
>>>>>>>>>>>
>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>> -
>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>                    mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>                    vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>                    mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>> !total_fences &&
>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>                            r =
>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>> false);
>>>>>>>>>>>                            if (r) @@ -467,16 +473,10 @@ void
>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>            int r = 0;
>>>>>>>>>>>
>>>>>>>>>>>            atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>
>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>> -      */
>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>> -
>>>>>>>>>>>            mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>> move to here:
>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>> David
>>>>>>>>> Alex
>>>>>>>>>
>>>>>>>>>> David
>>>>>>>>>>>            if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>                    r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>>            }
>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>
>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>            mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>            vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>>
>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>>> amdgpu_ring
>>>>>>>>>>> *ring)
>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submissio
>>>>>>>>>>> n
>>>>>>>>>>> _cnt);
>>>>>>>>>>>
>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_
>>>>>>>>>>> c
>>>>>>>>>>> nt);
>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>
>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>                                  VCN_IDLE_TIMEOUT); diff
>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>
>>>>>>>>>>>            uint16_t inst_mask;
>>>>>>>>>>>            uint8_t num_inst_per_aid;
>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>
>>>>>>>>>>>            /* IP reg dump */
>>>>>>>>>>>            uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14  9:24                         ` Lazar, Lijo
@ 2025-08-14  9:46                           ` Lazar, Lijo
  2025-08-14 11:52                             ` Sundararaju, Sathishkumar
  0 siblings, 1 reply; 36+ messages in thread
From: Lazar, Lijo @ 2025-08-14  9:46 UTC (permalink / raw)
  To: Lazar, Lijo, Sundararaju, Sathishkumar, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org

[-- Attachment #1: Type: text/plain, Size: 16000 bytes --]

[Public]

I see your point now. Attached should work, I guess. Is the concern more about having to take the lock for every begin?

Thanks,
Lijo

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Lazar, Lijo
Sent: Thursday, August 14, 2025 2:55 PM
To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>; Alex Deucher <alexdeucher@gmail.com>
Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)

[Public]

That is not required I think. The power profile is set by an instance *after* setting itself to power on. Also, it's switched back after changing its power state to off.  If idle worker is run by another instance, it won't be seeing the inst0 as power gated and won't change power profile.

Thanks,
Lijo
-----Original Message-----
From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
Sent: Thursday, August 14, 2025 2:41 PM
To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher <alexdeucher@gmail.com>
Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)

Hi Lijo,

On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
> [Public]
>
> We already have a per instance power state that can be tracked. What about something like attached?

This also has concurrent access of the power state , vcn.inst[i].cur_state is not protected by workload_profile_mutex

every where, it can still change while you are holding workload_profile_mutex and checking it.

Regards,

Sathish

>
> Thanks,
> Lijo
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Sundararaju, Sathishkumar
> Sent: Thursday, August 14, 2025 4:43 AM
> To: Alex Deucher <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> (v3)
>
>
> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
>> <sathishkumar.sundararaju@amd.com> wrote:
>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>> Hi Alex, Hi David,
>>>>>>>
>>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>>> problem, neither the current form , reason :-
>>>>>>>
>>>>>>> The emitted fence count and total submission count are fast
>>>>>>> transients which frequently become 0 in between video decodes
>>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>>> switch of video power profile, in the current form of patch that
>>>>>>> window is minimized, but still can happen if stress tested. But
>>>>>>> power state of any instance becoming zero
>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0
>>>>> decode completes, delayed idle work starts.
>>>>> inst0 idle handler can read 0 total fences and 0 total submission
>>>>> count, even if inst1 is actively decoding, that's between the jobs,
>>>>>      - as begin_use increaments vcn.total_submission_cnt and
>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
>>>>>      - if outstanding fences are cleared and no new emitted fence,
>>>>> between jobs , can be 0.
>>>>>      - both of the above conditions do not mean video decode is
>>>>> complete on inst1, it is actively decoding.
>>>> How can there be active decoding without an outstanding fence?  In
>>>> that case, total_fences (fences from both instances) would be non-0.
>>> I mean on inst1 the job scheduled is already complete, so 0
>>> outstanding fences, newer job is yet to be scheduled
>>>
>>> and commited to ring (inst1) , this doesn't mean decode has stopped
>>> on
>>> inst1 right (I am saying if timing of inst0 idle work coincides with
>>> this),
>>>
>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>>> correct my understanding here.
>> The flow looks like:
>>
>> begin_use(inst)
>> emit_fence(inst)
>> end_use(inst)
>>
>> ...later
>> fence signals
>> ...later
>> work handler
>>
>> In begin_use we increment the global and per instance submission.
>> This protects the power gating and profile until end_use.  In end use
>> we decrement the submissions because we don't need to protect
>> anything any more as we have the fence that was submitted via the
>> ring.  That fence won't signal until the job is complete.
> Is a next begin_use always guaranteed to be run before current job fence signals ?
>
> if not then both total submission and total fence are zero , example
> delayed job/packet submissions
>
> from user space, or next job schedule happens after current job fence signals.
>
> if this is never possible then (v3) is perfect.
>
> Regards,
>
> Sathish
>
>> For power gating, we
>> only care about the submission count and fences for that instance,
>> for the profile, we care about submission count and fences all instances.
>> Once the fences have signalled, the outstanding fences will be 0 and
>> there won't be any active work.
>>
>> Alex
>>
>>> Regards,
>>>
>>> Sathish
>>>
>>>> Alex
>>>>
>>>>> Whereas if instances are powered off we are sure idle time is past
>>>>> and it is powered off, no possible way of active video decode,
>>>>> when all instances are off we can safely assume no active decode
>>>>> and global lock protects it against new begin_use on any instance.
>>>>> But the only distant concern is global common locks w.r.t perf,
>>>>> but we are already having a global workprofile mutex , so there
>>>>> shouldn't be any drop in perf, with just one single global lock
>>>>> for all instances.
>>>>>
>>>>> Just sending out a patch with this fix, will leave it to you to
>>>>> decide the right method. If you think outstanding total fences can
>>>>> never be 0 during decode, then your previous version (v3) itself
>>>>> is good, there is no real benefit of splitting the handlers as such.
>>>>>
>>>>> Regards,
>>>>> Sathish
>>>>>> If it is possible, maybe it would be easier to just split the
>>>>>> profile and powergating into separate handlers.  The profile one
>>>>>> would be global and the powergating one would be per instance.
>>>>>> See the attached patches.
>>>>>>
>>>>>> Alex
>>>>>>
>>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>>> mistake in my patch was using per instance mutex, I should have
>>>>>>> used a common global mutex, then that covers the situation David is trying to bring out.
>>>>>>>
>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>>> flags to track power state could help us totally avoid this situation.
>>>>>>>
>>>>>>> Regards,
>>>>>>>
>>>>>>> Sathish
>>>>>>>
>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>> Hi Alex,
>>>>>>>>>>
>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
>>>>>>>>>> in case someone adds more before the lock and not reviewed
>>>>>>>>>> thoroughly)
>>>>>>>>>>        - up to you to decide.
>>>>>>>>>>
>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>
>>>>>>>>>> Thanks,
>>>>>>>>>> David
>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>>> end up switching the video profile while another instance is
>>>>>>>>>>> active because we only take into account the current
>>>>>>>>>>> instance's submissions.  Look at all outstanding fences for
>>>>>>>>>>> the video profile.
>>>>>>>>>>>
>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>
>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
>>>>>>>>>>> profile
>>>>>>>>>>> handling")
>>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>> ---
>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>        2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>            struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>                    container_of(work, struct
>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
>>>>>>>>>>>            struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>            int r = 0;
>>>>>>>>>>>
>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>>                    return;
>>>>>>>>>>>
>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
>>>>>>>>>>> + &adev->vcn.inst[i];
>>>>>>>>>>> +
>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>> +     }
>>>>>>>>>>>
>>>>>>>>>>>            /* Only set DPG pause for VCN3 or below, VCN4 and
>>>>>>>>>>> above will be handled by FW */
>>>>>>>>>>>            if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>                    struct dpg_pause_state new_state;
>>>>>>>>>>>
>>>>>>>>>>>                    if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct work_struct
>>>>>>>>>>> *work)
>>>>>>>>>>>                    else
>>>>>>>>>>>                            new_state.fw_based =
>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>
>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst,
>>>>>>>>>>> + &new_state);
>>>>>>>>>>>            }
>>>>>>>>>>>
>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>> -
>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>                    mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>                    vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>                    mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>> !total_fences &&
>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>                            r =
>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>> false);
>>>>>>>>>>>                            if (r) @@ -467,16 +473,10 @@ void
>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>            int r = 0;
>>>>>>>>>>>
>>>>>>>>>>>            atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>
>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>> -      */
>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>> -
>>>>>>>>>>>            mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>> move to here:
>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>> David
>>>>>>>>> Alex
>>>>>>>>>
>>>>>>>>>> David
>>>>>>>>>>>            if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>                    r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>>            }
>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>
>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>            mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>            vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>>
>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>>> amdgpu_ring
>>>>>>>>>>> *ring)
>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submissio
>>>>>>>>>>> n
>>>>>>>>>>> _cnt);
>>>>>>>>>>>
>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_
>>>>>>>>>>> c
>>>>>>>>>>> nt);
>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>
>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>                                  VCN_IDLE_TIMEOUT); diff
>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>
>>>>>>>>>>>            uint16_t inst_mask;
>>>>>>>>>>>            uint8_t num_inst_per_aid;
>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>
>>>>>>>>>>>            /* IP reg dump */
>>>>>>>>>>>            uint32_t                *ip_dump;

[-- Attachment #2: 0001-drm-amdgpu-Check-vcn-state-before-profile-switch.patch --]
[-- Type: application/octet-stream, Size: 8225 bytes --]

From 55d853b2e43b62acb83463777b0d89918eab995b Mon Sep 17 00:00:00 2001
From: Lijo Lazar <lijo.lazar@amd.com>
Date: Thu, 14 Aug 2025 13:52:50 +0530
Subject: [PATCH] drm/amdgpu: Check vcn state before profile switch

Before switching video profile, check power state of all VCN instances.
Before submission, power state is changed and a check is done for
vidoe profile. If already active, it skips switching; otherwise, it
switches to video profile.  When it gets to idle state, power state of
the current instance is set to gated. After that a check is done if all
instances are in gated state. If so, video profile is disabled.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 85 ++++++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  5 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 27 +-------
 3 files changed, 62 insertions(+), 55 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 9a76e11d1c18..8ab399ed9b20 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -410,6 +410,59 @@ int amdgpu_vcn_resume(struct amdgpu_device *adev, int i)
 	return 0;
 }
 
+int amdgpu_vcn_get_profile(struct amdgpu_device *adev)
+{
+	int r;
+
+
+	mutex_lock(&adev->vcn.workload_profile_mutex);
+
+	if (adev->vcn.workload_profile_active) {
+		mutex_unlock(&adev->vcn.workload_profile_mutex);
+		return 0;
+	}
+	r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
+					    true);
+	if (r)
+		dev_warn(adev->dev,
+			 "(%d) failed to enable video power profile mode\n", r);
+	else
+		adev->vcn.workload_profile_active = true;
+	mutex_unlock(&adev->vcn.workload_profile_mutex);
+
+	return r;
+}
+
+int amdgpu_vcn_put_profile(struct amdgpu_device *adev)
+{
+	bool pg = true;
+	int r, i;
+
+	mutex_lock(&adev->vcn.workload_profile_mutex);
+	for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
+		if (adev->vcn.inst[i].cur_state != AMD_PG_STATE_GATE) {
+			pg = false;
+			break;
+		}
+	}
+
+	if (pg) {
+		r = amdgpu_dpm_switch_power_profile(
+			adev, PP_SMC_POWER_PROFILE_VIDEO, false);
+		if (r)
+			dev_warn(
+				adev->dev,
+				"(%d) failed to disable video power profile mode\n",
+				r);
+		else
+			adev->vcn.workload_profile_active = false;
+	}
+
+	mutex_unlock(&adev->vcn.workload_profile_mutex);
+
+	return r;
+}
+
 static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 {
 	struct amdgpu_vcn_inst *vcn_inst =
@@ -417,7 +470,6 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 	struct amdgpu_device *adev = vcn_inst->adev;
 	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
 	unsigned int i = vcn_inst->inst, j;
-	int r = 0;
 
 	if (adev->vcn.harvest_config & (1 << i))
 		return;
@@ -446,15 +498,8 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 		mutex_lock(&vcn_inst->vcn_pg_lock);
 		vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
 		mutex_unlock(&vcn_inst->vcn_pg_lock);
-		mutex_lock(&adev->vcn.workload_profile_mutex);
-		if (adev->vcn.workload_profile_active) {
-			r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-							    false);
-			if (r)
-				dev_warn(adev->dev, "(%d) failed to disable video power profile mode\n", r);
-			adev->vcn.workload_profile_active = false;
-		}
-		mutex_unlock(&adev->vcn.workload_profile_mutex);
+		amdgpu_vcn_put_profile(adev);
+
 	} else {
 		schedule_delayed_work(&vcn_inst->idle_work, VCN_IDLE_TIMEOUT);
 	}
@@ -464,32 +509,14 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_vcn_inst *vcn_inst = &adev->vcn.inst[ring->me];
-	int r = 0;
 
 	atomic_inc(&vcn_inst->total_submission_cnt);
 
 	cancel_delayed_work_sync(&vcn_inst->idle_work);
 
-	/* We can safely return early here because we've cancelled the
-	 * the delayed work so there is no one else to set it to false
-	 * and we don't care if someone else sets it to true.
-	 */
-	if (adev->vcn.workload_profile_active)
-		goto pg_lock;
-
-	mutex_lock(&adev->vcn.workload_profile_mutex);
-	if (!adev->vcn.workload_profile_active) {
-		r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-						    true);
-		if (r)
-			dev_warn(adev->dev, "(%d) failed to switch to video power profile mode\n", r);
-		adev->vcn.workload_profile_active = true;
-	}
-	mutex_unlock(&adev->vcn.workload_profile_mutex);
-
-pg_lock:
 	mutex_lock(&vcn_inst->vcn_pg_lock);
 	vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
+	amdgpu_vcn_get_profile(adev);
 
 	/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
 	if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index b3fb1d0e43fc..c4dec986926f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -362,7 +362,7 @@ struct amdgpu_vcn {
 	bool			per_inst_fw;
 	unsigned		fw_version;
 
-	bool			workload_profile_active;
+	bool workload_profile_active;
 	struct mutex            workload_profile_mutex;
 	u32 reg_count;
 	const struct amdgpu_hwip_reg_entry *reg_list;
@@ -565,4 +565,7 @@ int amdgpu_vcn_reg_dump_init(struct amdgpu_device *adev,
 			     const struct amdgpu_hwip_reg_entry *reg, u32 count);
 void amdgpu_vcn_dump_ip_state(struct amdgpu_ip_block *ip_block);
 void amdgpu_vcn_print_ip_state(struct amdgpu_ip_block *ip_block, struct drm_printer *p);
+int amdgpu_vcn_get_profile(struct amdgpu_device *adev);
+int amdgpu_vcn_put_profile(struct amdgpu_device *adev);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index 3a7c137a83ef..26ed74101d27 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -116,7 +116,6 @@ static void vcn_v2_5_idle_work_handler(struct work_struct *work)
 	struct amdgpu_device *adev = vcn_inst->adev;
 	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
 	unsigned int i, j;
-	int r = 0;
 
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
 		struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
@@ -149,15 +148,7 @@ static void vcn_v2_5_idle_work_handler(struct work_struct *work)
 	if (!fences && !atomic_read(&adev->vcn.inst[0].total_submission_cnt)) {
 		amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN,
 						       AMD_PG_STATE_GATE);
-		mutex_lock(&adev->vcn.workload_profile_mutex);
-		if (adev->vcn.workload_profile_active) {
-			r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-							    false);
-			if (r)
-				dev_warn(adev->dev, "(%d) failed to disable video power profile mode\n", r);
-			adev->vcn.workload_profile_active = false;
-		}
-		mutex_unlock(&adev->vcn.workload_profile_mutex);
+		amdgpu_vcn_put_profile(adev);
 	} else {
 		schedule_delayed_work(&adev->vcn.inst[0].idle_work, VCN_IDLE_TIMEOUT);
 	}
@@ -167,7 +158,6 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_vcn_inst *v = &adev->vcn.inst[ring->me];
-	int r = 0;
 
 	atomic_inc(&adev->vcn.inst[0].total_submission_cnt);
 
@@ -177,23 +167,10 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
 	 * the delayed work so there is no one else to set it to false
 	 * and we don't care if someone else sets it to true.
 	 */
-	if (adev->vcn.workload_profile_active)
-		goto pg_lock;
-
-	mutex_lock(&adev->vcn.workload_profile_mutex);
-	if (!adev->vcn.workload_profile_active) {
-		r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-						    true);
-		if (r)
-			dev_warn(adev->dev, "(%d) failed to switch to video power profile mode\n", r);
-		adev->vcn.workload_profile_active = true;
-	}
-	mutex_unlock(&adev->vcn.workload_profile_mutex);
-
-pg_lock:
 	mutex_lock(&adev->vcn.inst[0].vcn_pg_lock);
 	amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN,
 					       AMD_PG_STATE_UNGATE);
+	amdgpu_vcn_get_profile(adev);
 
 	/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
 	if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14  9:46                           ` Lazar, Lijo
@ 2025-08-14 11:52                             ` Sundararaju, Sathishkumar
  2025-08-14 12:03                               ` Lazar, Lijo
  0 siblings, 1 reply; 36+ messages in thread
From: Sundararaju, Sathishkumar @ 2025-08-14 11:52 UTC (permalink / raw)
  To: Lazar, Lijo, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org


On 8/14/2025 3:16 PM, Lazar, Lijo wrote:
> [Public]
>
> I see your point now. Attached should work, I guess. Is the concern more about having to take the lock for every begin?

This is closer,  but the thing is, IMO we shouldn't have to use 2 locks 
and go into nested locking, we can do with just one global lock.

Power_state of each instance, and global workload_profile_active are 
inter-related, they need to be guarded together,

nested could work , but why nested if single lock is enough ? nested 
complicates it.

Regards,

Sathish

>
> Thanks,
> Lijo
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Lazar, Lijo
> Sent: Thursday, August 14, 2025 2:55 PM
> To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>; Alex Deucher <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
>
> [Public]
>
> That is not required I think. The power profile is set by an instance *after* setting itself to power on. Also, it's switched back after changing its power state to off.  If idle worker is run by another instance, it won't be seeing the inst0 as power gated and won't change power profile.
>
> Thanks,
> Lijo
> -----Original Message-----
> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> Sent: Thursday, August 14, 2025 2:41 PM
> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
>
> Hi Lijo,
>
> On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
>> [Public]
>>
>> We already have a per instance power state that can be tracked. What about something like attached?
> This also has concurrent access of the power state , vcn.inst[i].cur_state is not protected by workload_profile_mutex
>
> every where, it can still change while you are holding workload_profile_mutex and checking it.
>
> Regards,
>
> Sathish
>
>> Thanks,
>> Lijo
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Sundararaju, Sathishkumar
>> Sent: Thursday, August 14, 2025 4:43 AM
>> To: Alex Deucher <alexdeucher@gmail.com>
>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> (v3)
>>
>>
>> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>> Hi Alex, Hi David,
>>>>>>>>
>>>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>>>> problem, neither the current form , reason :-
>>>>>>>>
>>>>>>>> The emitted fence count and total submission count are fast
>>>>>>>> transients which frequently become 0 in between video decodes
>>>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>>>> switch of video power profile, in the current form of patch that
>>>>>>>> window is minimized, but still can happen if stress tested. But
>>>>>>>> power state of any instance becoming zero
>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0
>>>>>> decode completes, delayed idle work starts.
>>>>>> inst0 idle handler can read 0 total fences and 0 total submission
>>>>>> count, even if inst1 is actively decoding, that's between the jobs,
>>>>>>       - as begin_use increaments vcn.total_submission_cnt and
>>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
>>>>>>       - if outstanding fences are cleared and no new emitted fence,
>>>>>> between jobs , can be 0.
>>>>>>       - both of the above conditions do not mean video decode is
>>>>>> complete on inst1, it is actively decoding.
>>>>> How can there be active decoding without an outstanding fence?  In
>>>>> that case, total_fences (fences from both instances) would be non-0.
>>>> I mean on inst1 the job scheduled is already complete, so 0
>>>> outstanding fences, newer job is yet to be scheduled
>>>>
>>>> and commited to ring (inst1) , this doesn't mean decode has stopped
>>>> on
>>>> inst1 right (I am saying if timing of inst0 idle work coincides with
>>>> this),
>>>>
>>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>>>> correct my understanding here.
>>> The flow looks like:
>>>
>>> begin_use(inst)
>>> emit_fence(inst)
>>> end_use(inst)
>>>
>>> ...later
>>> fence signals
>>> ...later
>>> work handler
>>>
>>> In begin_use we increment the global and per instance submission.
>>> This protects the power gating and profile until end_use.  In end use
>>> we decrement the submissions because we don't need to protect
>>> anything any more as we have the fence that was submitted via the
>>> ring.  That fence won't signal until the job is complete.
>> Is a next begin_use always guaranteed to be run before current job fence signals ?
>>
>> if not then both total submission and total fence are zero , example
>> delayed job/packet submissions
>>
>> from user space, or next job schedule happens after current job fence signals.
>>
>> if this is never possible then (v3) is perfect.
>>
>> Regards,
>>
>> Sathish
>>
>>> For power gating, we
>>> only care about the submission count and fences for that instance,
>>> for the profile, we care about submission count and fences all instances.
>>> Once the fences have signalled, the outstanding fences will be 0 and
>>> there won't be any active work.
>>>
>>> Alex
>>>
>>>> Regards,
>>>>
>>>> Sathish
>>>>
>>>>> Alex
>>>>>
>>>>>> Whereas if instances are powered off we are sure idle time is past
>>>>>> and it is powered off, no possible way of active video decode,
>>>>>> when all instances are off we can safely assume no active decode
>>>>>> and global lock protects it against new begin_use on any instance.
>>>>>> But the only distant concern is global common locks w.r.t perf,
>>>>>> but we are already having a global workprofile mutex , so there
>>>>>> shouldn't be any drop in perf, with just one single global lock
>>>>>> for all instances.
>>>>>>
>>>>>> Just sending out a patch with this fix, will leave it to you to
>>>>>> decide the right method. If you think outstanding total fences can
>>>>>> never be 0 during decode, then your previous version (v3) itself
>>>>>> is good, there is no real benefit of splitting the handlers as such.
>>>>>>
>>>>>> Regards,
>>>>>> Sathish
>>>>>>> If it is possible, maybe it would be easier to just split the
>>>>>>> profile and powergating into separate handlers.  The profile one
>>>>>>> would be global and the powergating one would be per instance.
>>>>>>> See the attached patches.
>>>>>>>
>>>>>>> Alex
>>>>>>>
>>>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>>>> mistake in my patch was using per instance mutex, I should have
>>>>>>>> used a common global mutex, then that covers the situation David is trying to bring out.
>>>>>>>>
>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>>>> flags to track power state could help us totally avoid this situation.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>>
>>>>>>>> Sathish
>>>>>>>>
>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>>> Hi Alex,
>>>>>>>>>>>
>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
>>>>>>>>>>> in case someone adds more before the lock and not reviewed
>>>>>>>>>>> thoroughly)
>>>>>>>>>>>         - up to you to decide.
>>>>>>>>>>>
>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>>
>>>>>>>>>>> Thanks,
>>>>>>>>>>> David
>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>>>> end up switching the video profile while another instance is
>>>>>>>>>>>> active because we only take into account the current
>>>>>>>>>>>> instance's submissions.  Look at all outstanding fences for
>>>>>>>>>>>> the video profile.
>>>>>>>>>>>>
>>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>>
>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
>>>>>>>>>>>> profile
>>>>>>>>>>>> handling")
>>>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>>         2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>             struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>>                     container_of(work, struct
>>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
>>>>>>>>>>>>             struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>>             int r = 0;
>>>>>>>>>>>>
>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>>>                     return;
>>>>>>>>>>>>
>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
>>>>>>>>>>>> + &adev->vcn.inst[i];
>>>>>>>>>>>> +
>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>>> +     }
>>>>>>>>>>>>
>>>>>>>>>>>>             /* Only set DPG pause for VCN3 or below, VCN4 and
>>>>>>>>>>>> above will be handled by FW */
>>>>>>>>>>>>             if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>>                     struct dpg_pause_state new_state;
>>>>>>>>>>>>
>>>>>>>>>>>>                     if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct work_struct
>>>>>>>>>>>> *work)
>>>>>>>>>>>>                     else
>>>>>>>>>>>>                             new_state.fw_based =
>>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>>
>>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst,
>>>>>>>>>>>> + &new_state);
>>>>>>>>>>>>             }
>>>>>>>>>>>>
>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>>> -
>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>>                     mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>                     vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>>                     mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>>> !total_fences &&
>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>>                             r =
>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>>> false);
>>>>>>>>>>>>                             if (r) @@ -467,16 +473,10 @@ void
>>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>             int r = 0;
>>>>>>>>>>>>
>>>>>>>>>>>>             atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>>
>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>>> -      */
>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>>> -
>>>>>>>>>>>>             mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>> move to here:
>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>>> David
>>>>>>>>>> Alex
>>>>>>>>>>
>>>>>>>>>>> David
>>>>>>>>>>>>             if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>>                     r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>>>             }
>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>
>>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>>             mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>             vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>>>
>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>>>> amdgpu_ring
>>>>>>>>>>>> *ring)
>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submissio
>>>>>>>>>>>> n
>>>>>>>>>>>> _cnt);
>>>>>>>>>>>>
>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_
>>>>>>>>>>>> c
>>>>>>>>>>>> nt);
>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>>
>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>>                                   VCN_IDLE_TIMEOUT); diff
>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>>
>>>>>>>>>>>>             uint16_t inst_mask;
>>>>>>>>>>>>             uint8_t num_inst_per_aid;
>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>>
>>>>>>>>>>>>             /* IP reg dump */
>>>>>>>>>>>>             uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 11:52                             ` Sundararaju, Sathishkumar
@ 2025-08-14 12:03                               ` Lazar, Lijo
  2025-08-14 12:48                                 ` Sundararaju, Sathishkumar
  0 siblings, 1 reply; 36+ messages in thread
From: Lazar, Lijo @ 2025-08-14 12:03 UTC (permalink / raw)
  To: Sundararaju, Sathishkumar, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org

[Public]

There is no need for nested lock. It only needs to follow the order
        set instance power_state
        set profile (this takes a global lock and hence instance power state will be visible to whichever thread that gets the work profile lock).

You are seeing nested lock just because I added the code just after power state setting.

Thanks,
Lijo

-----Original Message-----
From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
Sent: Thursday, August 14, 2025 5:23 PM
To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher <alexdeucher@gmail.com>
Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)


On 8/14/2025 3:16 PM, Lazar, Lijo wrote:
> [Public]
>
> I see your point now. Attached should work, I guess. Is the concern more about having to take the lock for every begin?

This is closer,  but the thing is, IMO we shouldn't have to use 2 locks and go into nested locking, we can do with just one global lock.

Power_state of each instance, and global workload_profile_active are inter-related, they need to be guarded together,

nested could work , but why nested if single lock is enough ? nested complicates it.

Regards,

Sathish

>
> Thanks,
> Lijo
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Lazar, Lijo
> Sent: Thursday, August 14, 2025 2:55 PM
> To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>; Alex
> Deucher <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> (v3)
>
> [Public]
>
> That is not required I think. The power profile is set by an instance *after* setting itself to power on. Also, it's switched back after changing its power state to off.  If idle worker is run by another instance, it won't be seeing the inst0 as power gated and won't change power profile.
>
> Thanks,
> Lijo
> -----Original Message-----
> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> Sent: Thursday, August 14, 2025 2:41 PM
> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
> <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> (v3)
>
> Hi Lijo,
>
> On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
>> [Public]
>>
>> We already have a per instance power state that can be tracked. What about something like attached?
> This also has concurrent access of the power state ,
> vcn.inst[i].cur_state is not protected by workload_profile_mutex
>
> every where, it can still change while you are holding workload_profile_mutex and checking it.
>
> Regards,
>
> Sathish
>
>> Thanks,
>> Lijo
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Sundararaju, Sathishkumar
>> Sent: Thursday, August 14, 2025 4:43 AM
>> To: Alex Deucher <alexdeucher@gmail.com>
>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> (v3)
>>
>>
>> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>> Hi Alex, Hi David,
>>>>>>>>
>>>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>>>> problem, neither the current form , reason :-
>>>>>>>>
>>>>>>>> The emitted fence count and total submission count are fast
>>>>>>>> transients which frequently become 0 in between video decodes
>>>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>>>> switch of video power profile, in the current form of patch
>>>>>>>> that window is minimized, but still can happen if stress
>>>>>>>> tested. But power state of any instance becoming zero
>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0
>>>>>> decode completes, delayed idle work starts.
>>>>>> inst0 idle handler can read 0 total fences and 0 total submission
>>>>>> count, even if inst1 is actively decoding, that's between the jobs,
>>>>>>       - as begin_use increaments vcn.total_submission_cnt and
>>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
>>>>>>       - if outstanding fences are cleared and no new emitted
>>>>>> fence, between jobs , can be 0.
>>>>>>       - both of the above conditions do not mean video decode is
>>>>>> complete on inst1, it is actively decoding.
>>>>> How can there be active decoding without an outstanding fence?  In
>>>>> that case, total_fences (fences from both instances) would be non-0.
>>>> I mean on inst1 the job scheduled is already complete, so 0
>>>> outstanding fences, newer job is yet to be scheduled
>>>>
>>>> and commited to ring (inst1) , this doesn't mean decode has stopped
>>>> on
>>>> inst1 right (I am saying if timing of inst0 idle work coincides
>>>> with this),
>>>>
>>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>>>> correct my understanding here.
>>> The flow looks like:
>>>
>>> begin_use(inst)
>>> emit_fence(inst)
>>> end_use(inst)
>>>
>>> ...later
>>> fence signals
>>> ...later
>>> work handler
>>>
>>> In begin_use we increment the global and per instance submission.
>>> This protects the power gating and profile until end_use.  In end
>>> use we decrement the submissions because we don't need to protect
>>> anything any more as we have the fence that was submitted via the
>>> ring.  That fence won't signal until the job is complete.
>> Is a next begin_use always guaranteed to be run before current job fence signals ?
>>
>> if not then both total submission and total fence are zero , example
>> delayed job/packet submissions
>>
>> from user space, or next job schedule happens after current job fence signals.
>>
>> if this is never possible then (v3) is perfect.
>>
>> Regards,
>>
>> Sathish
>>
>>> For power gating, we
>>> only care about the submission count and fences for that instance,
>>> for the profile, we care about submission count and fences all instances.
>>> Once the fences have signalled, the outstanding fences will be 0 and
>>> there won't be any active work.
>>>
>>> Alex
>>>
>>>> Regards,
>>>>
>>>> Sathish
>>>>
>>>>> Alex
>>>>>
>>>>>> Whereas if instances are powered off we are sure idle time is
>>>>>> past and it is powered off, no possible way of active video
>>>>>> decode, when all instances are off we can safely assume no active
>>>>>> decode and global lock protects it against new begin_use on any instance.
>>>>>> But the only distant concern is global common locks w.r.t perf,
>>>>>> but we are already having a global workprofile mutex , so there
>>>>>> shouldn't be any drop in perf, with just one single global lock
>>>>>> for all instances.
>>>>>>
>>>>>> Just sending out a patch with this fix, will leave it to you to
>>>>>> decide the right method. If you think outstanding total fences
>>>>>> can never be 0 during decode, then your previous version (v3)
>>>>>> itself is good, there is no real benefit of splitting the handlers as such.
>>>>>>
>>>>>> Regards,
>>>>>> Sathish
>>>>>>> If it is possible, maybe it would be easier to just split the
>>>>>>> profile and powergating into separate handlers.  The profile one
>>>>>>> would be global and the powergating one would be per instance.
>>>>>>> See the attached patches.
>>>>>>>
>>>>>>> Alex
>>>>>>>
>>>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>>>> mistake in my patch was using per instance mutex, I should have
>>>>>>>> used a common global mutex, then that covers the situation David is trying to bring out.
>>>>>>>>
>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>>>> flags to track power state could help us totally avoid this situation.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>>
>>>>>>>> Sathish
>>>>>>>>
>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>>> Hi Alex,
>>>>>>>>>>>
>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
>>>>>>>>>>> in case someone adds more before the lock and not reviewed
>>>>>>>>>>> thoroughly)
>>>>>>>>>>>         - up to you to decide.
>>>>>>>>>>>
>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>>
>>>>>>>>>>> Thanks,
>>>>>>>>>>> David
>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>>>> end up switching the video profile while another instance
>>>>>>>>>>>> is active because we only take into account the current
>>>>>>>>>>>> instance's submissions.  Look at all outstanding fences for
>>>>>>>>>>>> the video profile.
>>>>>>>>>>>>
>>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>>
>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
>>>>>>>>>>>> profile
>>>>>>>>>>>> handling")
>>>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>>         2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>             struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>>                     container_of(work, struct
>>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
>>>>>>>>>>>>             struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>>             int r = 0;
>>>>>>>>>>>>
>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>>>                     return;
>>>>>>>>>>>>
>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
>>>>>>>>>>>> + &adev->vcn.inst[i];
>>>>>>>>>>>> +
>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>>> +     }
>>>>>>>>>>>>
>>>>>>>>>>>>             /* Only set DPG pause for VCN3 or below, VCN4
>>>>>>>>>>>> and above will be handled by FW */
>>>>>>>>>>>>             if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>>                     struct dpg_pause_state new_state;
>>>>>>>>>>>>
>>>>>>>>>>>>                     if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct work_struct
>>>>>>>>>>>> *work)
>>>>>>>>>>>>                     else
>>>>>>>>>>>>                             new_state.fw_based =
>>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>>
>>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst,
>>>>>>>>>>>> + &new_state);
>>>>>>>>>>>>             }
>>>>>>>>>>>>
>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>>> -
>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>>                     mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>                     vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>>                     mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>>> !total_fences &&
>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>>                             r =
>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, false);
>>>>>>>>>>>>                             if (r) @@ -467,16 +473,10 @@
>>>>>>>>>>>> void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>             int r = 0;
>>>>>>>>>>>>
>>>>>>>>>>>>             atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>>
>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>>> -      */
>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>>> -
>>>>>>>>>>>>             mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>> move to here:
>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>>> David
>>>>>>>>>> Alex
>>>>>>>>>>
>>>>>>>>>>> David
>>>>>>>>>>>>             if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>>                     r =
>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, @@ -487,7 +487,6 @@ void
>>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>             }
>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>
>>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>>             mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>             vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>>>
>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>>>> amdgpu_ring
>>>>>>>>>>>> *ring)
>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submissi
>>>>>>>>>>>> o
>>>>>>>>>>>> n
>>>>>>>>>>>> _cnt);
>>>>>>>>>>>>
>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission
>>>>>>>>>>>> _
>>>>>>>>>>>> c
>>>>>>>>>>>> nt);
>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>>
>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>>                                   VCN_IDLE_TIMEOUT); diff
>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>>
>>>>>>>>>>>>             uint16_t inst_mask;
>>>>>>>>>>>>             uint8_t num_inst_per_aid;
>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>>
>>>>>>>>>>>>             /* IP reg dump */
>>>>>>>>>>>>             uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 12:03                               ` Lazar, Lijo
@ 2025-08-14 12:48                                 ` Sundararaju, Sathishkumar
  2025-08-14 12:54                                   ` Lazar, Lijo
  2025-08-14 17:06                                   ` Sundararaju, Sathishkumar
  0 siblings, 2 replies; 36+ messages in thread
From: Sundararaju, Sathishkumar @ 2025-08-14 12:48 UTC (permalink / raw)
  To: Lazar, Lijo, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org


On 8/14/2025 5:33 PM, Lazar, Lijo wrote:
> [Public]
>
> There is no need for nested lock. It only needs to follow the order
>          set instance power_state
>          set profile (this takes a global lock and hence instance power state will be visible to whichever thread that gets the work profile lock).
>
> You are seeing nested lock just because I added the code just after power state setting.

Pasting your code from the file for ref :

@@ -464,32 +509,14 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring 
*ring)

-pg_lock:

      mutex_lock(&vcn_inst->vcn_pg_lock);
      vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);

+   amdgpu_vcn_get_profile(adev);

vcn_pg_lock isn't  released here yet right ? And in-case you intend to 
only order the locks, then still there is an un-necessary OFF followed 
by ON, but yes that is acceptable,

May be you want to move that vcn_pg_lock after amdgpu_vcn_get_profile to 
protect concurrent dpg_state access in begin_use.

The concern is, this patch access power_state that is protected by some 
other mutex lock hoping it reflects right values also when holding 
powerprofile_lock.

Or

Have shared a patch with global workload_profile_mutex that simplifies 
this handling, and renamed pg_lock -> dpg_lock  and used

that only for dpg_state changes per instance.

Regards,

Sathish

>
> Thanks,
> Lijo
>
> -----Original Message-----
> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> Sent: Thursday, August 14, 2025 5:23 PM
> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
>
>
> On 8/14/2025 3:16 PM, Lazar, Lijo wrote:
>> [Public]
>>
>> I see your point now. Attached should work, I guess. Is the concern more about having to take the lock for every begin?
> This is closer,  but the thing is, IMO we shouldn't have to use 2 locks and go into nested locking, we can do with just one global lock.
>
> Power_state of each instance, and global workload_profile_active are inter-related, they need to be guarded together,
>
> nested could work , but why nested if single lock is enough ? nested complicates it.
>
> Regards,
>
> Sathish
>
>> Thanks,
>> Lijo
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Lazar, Lijo
>> Sent: Thursday, August 14, 2025 2:55 PM
>> To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>; Alex
>> Deucher <alexdeucher@gmail.com>
>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> (v3)
>>
>> [Public]
>>
>> That is not required I think. The power profile is set by an instance *after* setting itself to power on. Also, it's switched back after changing its power state to off.  If idle worker is run by another instance, it won't be seeing the inst0 as power gated and won't change power profile.
>>
>> Thanks,
>> Lijo
>> -----Original Message-----
>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>> Sent: Thursday, August 14, 2025 2:41 PM
>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
>> <alexdeucher@gmail.com>
>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> (v3)
>>
>> Hi Lijo,
>>
>> On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
>>> [Public]
>>>
>>> We already have a per instance power state that can be tracked. What about something like attached?
>> This also has concurrent access of the power state ,
>> vcn.inst[i].cur_state is not protected by workload_profile_mutex
>>
>> every where, it can still change while you are holding workload_profile_mutex and checking it.
>>
>> Regards,
>>
>> Sathish
>>
>>> Thanks,
>>> Lijo
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Sundararaju, Sathishkumar
>>> Sent: Thursday, August 14, 2025 4:43 AM
>>> To: Alex Deucher <alexdeucher@gmail.com>
>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>>> (v3)
>>>
>>>
>>> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>>>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>>> Hi Alex, Hi David,
>>>>>>>>>
>>>>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>>>>> problem, neither the current form , reason :-
>>>>>>>>>
>>>>>>>>> The emitted fence count and total submission count are fast
>>>>>>>>> transients which frequently become 0 in between video decodes
>>>>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>>>>> switch of video power profile, in the current form of patch
>>>>>>>>> that window is minimized, but still can happen if stress
>>>>>>>>> tested. But power state of any instance becoming zero
>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0
>>>>>>> decode completes, delayed idle work starts.
>>>>>>> inst0 idle handler can read 0 total fences and 0 total submission
>>>>>>> count, even if inst1 is actively decoding, that's between the jobs,
>>>>>>>        - as begin_use increaments vcn.total_submission_cnt and
>>>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
>>>>>>>        - if outstanding fences are cleared and no new emitted
>>>>>>> fence, between jobs , can be 0.
>>>>>>>        - both of the above conditions do not mean video decode is
>>>>>>> complete on inst1, it is actively decoding.
>>>>>> How can there be active decoding without an outstanding fence?  In
>>>>>> that case, total_fences (fences from both instances) would be non-0.
>>>>> I mean on inst1 the job scheduled is already complete, so 0
>>>>> outstanding fences, newer job is yet to be scheduled
>>>>>
>>>>> and commited to ring (inst1) , this doesn't mean decode has stopped
>>>>> on
>>>>> inst1 right (I am saying if timing of inst0 idle work coincides
>>>>> with this),
>>>>>
>>>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>>>>> correct my understanding here.
>>>> The flow looks like:
>>>>
>>>> begin_use(inst)
>>>> emit_fence(inst)
>>>> end_use(inst)
>>>>
>>>> ...later
>>>> fence signals
>>>> ...later
>>>> work handler
>>>>
>>>> In begin_use we increment the global and per instance submission.
>>>> This protects the power gating and profile until end_use.  In end
>>>> use we decrement the submissions because we don't need to protect
>>>> anything any more as we have the fence that was submitted via the
>>>> ring.  That fence won't signal until the job is complete.
>>> Is a next begin_use always guaranteed to be run before current job fence signals ?
>>>
>>> if not then both total submission and total fence are zero , example
>>> delayed job/packet submissions
>>>
>>> from user space, or next job schedule happens after current job fence signals.
>>>
>>> if this is never possible then (v3) is perfect.
>>>
>>> Regards,
>>>
>>> Sathish
>>>
>>>> For power gating, we
>>>> only care about the submission count and fences for that instance,
>>>> for the profile, we care about submission count and fences all instances.
>>>> Once the fences have signalled, the outstanding fences will be 0 and
>>>> there won't be any active work.
>>>>
>>>> Alex
>>>>
>>>>> Regards,
>>>>>
>>>>> Sathish
>>>>>
>>>>>> Alex
>>>>>>
>>>>>>> Whereas if instances are powered off we are sure idle time is
>>>>>>> past and it is powered off, no possible way of active video
>>>>>>> decode, when all instances are off we can safely assume no active
>>>>>>> decode and global lock protects it against new begin_use on any instance.
>>>>>>> But the only distant concern is global common locks w.r.t perf,
>>>>>>> but we are already having a global workprofile mutex , so there
>>>>>>> shouldn't be any drop in perf, with just one single global lock
>>>>>>> for all instances.
>>>>>>>
>>>>>>> Just sending out a patch with this fix, will leave it to you to
>>>>>>> decide the right method. If you think outstanding total fences
>>>>>>> can never be 0 during decode, then your previous version (v3)
>>>>>>> itself is good, there is no real benefit of splitting the handlers as such.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Sathish
>>>>>>>> If it is possible, maybe it would be easier to just split the
>>>>>>>> profile and powergating into separate handlers.  The profile one
>>>>>>>> would be global and the powergating one would be per instance.
>>>>>>>> See the attached patches.
>>>>>>>>
>>>>>>>> Alex
>>>>>>>>
>>>>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>>>>> mistake in my patch was using per instance mutex, I should have
>>>>>>>>> used a common global mutex, then that covers the situation David is trying to bring out.
>>>>>>>>>
>>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>>>>> flags to track power state could help us totally avoid this situation.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>>
>>>>>>>>> Sathish
>>>>>>>>>
>>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>>>> Hi Alex,
>>>>>>>>>>>>
>>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
>>>>>>>>>>>> in case someone adds more before the lock and not reviewed
>>>>>>>>>>>> thoroughly)
>>>>>>>>>>>>          - up to you to decide.
>>>>>>>>>>>>
>>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>>>
>>>>>>>>>>>> Thanks,
>>>>>>>>>>>> David
>>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>>>>> end up switching the video profile while another instance
>>>>>>>>>>>>> is active because we only take into account the current
>>>>>>>>>>>>> instance's submissions.  Look at all outstanding fences for
>>>>>>>>>>>>> the video profile.
>>>>>>>>>>>>>
>>>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>>>
>>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
>>>>>>>>>>>>> profile
>>>>>>>>>>>>> handling")
>>>>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>>>          2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>>              struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>>>                      container_of(work, struct
>>>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
>>>>>>>>>>>>>              struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>>>              int r = 0;
>>>>>>>>>>>>>
>>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>>>>                      return;
>>>>>>>>>>>>>
>>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
>>>>>>>>>>>>> + &adev->vcn.inst[i];
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>>>> +     }
>>>>>>>>>>>>>
>>>>>>>>>>>>>              /* Only set DPG pause for VCN3 or below, VCN4
>>>>>>>>>>>>> and above will be handled by FW */
>>>>>>>>>>>>>              if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>>>                      struct dpg_pause_state new_state;
>>>>>>>>>>>>>
>>>>>>>>>>>>>                      if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct work_struct
>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>                      else
>>>>>>>>>>>>>                              new_state.fw_based =
>>>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>>>
>>>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst,
>>>>>>>>>>>>> + &new_state);
>>>>>>>>>>>>>              }
>>>>>>>>>>>>>
>>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>>>> -
>>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>>>                      mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>                      vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>>>                      mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>>>> !total_fences &&
>>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>>>                              r =
>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, false);
>>>>>>>>>>>>>                              if (r) @@ -467,16 +473,10 @@
>>>>>>>>>>>>> void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>>              int r = 0;
>>>>>>>>>>>>>
>>>>>>>>>>>>>              atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>>>
>>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>>>> -      */
>>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>>>> -
>>>>>>>>>>>>>              mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>> move to here:
>>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>>>> David
>>>>>>>>>>> Alex
>>>>>>>>>>>
>>>>>>>>>>>> David
>>>>>>>>>>>>>              if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>>>                      r =
>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, @@ -487,7 +487,6 @@ void
>>>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>>              }
>>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>>
>>>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>>>              mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>              vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>>>>
>>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>>>>> amdgpu_ring
>>>>>>>>>>>>> *ring)
>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submissi
>>>>>>>>>>>>> o
>>>>>>>>>>>>> n
>>>>>>>>>>>>> _cnt);
>>>>>>>>>>>>>
>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission
>>>>>>>>>>>>> _
>>>>>>>>>>>>> c
>>>>>>>>>>>>> nt);
>>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>>>
>>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>>>                                    VCN_IDLE_TIMEOUT); diff
>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>>>
>>>>>>>>>>>>>              uint16_t inst_mask;
>>>>>>>>>>>>>              uint8_t num_inst_per_aid;
>>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>>>
>>>>>>>>>>>>>              /* IP reg dump */
>>>>>>>>>>>>>              uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 12:48                                 ` Sundararaju, Sathishkumar
@ 2025-08-14 12:54                                   ` Lazar, Lijo
  2025-08-14 17:44                                     ` David Wu
  2025-08-14 17:06                                   ` Sundararaju, Sathishkumar
  1 sibling, 1 reply; 36+ messages in thread
From: Lazar, Lijo @ 2025-08-14 12:54 UTC (permalink / raw)
  To: Sundararaju, Sathishkumar, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org

[-- Attachment #1: Type: text/plain, Size: 20968 bytes --]

[Public]

The request profile can be moved outside the pg_lock in begin_use as in the attached patch. It needs  set power state -> set profile order.

This is the premise -

Let's say there are two threads, begin_use thread and idle_work threads. begin_use and idle_work will need the workprofile mutex to request a profile.

Case 1) Idle thread gets the lock first.
        a) Idle thread sees vinst power state as PG_UNGATE, no harm done. It exits without requesting power profile change. begin_use thread gets the lock next, it sees profile as active and continues.
        b) Idle thread sees vinst power state as PG_GATE, it will make workprofile_active to false and exit. Now when begin_use thread gets the mutex next, it's guaranteed to see the workprofile_active as false, hence it will request the profile.

Case 2) begin_use thread gets the lock first.
        a) Workload profile is active, hence it doesn't do anything and exits. The change made by begin_use thread to vinst power state (state = on) will now be visible to idle thread which gets the lock next. It will do nothing and exit.
        b) Workload profile is inactive, hence it requests a profile change. Again, the change made by begin_use thread to vinst power state will now be visible to idle thread which gets the lock next. It will do nothing and exit.

Thanks,
Lijo

-----Original Message-----
From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
Sent: Thursday, August 14, 2025 6:18 PM
To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher <alexdeucher@gmail.com>
Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)


On 8/14/2025 5:33 PM, Lazar, Lijo wrote:
> [Public]
>
> There is no need for nested lock. It only needs to follow the order
>          set instance power_state
>          set profile (this takes a global lock and hence instance power state will be visible to whichever thread that gets the work profile lock).
>
> You are seeing nested lock just because I added the code just after power state setting.

Pasting your code from the file for ref :

@@ -464,32 +509,14 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring
*ring)

-pg_lock:

      mutex_lock(&vcn_inst->vcn_pg_lock);
      vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);

+   amdgpu_vcn_get_profile(adev);

vcn_pg_lock isn't  released here yet right ? And in-case you intend to only order the locks, then still there is an un-necessary OFF followed by ON, but yes that is acceptable,

May be you want to move that vcn_pg_lock after amdgpu_vcn_get_profile to protect concurrent dpg_state access in begin_use.

The concern is, this patch access power_state that is protected by some other mutex lock hoping it reflects right values also when holding powerprofile_lock.

Or

Have shared a patch with global workload_profile_mutex that simplifies this handling, and renamed pg_lock -> dpg_lock  and used

that only for dpg_state changes per instance.

Regards,

Sathish

>
> Thanks,
> Lijo
>
> -----Original Message-----
> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> Sent: Thursday, August 14, 2025 5:23 PM
> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
> <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> (v3)
>
>
> On 8/14/2025 3:16 PM, Lazar, Lijo wrote:
>> [Public]
>>
>> I see your point now. Attached should work, I guess. Is the concern more about having to take the lock for every begin?
> This is closer,  but the thing is, IMO we shouldn't have to use 2 locks and go into nested locking, we can do with just one global lock.
>
> Power_state of each instance, and global workload_profile_active are
> inter-related, they need to be guarded together,
>
> nested could work , but why nested if single lock is enough ? nested complicates it.
>
> Regards,
>
> Sathish
>
>> Thanks,
>> Lijo
>>
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Lazar, Lijo
>> Sent: Thursday, August 14, 2025 2:55 PM
>> To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>;
>> Alex Deucher <alexdeucher@gmail.com>
>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> (v3)
>>
>> [Public]
>>
>> That is not required I think. The power profile is set by an instance *after* setting itself to power on. Also, it's switched back after changing its power state to off.  If idle worker is run by another instance, it won't be seeing the inst0 as power gated and won't change power profile.
>>
>> Thanks,
>> Lijo
>> -----Original Message-----
>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>> Sent: Thursday, August 14, 2025 2:41 PM
>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
>> <alexdeucher@gmail.com>
>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> (v3)
>>
>> Hi Lijo,
>>
>> On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
>>> [Public]
>>>
>>> We already have a per instance power state that can be tracked. What about something like attached?
>> This also has concurrent access of the power state ,
>> vcn.inst[i].cur_state is not protected by workload_profile_mutex
>>
>> every where, it can still change while you are holding workload_profile_mutex and checking it.
>>
>> Regards,
>>
>> Sathish
>>
>>> Thanks,
>>> Lijo
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Sundararaju, Sathishkumar
>>> Sent: Thursday, August 14, 2025 4:43 AM
>>> To: Alex Deucher <alexdeucher@gmail.com>
>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race
>>> condition
>>> (v3)
>>>
>>>
>>> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>>>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>>> Hi Alex, Hi David,
>>>>>>>>>
>>>>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>>>>> problem, neither the current form , reason :-
>>>>>>>>>
>>>>>>>>> The emitted fence count and total submission count are fast
>>>>>>>>> transients which frequently become 0 in between video decodes
>>>>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>>>>> switch of video power profile, in the current form of patch
>>>>>>>>> that window is minimized, but still can happen if stress
>>>>>>>>> tested. But power state of any instance becoming zero
>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>>>> Consider this situation, inst0 and inst1 actively decoding,
>>>>>>> inst0 decode completes, delayed idle work starts.
>>>>>>> inst0 idle handler can read 0 total fences and 0 total
>>>>>>> submission count, even if inst1 is actively decoding, that's between the jobs,
>>>>>>>        - as begin_use increaments vcn.total_submission_cnt and
>>>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
>>>>>>>        - if outstanding fences are cleared and no new emitted
>>>>>>> fence, between jobs , can be 0.
>>>>>>>        - both of the above conditions do not mean video decode
>>>>>>> is complete on inst1, it is actively decoding.
>>>>>> How can there be active decoding without an outstanding fence?
>>>>>> In that case, total_fences (fences from both instances) would be non-0.
>>>>> I mean on inst1 the job scheduled is already complete, so 0
>>>>> outstanding fences, newer job is yet to be scheduled
>>>>>
>>>>> and commited to ring (inst1) , this doesn't mean decode has
>>>>> stopped on
>>>>> inst1 right (I am saying if timing of inst0 idle work coincides
>>>>> with this),
>>>>>
>>>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>>>>> correct my understanding here.
>>>> The flow looks like:
>>>>
>>>> begin_use(inst)
>>>> emit_fence(inst)
>>>> end_use(inst)
>>>>
>>>> ...later
>>>> fence signals
>>>> ...later
>>>> work handler
>>>>
>>>> In begin_use we increment the global and per instance submission.
>>>> This protects the power gating and profile until end_use.  In end
>>>> use we decrement the submissions because we don't need to protect
>>>> anything any more as we have the fence that was submitted via the
>>>> ring.  That fence won't signal until the job is complete.
>>> Is a next begin_use always guaranteed to be run before current job fence signals ?
>>>
>>> if not then both total submission and total fence are zero , example
>>> delayed job/packet submissions
>>>
>>> from user space, or next job schedule happens after current job fence signals.
>>>
>>> if this is never possible then (v3) is perfect.
>>>
>>> Regards,
>>>
>>> Sathish
>>>
>>>> For power gating, we
>>>> only care about the submission count and fences for that instance,
>>>> for the profile, we care about submission count and fences all instances.
>>>> Once the fences have signalled, the outstanding fences will be 0
>>>> and there won't be any active work.
>>>>
>>>> Alex
>>>>
>>>>> Regards,
>>>>>
>>>>> Sathish
>>>>>
>>>>>> Alex
>>>>>>
>>>>>>> Whereas if instances are powered off we are sure idle time is
>>>>>>> past and it is powered off, no possible way of active video
>>>>>>> decode, when all instances are off we can safely assume no
>>>>>>> active decode and global lock protects it against new begin_use on any instance.
>>>>>>> But the only distant concern is global common locks w.r.t perf,
>>>>>>> but we are already having a global workprofile mutex , so there
>>>>>>> shouldn't be any drop in perf, with just one single global lock
>>>>>>> for all instances.
>>>>>>>
>>>>>>> Just sending out a patch with this fix, will leave it to you to
>>>>>>> decide the right method. If you think outstanding total fences
>>>>>>> can never be 0 during decode, then your previous version (v3)
>>>>>>> itself is good, there is no real benefit of splitting the handlers as such.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Sathish
>>>>>>>> If it is possible, maybe it would be easier to just split the
>>>>>>>> profile and powergating into separate handlers.  The profile
>>>>>>>> one would be global and the powergating one would be per instance.
>>>>>>>> See the attached patches.
>>>>>>>>
>>>>>>>> Alex
>>>>>>>>
>>>>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>>>>> mistake in my patch was using per instance mutex, I should
>>>>>>>>> have used a common global mutex, then that covers the situation David is trying to bring out.
>>>>>>>>>
>>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>>>>> flags to track power state could help us totally avoid this situation.
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>>
>>>>>>>>> Sathish
>>>>>>>>>
>>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>>>> Hi Alex,
>>>>>>>>>>>>
>>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
>>>>>>>>>>>> in case someone adds more before the lock and not reviewed
>>>>>>>>>>>> thoroughly)
>>>>>>>>>>>>          - up to you to decide.
>>>>>>>>>>>>
>>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>>>
>>>>>>>>>>>> Thanks,
>>>>>>>>>>>> David
>>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>>>>> end up switching the video profile while another instance
>>>>>>>>>>>>> is active because we only take into account the current
>>>>>>>>>>>>> instance's submissions.  Look at all outstanding fences
>>>>>>>>>>>>> for the video profile.
>>>>>>>>>>>>>
>>>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>>>
>>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
>>>>>>>>>>>>> profile
>>>>>>>>>>>>> handling")
>>>>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>>>          2 files changed, 21 insertions(+), 20
>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>>              struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>>>                      container_of(work, struct
>>>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
>>>>>>>>>>>>>              struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>>>              int r = 0;
>>>>>>>>>>>>>
>>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 <<
>>>>>>>>>>>>> + vcn_inst->inst))
>>>>>>>>>>>>>                      return;
>>>>>>>>>>>>>
>>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
>>>>>>>>>>>>> + &adev->vcn.inst[i];
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>>>> +     }
>>>>>>>>>>>>>
>>>>>>>>>>>>>              /* Only set DPG pause for VCN3 or below, VCN4
>>>>>>>>>>>>> and above will be handled by FW */
>>>>>>>>>>>>>              if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>>>                      struct dpg_pause_state new_state;
>>>>>>>>>>>>>
>>>>>>>>>>>>>                      if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct
>>>>>>>>>>>>> work_struct
>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>                      else
>>>>>>>>>>>>>                              new_state.fw_based =
>>>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>>>
>>>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst,
>>>>>>>>>>>>> + &new_state);
>>>>>>>>>>>>>              }
>>>>>>>>>>>>>
>>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>>>> -
>>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>>>                      mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>                      vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>>>                      mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>>>> !total_fences &&
>>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>>>                              r =
>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, false);
>>>>>>>>>>>>>                              if (r) @@ -467,16 +473,10 @@
>>>>>>>>>>>>> void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>>              int r = 0;
>>>>>>>>>>>>>
>>>>>>>>>>>>>              atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>>>
>>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>>>> -      */
>>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>>>> -
>>>>>>>>>>>>>
>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>> move to here:
>>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>>>> David
>>>>>>>>>>> Alex
>>>>>>>>>>>
>>>>>>>>>>>> David
>>>>>>>>>>>>>              if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>>>                      r =
>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, @@ -487,7 +487,6 @@ void
>>>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>>              }
>>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>>
>>>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>>>              mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>              vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>>>>
>>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>>>>> amdgpu_ring
>>>>>>>>>>>>> *ring)
>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submiss
>>>>>>>>>>>>> i
>>>>>>>>>>>>> o
>>>>>>>>>>>>> n
>>>>>>>>>>>>> _cnt);
>>>>>>>>>>>>>
>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submissio
>>>>>>>>>>>>> n
>>>>>>>>>>>>> _
>>>>>>>>>>>>> c
>>>>>>>>>>>>> nt);
>>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>>>
>>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>>>                                    VCN_IDLE_TIMEOUT); diff
>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>>>
>>>>>>>>>>>>>              uint16_t inst_mask;
>>>>>>>>>>>>>              uint8_t num_inst_per_aid;
>>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>>>
>>>>>>>>>>>>>              /* IP reg dump */
>>>>>>>>>>>>>              uint32_t                *ip_dump;

[-- Attachment #2: 0001-drm-amdgpu-Check-vcn-state-before-profile-switch.patch --]
[-- Type: application/octet-stream, Size: 8428 bytes --]

From a556a9802bfa3cb95c0a94a6771380d680c13e1c Mon Sep 17 00:00:00 2001
From: Lijo Lazar <lijo.lazar@amd.com>
Date: Thu, 14 Aug 2025 13:52:50 +0530
Subject: [PATCH] drm/amdgpu: Check vcn state before profile switch

Before switching video profile, check power state of all VCN instances.
Before submission, power state is changed and a check is done for
vidoe profile. If already active, it skips switching; otherwise, it
switches to video profile.  When it gets to idle state, power state of
the current instance is set to gated. After that a check is done if all
instances are in gated state. If so, video profile is disabled.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 85 ++++++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  5 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 27 +-------
 3 files changed, 62 insertions(+), 55 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 9a76e11d1c18..ad0811551314 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -410,6 +410,59 @@ int amdgpu_vcn_resume(struct amdgpu_device *adev, int i)
 	return 0;
 }
 
+int amdgpu_vcn_get_profile(struct amdgpu_device *adev)
+{
+	int r;
+
+
+	mutex_lock(&adev->vcn.workload_profile_mutex);
+
+	if (adev->vcn.workload_profile_active) {
+		mutex_unlock(&adev->vcn.workload_profile_mutex);
+		return 0;
+	}
+	r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
+					    true);
+	if (r)
+		dev_warn(adev->dev,
+			 "(%d) failed to enable video power profile mode\n", r);
+	else
+		adev->vcn.workload_profile_active = true;
+	mutex_unlock(&adev->vcn.workload_profile_mutex);
+
+	return r;
+}
+
+int amdgpu_vcn_put_profile(struct amdgpu_device *adev)
+{
+	bool pg = true;
+	int r, i;
+
+	mutex_lock(&adev->vcn.workload_profile_mutex);
+	for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
+		if (adev->vcn.inst[i].cur_state != AMD_PG_STATE_GATE) {
+			pg = false;
+			break;
+		}
+	}
+
+	if (pg) {
+		r = amdgpu_dpm_switch_power_profile(
+			adev, PP_SMC_POWER_PROFILE_VIDEO, false);
+		if (r)
+			dev_warn(
+				adev->dev,
+				"(%d) failed to disable video power profile mode\n",
+				r);
+		else
+			adev->vcn.workload_profile_active = false;
+	}
+
+	mutex_unlock(&adev->vcn.workload_profile_mutex);
+
+	return r;
+}
+
 static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 {
 	struct amdgpu_vcn_inst *vcn_inst =
@@ -417,7 +470,6 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 	struct amdgpu_device *adev = vcn_inst->adev;
 	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
 	unsigned int i = vcn_inst->inst, j;
-	int r = 0;
 
 	if (adev->vcn.harvest_config & (1 << i))
 		return;
@@ -446,15 +498,8 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 		mutex_lock(&vcn_inst->vcn_pg_lock);
 		vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
 		mutex_unlock(&vcn_inst->vcn_pg_lock);
-		mutex_lock(&adev->vcn.workload_profile_mutex);
-		if (adev->vcn.workload_profile_active) {
-			r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-							    false);
-			if (r)
-				dev_warn(adev->dev, "(%d) failed to disable video power profile mode\n", r);
-			adev->vcn.workload_profile_active = false;
-		}
-		mutex_unlock(&adev->vcn.workload_profile_mutex);
+		amdgpu_vcn_put_profile(adev);
+
 	} else {
 		schedule_delayed_work(&vcn_inst->idle_work, VCN_IDLE_TIMEOUT);
 	}
@@ -464,30 +509,11 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_vcn_inst *vcn_inst = &adev->vcn.inst[ring->me];
-	int r = 0;
 
 	atomic_inc(&vcn_inst->total_submission_cnt);
 
 	cancel_delayed_work_sync(&vcn_inst->idle_work);
 
-	/* We can safely return early here because we've cancelled the
-	 * the delayed work so there is no one else to set it to false
-	 * and we don't care if someone else sets it to true.
-	 */
-	if (adev->vcn.workload_profile_active)
-		goto pg_lock;
-
-	mutex_lock(&adev->vcn.workload_profile_mutex);
-	if (!adev->vcn.workload_profile_active) {
-		r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-						    true);
-		if (r)
-			dev_warn(adev->dev, "(%d) failed to switch to video power profile mode\n", r);
-		adev->vcn.workload_profile_active = true;
-	}
-	mutex_unlock(&adev->vcn.workload_profile_mutex);
-
-pg_lock:
 	mutex_lock(&vcn_inst->vcn_pg_lock);
 	vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
 
@@ -515,6 +541,7 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
 		vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
 	}
 	mutex_unlock(&vcn_inst->vcn_pg_lock);
+	amdgpu_vcn_get_profile(adev);
 }
 
 void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index b3fb1d0e43fc..c4dec986926f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -362,7 +362,7 @@ struct amdgpu_vcn {
 	bool			per_inst_fw;
 	unsigned		fw_version;
 
-	bool			workload_profile_active;
+	bool workload_profile_active;
 	struct mutex            workload_profile_mutex;
 	u32 reg_count;
 	const struct amdgpu_hwip_reg_entry *reg_list;
@@ -565,4 +565,7 @@ int amdgpu_vcn_reg_dump_init(struct amdgpu_device *adev,
 			     const struct amdgpu_hwip_reg_entry *reg, u32 count);
 void amdgpu_vcn_dump_ip_state(struct amdgpu_ip_block *ip_block);
 void amdgpu_vcn_print_ip_state(struct amdgpu_ip_block *ip_block, struct drm_printer *p);
+int amdgpu_vcn_get_profile(struct amdgpu_device *adev);
+int amdgpu_vcn_put_profile(struct amdgpu_device *adev);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index 3a7c137a83ef..904b94bc8693 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -116,7 +116,6 @@ static void vcn_v2_5_idle_work_handler(struct work_struct *work)
 	struct amdgpu_device *adev = vcn_inst->adev;
 	unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
 	unsigned int i, j;
-	int r = 0;
 
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
 		struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
@@ -149,15 +148,7 @@ static void vcn_v2_5_idle_work_handler(struct work_struct *work)
 	if (!fences && !atomic_read(&adev->vcn.inst[0].total_submission_cnt)) {
 		amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN,
 						       AMD_PG_STATE_GATE);
-		mutex_lock(&adev->vcn.workload_profile_mutex);
-		if (adev->vcn.workload_profile_active) {
-			r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-							    false);
-			if (r)
-				dev_warn(adev->dev, "(%d) failed to disable video power profile mode\n", r);
-			adev->vcn.workload_profile_active = false;
-		}
-		mutex_unlock(&adev->vcn.workload_profile_mutex);
+		amdgpu_vcn_put_profile(adev);
 	} else {
 		schedule_delayed_work(&adev->vcn.inst[0].idle_work, VCN_IDLE_TIMEOUT);
 	}
@@ -167,7 +158,6 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_vcn_inst *v = &adev->vcn.inst[ring->me];
-	int r = 0;
 
 	atomic_inc(&adev->vcn.inst[0].total_submission_cnt);
 
@@ -177,20 +167,6 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
 	 * the delayed work so there is no one else to set it to false
 	 * and we don't care if someone else sets it to true.
 	 */
-	if (adev->vcn.workload_profile_active)
-		goto pg_lock;
-
-	mutex_lock(&adev->vcn.workload_profile_mutex);
-	if (!adev->vcn.workload_profile_active) {
-		r = amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_VIDEO,
-						    true);
-		if (r)
-			dev_warn(adev->dev, "(%d) failed to switch to video power profile mode\n", r);
-		adev->vcn.workload_profile_active = true;
-	}
-	mutex_unlock(&adev->vcn.workload_profile_mutex);
-
-pg_lock:
 	mutex_lock(&adev->vcn.inst[0].vcn_pg_lock);
 	amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN,
 					       AMD_PG_STATE_UNGATE);
@@ -218,6 +194,7 @@ static void vcn_v2_5_ring_begin_use(struct amdgpu_ring *ring)
 		v->pause_dpg_mode(v, &new_state);
 	}
 	mutex_unlock(&adev->vcn.inst[0].vcn_pg_lock);
+	amdgpu_vcn_get_profile(adev);
 }
 
 static void vcn_v2_5_ring_end_use(struct amdgpu_ring *ring)
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 12:54                                   ` Lazar, Lijo
@ 2025-08-14 17:44                                     ` David Wu
  2025-08-14 18:06                                       ` Lazar, Lijo
  0 siblings, 1 reply; 36+ messages in thread
From: David Wu @ 2025-08-14 17:44 UTC (permalink / raw)
  To: Lazar, Lijo, Sundararaju, Sathishkumar, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org

amdgpu_vcn_idle_work_handler():
     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
----------- could it be possible a context switch here to 
amdgpu_vcn_ring_begin_use()?
  if it could then AMD_PG_STATE_GATE will be set by mistake.

David

On 2025-08-14 08:54, Lazar, Lijo wrote:
> [Public]
>
> The request profile can be moved outside the pg_lock in begin_use as in the attached patch. It needs  set power state -> set profile order.
>
> This is the premise -
>
> Let's say there are two threads, begin_use thread and idle_work threads. begin_use and idle_work will need the workprofile mutex to request a profile.
>
> Case 1) Idle thread gets the lock first.
>          a) Idle thread sees vinst power state as PG_UNGATE, no harm done. It exits without requesting power profile change. begin_use thread gets the lock next, it sees profile as active and continues.
>          b) Idle thread sees vinst power state as PG_GATE, it will make workprofile_active to false and exit. Now when begin_use thread gets the mutex next, it's guaranteed to see the workprofile_active as false, hence it will request the profile.
>
> Case 2) begin_use thread gets the lock first.
>          a) Workload profile is active, hence it doesn't do anything and exits. The change made by begin_use thread to vinst power state (state = on) will now be visible to idle thread which gets the lock next. It will do nothing and exit.
>          b) Workload profile is inactive, hence it requests a profile change. Again, the change made by begin_use thread to vinst power state will now be visible to idle thread which gets the lock next. It will do nothing and exit.
>
> Thanks,
> Lijo
>
> -----Original Message-----
> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> Sent: Thursday, August 14, 2025 6:18 PM
> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
>
>
> On 8/14/2025 5:33 PM, Lazar, Lijo wrote:
>> [Public]
>>
>> There is no need for nested lock. It only needs to follow the order
>>           set instance power_state
>>           set profile (this takes a global lock and hence instance power state will be visible to whichever thread that gets the work profile lock).
>>
>> You are seeing nested lock just because I added the code just after power state setting.
> Pasting your code from the file for ref :
>
> @@ -464,32 +509,14 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring
> *ring)
>
> -pg_lock:
>
>        mutex_lock(&vcn_inst->vcn_pg_lock);
>        vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>
> +   amdgpu_vcn_get_profile(adev);
>
> vcn_pg_lock isn't  released here yet right ? And in-case you intend to only order the locks, then still there is an un-necessary OFF followed by ON, but yes that is acceptable,
>
> May be you want to move that vcn_pg_lock after amdgpu_vcn_get_profile to protect concurrent dpg_state access in begin_use.
>
> The concern is, this patch access power_state that is protected by some other mutex lock hoping it reflects right values also when holding powerprofile_lock.
>
> Or
>
> Have shared a patch with global workload_profile_mutex that simplifies this handling, and renamed pg_lock -> dpg_lock  and used
>
> that only for dpg_state changes per instance.
>
> Regards,
>
> Sathish
>
>> Thanks,
>> Lijo
>>
>> -----Original Message-----
>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>> Sent: Thursday, August 14, 2025 5:23 PM
>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
>> <alexdeucher@gmail.com>
>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> (v3)
>>
>>
>> On 8/14/2025 3:16 PM, Lazar, Lijo wrote:
>>> [Public]
>>>
>>> I see your point now. Attached should work, I guess. Is the concern more about having to take the lock for every begin?
>> This is closer,  but the thing is, IMO we shouldn't have to use 2 locks and go into nested locking, we can do with just one global lock.
>>
>> Power_state of each instance, and global workload_profile_active are
>> inter-related, they need to be guarded together,
>>
>> nested could work , but why nested if single lock is enough ? nested complicates it.
>>
>> Regards,
>>
>> Sathish
>>
>>> Thanks,
>>> Lijo
>>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Lazar, Lijo
>>> Sent: Thursday, August 14, 2025 2:55 PM
>>> To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>;
>>> Alex Deucher <alexdeucher@gmail.com>
>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>>> (v3)
>>>
>>> [Public]
>>>
>>> That is not required I think. The power profile is set by an instance *after* setting itself to power on. Also, it's switched back after changing its power state to off.  If idle worker is run by another instance, it won't be seeing the inst0 as power gated and won't change power profile.
>>>
>>> Thanks,
>>> Lijo
>>> -----Original Message-----
>>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>>> Sent: Thursday, August 14, 2025 2:41 PM
>>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
>>> <alexdeucher@gmail.com>
>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>>> (v3)
>>>
>>> Hi Lijo,
>>>
>>> On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
>>>> [Public]
>>>>
>>>> We already have a per instance power state that can be tracked. What about something like attached?
>>> This also has concurrent access of the power state ,
>>> vcn.inst[i].cur_state is not protected by workload_profile_mutex
>>>
>>> every where, it can still change while you are holding workload_profile_mutex and checking it.
>>>
>>> Regards,
>>>
>>> Sathish
>>>
>>>> Thanks,
>>>> Lijo
>>>> -----Original Message-----
>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>> Sundararaju, Sathishkumar
>>>> Sent: Thursday, August 14, 2025 4:43 AM
>>>> To: Alex Deucher <alexdeucher@gmail.com>
>>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race
>>>> condition
>>>> (v3)
>>>>
>>>>
>>>> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>>>> Hi Alex, Hi David,
>>>>>>>>>>
>>>>>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>>>>>> problem, neither the current form , reason :-
>>>>>>>>>>
>>>>>>>>>> The emitted fence count and total submission count are fast
>>>>>>>>>> transients which frequently become 0 in between video decodes
>>>>>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>>>>>> switch of video power profile, in the current form of patch
>>>>>>>>>> that window is minimized, but still can happen if stress
>>>>>>>>>> tested. But power state of any instance becoming zero
>>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>>>>> Consider this situation, inst0 and inst1 actively decoding,
>>>>>>>> inst0 decode completes, delayed idle work starts.
>>>>>>>> inst0 idle handler can read 0 total fences and 0 total
>>>>>>>> submission count, even if inst1 is actively decoding, that's between the jobs,
>>>>>>>>         - as begin_use increaments vcn.total_submission_cnt and
>>>>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
>>>>>>>>         - if outstanding fences are cleared and no new emitted
>>>>>>>> fence, between jobs , can be 0.
>>>>>>>>         - both of the above conditions do not mean video decode
>>>>>>>> is complete on inst1, it is actively decoding.
>>>>>>> How can there be active decoding without an outstanding fence?
>>>>>>> In that case, total_fences (fences from both instances) would be non-0.
>>>>>> I mean on inst1 the job scheduled is already complete, so 0
>>>>>> outstanding fences, newer job is yet to be scheduled
>>>>>>
>>>>>> and commited to ring (inst1) , this doesn't mean decode has
>>>>>> stopped on
>>>>>> inst1 right (I am saying if timing of inst0 idle work coincides
>>>>>> with this),
>>>>>>
>>>>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>>>>>> correct my understanding here.
>>>>> The flow looks like:
>>>>>
>>>>> begin_use(inst)
>>>>> emit_fence(inst)
>>>>> end_use(inst)
>>>>>
>>>>> ...later
>>>>> fence signals
>>>>> ...later
>>>>> work handler
>>>>>
>>>>> In begin_use we increment the global and per instance submission.
>>>>> This protects the power gating and profile until end_use.  In end
>>>>> use we decrement the submissions because we don't need to protect
>>>>> anything any more as we have the fence that was submitted via the
>>>>> ring.  That fence won't signal until the job is complete.
>>>> Is a next begin_use always guaranteed to be run before current job fence signals ?
>>>>
>>>> if not then both total submission and total fence are zero , example
>>>> delayed job/packet submissions
>>>>
>>>> from user space, or next job schedule happens after current job fence signals.
>>>>
>>>> if this is never possible then (v3) is perfect.
>>>>
>>>> Regards,
>>>>
>>>> Sathish
>>>>
>>>>> For power gating, we
>>>>> only care about the submission count and fences for that instance,
>>>>> for the profile, we care about submission count and fences all instances.
>>>>> Once the fences have signalled, the outstanding fences will be 0
>>>>> and there won't be any active work.
>>>>>
>>>>> Alex
>>>>>
>>>>>> Regards,
>>>>>>
>>>>>> Sathish
>>>>>>
>>>>>>> Alex
>>>>>>>
>>>>>>>> Whereas if instances are powered off we are sure idle time is
>>>>>>>> past and it is powered off, no possible way of active video
>>>>>>>> decode, when all instances are off we can safely assume no
>>>>>>>> active decode and global lock protects it against new begin_use on any instance.
>>>>>>>> But the only distant concern is global common locks w.r.t perf,
>>>>>>>> but we are already having a global workprofile mutex , so there
>>>>>>>> shouldn't be any drop in perf, with just one single global lock
>>>>>>>> for all instances.
>>>>>>>>
>>>>>>>> Just sending out a patch with this fix, will leave it to you to
>>>>>>>> decide the right method. If you think outstanding total fences
>>>>>>>> can never be 0 during decode, then your previous version (v3)
>>>>>>>> itself is good, there is no real benefit of splitting the handlers as such.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Sathish
>>>>>>>>> If it is possible, maybe it would be easier to just split the
>>>>>>>>> profile and powergating into separate handlers.  The profile
>>>>>>>>> one would be global and the powergating one would be per instance.
>>>>>>>>> See the attached patches.
>>>>>>>>>
>>>>>>>>> Alex
>>>>>>>>>
>>>>>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>>>>>> mistake in my patch was using per instance mutex, I should
>>>>>>>>>> have used a common global mutex, then that covers the situation David is trying to bring out.
>>>>>>>>>>
>>>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>>>>>> flags to track power state could help us totally avoid this situation.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>>
>>>>>>>>>> Sathish
>>>>>>>>>>
>>>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>>>>> Hi Alex,
>>>>>>>>>>>>>
>>>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
>>>>>>>>>>>>> in case someone adds more before the lock and not reviewed
>>>>>>>>>>>>> thoroughly)
>>>>>>>>>>>>>           - up to you to decide.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>> David
>>>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>>>>>> end up switching the video profile while another instance
>>>>>>>>>>>>>> is active because we only take into account the current
>>>>>>>>>>>>>> instance's submissions.  Look at all outstanding fences
>>>>>>>>>>>>>> for the video profile.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
>>>>>>>>>>>>>> profile
>>>>>>>>>>>>>> handling")
>>>>>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>>>>           2 files changed, 21 insertions(+), 20
>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>>>               struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>>>>                       container_of(work, struct
>>>>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
>>>>>>>>>>>>>>               struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>>>>               int r = 0;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 <<
>>>>>>>>>>>>>> + vcn_inst->inst))
>>>>>>>>>>>>>>                       return;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
>>>>>>>>>>>>>> + &adev->vcn.inst[i];
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>>>>> +     }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>               /* Only set DPG pause for VCN3 or below, VCN4
>>>>>>>>>>>>>> and above will be handled by FW */
>>>>>>>>>>>>>>               if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>>>>                       struct dpg_pause_state new_state;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>                       if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct
>>>>>>>>>>>>>> work_struct
>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>                       else
>>>>>>>>>>>>>>                               new_state.fw_based =
>>>>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst,
>>>>>>>>>>>>>> + &new_state);
>>>>>>>>>>>>>>               }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>>>>                       mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>>                       vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>>>>                       mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>>>>> !total_fences &&
>>>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>>>>                               r =
>>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, false);
>>>>>>>>>>>>>>                               if (r) @@ -467,16 +473,10 @@
>>>>>>>>>>>>>> void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>>>               int r = 0;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>               atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>>>>> -      */
>>>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>> move to here:
>>>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>>>>> David
>>>>>>>>>>>> Alex
>>>>>>>>>>>>
>>>>>>>>>>>>> David
>>>>>>>>>>>>>>               if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>>>>                       r =
>>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, @@ -487,7 +487,6 @@ void
>>>>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>>>               }
>>>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>>>>               mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>>               vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>>>>>> amdgpu_ring
>>>>>>>>>>>>>> *ring)
>>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submiss
>>>>>>>>>>>>>> i
>>>>>>>>>>>>>> o
>>>>>>>>>>>>>> n
>>>>>>>>>>>>>> _cnt);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submissio
>>>>>>>>>>>>>> n
>>>>>>>>>>>>>> _
>>>>>>>>>>>>>> c
>>>>>>>>>>>>>> nt);
>>>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>>>>                                     VCN_IDLE_TIMEOUT); diff
>>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>               uint16_t inst_mask;
>>>>>>>>>>>>>>               uint8_t num_inst_per_aid;
>>>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>               /* IP reg dump */
>>>>>>>>>>>>>>               uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 17:44                                     ` David Wu
@ 2025-08-14 18:06                                       ` Lazar, Lijo
  2025-08-14 18:42                                         ` David Wu
  0 siblings, 1 reply; 36+ messages in thread
From: Lazar, Lijo @ 2025-08-14 18:06 UTC (permalink / raw)
  To: Wu, David, Sundararaju, Sathishkumar, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org

[-- Attachment #1: Type: text/plain, Size: 22857 bytes --]

[AMD Official Use Only - AMD Internal Distribution Only]

I didn't fully understand the question.

For the same instance, begin_thread will set the power state only after cancelling any idle worker for the instance. If idle worker is already under progress, then it needs to complete as that's a cancel_sync (it's the existing logic).

Basically, by the time begin_thread sets the PG state, no idle worker for the same vcn instance would be active. If it's about context switch to another vcn instance's begin_thread, I think that won't be a problem.

Thanks,
Lijo
________________________________
From: Wu, David <David.Wu3@amd.com>
Sent: Thursday, August 14, 2025 11:14:26 PM
To: Lazar, Lijo <Lijo.Lazar@amd.com>; Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>; Alex Deucher <alexdeucher@gmail.com>
Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)

amdgpu_vcn_idle_work_handler():
     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
----------- could it be possible a context switch here to
amdgpu_vcn_ring_begin_use()?
  if it could then AMD_PG_STATE_GATE will be set by mistake.

David

On 2025-08-14 08:54, Lazar, Lijo wrote:
> [Public]
>
> The request profile can be moved outside the pg_lock in begin_use as in the attached patch. It needs  set power state -> set profile order.
>
> This is the premise -
>
> Let's say there are two threads, begin_use thread and idle_work threads. begin_use and idle_work will need the workprofile mutex to request a profile.
>
> Case 1) Idle thread gets the lock first.
>          a) Idle thread sees vinst power state as PG_UNGATE, no harm done. It exits without requesting power profile change. begin_use thread gets the lock next, it sees profile as active and continues.
>          b) Idle thread sees vinst power state as PG_GATE, it will make workprofile_active to false and exit. Now when begin_use thread gets the mutex next, it's guaranteed to see the workprofile_active as false, hence it will request the profile.
>
> Case 2) begin_use thread gets the lock first.
>          a) Workload profile is active, hence it doesn't do anything and exits. The change made by begin_use thread to vinst power state (state = on) will now be visible to idle thread which gets the lock next. It will do nothing and exit.
>          b) Workload profile is inactive, hence it requests a profile change. Again, the change made by begin_use thread to vinst power state will now be visible to idle thread which gets the lock next. It will do nothing and exit.
>
> Thanks,
> Lijo
>
> -----Original Message-----
> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> Sent: Thursday, August 14, 2025 6:18 PM
> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
>
>
> On 8/14/2025 5:33 PM, Lazar, Lijo wrote:
>> [Public]
>>
>> There is no need for nested lock. It only needs to follow the order
>>           set instance power_state
>>           set profile (this takes a global lock and hence instance power state will be visible to whichever thread that gets the work profile lock).
>>
>> You are seeing nested lock just because I added the code just after power state setting.
> Pasting your code from the file for ref :
>
> @@ -464,32 +509,14 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring
> *ring)
>
> -pg_lock:
>
>        mutex_lock(&vcn_inst->vcn_pg_lock);
>        vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>
> +   amdgpu_vcn_get_profile(adev);
>
> vcn_pg_lock isn't  released here yet right ? And in-case you intend to only order the locks, then still there is an un-necessary OFF followed by ON, but yes that is acceptable,
>
> May be you want to move that vcn_pg_lock after amdgpu_vcn_get_profile to protect concurrent dpg_state access in begin_use.
>
> The concern is, this patch access power_state that is protected by some other mutex lock hoping it reflects right values also when holding powerprofile_lock.
>
> Or
>
> Have shared a patch with global workload_profile_mutex that simplifies this handling, and renamed pg_lock -> dpg_lock  and used
>
> that only for dpg_state changes per instance.
>
> Regards,
>
> Sathish
>
>> Thanks,
>> Lijo
>>
>> -----Original Message-----
>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>> Sent: Thursday, August 14, 2025 5:23 PM
>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
>> <alexdeucher@gmail.com>
>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> (v3)
>>
>>
>> On 8/14/2025 3:16 PM, Lazar, Lijo wrote:
>>> [Public]
>>>
>>> I see your point now. Attached should work, I guess. Is the concern more about having to take the lock for every begin?
>> This is closer,  but the thing is, IMO we shouldn't have to use 2 locks and go into nested locking, we can do with just one global lock.
>>
>> Power_state of each instance, and global workload_profile_active are
>> inter-related, they need to be guarded together,
>>
>> nested could work , but why nested if single lock is enough ? nested complicates it.
>>
>> Regards,
>>
>> Sathish
>>
>>> Thanks,
>>> Lijo
>>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Lazar, Lijo
>>> Sent: Thursday, August 14, 2025 2:55 PM
>>> To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>;
>>> Alex Deucher <alexdeucher@gmail.com>
>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>>> (v3)
>>>
>>> [Public]
>>>
>>> That is not required I think. The power profile is set by an instance *after* setting itself to power on. Also, it's switched back after changing its power state to off.  If idle worker is run by another instance, it won't be seeing the inst0 as power gated and won't change power profile.
>>>
>>> Thanks,
>>> Lijo
>>> -----Original Message-----
>>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>>> Sent: Thursday, August 14, 2025 2:41 PM
>>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
>>> <alexdeucher@gmail.com>
>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>>> (v3)
>>>
>>> Hi Lijo,
>>>
>>> On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
>>>> [Public]
>>>>
>>>> We already have a per instance power state that can be tracked. What about something like attached?
>>> This also has concurrent access of the power state ,
>>> vcn.inst[i].cur_state is not protected by workload_profile_mutex
>>>
>>> every where, it can still change while you are holding workload_profile_mutex and checking it.
>>>
>>> Regards,
>>>
>>> Sathish
>>>
>>>> Thanks,
>>>> Lijo
>>>> -----Original Message-----
>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>> Sundararaju, Sathishkumar
>>>> Sent: Thursday, August 14, 2025 4:43 AM
>>>> To: Alex Deucher <alexdeucher@gmail.com>
>>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race
>>>> condition
>>>> (v3)
>>>>
>>>>
>>>> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>>>> Hi Alex, Hi David,
>>>>>>>>>>
>>>>>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>>>>>> problem, neither the current form , reason :-
>>>>>>>>>>
>>>>>>>>>> The emitted fence count and total submission count are fast
>>>>>>>>>> transients which frequently become 0 in between video decodes
>>>>>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>>>>>> switch of video power profile, in the current form of patch
>>>>>>>>>> that window is minimized, but still can happen if stress
>>>>>>>>>> tested. But power state of any instance becoming zero
>>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>>>>> Consider this situation, inst0 and inst1 actively decoding,
>>>>>>>> inst0 decode completes, delayed idle work starts.
>>>>>>>> inst0 idle handler can read 0 total fences and 0 total
>>>>>>>> submission count, even if inst1 is actively decoding, that's between the jobs,
>>>>>>>>         - as begin_use increaments vcn.total_submission_cnt and
>>>>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
>>>>>>>>         - if outstanding fences are cleared and no new emitted
>>>>>>>> fence, between jobs , can be 0.
>>>>>>>>         - both of the above conditions do not mean video decode
>>>>>>>> is complete on inst1, it is actively decoding.
>>>>>>> How can there be active decoding without an outstanding fence?
>>>>>>> In that case, total_fences (fences from both instances) would be non-0.
>>>>>> I mean on inst1 the job scheduled is already complete, so 0
>>>>>> outstanding fences, newer job is yet to be scheduled
>>>>>>
>>>>>> and commited to ring (inst1) , this doesn't mean decode has
>>>>>> stopped on
>>>>>> inst1 right (I am saying if timing of inst0 idle work coincides
>>>>>> with this),
>>>>>>
>>>>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>>>>>> correct my understanding here.
>>>>> The flow looks like:
>>>>>
>>>>> begin_use(inst)
>>>>> emit_fence(inst)
>>>>> end_use(inst)
>>>>>
>>>>> ...later
>>>>> fence signals
>>>>> ...later
>>>>> work handler
>>>>>
>>>>> In begin_use we increment the global and per instance submission.
>>>>> This protects the power gating and profile until end_use.  In end
>>>>> use we decrement the submissions because we don't need to protect
>>>>> anything any more as we have the fence that was submitted via the
>>>>> ring.  That fence won't signal until the job is complete.
>>>> Is a next begin_use always guaranteed to be run before current job fence signals ?
>>>>
>>>> if not then both total submission and total fence are zero , example
>>>> delayed job/packet submissions
>>>>
>>>> from user space, or next job schedule happens after current job fence signals.
>>>>
>>>> if this is never possible then (v3) is perfect.
>>>>
>>>> Regards,
>>>>
>>>> Sathish
>>>>
>>>>> For power gating, we
>>>>> only care about the submission count and fences for that instance,
>>>>> for the profile, we care about submission count and fences all instances.
>>>>> Once the fences have signalled, the outstanding fences will be 0
>>>>> and there won't be any active work.
>>>>>
>>>>> Alex
>>>>>
>>>>>> Regards,
>>>>>>
>>>>>> Sathish
>>>>>>
>>>>>>> Alex
>>>>>>>
>>>>>>>> Whereas if instances are powered off we are sure idle time is
>>>>>>>> past and it is powered off, no possible way of active video
>>>>>>>> decode, when all instances are off we can safely assume no
>>>>>>>> active decode and global lock protects it against new begin_use on any instance.
>>>>>>>> But the only distant concern is global common locks w.r.t perf,
>>>>>>>> but we are already having a global workprofile mutex , so there
>>>>>>>> shouldn't be any drop in perf, with just one single global lock
>>>>>>>> for all instances.
>>>>>>>>
>>>>>>>> Just sending out a patch with this fix, will leave it to you to
>>>>>>>> decide the right method. If you think outstanding total fences
>>>>>>>> can never be 0 during decode, then your previous version (v3)
>>>>>>>> itself is good, there is no real benefit of splitting the handlers as such.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Sathish
>>>>>>>>> If it is possible, maybe it would be easier to just split the
>>>>>>>>> profile and powergating into separate handlers.  The profile
>>>>>>>>> one would be global and the powergating one would be per instance.
>>>>>>>>> See the attached patches.
>>>>>>>>>
>>>>>>>>> Alex
>>>>>>>>>
>>>>>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>>>>>> mistake in my patch was using per instance mutex, I should
>>>>>>>>>> have used a common global mutex, then that covers the situation David is trying to bring out.
>>>>>>>>>>
>>>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>>>>>> flags to track power state could help us totally avoid this situation.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>>
>>>>>>>>>> Sathish
>>>>>>>>>>
>>>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>>>>> Hi Alex,
>>>>>>>>>>>>>
>>>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
>>>>>>>>>>>>> in case someone adds more before the lock and not reviewed
>>>>>>>>>>>>> thoroughly)
>>>>>>>>>>>>>           - up to you to decide.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>> David
>>>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>>>>>> end up switching the video profile while another instance
>>>>>>>>>>>>>> is active because we only take into account the current
>>>>>>>>>>>>>> instance's submissions.  Look at all outstanding fences
>>>>>>>>>>>>>> for the video profile.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
>>>>>>>>>>>>>> profile
>>>>>>>>>>>>>> handling")
>>>>>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>>>>           2 files changed, 21 insertions(+), 20
>>>>>>>>>>>>>> deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>>>               struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>>>>                       container_of(work, struct
>>>>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
>>>>>>>>>>>>>>               struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>>>>               int r = 0;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 <<
>>>>>>>>>>>>>> + vcn_inst->inst))
>>>>>>>>>>>>>>                       return;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
>>>>>>>>>>>>>> + &adev->vcn.inst[i];
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>>>>> +     }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>               /* Only set DPG pause for VCN3 or below, VCN4
>>>>>>>>>>>>>> and above will be handled by FW */
>>>>>>>>>>>>>>               if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>>>>                       struct dpg_pause_state new_state;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>                       if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct
>>>>>>>>>>>>>> work_struct
>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>                       else
>>>>>>>>>>>>>>                               new_state.fw_based =
>>>>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst,
>>>>>>>>>>>>>> + &new_state);
>>>>>>>>>>>>>>               }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>>>>                       mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>>                       vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>>>>                       mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>>>>> !total_fences &&
>>>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>>>>                               r =
>>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, false);
>>>>>>>>>>>>>>                               if (r) @@ -467,16 +473,10 @@
>>>>>>>>>>>>>> void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>>>               int r = 0;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>               atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>>>>> -      */
>>>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>> move to here:
>>>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>>>>> David
>>>>>>>>>>>> Alex
>>>>>>>>>>>>
>>>>>>>>>>>>> David
>>>>>>>>>>>>>>               if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>>>>                       r =
>>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, @@ -487,7 +487,6 @@ void
>>>>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>>>               }
>>>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>>>>               mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>>               vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>>>>>> amdgpu_ring
>>>>>>>>>>>>>> *ring)
>>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submiss
>>>>>>>>>>>>>> i
>>>>>>>>>>>>>> o
>>>>>>>>>>>>>> n
>>>>>>>>>>>>>> _cnt);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submissio
>>>>>>>>>>>>>> n
>>>>>>>>>>>>>> _
>>>>>>>>>>>>>> c
>>>>>>>>>>>>>> nt);
>>>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>>>>                                     VCN_IDLE_TIMEOUT); diff
>>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>               uint16_t inst_mask;
>>>>>>>>>>>>>>               uint8_t num_inst_per_aid;
>>>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>               /* IP reg dump */
>>>>>>>>>>>>>>               uint32_t                *ip_dump;

[-- Attachment #2: Type: text/html, Size: 43143 bytes --]

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 18:06                                       ` Lazar, Lijo
@ 2025-08-14 18:42                                         ` David Wu
  2025-08-18 19:22                                           ` David Wu
  0 siblings, 1 reply; 36+ messages in thread
From: David Wu @ 2025-08-14 18:42 UTC (permalink / raw)
  To: Lazar, Lijo, Wu, David, Sundararaju, Sathishkumar, Alex Deucher
  Cc: Deucher, Alexander, amd-gfx@lists.freedesktop.org

[-- Attachment #1: Type: text/plain, Size: 24113 bytes --]

hmm.. it is my concern for the same instance. but I got it now. Your 
patch is good.
Thanks,
David
On 2025-08-14 14:06, Lazar, Lijo wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> I didn't fully understand the question.
>
> For the same instance, begin_thread will set the power state only 
> after cancelling any idle worker for the instance. If idle worker is 
> already under progress, then it needs to complete as that's a 
> cancel_sync (it's the existing logic).
>
> Basically, by the time begin_thread sets the PG state, no idle worker 
> for the same vcn instance would be active. If it's about context 
> switch to another vcn instance's begin_thread, I think that won't be a 
> problem.
>
> Thanks,
> Lijo
> ------------------------------------------------------------------------
> *From:* Wu, David <David.Wu3@amd.com>
> *Sent:* Thursday, August 14, 2025 11:14:26 PM
> *To:* Lazar, Lijo <Lijo.Lazar@amd.com>; Sundararaju, Sathishkumar 
> <Sathishkumar.Sundararaju@amd.com>; Alex Deucher <alexdeucher@gmail.com>
> *Cc:* Wu, David <David.Wu3@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org 
> <amd-gfx@lists.freedesktop.org>
> *Subject:* Re: [PATCH] drm/amdgpu/vcn: fix video profile race 
> condition (v3)
> amdgpu_vcn_idle_work_handler():
>      if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> ----------- could it be possible a context switch here to
> amdgpu_vcn_ring_begin_use()?
>   if it could then AMD_PG_STATE_GATE will be set by mistake.
>
> David
>
> On 2025-08-14 08:54, Lazar, Lijo wrote:
> > [Public]
> >
> > The request profile can be moved outside the pg_lock in begin_use as 
> in the attached patch. It needs set power state -> set profile order.
> >
> > This is the premise -
> >
> > Let's say there are two threads, begin_use thread and idle_work 
> threads. begin_use and idle_work will need the workprofile mutex to 
> request a profile.
> >
> > Case 1) Idle thread gets the lock first.
> >          a) Idle thread sees vinst power state as PG_UNGATE, no harm 
> done. It exits without requesting power profile change. begin_use 
> thread gets the lock next, it sees profile as active and continues.
> >          b) Idle thread sees vinst power state as PG_GATE, it will 
> make workprofile_active to false and exit. Now when begin_use thread 
> gets the mutex next, it's guaranteed to see the workprofile_active as 
> false, hence it will request the profile.
> >
> > Case 2) begin_use thread gets the lock first.
> >          a) Workload profile is active, hence it doesn't do anything 
> and exits. The change made by begin_use thread to vinst power state 
> (state = on) will now be visible to idle thread which gets the lock 
> next. It will do nothing and exit.
> >          b) Workload profile is inactive, hence it requests a 
> profile change. Again, the change made by begin_use thread to vinst 
> power state will now be visible to idle thread which gets the lock 
> next. It will do nothing and exit.
> >
> > Thanks,
> > Lijo
> >
> > -----Original Message-----
> > From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> > Sent: Thursday, August 14, 2025 6:18 PM
> > To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher 
> <alexdeucher@gmail.com>
> > Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander 
> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> > Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race 
> condition (v3)
> >
> >
> > On 8/14/2025 5:33 PM, Lazar, Lijo wrote:
> >> [Public]
> >>
> >> There is no need for nested lock. It only needs to follow the order
> >>           set instance power_state
> >>           set profile (this takes a global lock and hence instance 
> power state will be visible to whichever thread that gets the work 
> profile lock).
> >>
> >> You are seeing nested lock just because I added the code just after 
> power state setting.
> > Pasting your code from the file for ref :
> >
> > @@ -464,32 +509,14 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring
> > *ring)
> >
> > -pg_lock:
> >
> >        mutex_lock(&vcn_inst->vcn_pg_lock);
> >        vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> >
> > +   amdgpu_vcn_get_profile(adev);
> >
> > vcn_pg_lock isn't  released here yet right ? And in-case you intend 
> to only order the locks, then still there is an un-necessary OFF 
> followed by ON, but yes that is acceptable,
> >
> > May be you want to move that vcn_pg_lock after 
> amdgpu_vcn_get_profile to protect concurrent dpg_state access in 
> begin_use.
> >
> > The concern is, this patch access power_state that is protected by 
> some other mutex lock hoping it reflects right values also when 
> holding powerprofile_lock.
> >
> > Or
> >
> > Have shared a patch with global workload_profile_mutex that 
> simplifies this handling, and renamed pg_lock -> dpg_lock  and used
> >
> > that only for dpg_state changes per instance.
> >
> > Regards,
> >
> > Sathish
> >
> >> Thanks,
> >> Lijo
> >>
> >> -----Original Message-----
> >> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> >> Sent: Thursday, August 14, 2025 5:23 PM
> >> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
> >> <alexdeucher@gmail.com>
> >> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> >> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> >> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> >> (v3)
> >>
> >>
> >> On 8/14/2025 3:16 PM, Lazar, Lijo wrote:
> >>> [Public]
> >>>
> >>> I see your point now. Attached should work, I guess. Is the 
> concern more about having to take the lock for every begin?
> >> This is closer,  but the thing is, IMO we shouldn't have to use 2 
> locks and go into nested locking, we can do with just one global lock.
> >>
> >> Power_state of each instance, and global workload_profile_active are
> >> inter-related, they need to be guarded together,
> >>
> >> nested could work , but why nested if single lock is enough ? 
> nested complicates it.
> >>
> >> Regards,
> >>
> >> Sathish
> >>
> >>> Thanks,
> >>> Lijo
> >>>
> >>> -----Original Message-----
> >>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> >>> Lazar, Lijo
> >>> Sent: Thursday, August 14, 2025 2:55 PM
> >>> To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>;
> >>> Alex Deucher <alexdeucher@gmail.com>
> >>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> >>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> >>> Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> >>> (v3)
> >>>
> >>> [Public]
> >>>
> >>> That is not required I think. The power profile is set by an 
> instance *after* setting itself to power on. Also, it's switched back 
> after changing its power state to off.  If idle worker is run by 
> another instance, it won't be seeing the inst0 as power gated and 
> won't change power profile.
> >>>
> >>> Thanks,
> >>> Lijo
> >>> -----Original Message-----
> >>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> >>> Sent: Thursday, August 14, 2025 2:41 PM
> >>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
> >>> <alexdeucher@gmail.com>
> >>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> >>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> >>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> >>> (v3)
> >>>
> >>> Hi Lijo,
> >>>
> >>> On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
> >>>> [Public]
> >>>>
> >>>> We already have a per instance power state that can be tracked. 
> What about something like attached?
> >>> This also has concurrent access of the power state ,
> >>> vcn.inst[i].cur_state is not protected by workload_profile_mutex
> >>>
> >>> every where, it can still change while you are holding 
> workload_profile_mutex and checking it.
> >>>
> >>> Regards,
> >>>
> >>> Sathish
> >>>
> >>>> Thanks,
> >>>> Lijo
> >>>> -----Original Message-----
> >>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> >>>> Sundararaju, Sathishkumar
> >>>> Sent: Thursday, August 14, 2025 4:43 AM
> >>>> To: Alex Deucher <alexdeucher@gmail.com>
> >>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> >>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> >>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race
> >>>> condition
> >>>> (v3)
> >>>>
> >>>>
> >>>> On 8/14/2025 3:38 AM, Alex Deucher wrote:
> >>>>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
> >>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
> >>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
> >>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
> >>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
> >>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>>>>>> Hi Alex, Hi David,
> >>>>>>>>>>
> >>>>>>>>>> I see David's concern but his suggestion yet wont solve the
> >>>>>>>>>> problem, neither the current form , reason :-
> >>>>>>>>>>
> >>>>>>>>>> The emitted fence count and total submission count are fast
> >>>>>>>>>> transients which frequently become 0 in between video decodes
> >>>>>>>>>> (between jobs) even with the atomics and locks there can be a
> >>>>>>>>>> switch of video power profile, in the current form of patch
> >>>>>>>>>> that window is minimized, but still can happen if stress
> >>>>>>>>>> tested. But power state of any instance becoming zero
> >>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
> >>>>>>>> Consider this situation, inst0 and inst1 actively decoding,
> >>>>>>>> inst0 decode completes, delayed idle work starts.
> >>>>>>>> inst0 idle handler can read 0 total fences and 0 total
> >>>>>>>> submission count, even if inst1 is actively decoding, that's 
> between the jobs,
> >>>>>>>>         - as begin_use increaments vcn.total_submission_cnt and
> >>>>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
> >>>>>>>>         - if outstanding fences are cleared and no new emitted
> >>>>>>>> fence, between jobs , can be 0.
> >>>>>>>>         - both of the above conditions do not mean video decode
> >>>>>>>> is complete on inst1, it is actively decoding.
> >>>>>>> How can there be active decoding without an outstanding fence?
> >>>>>>> In that case, total_fences (fences from both instances) would 
> be non-0.
> >>>>>> I mean on inst1 the job scheduled is already complete, so 0
> >>>>>> outstanding fences, newer job is yet to be scheduled
> >>>>>>
> >>>>>> and commited to ring (inst1) , this doesn't mean decode has
> >>>>>> stopped on
> >>>>>> inst1 right (I am saying if timing of inst0 idle work coincides
> >>>>>> with this),
> >>>>>>
> >>>>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
> >>>>>> correct my understanding here.
> >>>>> The flow looks like:
> >>>>>
> >>>>> begin_use(inst)
> >>>>> emit_fence(inst)
> >>>>> end_use(inst)
> >>>>>
> >>>>> ...later
> >>>>> fence signals
> >>>>> ...later
> >>>>> work handler
> >>>>>
> >>>>> In begin_use we increment the global and per instance submission.
> >>>>> This protects the power gating and profile until end_use.  In end
> >>>>> use we decrement the submissions because we don't need to protect
> >>>>> anything any more as we have the fence that was submitted via the
> >>>>> ring.  That fence won't signal until the job is complete.
> >>>> Is a next begin_use always guaranteed to be run before current 
> job fence signals ?
> >>>>
> >>>> if not then both total submission and total fence are zero , example
> >>>> delayed job/packet submissions
> >>>>
> >>>> from user space, or next job schedule happens after current job 
> fence signals.
> >>>>
> >>>> if this is never possible then (v3) is perfect.
> >>>>
> >>>> Regards,
> >>>>
> >>>> Sathish
> >>>>
> >>>>> For power gating, we
> >>>>> only care about the submission count and fences for that instance,
> >>>>> for the profile, we care about submission count and fences all 
> instances.
> >>>>> Once the fences have signalled, the outstanding fences will be 0
> >>>>> and there won't be any active work.
> >>>>>
> >>>>> Alex
> >>>>>
> >>>>>> Regards,
> >>>>>>
> >>>>>> Sathish
> >>>>>>
> >>>>>>> Alex
> >>>>>>>
> >>>>>>>> Whereas if instances are powered off we are sure idle time is
> >>>>>>>> past and it is powered off, no possible way of active video
> >>>>>>>> decode, when all instances are off we can safely assume no
> >>>>>>>> active decode and global lock protects it against new 
> begin_use on any instance.
> >>>>>>>> But the only distant concern is global common locks w.r.t perf,
> >>>>>>>> but we are already having a global workprofile mutex , so there
> >>>>>>>> shouldn't be any drop in perf, with just one single global lock
> >>>>>>>> for all instances.
> >>>>>>>>
> >>>>>>>> Just sending out a patch with this fix, will leave it to you to
> >>>>>>>> decide the right method. If you think outstanding total fences
> >>>>>>>> can never be 0 during decode, then your previous version (v3)
> >>>>>>>> itself is good, there is no real benefit of splitting the 
> handlers as such.
> >>>>>>>>
> >>>>>>>> Regards,
> >>>>>>>> Sathish
> >>>>>>>>> If it is possible, maybe it would be easier to just split the
> >>>>>>>>> profile and powergating into separate handlers.  The profile
> >>>>>>>>> one would be global and the powergating one would be per 
> instance.
> >>>>>>>>> See the attached patches.
> >>>>>>>>>
> >>>>>>>>> Alex
> >>>>>>>>>
> >>>>>>>>>> can be a sure shot indication of break in a video decode, the
> >>>>>>>>>> mistake in my patch was using per instance mutex, I should
> >>>>>>>>>> have used a common global mutex, then that covers the 
> situation David is trying to bring out.
> >>>>>>>>>>
> >>>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
> >>>>>>>>>> flags to track power state could help us totally avoid this 
> situation.
> >>>>>>>>>>
> >>>>>>>>>> Regards,
> >>>>>>>>>>
> >>>>>>>>>> Sathish
> >>>>>>>>>>
> >>>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
> >>>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
> >>>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David 
> <davidwu2@amd.com> wrote:
> >>>>>>>>>>>>> Hi Alex,
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
> >>>>>>>>>>>>> it is unlikely to have a context switch right after the 
> begin_use().
> >>>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
> >>>>>>>>>>>>> in case someone adds more before the lock and not reviewed
> >>>>>>>>>>>>> thoroughly)
> >>>>>>>>>>>>>           - up to you to decide.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Thanks,
> >>>>>>>>>>>>> David
> >>>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
> >>>>>>>>>>>>>> If there are multiple instances of the VCN running, we may
> >>>>>>>>>>>>>> end up switching the video profile while another instance
> >>>>>>>>>>>>>> is active because we only take into account the current
> >>>>>>>>>>>>>> instance's submissions.  Look at all outstanding fences
> >>>>>>>>>>>>>> for the video profile.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> v2: drop early exit in begin_use()
> >>>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
> >>>>>>>>>>>>>> profile
> >>>>>>>>>>>>>> handling")
> >>>>>>>>>>>>>> Reviewed-by: Sathishkumar S
> >>>>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
> >>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> >>>>>>>>>>>>>> ---
> >>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
> >>>>>>>>>>>>>> ++++++++++++-------------
> >>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
> >>>>>>>>>>>>>>           2 files changed, 21 insertions(+), 20
> >>>>>>>>>>>>>> deletions(-)
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
> >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
> >>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>>>>>>>              struct amdgpu_vcn_inst *vcn_inst =
> >>>>>>>>>>>>>>                      container_of(work, struct
> >>>>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
> >>>>>>>>>>>>>>              struct amdgpu_device *adev = vcn_inst->adev;
> >>>>>>>>>>>>>> -     unsigned int fences = 0, 
> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
> >>>>>>>>>>>>>> +     unsigned int total_fences = 0,
> >>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>>>>>>>> +     unsigned int i, j;
> >>>>>>>>>>>>>>              int r = 0;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
> >>>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 <<
> >>>>>>>>>>>>>> + vcn_inst->inst))
> >>>>>>>>>>>>>>                      return;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> >>>>>>>>>>>>>> -             fence[i] +=
> >>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> >>>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> >>>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
> >>>>>>>>>>>>>> + &adev->vcn.inst[i];
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
> >>>>>>>>>>>>>> +                     fence[i] +=
> >>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
> >>>>>>>>>>>>>> +             fence[i] += 
> amdgpu_fence_count_emitted(&v->ring_dec);
> >>>>>>>>>>>>>> +             total_fences += fence[i];
> >>>>>>>>>>>>>> +     }
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>              /* Only set DPG pause for VCN3 or below, VCN4
> >>>>>>>>>>>>>> and above will be handled by FW */
> >>>>>>>>>>>>>>              if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> >>>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
> >>>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
> >>>>>>>>>>>>>>                      struct dpg_pause_state new_state;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>                      if (fence[i] || @@ -436,18 +442,18 @@
> >>>>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct
> >>>>>>>>>>>>>> work_struct
> >>>>>>>>>>>>>> *work)
> >>>>>>>>>>>>>>                      else
> >>>>>>>>>>>>>>                              new_state.fw_based =
> >>>>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> - adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst,
> >>>>>>>>>>>>>> + &new_state);
> >>>>>>>>>>>>>>               }
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     fence[i] += 
> amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> >>>>>>>>>>>>>> -     fences += fence[i];
> >>>>>>>>>>>>>> -
> >>>>>>>>>>>>>> -     if (!fences && 
> !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
> >>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>>>>>>>> +             /* This is specific to this instance */
> >>>>>>>>>>>>>>                      mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>>>                      vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> >>>>>>>>>>>>>>                      mutex_unlock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
> >>>>>>>>>>>>>> +             /* This is global and depends on all VCN 
> instances */
> >>>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
> >>>>>>>>>>>>>> !total_fences &&
> >>>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
> >>>>>>>>>>>>>>                              r =
> >>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, false);
> >>>>>>>>>>>>>>                              if (r) @@ -467,16 +473,10 @@
> >>>>>>>>>>>>>> void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
> >>>>>>>>>>>>>>              int r = 0;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>              atomic_inc(&vcn_inst->total_submission_cnt);
> >>>>>>>>>>>>>> + atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>>>>>>>> move this addition down inside the mutex lock
> >>>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     /* We can safely return early here because we've 
> cancelled the
> >>>>>>>>>>>>>> -      * the delayed work so there is no one else to 
> set it to false
> >>>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
> >>>>>>>>>>>>>> -      */
> >>>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
> >>>>>>>>>>>>>> -             goto pg_lock;
> >>>>>>>>>>>>>> -
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>>> move to here:
> >>>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>>>>>>>> I think this should work for multiple instances.
> >>>>>>>>>>>> Why does this need to be protected by the mutex?
> >>>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
> >>>>>>>>>>> David
> >>>>>>>>>>>> Alex
> >>>>>>>>>>>>
> >>>>>>>>>>>>> David
> >>>>>>>>>>>>>>              if (!adev->vcn.workload_profile_active) {
> >>>>>>>>>>>>>>                      r =
> >>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, @@ -487,7 +487,6 @@ void
> >>>>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
> >>>>>>>>>>>>>>               }
> >>>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -pg_lock:
> >>>>>>>>>>>>>>              mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>>>              vcn_inst->set_pg_state(vcn_inst,
> >>>>>>>>>>>>>> AMD_PG_STATE_UNGATE);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
> >>>>>>>>>>>>>> amdgpu_ring
> >>>>>>>>>>>>>> *ring)
> >>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submiss
> >>>>>>>>>>>>>> i
> >>>>>>>>>>>>>> o
> >>>>>>>>>>>>>> n
> >>>>>>>>>>>>>> _cnt);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submissio
> >>>>>>>>>>>>>> n
> >>>>>>>>>>>>>> _
> >>>>>>>>>>>>>> c
> >>>>>>>>>>>>>> nt);
> >>>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> 
> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
> >>>>>>>>>>>>>>                                    VCN_IDLE_TIMEOUT); diff
> >>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
> >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>              uint16_t inst_mask;
> >>>>>>>>>>>>>>              uint8_t num_inst_per_aid;
> >>>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>              /* IP reg dump */
> >>>>>>>>>>>>>>              uint32_t                *ip_dump;

[-- Attachment #2: Type: text/html, Size: 59259 bytes --]

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 18:42                                         ` David Wu
@ 2025-08-18 19:22                                           ` David Wu
  2025-08-18 19:31                                             ` Alex Deucher
  0 siblings, 1 reply; 36+ messages in thread
From: David Wu @ 2025-08-18 19:22 UTC (permalink / raw)
  To: Lazar, Lijo, Wu, David, Sundararaju, Sathishkumar, Alex Deucher
  Cc: Deucher, Alexander, amd-gfx@lists.freedesktop.org

[-- Attachment #1: Type: text/plain, Size: 24874 bytes --]

Hi Lijo and Alex,

I prefer Lijo's method as it should not cause unexpected power profile 
change for the active session.
Liji, could you make some adjustments for your patch such as removing 
extra blank line and function
declaration such as using "void" instead of "int" for the new functions 
(should they be static?)

Thanks,

David

On 2025-08-14 14:42, David Wu wrote:
> hmm.. it is my concern for the same instance. but I got it now. Your 
> patch is good.
> Thanks,
> David
> On 2025-08-14 14:06, Lazar, Lijo wrote:
>>
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>>
>> I didn't fully understand the question.
>>
>> For the same instance, begin_thread will set the power state only 
>> after cancelling any idle worker for the instance. If idle worker is 
>> already under progress, then it needs to complete as that's a 
>> cancel_sync (it's the existing logic).
>>
>> Basically, by the time begin_thread sets the PG state, no idle worker 
>> for the same vcn instance would be active. If it's about context 
>> switch to another vcn instance's begin_thread, I think that won't be 
>> a problem.
>>
>> Thanks,
>> Lijo
>> ------------------------------------------------------------------------
>> *From:* Wu, David <David.Wu3@amd.com>
>> *Sent:* Thursday, August 14, 2025 11:14:26 PM
>> *To:* Lazar, Lijo <Lijo.Lazar@amd.com>; Sundararaju, Sathishkumar 
>> <Sathishkumar.Sundararaju@amd.com>; Alex Deucher <alexdeucher@gmail.com>
>> *Cc:* Wu, David <David.Wu3@amd.com>; Deucher, Alexander 
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org 
>> <amd-gfx@lists.freedesktop.org>
>> *Subject:* Re: [PATCH] drm/amdgpu/vcn: fix video profile race 
>> condition (v3)
>> amdgpu_vcn_idle_work_handler():
>>      if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>> ----------- could it be possible a context switch here to
>> amdgpu_vcn_ring_begin_use()?
>>   if it could then AMD_PG_STATE_GATE will be set by mistake.
>>
>> David
>>
>> On 2025-08-14 08:54, Lazar, Lijo wrote:
>> > [Public]
>> >
>> > The request profile can be moved outside the pg_lock in begin_use 
>> as in the attached patch. It needs  set power state -> set profile order.
>> >
>> > This is the premise -
>> >
>> > Let's say there are two threads, begin_use thread and idle_work 
>> threads. begin_use and idle_work will need the workprofile mutex to 
>> request a profile.
>> >
>> > Case 1) Idle thread gets the lock first.
>> >          a) Idle thread sees vinst power state as PG_UNGATE, no 
>> harm done. It exits without requesting power profile change. 
>> begin_use thread gets the lock next, it sees profile as active and 
>> continues.
>> >          b) Idle thread sees vinst power state as PG_GATE, it will 
>> make workprofile_active to false and exit. Now when begin_use thread 
>> gets the mutex next, it's guaranteed to see the workprofile_active as 
>> false, hence it will request the profile.
>> >
>> > Case 2) begin_use thread gets the lock first.
>> >          a) Workload profile is active, hence it doesn't do 
>> anything and exits. The change made by begin_use thread to vinst 
>> power state (state = on) will now be visible to idle thread which 
>> gets the lock next. It will do nothing and exit.
>> >          b) Workload profile is inactive, hence it requests a 
>> profile change. Again, the change made by begin_use thread to vinst 
>> power state will now be visible to idle thread which gets the lock 
>> next. It will do nothing and exit.
>> >
>> > Thanks,
>> > Lijo
>> >
>> > -----Original Message-----
>> > From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>> > Sent: Thursday, August 14, 2025 6:18 PM
>> > To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher 
>> <alexdeucher@gmail.com>
>> > Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander 
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> > Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race 
>> condition (v3)
>> >
>> >
>> > On 8/14/2025 5:33 PM, Lazar, Lijo wrote:
>> >> [Public]
>> >>
>> >> There is no need for nested lock. It only needs to follow the order
>> >>           set instance power_state
>> >>           set profile (this takes a global lock and hence instance 
>> power state will be visible to whichever thread that gets the work 
>> profile lock).
>> >>
>> >> You are seeing nested lock just because I added the code just 
>> after power state setting.
>> > Pasting your code from the file for ref :
>> >
>> > @@ -464,32 +509,14 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring
>> > *ring)
>> >
>> > -pg_lock:
>> >
>> >        mutex_lock(&vcn_inst->vcn_pg_lock);
>> >        vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>> >
>> > +   amdgpu_vcn_get_profile(adev);
>> >
>> > vcn_pg_lock isn't  released here yet right ? And in-case you intend 
>> to only order the locks, then still there is an un-necessary OFF 
>> followed by ON, but yes that is acceptable,
>> >
>> > May be you want to move that vcn_pg_lock after 
>> amdgpu_vcn_get_profile to protect concurrent dpg_state access in 
>> begin_use.
>> >
>> > The concern is, this patch access power_state that is protected by 
>> some other mutex lock hoping it reflects right values also when 
>> holding powerprofile_lock.
>> >
>> > Or
>> >
>> > Have shared a patch with global workload_profile_mutex that 
>> simplifies this handling, and renamed pg_lock -> dpg_lock  and used
>> >
>> > that only for dpg_state changes per instance.
>> >
>> > Regards,
>> >
>> > Sathish
>> >
>> >> Thanks,
>> >> Lijo
>> >>
>> >> -----Original Message-----
>> >> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>> >> Sent: Thursday, August 14, 2025 5:23 PM
>> >> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
>> >> <alexdeucher@gmail.com>
>> >> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> >> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> >> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> >> (v3)
>> >>
>> >>
>> >> On 8/14/2025 3:16 PM, Lazar, Lijo wrote:
>> >>> [Public]
>> >>>
>> >>> I see your point now. Attached should work, I guess. Is the 
>> concern more about having to take the lock for every begin?
>> >> This is closer,  but the thing is, IMO we shouldn't have to use 2 
>> locks and go into nested locking, we can do with just one global lock.
>> >>
>> >> Power_state of each instance, and global workload_profile_active are
>> >> inter-related, they need to be guarded together,
>> >>
>> >> nested could work , but why nested if single lock is enough ? 
>> nested complicates it.
>> >>
>> >> Regards,
>> >>
>> >> Sathish
>> >>
>> >>> Thanks,
>> >>> Lijo
>> >>>
>> >>> -----Original Message-----
>> >>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> >>> Lazar, Lijo
>> >>> Sent: Thursday, August 14, 2025 2:55 PM
>> >>> To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>;
>> >>> Alex Deucher <alexdeucher@gmail.com>
>> >>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> >>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> >>> Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> >>> (v3)
>> >>>
>> >>> [Public]
>> >>>
>> >>> That is not required I think. The power profile is set by an 
>> instance *after* setting itself to power on. Also, it's switched back 
>> after changing its power state to off.  If idle worker is run by 
>> another instance, it won't be seeing the inst0 as power gated and 
>> won't change power profile.
>> >>>
>> >>> Thanks,
>> >>> Lijo
>> >>> -----Original Message-----
>> >>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>> >>> Sent: Thursday, August 14, 2025 2:41 PM
>> >>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
>> >>> <alexdeucher@gmail.com>
>> >>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> >>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> >>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>> >>> (v3)
>> >>>
>> >>> Hi Lijo,
>> >>>
>> >>> On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
>> >>>> [Public]
>> >>>>
>> >>>> We already have a per instance power state that can be tracked. 
>> What about something like attached?
>> >>> This also has concurrent access of the power state ,
>> >>> vcn.inst[i].cur_state is not protected by workload_profile_mutex
>> >>>
>> >>> every where, it can still change while you are holding 
>> workload_profile_mutex and checking it.
>> >>>
>> >>> Regards,
>> >>>
>> >>> Sathish
>> >>>
>> >>>> Thanks,
>> >>>> Lijo
>> >>>> -----Original Message-----
>> >>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> >>>> Sundararaju, Sathishkumar
>> >>>> Sent: Thursday, August 14, 2025 4:43 AM
>> >>>> To: Alex Deucher <alexdeucher@gmail.com>
>> >>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>> >>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> >>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race
>> >>>> condition
>> >>>> (v3)
>> >>>>
>> >>>>
>> >>>> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>> >>>>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
>> >>>>> <sathishkumar.sundararaju@amd.com> wrote:
>> >>>>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>> >>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>> >>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>> >>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>> >>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>> >>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>> >>>>>>>>>> Hi Alex, Hi David,
>> >>>>>>>>>>
>> >>>>>>>>>> I see David's concern but his suggestion yet wont solve the
>> >>>>>>>>>> problem, neither the current form , reason :-
>> >>>>>>>>>>
>> >>>>>>>>>> The emitted fence count and total submission count are fast
>> >>>>>>>>>> transients which frequently become 0 in between video decodes
>> >>>>>>>>>> (between jobs) even with the atomics and locks there can be a
>> >>>>>>>>>> switch of video power profile, in the current form of patch
>> >>>>>>>>>> that window is minimized, but still can happen if stress
>> >>>>>>>>>> tested. But power state of any instance becoming zero
>> >>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>> >>>>>>>> Consider this situation, inst0 and inst1 actively decoding,
>> >>>>>>>> inst0 decode completes, delayed idle work starts.
>> >>>>>>>> inst0 idle handler can read 0 total fences and 0 total
>> >>>>>>>> submission count, even if inst1 is actively decoding, that's 
>> between the jobs,
>> >>>>>>>>         - as begin_use increaments vcn.total_submission_cnt and
>> >>>>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
>> >>>>>>>>         - if outstanding fences are cleared and no new emitted
>> >>>>>>>> fence, between jobs , can be 0.
>> >>>>>>>>         - both of the above conditions do not mean video decode
>> >>>>>>>> is complete on inst1, it is actively decoding.
>> >>>>>>> How can there be active decoding without an outstanding fence?
>> >>>>>>> In that case, total_fences (fences from both instances) would 
>> be non-0.
>> >>>>>> I mean on inst1 the job scheduled is already complete, so 0
>> >>>>>> outstanding fences, newer job is yet to be scheduled
>> >>>>>>
>> >>>>>> and commited to ring (inst1) , this doesn't mean decode has
>> >>>>>> stopped on
>> >>>>>> inst1 right (I am saying if timing of inst0 idle work coincides
>> >>>>>> with this),
>> >>>>>>
>> >>>>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>> >>>>>> correct my understanding here.
>> >>>>> The flow looks like:
>> >>>>>
>> >>>>> begin_use(inst)
>> >>>>> emit_fence(inst)
>> >>>>> end_use(inst)
>> >>>>>
>> >>>>> ...later
>> >>>>> fence signals
>> >>>>> ...later
>> >>>>> work handler
>> >>>>>
>> >>>>> In begin_use we increment the global and per instance submission.
>> >>>>> This protects the power gating and profile until end_use.  In end
>> >>>>> use we decrement the submissions because we don't need to protect
>> >>>>> anything any more as we have the fence that was submitted via the
>> >>>>> ring.  That fence won't signal until the job is complete.
>> >>>> Is a next begin_use always guaranteed to be run before current 
>> job fence signals ?
>> >>>>
>> >>>> if not then both total submission and total fence are zero , example
>> >>>> delayed job/packet submissions
>> >>>>
>> >>>> from user space, or next job schedule happens after current job 
>> fence signals.
>> >>>>
>> >>>> if this is never possible then (v3) is perfect.
>> >>>>
>> >>>> Regards,
>> >>>>
>> >>>> Sathish
>> >>>>
>> >>>>> For power gating, we
>> >>>>> only care about the submission count and fences for that instance,
>> >>>>> for the profile, we care about submission count and fences all 
>> instances.
>> >>>>> Once the fences have signalled, the outstanding fences will be 0
>> >>>>> and there won't be any active work.
>> >>>>>
>> >>>>> Alex
>> >>>>>
>> >>>>>> Regards,
>> >>>>>>
>> >>>>>> Sathish
>> >>>>>>
>> >>>>>>> Alex
>> >>>>>>>
>> >>>>>>>> Whereas if instances are powered off we are sure idle time is
>> >>>>>>>> past and it is powered off, no possible way of active video
>> >>>>>>>> decode, when all instances are off we can safely assume no
>> >>>>>>>> active decode and global lock protects it against new 
>> begin_use on any instance.
>> >>>>>>>> But the only distant concern is global common locks w.r.t perf,
>> >>>>>>>> but we are already having a global workprofile mutex , so there
>> >>>>>>>> shouldn't be any drop in perf, with just one single global lock
>> >>>>>>>> for all instances.
>> >>>>>>>>
>> >>>>>>>> Just sending out a patch with this fix, will leave it to you to
>> >>>>>>>> decide the right method. If you think outstanding total fences
>> >>>>>>>> can never be 0 during decode, then your previous version (v3)
>> >>>>>>>> itself is good, there is no real benefit of splitting the 
>> handlers as such.
>> >>>>>>>>
>> >>>>>>>> Regards,
>> >>>>>>>> Sathish
>> >>>>>>>>> If it is possible, maybe it would be easier to just split the
>> >>>>>>>>> profile and powergating into separate handlers.  The profile
>> >>>>>>>>> one would be global and the powergating one would be per 
>> instance.
>> >>>>>>>>> See the attached patches.
>> >>>>>>>>>
>> >>>>>>>>> Alex
>> >>>>>>>>>
>> >>>>>>>>>> can be a sure shot indication of break in a video decode, the
>> >>>>>>>>>> mistake in my patch was using per instance mutex, I should
>> >>>>>>>>>> have used a common global mutex, then that covers the 
>> situation David is trying to bring out.
>> >>>>>>>>>>
>> >>>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>> >>>>>>>>>> flags to track power state could help us totally avoid 
>> this situation.
>> >>>>>>>>>>
>> >>>>>>>>>> Regards,
>> >>>>>>>>>>
>> >>>>>>>>>> Sathish
>> >>>>>>>>>>
>> >>>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>> >>>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>> >>>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David 
>> <davidwu2@amd.com> wrote:
>> >>>>>>>>>>>>> Hi Alex,
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>> >>>>>>>>>>>>> it is unlikely to have a context switch right after the 
>> begin_use().
>> >>>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
>> >>>>>>>>>>>>> in case someone adds more before the lock and not reviewed
>> >>>>>>>>>>>>> thoroughly)
>> >>>>>>>>>>>>>           - up to you to decide.
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> Thanks,
>> >>>>>>>>>>>>> David
>> >>>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>> >>>>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>> >>>>>>>>>>>>>> end up switching the video profile while another instance
>> >>>>>>>>>>>>>> is active because we only take into account the current
>> >>>>>>>>>>>>>> instance's submissions.  Look at all outstanding fences
>> >>>>>>>>>>>>>> for the video profile.
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> v2: drop early exit in begin_use()
>> >>>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
>> >>>>>>>>>>>>>> profile
>> >>>>>>>>>>>>>> handling")
>> >>>>>>>>>>>>>> Reviewed-by: Sathishkumar S
>> >>>>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>> >>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>> >>>>>>>>>>>>>> ---
>> >>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>> >>>>>>>>>>>>>> ++++++++++++-------------
>> >>>>>>>>>>>>>>          drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>> >>>>>>>>>>>>>>           2 files changed, 21 insertions(+), 20
>> >>>>>>>>>>>>>> deletions(-)
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>> >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>> >>>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>> >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>> >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>> >>>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>> >>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>> >>>>>>>>>>>>>>              struct amdgpu_vcn_inst *vcn_inst =
>> >>>>>>>>>>>>>>                      container_of(work, struct
>> >>>>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
>> >>>>>>>>>>>>>>              struct amdgpu_device *adev = vcn_inst->adev;
>> >>>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>> >>>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>> >>>>>>>>>>>>>> +     unsigned int total_fences = 0,
>> >>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>> >>>>>>>>>>>>>> +     unsigned int i, j;
>> >>>>>>>>>>>>>>              int r = 0;
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>> >>>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 <<
>> >>>>>>>>>>>>>> + vcn_inst->inst))
>> >>>>>>>>>>>>>>                      return;
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>> >>>>>>>>>>>>>> -            fence[i] +=
>> >>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>> >>>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>> >>>>>>>>>>>>>> +            struct amdgpu_vcn_inst *v =
>> >>>>>>>>>>>>>> + &adev->vcn.inst[i];
>> >>>>>>>>>>>>>> +
>> >>>>>>>>>>>>>> +            for (j = 0; j < v->num_enc_rings; ++j)
>> >>>>>>>>>>>>>> +                     fence[i] +=
>> >>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>> >>>>>>>>>>>>>> +            fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>> >>>>>>>>>>>>>> +            total_fences += fence[i];
>> >>>>>>>>>>>>>> +     }
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>>              /* Only set DPG pause for VCN3 or below, VCN4
>> >>>>>>>>>>>>>> and above will be handled by FW */
>> >>>>>>>>>>>>>>              if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>> >>>>>>>>>>>>>> -        !adev->vcn.inst[i].using_unified_queue) {
>> >>>>>>>>>>>>>> +        !vcn_inst->using_unified_queue) {
>> >>>>>>>>>>>>>>                      struct dpg_pause_state new_state;
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>>                      if (fence[i] || @@ -436,18 +442,18 @@
>> >>>>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct
>> >>>>>>>>>>>>>> work_struct
>> >>>>>>>>>>>>>> *work)
>> >>>>>>>>>>>>>>                      else
>> >>>>>>>>>>>>>>                              new_state.fw_based =
>> >>>>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> -            adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>> >>>>>>>>>>>>>> +            vcn_inst->pause_dpg_mode(vcn_inst,
>> >>>>>>>>>>>>>> + &new_state);
>> >>>>>>>>>>>>>>               }
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>> >>>>>>>>>>>>>> -     fences += fence[i];
>> >>>>>>>>>>>>>> -
>> >>>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>> >>>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>> >>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>> >>>>>>>>>>>>>> +            /* This is specific to this instance */
>> >>>>>>>>>>>>>>                      mutex_lock(&vcn_inst->vcn_pg_lock);
>> >>>>>>>>>>>>>>                      vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>> >>>>>>>>>>>>>>                      mutex_unlock(&vcn_inst->vcn_pg_lock);
>> >>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>> >>>>>>>>>>>>>> -            if (adev->vcn.workload_profile_active) {
>> >>>>>>>>>>>>>> +            /* This is global and depends on all VCN instances */
>> >>>>>>>>>>>>>> +            if (adev->vcn.workload_profile_active &&
>> >>>>>>>>>>>>>> !total_fences &&
>> >>>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>> >>>>>>>>>>>>>>                              r =
>> >>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>> >>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, false);
>> >>>>>>>>>>>>>>                              if (r) @@ -467,16 +473,10 @@
>> >>>>>>>>>>>>>> void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>> >>>>>>>>>>>>>>              int r = 0;
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>>              atomic_inc(&vcn_inst->total_submission_cnt);
>> >>>>>>>>>>>>>> +    atomic_inc(&adev->vcn.total_submission_cnt);
>> >>>>>>>>>>>>> move this addition down inside the mutex lock
>> >>>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>> >>>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>> >>>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>> >>>>>>>>>>>>>> -      */
>> >>>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>> >>>>>>>>>>>>>> -            goto pg_lock;
>> >>>>>>>>>>>>>> -
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>> >>>>>>>>>>>>> move to here:
>> >>>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>> >>>>>>>>>>>>> I think this should work for multiple instances.
>> >>>>>>>>>>>> Why does this need to be protected by the mutex?
>> >>>>>>>>>>> hmm.. OK - no need and it is actually better before the 
>> mutex.
>> >>>>>>>>>>> David
>> >>>>>>>>>>>> Alex
>> >>>>>>>>>>>>
>> >>>>>>>>>>>>> David
>> >>>>>>>>>>>>>>              if (!adev->vcn.workload_profile_active) {
>> >>>>>>>>>>>>>>                      r =
>> >>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>> >>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, @@ -487,7 +487,6 @@ void
>> >>>>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>> >>>>>>>>>>>>>>               }
>> >>>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> -pg_lock:
>> >>>>>>>>>>>>>>              mutex_lock(&vcn_inst->vcn_pg_lock);
>> >>>>>>>>>>>>>>              vcn_inst->set_pg_state(vcn_inst,
>> >>>>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>> >>>>>>>>>>>>>> amdgpu_ring
>> >>>>>>>>>>>>>> *ring)
>> >>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submiss
>> >>>>>>>>>>>>>> i
>> >>>>>>>>>>>>>> o
>> >>>>>>>>>>>>>> n
>> >>>>>>>>>>>>>> _cnt);
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submissio
>> >>>>>>>>>>>>>> n
>> >>>>>>>>>>>>>> _
>> >>>>>>>>>>>>>> c
>> >>>>>>>>>>>>>> nt);
>> >>>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>> >>>>>>>>>>>>>>                                    VCN_IDLE_TIMEOUT); diff
>> >>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>> >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>> >>>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>> >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>> >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>> >>>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>>              uint16_t inst_mask;
>> >>>>>>>>>>>>>>              uint8_t num_inst_per_aid;
>> >>>>>>>>>>>>>> +    atomic_t                total_submission_cnt;
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>>              /* IP reg dump */
>> >>>>>>>>>>>>>>              uint32_t                *ip_dump;

[-- Attachment #2: Type: text/html, Size: 59797 bytes --]

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-18 19:22                                           ` David Wu
@ 2025-08-18 19:31                                             ` Alex Deucher
  0 siblings, 0 replies; 36+ messages in thread
From: Alex Deucher @ 2025-08-18 19:31 UTC (permalink / raw)
  To: David Wu
  Cc: Lazar, Lijo, Wu, David, Sundararaju, Sathishkumar,
	Deucher, Alexander, amd-gfx@lists.freedesktop.org

On Mon, Aug 18, 2025 at 3:23 PM David Wu <davidwu2@amd.com> wrote:
>
> Hi Lijo and Alex,
>
> I prefer Lijo's method as it should not cause unexpected power profile change for the active session.
> Liji, could you make some adjustments for your patch such as removing extra blank line and function
> declaration such as using "void" instead of "int" for the new functions (should they be static?)

I still don't understand how you can get a case where the profile
would change for the active session with my patch.  That said, I think
Lijo's patch is nicer because it avoids the extra atomic.

Alex

>
> Thanks,
>
> David
>
> On 2025-08-14 14:42, David Wu wrote:
>
> hmm.. it is my concern for the same instance. but I got it now. Your patch is good.
> Thanks,
> David
> On 2025-08-14 14:06, Lazar, Lijo wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> I didn't fully understand the question.
>
> For the same instance, begin_thread will set the power state only after cancelling any idle worker for the instance. If idle worker is already under progress, then it needs to complete as that's a cancel_sync (it's the existing logic).
>
> Basically, by the time begin_thread sets the PG state, no idle worker for the same vcn instance would be active. If it's about context switch to another vcn instance's begin_thread, I think that won't be a problem.
>
> Thanks,
> Lijo
> ________________________________
> From: Wu, David <David.Wu3@amd.com>
> Sent: Thursday, August 14, 2025 11:14:26 PM
> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>; Alex Deucher <alexdeucher@gmail.com>
> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>
> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
>
> amdgpu_vcn_idle_work_handler():
>      if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> ----------- could it be possible a context switch here to
> amdgpu_vcn_ring_begin_use()?
>   if it could then AMD_PG_STATE_GATE will be set by mistake.
>
> David
>
> On 2025-08-14 08:54, Lazar, Lijo wrote:
> > [Public]
> >
> > The request profile can be moved outside the pg_lock in begin_use as in the attached patch. It needs  set power state -> set profile order.
> >
> > This is the premise -
> >
> > Let's say there are two threads, begin_use thread and idle_work threads. begin_use and idle_work will need the workprofile mutex to request a profile.
> >
> > Case 1) Idle thread gets the lock first.
> >          a) Idle thread sees vinst power state as PG_UNGATE, no harm done. It exits without requesting power profile change. begin_use thread gets the lock next, it sees profile as active and continues.
> >          b) Idle thread sees vinst power state as PG_GATE, it will make workprofile_active to false and exit. Now when begin_use thread gets the mutex next, it's guaranteed to see the workprofile_active as false, hence it will request the profile.
> >
> > Case 2) begin_use thread gets the lock first.
> >          a) Workload profile is active, hence it doesn't do anything and exits. The change made by begin_use thread to vinst power state (state = on) will now be visible to idle thread which gets the lock next. It will do nothing and exit.
> >          b) Workload profile is inactive, hence it requests a profile change. Again, the change made by begin_use thread to vinst power state will now be visible to idle thread which gets the lock next. It will do nothing and exit.
> >
> > Thanks,
> > Lijo
> >
> > -----Original Message-----
> > From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> > Sent: Thursday, August 14, 2025 6:18 PM
> > To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher <alexdeucher@gmail.com>
> > Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> > Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
> >
> >
> > On 8/14/2025 5:33 PM, Lazar, Lijo wrote:
> >> [Public]
> >>
> >> There is no need for nested lock. It only needs to follow the order
> >>           set instance power_state
> >>           set profile (this takes a global lock and hence instance power state will be visible to whichever thread that gets the work profile lock).
> >>
> >> You are seeing nested lock just because I added the code just after power state setting.
> > Pasting your code from the file for ref :
> >
> > @@ -464,32 +509,14 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring
> > *ring)
> >
> > -pg_lock:
> >
> >        mutex_lock(&vcn_inst->vcn_pg_lock);
> >        vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> >
> > +   amdgpu_vcn_get_profile(adev);
> >
> > vcn_pg_lock isn't  released here yet right ? And in-case you intend to only order the locks, then still there is an un-necessary OFF followed by ON, but yes that is acceptable,
> >
> > May be you want to move that vcn_pg_lock after amdgpu_vcn_get_profile to protect concurrent dpg_state access in begin_use.
> >
> > The concern is, this patch access power_state that is protected by some other mutex lock hoping it reflects right values also when holding powerprofile_lock.
> >
> > Or
> >
> > Have shared a patch with global workload_profile_mutex that simplifies this handling, and renamed pg_lock -> dpg_lock  and used
> >
> > that only for dpg_state changes per instance.
> >
> > Regards,
> >
> > Sathish
> >
> >> Thanks,
> >> Lijo
> >>
> >> -----Original Message-----
> >> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> >> Sent: Thursday, August 14, 2025 5:23 PM
> >> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
> >> <alexdeucher@gmail.com>
> >> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> >> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> >> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> >> (v3)
> >>
> >>
> >> On 8/14/2025 3:16 PM, Lazar, Lijo wrote:
> >>> [Public]
> >>>
> >>> I see your point now. Attached should work, I guess. Is the concern more about having to take the lock for every begin?
> >> This is closer,  but the thing is, IMO we shouldn't have to use 2 locks and go into nested locking, we can do with just one global lock.
> >>
> >> Power_state of each instance, and global workload_profile_active are
> >> inter-related, they need to be guarded together,
> >>
> >> nested could work , but why nested if single lock is enough ? nested complicates it.
> >>
> >> Regards,
> >>
> >> Sathish
> >>
> >>> Thanks,
> >>> Lijo
> >>>
> >>> -----Original Message-----
> >>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> >>> Lazar, Lijo
> >>> Sent: Thursday, August 14, 2025 2:55 PM
> >>> To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>;
> >>> Alex Deucher <alexdeucher@gmail.com>
> >>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> >>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> >>> Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> >>> (v3)
> >>>
> >>> [Public]
> >>>
> >>> That is not required I think. The power profile is set by an instance *after* setting itself to power on. Also, it's switched back after changing its power state to off.  If idle worker is run by another instance, it won't be seeing the inst0 as power gated and won't change power profile.
> >>>
> >>> Thanks,
> >>> Lijo
> >>> -----Original Message-----
> >>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
> >>> Sent: Thursday, August 14, 2025 2:41 PM
> >>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
> >>> <alexdeucher@gmail.com>
> >>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> >>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> >>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
> >>> (v3)
> >>>
> >>> Hi Lijo,
> >>>
> >>> On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
> >>>> [Public]
> >>>>
> >>>> We already have a per instance power state that can be tracked. What about something like attached?
> >>> This also has concurrent access of the power state ,
> >>> vcn.inst[i].cur_state is not protected by workload_profile_mutex
> >>>
> >>> every where, it can still change while you are holding workload_profile_mutex and checking it.
> >>>
> >>> Regards,
> >>>
> >>> Sathish
> >>>
> >>>> Thanks,
> >>>> Lijo
> >>>> -----Original Message-----
> >>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> >>>> Sundararaju, Sathishkumar
> >>>> Sent: Thursday, August 14, 2025 4:43 AM
> >>>> To: Alex Deucher <alexdeucher@gmail.com>
> >>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
> >>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
> >>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race
> >>>> condition
> >>>> (v3)
> >>>>
> >>>>
> >>>> On 8/14/2025 3:38 AM, Alex Deucher wrote:
> >>>>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
> >>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
> >>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
> >>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
> >>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
> >>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>>>>>> Hi Alex, Hi David,
> >>>>>>>>>>
> >>>>>>>>>> I see David's concern but his suggestion yet wont solve the
> >>>>>>>>>> problem, neither the current form , reason :-
> >>>>>>>>>>
> >>>>>>>>>> The emitted fence count and total submission count are fast
> >>>>>>>>>> transients which frequently become 0 in between video decodes
> >>>>>>>>>> (between jobs) even with the atomics and locks there can be a
> >>>>>>>>>> switch of video power profile, in the current form of patch
> >>>>>>>>>> that window is minimized, but still can happen if stress
> >>>>>>>>>> tested. But power state of any instance becoming zero
> >>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
> >>>>>>>> Consider this situation, inst0 and inst1 actively decoding,
> >>>>>>>> inst0 decode completes, delayed idle work starts.
> >>>>>>>> inst0 idle handler can read 0 total fences and 0 total
> >>>>>>>> submission count, even if inst1 is actively decoding, that's between the jobs,
> >>>>>>>>         - as begin_use increaments vcn.total_submission_cnt and
> >>>>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
> >>>>>>>>         - if outstanding fences are cleared and no new emitted
> >>>>>>>> fence, between jobs , can be 0.
> >>>>>>>>         - both of the above conditions do not mean video decode
> >>>>>>>> is complete on inst1, it is actively decoding.
> >>>>>>> How can there be active decoding without an outstanding fence?
> >>>>>>> In that case, total_fences (fences from both instances) would be non-0.
> >>>>>> I mean on inst1 the job scheduled is already complete, so 0
> >>>>>> outstanding fences, newer job is yet to be scheduled
> >>>>>>
> >>>>>> and commited to ring (inst1) , this doesn't mean decode has
> >>>>>> stopped on
> >>>>>> inst1 right (I am saying if timing of inst0 idle work coincides
> >>>>>> with this),
> >>>>>>
> >>>>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
> >>>>>> correct my understanding here.
> >>>>> The flow looks like:
> >>>>>
> >>>>> begin_use(inst)
> >>>>> emit_fence(inst)
> >>>>> end_use(inst)
> >>>>>
> >>>>> ...later
> >>>>> fence signals
> >>>>> ...later
> >>>>> work handler
> >>>>>
> >>>>> In begin_use we increment the global and per instance submission.
> >>>>> This protects the power gating and profile until end_use.  In end
> >>>>> use we decrement the submissions because we don't need to protect
> >>>>> anything any more as we have the fence that was submitted via the
> >>>>> ring.  That fence won't signal until the job is complete.
> >>>> Is a next begin_use always guaranteed to be run before current job fence signals ?
> >>>>
> >>>> if not then both total submission and total fence are zero , example
> >>>> delayed job/packet submissions
> >>>>
> >>>> from user space, or next job schedule happens after current job fence signals.
> >>>>
> >>>> if this is never possible then (v3) is perfect.
> >>>>
> >>>> Regards,
> >>>>
> >>>> Sathish
> >>>>
> >>>>> For power gating, we
> >>>>> only care about the submission count and fences for that instance,
> >>>>> for the profile, we care about submission count and fences all instances.
> >>>>> Once the fences have signalled, the outstanding fences will be 0
> >>>>> and there won't be any active work.
> >>>>>
> >>>>> Alex
> >>>>>
> >>>>>> Regards,
> >>>>>>
> >>>>>> Sathish
> >>>>>>
> >>>>>>> Alex
> >>>>>>>
> >>>>>>>> Whereas if instances are powered off we are sure idle time is
> >>>>>>>> past and it is powered off, no possible way of active video
> >>>>>>>> decode, when all instances are off we can safely assume no
> >>>>>>>> active decode and global lock protects it against new begin_use on any instance.
> >>>>>>>> But the only distant concern is global common locks w.r.t perf,
> >>>>>>>> but we are already having a global workprofile mutex , so there
> >>>>>>>> shouldn't be any drop in perf, with just one single global lock
> >>>>>>>> for all instances.
> >>>>>>>>
> >>>>>>>> Just sending out a patch with this fix, will leave it to you to
> >>>>>>>> decide the right method. If you think outstanding total fences
> >>>>>>>> can never be 0 during decode, then your previous version (v3)
> >>>>>>>> itself is good, there is no real benefit of splitting the handlers as such.
> >>>>>>>>
> >>>>>>>> Regards,
> >>>>>>>> Sathish
> >>>>>>>>> If it is possible, maybe it would be easier to just split the
> >>>>>>>>> profile and powergating into separate handlers.  The profile
> >>>>>>>>> one would be global and the powergating one would be per instance.
> >>>>>>>>> See the attached patches.
> >>>>>>>>>
> >>>>>>>>> Alex
> >>>>>>>>>
> >>>>>>>>>> can be a sure shot indication of break in a video decode, the
> >>>>>>>>>> mistake in my patch was using per instance mutex, I should
> >>>>>>>>>> have used a common global mutex, then that covers the situation David is trying to bring out.
> >>>>>>>>>>
> >>>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
> >>>>>>>>>> flags to track power state could help us totally avoid this situation.
> >>>>>>>>>>
> >>>>>>>>>> Regards,
> >>>>>>>>>>
> >>>>>>>>>> Sathish
> >>>>>>>>>>
> >>>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
> >>>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
> >>>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
> >>>>>>>>>>>>> Hi Alex,
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
> >>>>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
> >>>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
> >>>>>>>>>>>>> in case someone adds more before the lock and not reviewed
> >>>>>>>>>>>>> thoroughly)
> >>>>>>>>>>>>>           - up to you to decide.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Thanks,
> >>>>>>>>>>>>> David
> >>>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
> >>>>>>>>>>>>>> If there are multiple instances of the VCN running, we may
> >>>>>>>>>>>>>> end up switching the video profile while another instance
> >>>>>>>>>>>>>> is active because we only take into account the current
> >>>>>>>>>>>>>> instance's submissions.  Look at all outstanding fences
> >>>>>>>>>>>>>> for the video profile.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> v2: drop early exit in begin_use()
> >>>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
> >>>>>>>>>>>>>> profile
> >>>>>>>>>>>>>> handling")
> >>>>>>>>>>>>>> Reviewed-by: Sathishkumar S
> >>>>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
> >>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> >>>>>>>>>>>>>> ---
> >>>>>>>>>>>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
> >>>>>>>>>>>>>> ++++++++++++-------------
> >>>>>>>>>>>>>>           drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
> >>>>>>>>>>>>>>           2 files changed, 21 insertions(+), 20
> >>>>>>>>>>>>>> deletions(-)
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
> >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
> >>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>>>>>>>               struct amdgpu_vcn_inst *vcn_inst =
> >>>>>>>>>>>>>>                       container_of(work, struct
> >>>>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
> >>>>>>>>>>>>>>               struct amdgpu_device *adev = vcn_inst->adev;
> >>>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
> >>>>>>>>>>>>>> +     unsigned int total_fences = 0,
> >>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>>>>>>>> +     unsigned int i, j;
> >>>>>>>>>>>>>>               int r = 0;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
> >>>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 <<
> >>>>>>>>>>>>>> + vcn_inst->inst))
> >>>>>>>>>>>>>>                       return;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> >>>>>>>>>>>>>> -             fence[i] +=
> >>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> >>>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> >>>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
> >>>>>>>>>>>>>> + &adev->vcn.inst[i];
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
> >>>>>>>>>>>>>> +                     fence[i] +=
> >>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
> >>>>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> >>>>>>>>>>>>>> +             total_fences += fence[i];
> >>>>>>>>>>>>>> +     }
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>               /* Only set DPG pause for VCN3 or below, VCN4
> >>>>>>>>>>>>>> and above will be handled by FW */
> >>>>>>>>>>>>>>               if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> >>>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
> >>>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
> >>>>>>>>>>>>>>                       struct dpg_pause_state new_state;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>                       if (fence[i] || @@ -436,18 +442,18 @@
> >>>>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct
> >>>>>>>>>>>>>> work_struct
> >>>>>>>>>>>>>> *work)
> >>>>>>>>>>>>>>                       else
> >>>>>>>>>>>>>>                               new_state.fw_based =
> >>>>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst,
> >>>>>>>>>>>>>> + &new_state);
> >>>>>>>>>>>>>>               }
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> >>>>>>>>>>>>>> -     fences += fence[i];
> >>>>>>>>>>>>>> -
> >>>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
> >>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>>>>>>>> +             /* This is specific to this instance */
> >>>>>>>>>>>>>>                       mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>>>                       vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> >>>>>>>>>>>>>>                       mutex_unlock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
> >>>>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
> >>>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
> >>>>>>>>>>>>>> !total_fences &&
> >>>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
> >>>>>>>>>>>>>>                               r =
> >>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, false);
> >>>>>>>>>>>>>>                               if (r) @@ -467,16 +473,10 @@
> >>>>>>>>>>>>>> void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
> >>>>>>>>>>>>>>               int r = 0;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>               atomic_inc(&vcn_inst->total_submission_cnt);
> >>>>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>>>>>>>> move this addition down inside the mutex lock
> >>>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
> >>>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
> >>>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
> >>>>>>>>>>>>>> -      */
> >>>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
> >>>>>>>>>>>>>> -             goto pg_lock;
> >>>>>>>>>>>>>> -
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>>> move to here:
> >>>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>>>>>>>> I think this should work for multiple instances.
> >>>>>>>>>>>> Why does this need to be protected by the mutex?
> >>>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
> >>>>>>>>>>> David
> >>>>>>>>>>>> Alex
> >>>>>>>>>>>>
> >>>>>>>>>>>>> David
> >>>>>>>>>>>>>>               if (!adev->vcn.workload_profile_active) {
> >>>>>>>>>>>>>>                       r =
> >>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, @@ -487,7 +487,6 @@ void
> >>>>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
> >>>>>>>>>>>>>>               }
> >>>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -pg_lock:
> >>>>>>>>>>>>>>               mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>>>               vcn_inst->set_pg_state(vcn_inst,
> >>>>>>>>>>>>>> AMD_PG_STATE_UNGATE);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
> >>>>>>>>>>>>>> amdgpu_ring
> >>>>>>>>>>>>>> *ring)
> >>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submiss
> >>>>>>>>>>>>>> i
> >>>>>>>>>>>>>> o
> >>>>>>>>>>>>>> n
> >>>>>>>>>>>>>> _cnt);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submissio
> >>>>>>>>>>>>>> n
> >>>>>>>>>>>>>> _
> >>>>>>>>>>>>>> c
> >>>>>>>>>>>>>> nt);
> >>>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
> >>>>>>>>>>>>>>                                     VCN_IDLE_TIMEOUT); diff
> >>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
> >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>               uint16_t inst_mask;
> >>>>>>>>>>>>>>               uint8_t num_inst_per_aid;
> >>>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>               /* IP reg dump */
> >>>>>>>>>>>>>>               uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 12:48                                 ` Sundararaju, Sathishkumar
  2025-08-14 12:54                                   ` Lazar, Lijo
@ 2025-08-14 17:06                                   ` Sundararaju, Sathishkumar
  1 sibling, 0 replies; 36+ messages in thread
From: Sundararaju, Sathishkumar @ 2025-08-14 17:06 UTC (permalink / raw)
  To: Lazar, Lijo, Alex Deucher
  Cc: Wu, David, Deucher, Alexander, amd-gfx@lists.freedesktop.org

Hi Lijo,

On 8/14/2025 6:18 PM, Sundararaju, Sathishkumar wrote:
>
> On 8/14/2025 5:33 PM, Lazar, Lijo wrote:
>> [Public]
>>
>> There is no need for nested lock. It only needs to follow the order
>>          set instance power_state
>>          set profile (this takes a global lock and hence instance 
>> power state will be visible to whichever thread that gets the work 
>> profile lock).
>>
>> You are seeing nested lock just because I added the code just after 
>> power state setting.
>
> Pasting your code from the file for ref :
>
> @@ -464,32 +509,14 @@ void amdgpu_vcn_ring_begin_use(struct 
> amdgpu_ring *ring)
>
> -pg_lock:
>
>      mutex_lock(&vcn_inst->vcn_pg_lock);
>      vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>
> +   amdgpu_vcn_get_profile(adev);
>
> vcn_pg_lock isn't  released here yet right ? And in-case you intend to 
> only order the locks, then still there is an un-necessary OFF followed 
> by ON, but yes that is acceptable,
>
> May be you want to move that vcn_pg_lock after amdgpu_vcn_get_profile 
> to protect concurrent dpg_state access in begin_use.
>
> The concern is, this patch access power_state that is protected by 
> some other mutex lock hoping it reflects right values also when 
> holding powerprofile_lock.

Thanks for taking time to explain this personally to me , I get it now, 
your patch should work fine. This above is a wrong hypothetical issue I 
imagined.

amdgpu_vcn_get_profile() and amdgpu_vcn_put_profile can be void return 
type.

And please remove the typo to avoid nested locking.

With above changes, your patch is  :-

Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com>

And there is other thread which also has Alex's (v3) fix for this issue, 
leave it to Alex/yourself and other review participants to decide for 
the best approach, I am inclined with your approach, as it is centered 
around pg_state.


Regards,

Sathish

>
> Or
>
> Have shared a patch with global workload_profile_mutex that simplifies 
> this handling, and renamed pg_lock -> dpg_lock  and used
>
> that only for dpg_state changes per instance.
>
> Regards,
>
> Sathish
>
>>
>> Thanks,
>> Lijo
>>
>> -----Original Message-----
>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>> Sent: Thursday, August 14, 2025 5:23 PM
>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher 
>> <alexdeucher@gmail.com>
>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander 
>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition 
>> (v3)
>>
>>
>> On 8/14/2025 3:16 PM, Lazar, Lijo wrote:
>>> [Public]
>>>
>>> I see your point now. Attached should work, I guess. Is the concern 
>>> more about having to take the lock for every begin?
>> This is closer,  but the thing is, IMO we shouldn't have to use 2 
>> locks and go into nested locking, we can do with just one global lock.
>>
>> Power_state of each instance, and global workload_profile_active are 
>> inter-related, they need to be guarded together,
>>
>> nested could work , but why nested if single lock is enough ? nested 
>> complicates it.
>>
>> Regards,
>>
>> Sathish
>>
>>> Thanks,
>>> Lijo
>>>
>>> -----Original Message-----
>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>> Lazar, Lijo
>>> Sent: Thursday, August 14, 2025 2:55 PM
>>> To: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>; Alex
>>> Deucher <alexdeucher@gmail.com>
>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: RE: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>>> (v3)
>>>
>>> [Public]
>>>
>>> That is not required I think. The power profile is set by an 
>>> instance *after* setting itself to power on. Also, it's switched 
>>> back after changing its power state to off.  If idle worker is run 
>>> by another instance, it won't be seeing the inst0 as power gated and 
>>> won't change power profile.
>>>
>>> Thanks,
>>> Lijo
>>> -----Original Message-----
>>> From: Sundararaju, Sathishkumar <Sathishkumar.Sundararaju@amd.com>
>>> Sent: Thursday, August 14, 2025 2:41 PM
>>> To: Lazar, Lijo <Lijo.Lazar@amd.com>; Alex Deucher
>>> <alexdeucher@gmail.com>
>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>>> (v3)
>>>
>>> Hi Lijo,
>>>
>>> On 8/14/2025 2:11 PM, Lazar, Lijo wrote:
>>>> [Public]
>>>>
>>>> We already have a per instance power state that can be tracked. 
>>>> What about something like attached?
>>> This also has concurrent access of the power state ,
>>> vcn.inst[i].cur_state is not protected by workload_profile_mutex
>>>
>>> every where, it can still change while you are holding 
>>> workload_profile_mutex and checking it.
>>>
>>> Regards,
>>>
>>> Sathish
>>>
>>>> Thanks,
>>>> Lijo
>>>> -----Original Message-----
>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>> Sundararaju, Sathishkumar
>>>> Sent: Thursday, August 14, 2025 4:43 AM
>>>> To: Alex Deucher <alexdeucher@gmail.com>
>>>> Cc: Wu, David <David.Wu3@amd.com>; Deucher, Alexander
>>>> <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition
>>>> (v3)
>>>>
>>>>
>>>> On 8/14/2025 3:38 AM, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 5:1 PM Sundararaju, Sathishkumar
>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>> On 8/14/2025 2:33 AM, Alex Deucher wrote:
>>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>>>> Hi Alex, Hi David,
>>>>>>>>>>
>>>>>>>>>> I see David's concern but his suggestion yet wont solve the
>>>>>>>>>> problem, neither the current form , reason :-
>>>>>>>>>>
>>>>>>>>>> The emitted fence count and total submission count are fast
>>>>>>>>>> transients which frequently become 0 in between video decodes
>>>>>>>>>> (between jobs) even with the atomics and locks there can be a
>>>>>>>>>> switch of video power profile, in the current form of patch
>>>>>>>>>> that window is minimized, but still can happen if stress
>>>>>>>>>> tested. But power state of any instance becoming zero
>>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0
>>>>>>>> decode completes, delayed idle work starts.
>>>>>>>> inst0 idle handler can read 0 total fences and 0 total submission
>>>>>>>> count, even if inst1 is actively decoding, that's between the 
>>>>>>>> jobs,
>>>>>>>>        - as begin_use increaments vcn.total_submission_cnt and
>>>>>>>> end_use decreaments vcn.total_submission_cnt that can be 0.
>>>>>>>>        - if outstanding fences are cleared and no new emitted
>>>>>>>> fence, between jobs , can be 0.
>>>>>>>>        - both of the above conditions do not mean video decode is
>>>>>>>> complete on inst1, it is actively decoding.
>>>>>>> How can there be active decoding without an outstanding fence?  In
>>>>>>> that case, total_fences (fences from both instances) would be 
>>>>>>> non-0.
>>>>>> I mean on inst1 the job scheduled is already complete, so 0
>>>>>> outstanding fences, newer job is yet to be scheduled
>>>>>>
>>>>>> and commited to ring (inst1) , this doesn't mean decode has stopped
>>>>>> on
>>>>>> inst1 right (I am saying if timing of inst0 idle work coincides
>>>>>> with this),
>>>>>>
>>>>>> Or am I wrong in assuming this ? Can't this ever happen ? Please
>>>>>> correct my understanding here.
>>>>> The flow looks like:
>>>>>
>>>>> begin_use(inst)
>>>>> emit_fence(inst)
>>>>> end_use(inst)
>>>>>
>>>>> ...later
>>>>> fence signals
>>>>> ...later
>>>>> work handler
>>>>>
>>>>> In begin_use we increment the global and per instance submission.
>>>>> This protects the power gating and profile until end_use. In end
>>>>> use we decrement the submissions because we don't need to protect
>>>>> anything any more as we have the fence that was submitted via the
>>>>> ring.  That fence won't signal until the job is complete.
>>>> Is a next begin_use always guaranteed to be run before current job 
>>>> fence signals ?
>>>>
>>>> if not then both total submission and total fence are zero , example
>>>> delayed job/packet submissions
>>>>
>>>> from user space, or next job schedule happens after current job 
>>>> fence signals.
>>>>
>>>> if this is never possible then (v3) is perfect.
>>>>
>>>> Regards,
>>>>
>>>> Sathish
>>>>
>>>>> For power gating, we
>>>>> only care about the submission count and fences for that instance,
>>>>> for the profile, we care about submission count and fences all 
>>>>> instances.
>>>>> Once the fences have signalled, the outstanding fences will be 0 and
>>>>> there won't be any active work.
>>>>>
>>>>> Alex
>>>>>
>>>>>> Regards,
>>>>>>
>>>>>> Sathish
>>>>>>
>>>>>>> Alex
>>>>>>>
>>>>>>>> Whereas if instances are powered off we are sure idle time is
>>>>>>>> past and it is powered off, no possible way of active video
>>>>>>>> decode, when all instances are off we can safely assume no active
>>>>>>>> decode and global lock protects it against new begin_use on any 
>>>>>>>> instance.
>>>>>>>> But the only distant concern is global common locks w.r.t perf,
>>>>>>>> but we are already having a global workprofile mutex , so there
>>>>>>>> shouldn't be any drop in perf, with just one single global lock
>>>>>>>> for all instances.
>>>>>>>>
>>>>>>>> Just sending out a patch with this fix, will leave it to you to
>>>>>>>> decide the right method. If you think outstanding total fences
>>>>>>>> can never be 0 during decode, then your previous version (v3)
>>>>>>>> itself is good, there is no real benefit of splitting the 
>>>>>>>> handlers as such.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Sathish
>>>>>>>>> If it is possible, maybe it would be easier to just split the
>>>>>>>>> profile and powergating into separate handlers. The profile one
>>>>>>>>> would be global and the powergating one would be per instance.
>>>>>>>>> See the attached patches.
>>>>>>>>>
>>>>>>>>> Alex
>>>>>>>>>
>>>>>>>>>> can be a sure shot indication of break in a video decode, the
>>>>>>>>>> mistake in my patch was using per instance mutex, I should have
>>>>>>>>>> used a common global mutex, then that covers the situation 
>>>>>>>>>> David is trying to bring out.
>>>>>>>>>>
>>>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using
>>>>>>>>>> flags to track power state could help us totally avoid this 
>>>>>>>>>> situation.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>>
>>>>>>>>>> Sathish
>>>>>>>>>>
>>>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David 
>>>>>>>>>>>> <davidwu2@amd.com> wrote:
>>>>>>>>>>>>> Hi Alex,
>>>>>>>>>>>>>
>>>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>>>>> it is unlikely to have a context switch right after the 
>>>>>>>>>>>>> begin_use().
>>>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer
>>>>>>>>>>>>> in case someone adds more before the lock and not reviewed
>>>>>>>>>>>>> thoroughly)
>>>>>>>>>>>>>          - up to you to decide.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>> David
>>>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>>>>> If there are multiple instances of the VCN running, we may
>>>>>>>>>>>>>> end up switching the video profile while another instance
>>>>>>>>>>>>>> is active because we only take into account the current
>>>>>>>>>>>>>> instance's submissions.  Look at all outstanding fences for
>>>>>>>>>>>>>> the video profile.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload
>>>>>>>>>>>>>> profile
>>>>>>>>>>>>>> handling")
>>>>>>>>>>>>>> Reviewed-by: Sathishkumar S
>>>>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>>>>          2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>>>              struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>>>>                      container_of(work, struct
>>>>>>>>>>>>>> amdgpu_vcn_inst, idle_work.work);
>>>>>>>>>>>>>>              struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>>>>> -     unsigned int fences = 0, 
>>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>>>>              int r = 0;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>>>>>                      return;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v =
>>>>>>>>>>>>>> + &adev->vcn.inst[i];
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>>>>> +             fence[i] += 
>>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>>>>> +     }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>              /* Only set DPG pause for VCN3 or below, VCN4
>>>>>>>>>>>>>> and above will be handled by FW */
>>>>>>>>>>>>>>              if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>>>>> - !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>>>>> + !vcn_inst->using_unified_queue) {
>>>>>>>>>>>>>>                      struct dpg_pause_state new_state;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>                      if (fence[i] || @@ -436,18 +442,18 @@
>>>>>>>>>>>>>> static void amdgpu_vcn_idle_work_handler(struct work_struct
>>>>>>>>>>>>>> *work)
>>>>>>>>>>>>>>                      else
>>>>>>>>>>>>>> new_state.fw_based =
>>>>>>>>>>>>>> VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> - adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>>>> + vcn_inst->pause_dpg_mode(vcn_inst,
>>>>>>>>>>>>>> + &new_state);
>>>>>>>>>>>>>>              }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     fence[i] += 
>>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>> -     if (!fences && 
>>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>>>> mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>> vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>>>> mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>>>>> +             /* This is global and depends on all VCN 
>>>>>>>>>>>>>> instances */
>>>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>>>>> !total_fences &&
>>>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>>>>                              r =
>>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, false);
>>>>>>>>>>>>>>                              if (r) @@ -467,16 +473,10 @@
>>>>>>>>>>>>>> void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>>>              int r = 0;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>>>>> + atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     /* We can safely return early here because we've 
>>>>>>>>>>>>>> cancelled the
>>>>>>>>>>>>>> -      * the delayed work so there is no one else to set 
>>>>>>>>>>>>>> it to false
>>>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>>>>> -      */
>>>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>> move to here:
>>>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>>>>> David
>>>>>>>>>>>> Alex
>>>>>>>>>>>>
>>>>>>>>>>>>> David
>>>>>>>>>>>>>>              if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>>>>                      r =
>>>>>>>>>>>>>> amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO, @@ -487,7 +487,6 @@ void
>>>>>>>>>>>>>> amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
>>>>>>>>>>>>>>              }
>>>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>>>> mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>> vcn_inst->set_pg_state(vcn_inst,
>>>>>>>>>>>>>> AMD_PG_STATE_UNGATE);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct
>>>>>>>>>>>>>> amdgpu_ring
>>>>>>>>>>>>>> *ring)
>>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submissi
>>>>>>>>>>>>>> o
>>>>>>>>>>>>>> n
>>>>>>>>>>>>>> _cnt);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission
>>>>>>>>>>>>>> _
>>>>>>>>>>>>>> c
>>>>>>>>>>>>>> nt);
>>>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work, 
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> VCN_IDLE_TIMEOUT); diff
>>>>>>>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>              uint16_t inst_mask;
>>>>>>>>>>>>>>              uint8_t num_inst_per_aid;
>>>>>>>>>>>>>> +     atomic_t total_submission_cnt;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>              /* IP reg dump */
>>>>>>>>>>>>>>              uint32_t *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 21:03             ` Alex Deucher
  2025-08-13 21:16               ` Sundararaju, Sathishkumar
@ 2025-08-13 21:47               ` Wu, David
  2025-08-13 22:11                 ` Alex Deucher
  1 sibling, 1 reply; 36+ messages in thread
From: Wu, David @ 2025-08-13 21:47 UTC (permalink / raw)
  To: Alex Deucher, Sundararaju, Sathishkumar; +Cc: Alex Deucher, amd-gfx

On 8/13/2025 5:03 PM, Alex Deucher wrote:
> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
> <sathishkumar.sundararaju@amd.com> wrote:
>>
>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>> Hi Alex, Hi David,
>>>>
>>>> I see David's concern but his suggestion yet wont solve the problem,
>>>> neither the current form , reason :-
>>>>
>>>> The emitted fence count and total submission count are fast transients
>>>> which frequently become 0 in between video decodes (between jobs) even
>>>> with the atomics and locks there can be a switch of video power profile,
>>>> in the current form of patch that window is minimized, but still can
>>>> happen if stress tested. But power state of any instance becoming zero
>>> Can you explain how this can happen?  I'm not seeing it.
>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
>> completes, delayed idle work starts.
>> inst0 idle handler can read 0 total fences and 0 total submission count,
>> even if inst1 is actively decoding,
>> that's between the jobs,
>>    - as begin_use increaments vcn.total_submission_cnt and end_use
>> decreaments vcn.total_submission_cnt that can be 0.
>>    - if outstanding fences are cleared and no new emitted fence, between
>> jobs , can be 0.
>>    - both of the above conditions do not mean video decode is complete on
>> inst1, it is actively decoding.
> How can there be active decoding without an outstanding fence?  In
> that case, total_fences (fences from both instances) would be non-0.
I think it should be non-0.
I do see a hiccup possible - i.e the power switching from ON to OFF then 
ON in the
middle of decoding, i.e inst0 idle handler turns it off then inst1 turns 
it on.
We should avoid this glitch. This requires the idle handler sets/clears 
a flag for
done for this instance as Sathish's original patch. When all instances 
set/clear the
flag then we can safely power off.
David
> Alex
>
>> Whereas if instances are powered off we are sure idle time is past and
>> it is powered off, no possible way of
>> active video decode, when all instances are off we can safely assume no
>> active decode and global lock protects
>> it against new begin_use on any instance. But the only distant concern
>> is global common locks w.r.t perf, but we
>> are already having a global workprofile mutex , so there shouldn't be
>> any drop in perf, with just one single
>> global lock for all instances.
>>
>> Just sending out a patch with this fix, will leave it to you to decide
>> the right method. If you think outstanding total fences
>> can never be 0 during decode, then your previous version (v3) itself is
>> good, there is no real benefit of splitting the handlers as such.
>>
>> Regards,
>> Sathish
>>> If it is possible, maybe it would be easier to just split the profile
>>> and powergating into separate handlers.  The profile one would be
>>> global and the powergating one would be per instance.  See the
>>> attached patches.
>>>
>>> Alex
>>>
>>>> can be a sure shot indication of break in a video decode, the mistake in
>>>> my patch was using per instance mutex, I should have used a common
>>>> global mutex, then that covers the situation David is trying to bring out.
>>>>
>>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
>>>> track power state could help us totally avoid this situation.
>>>>
>>>> Regards,
>>>>
>>>> Sathish
>>>>
>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>> Hi Alex,
>>>>>>>
>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>> The suggestion of moving it inside the lock (which I prefer in case
>>>>>>> someone
>>>>>>> adds more before the lock and not reviewed thoroughly)
>>>>>>>      - up to you to decide.
>>>>>>>
>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>
>>>>>>> Thanks,
>>>>>>> David
>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>> If there are multiple instances of the VCN running,
>>>>>>>> we may end up switching the video profile while another
>>>>>>>> instance is active because we only take into account
>>>>>>>> the current instance's submissions.  Look at all
>>>>>>>> outstanding fences for the video profile.
>>>>>>>>
>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>
>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
>>>>>>>> handling")
>>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>> ++++++++++++-------------
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>      2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>          struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>                  container_of(work, struct amdgpu_vcn_inst,
>>>>>>>> idle_work.work);
>>>>>>>>          struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>> +     unsigned int i, j;
>>>>>>>>          int r = 0;
>>>>>>>>
>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>                  return;
>>>>>>>>
>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>> -             fence[i] +=
>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>>>>>>> +
>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>> +                     fence[i] +=
>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>> +             total_fences += fence[i];
>>>>>>>> +     }
>>>>>>>>
>>>>>>>>          /* Only set DPG pause for VCN3 or below, VCN4 and above will
>>>>>>>> be handled by FW */
>>>>>>>>          if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>                  struct dpg_pause_state new_state;
>>>>>>>>
>>>>>>>>                  if (fence[i] ||
>>>>>>>> @@ -436,18 +442,18 @@ static void
>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>                  else
>>>>>>>>                          new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>>>>>>>>
>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>          }
>>>>>>>>
>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>> -     fences += fence[i];
>>>>>>>> -
>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>                  mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>                  vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>                  mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>> !total_fences &&
>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>                          r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>> false);
>>>>>>>>                          if (r)
>>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>> amdgpu_ring *ring)
>>>>>>>>          int r = 0;
>>>>>>>>
>>>>>>>>          atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>> move this addition down inside the mutex lock
>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>
>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>> -      */
>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>> -             goto pg_lock;
>>>>>>>> -
>>>>>>>>          mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>> move to here:
>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>> I think this should work for multiple instances.
>>>>>> Why does this need to be protected by the mutex?
>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>> David
>>>>>> Alex
>>>>>>
>>>>>>> David
>>>>>>>>          if (!adev->vcn.workload_profile_active) {
>>>>>>>>                  r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>> amdgpu_ring *ring)
>>>>>>>>          }
>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>
>>>>>>>> -pg_lock:
>>>>>>>>          mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>          vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>>>>>>>>
>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
>>>>>>>> *ring)
>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>>>>>>>>
>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>
>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>                                VCN_IDLE_TIMEOUT);
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>
>>>>>>>>          uint16_t inst_mask;
>>>>>>>>          uint8_t num_inst_per_aid;
>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>
>>>>>>>>          /* IP reg dump */
>>>>>>>>          uint32_t                *ip_dump;


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 21:47               ` Wu, David
@ 2025-08-13 22:11                 ` Alex Deucher
  2025-08-13 23:06                   ` Wu, David
  0 siblings, 1 reply; 36+ messages in thread
From: Alex Deucher @ 2025-08-13 22:11 UTC (permalink / raw)
  To: Wu, David; +Cc: Sundararaju, Sathishkumar, Alex Deucher, amd-gfx

On Wed, Aug 13, 2025 at 5:47 PM Wu, David <davidwu2@amd.com> wrote:
>
> On 8/13/2025 5:03 PM, Alex Deucher wrote:
> > On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
> > <sathishkumar.sundararaju@amd.com> wrote:
> >>
> >> On 8/14/2025 1:35 AM, Alex Deucher wrote:
> >>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
> >>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>> Hi Alex, Hi David,
> >>>>
> >>>> I see David's concern but his suggestion yet wont solve the problem,
> >>>> neither the current form , reason :-
> >>>>
> >>>> The emitted fence count and total submission count are fast transients
> >>>> which frequently become 0 in between video decodes (between jobs) even
> >>>> with the atomics and locks there can be a switch of video power profile,
> >>>> in the current form of patch that window is minimized, but still can
> >>>> happen if stress tested. But power state of any instance becoming zero
> >>> Can you explain how this can happen?  I'm not seeing it.
> >> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
> >> completes, delayed idle work starts.
> >> inst0 idle handler can read 0 total fences and 0 total submission count,
> >> even if inst1 is actively decoding,
> >> that's between the jobs,
> >>    - as begin_use increaments vcn.total_submission_cnt and end_use
> >> decreaments vcn.total_submission_cnt that can be 0.
> >>    - if outstanding fences are cleared and no new emitted fence, between
> >> jobs , can be 0.
> >>    - both of the above conditions do not mean video decode is complete on
> >> inst1, it is actively decoding.
> > How can there be active decoding without an outstanding fence?  In
> > that case, total_fences (fences from both instances) would be non-0.
> I think it should be non-0.
> I do see a hiccup possible - i.e the power switching from ON to OFF then
> ON in the
> middle of decoding, i.e inst0 idle handler turns it off then inst1 turns
> it on.

How would that happen? As long as there submission cnt is non-0 and
there are outstanding fences on any instance, the video profile will
stay active.

Alex

> We should avoid this glitch. This requires the idle handler sets/clears
> a flag for
> done for this instance as Sathish's original patch. When all instances
> set/clear the
> flag then we can safely power off.
> David
> > Alex
> >
> >> Whereas if instances are powered off we are sure idle time is past and
> >> it is powered off, no possible way of
> >> active video decode, when all instances are off we can safely assume no
> >> active decode and global lock protects
> >> it against new begin_use on any instance. But the only distant concern
> >> is global common locks w.r.t perf, but we
> >> are already having a global workprofile mutex , so there shouldn't be
> >> any drop in perf, with just one single
> >> global lock for all instances.
> >>
> >> Just sending out a patch with this fix, will leave it to you to decide
> >> the right method. If you think outstanding total fences
> >> can never be 0 during decode, then your previous version (v3) itself is
> >> good, there is no real benefit of splitting the handlers as such.
> >>
> >> Regards,
> >> Sathish
> >>> If it is possible, maybe it would be easier to just split the profile
> >>> and powergating into separate handlers.  The profile one would be
> >>> global and the powergating one would be per instance.  See the
> >>> attached patches.
> >>>
> >>> Alex
> >>>
> >>>> can be a sure shot indication of break in a video decode, the mistake in
> >>>> my patch was using per instance mutex, I should have used a common
> >>>> global mutex, then that covers the situation David is trying to bring out.
> >>>>
> >>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
> >>>> track power state could help us totally avoid this situation.
> >>>>
> >>>> Regards,
> >>>>
> >>>> Sathish
> >>>>
> >>>> On 8/13/2025 11:46 PM, Wu, David wrote:
> >>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
> >>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
> >>>>>>> Hi Alex,
> >>>>>>>
> >>>>>>> The addition of  total_submission_cnt should work - in that
> >>>>>>> it is unlikely to have a context switch right after the begin_use().
> >>>>>>> The suggestion of moving it inside the lock (which I prefer in case
> >>>>>>> someone
> >>>>>>> adds more before the lock and not reviewed thoroughly)
> >>>>>>>      - up to you to decide.
> >>>>>>>
> >>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
> >>>>>>>
> >>>>>>> Thanks,
> >>>>>>> David
> >>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
> >>>>>>>> If there are multiple instances of the VCN running,
> >>>>>>>> we may end up switching the video profile while another
> >>>>>>>> instance is active because we only take into account
> >>>>>>>> the current instance's submissions.  Look at all
> >>>>>>>> outstanding fences for the video profile.
> >>>>>>>>
> >>>>>>>> v2: drop early exit in begin_use()
> >>>>>>>> v3: handle possible race between begin_use() work handler
> >>>>>>>>
> >>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
> >>>>>>>> handling")
> >>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
> >>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> >>>>>>>> ---
> >>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
> >>>>>>>> ++++++++++++-------------
> >>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
> >>>>>>>>      2 files changed, 21 insertions(+), 20 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
> >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>> @@ -415,19 +415,25 @@ static void
> >>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>          struct amdgpu_vcn_inst *vcn_inst =
> >>>>>>>>                  container_of(work, struct amdgpu_vcn_inst,
> >>>>>>>> idle_work.work);
> >>>>>>>>          struct amdgpu_device *adev = vcn_inst->adev;
> >>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>> -     unsigned int i = vcn_inst->inst, j;
> >>>>>>>> +     unsigned int total_fences = 0,
> >>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>> +     unsigned int i, j;
> >>>>>>>>          int r = 0;
> >>>>>>>>
> >>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
> >>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
> >>>>>>>>                  return;
> >>>>>>>>
> >>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> >>>>>>>> -             fence[i] +=
> >>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> >>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> >>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
> >>>>>>>> +
> >>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
> >>>>>>>> +                     fence[i] +=
> >>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
> >>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> >>>>>>>> +             total_fences += fence[i];
> >>>>>>>> +     }
> >>>>>>>>
> >>>>>>>>          /* Only set DPG pause for VCN3 or below, VCN4 and above will
> >>>>>>>> be handled by FW */
> >>>>>>>>          if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> >>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
> >>>>>>>> +         !vcn_inst->using_unified_queue) {
> >>>>>>>>                  struct dpg_pause_state new_state;
> >>>>>>>>
> >>>>>>>>                  if (fence[i] ||
> >>>>>>>> @@ -436,18 +442,18 @@ static void
> >>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>                  else
> >>>>>>>>                          new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
> >>>>>>>>
> >>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>>          }
> >>>>>>>>
> >>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> >>>>>>>> -     fences += fence[i];
> >>>>>>>> -
> >>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>> +     if (!fence[vcn_inst->inst] &&
> >>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>> +             /* This is specific to this instance */
> >>>>>>>>                  mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>                  vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> >>>>>>>>                  mutex_unlock(&vcn_inst->vcn_pg_lock);
> >>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>> -             if (adev->vcn.workload_profile_active) {
> >>>>>>>> +             /* This is global and depends on all VCN instances */
> >>>>>>>> +             if (adev->vcn.workload_profile_active &&
> >>>>>>>> !total_fences &&
> >>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
> >>>>>>>>                          r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>>>> false);
> >>>>>>>>                          if (r)
> >>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>>>> amdgpu_ring *ring)
> >>>>>>>>          int r = 0;
> >>>>>>>>
> >>>>>>>>          atomic_inc(&vcn_inst->total_submission_cnt);
> >>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>> move this addition down inside the mutex lock
> >>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
> >>>>>>>>
> >>>>>>>> -     /* We can safely return early here because we've cancelled the
> >>>>>>>> -      * the delayed work so there is no one else to set it to false
> >>>>>>>> -      * and we don't care if someone else sets it to true.
> >>>>>>>> -      */
> >>>>>>>> -     if (adev->vcn.workload_profile_active)
> >>>>>>>> -             goto pg_lock;
> >>>>>>>> -
> >>>>>>>>          mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>> move to here:
> >>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>> I think this should work for multiple instances.
> >>>>>> Why does this need to be protected by the mutex?
> >>>>> hmm.. OK - no need and it is actually better before the mutex.
> >>>>> David
> >>>>>> Alex
> >>>>>>
> >>>>>>> David
> >>>>>>>>          if (!adev->vcn.workload_profile_active) {
> >>>>>>>>                  r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>>>> amdgpu_ring *ring)
> >>>>>>>>          }
> >>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>
> >>>>>>>> -pg_lock:
> >>>>>>>>          mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>          vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> >>>>>>>>
> >>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
> >>>>>>>> *ring)
> >>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
> >>>>>>>>
> >>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
> >>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
> >>>>>>>>
> >>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
> >>>>>>>>                                VCN_IDLE_TIMEOUT);
> >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
> >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
> >>>>>>>>
> >>>>>>>>          uint16_t inst_mask;
> >>>>>>>>          uint8_t num_inst_per_aid;
> >>>>>>>> +     atomic_t                total_submission_cnt;
> >>>>>>>>
> >>>>>>>>          /* IP reg dump */
> >>>>>>>>          uint32_t                *ip_dump;
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 22:11                 ` Alex Deucher
@ 2025-08-13 23:06                   ` Wu, David
  2025-08-14 12:56                     ` Alex Deucher
  0 siblings, 1 reply; 36+ messages in thread
From: Wu, David @ 2025-08-13 23:06 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Sundararaju, Sathishkumar, Alex Deucher, amd-gfx

On 8/13/2025 6:11 PM, Alex Deucher wrote:
> On Wed, Aug 13, 2025 at 5:47 PM Wu, David <davidwu2@amd.com> wrote:
>> On 8/13/2025 5:03 PM, Alex Deucher wrote:
>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>> Hi Alex, Hi David,
>>>>>>
>>>>>> I see David's concern but his suggestion yet wont solve the problem,
>>>>>> neither the current form , reason :-
>>>>>>
>>>>>> The emitted fence count and total submission count are fast transients
>>>>>> which frequently become 0 in between video decodes (between jobs) even
>>>>>> with the atomics and locks there can be a switch of video power profile,
>>>>>> in the current form of patch that window is minimized, but still can
>>>>>> happen if stress tested. But power state of any instance becoming zero
>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
>>>> completes, delayed idle work starts.
>>>> inst0 idle handler can read 0 total fences and 0 total submission count,
>>>> even if inst1 is actively decoding,
>>>> that's between the jobs,
>>>>     - as begin_use increaments vcn.total_submission_cnt and end_use
>>>> decreaments vcn.total_submission_cnt that can be 0.
>>>>     - if outstanding fences are cleared and no new emitted fence, between
>>>> jobs , can be 0.
>>>>     - both of the above conditions do not mean video decode is complete on
>>>> inst1, it is actively decoding.
>>> How can there be active decoding without an outstanding fence?  In
>>> that case, total_fences (fences from both instances) would be non-0.
>> I think it should be non-0.
>> I do see a hiccup possible - i.e the power switching from ON to OFF then
>> ON in the
>> middle of decoding, i.e inst0 idle handler turns it off then inst1 turns
>> it on.
> How would that happen? As long as there submission cnt is non-0 and
> there are outstanding fences on any instance, the video profile will
> stay active.
there could be no jobs but it doesn't timeout yet and new jobs will come in
any ms - note all fences are done at this time. The idle handler sees no 
fences
and no jobs so it turns off the power - but just ms later a new job is 
submitted
from the same decode session which could be mpv player as it does not 
need to
submit jobs without delays. This will turn on the power.
David
> Alex
>
>> We should avoid this glitch. This requires the idle handler sets/clears
>> a flag for
>> done for this instance as Sathish's original patch. When all instances
>> set/clear the
>> flag then we can safely power off.
>> David
>>> Alex
>>>
>>>> Whereas if instances are powered off we are sure idle time is past and
>>>> it is powered off, no possible way of
>>>> active video decode, when all instances are off we can safely assume no
>>>> active decode and global lock protects
>>>> it against new begin_use on any instance. But the only distant concern
>>>> is global common locks w.r.t perf, but we
>>>> are already having a global workprofile mutex , so there shouldn't be
>>>> any drop in perf, with just one single
>>>> global lock for all instances.
>>>>
>>>> Just sending out a patch with this fix, will leave it to you to decide
>>>> the right method. If you think outstanding total fences
>>>> can never be 0 during decode, then your previous version (v3) itself is
>>>> good, there is no real benefit of splitting the handlers as such.
>>>>
>>>> Regards,
>>>> Sathish
>>>>> If it is possible, maybe it would be easier to just split the profile
>>>>> and powergating into separate handlers.  The profile one would be
>>>>> global and the powergating one would be per instance.  See the
>>>>> attached patches.
>>>>>
>>>>> Alex
>>>>>
>>>>>> can be a sure shot indication of break in a video decode, the mistake in
>>>>>> my patch was using per instance mutex, I should have used a common
>>>>>> global mutex, then that covers the situation David is trying to bring out.
>>>>>>
>>>>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
>>>>>> track power state could help us totally avoid this situation.
>>>>>>
>>>>>> Regards,
>>>>>>
>>>>>> Sathish
>>>>>>
>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>> Hi Alex,
>>>>>>>>>
>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>> The suggestion of moving it inside the lock (which I prefer in case
>>>>>>>>> someone
>>>>>>>>> adds more before the lock and not reviewed thoroughly)
>>>>>>>>>       - up to you to decide.
>>>>>>>>>
>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> David
>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>> If there are multiple instances of the VCN running,
>>>>>>>>>> we may end up switching the video profile while another
>>>>>>>>>> instance is active because we only take into account
>>>>>>>>>> the current instance's submissions.  Look at all
>>>>>>>>>> outstanding fences for the video profile.
>>>>>>>>>>
>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>
>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
>>>>>>>>>> handling")
>>>>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>> ---
>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>       2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>           struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>                   container_of(work, struct amdgpu_vcn_inst,
>>>>>>>>>> idle_work.work);
>>>>>>>>>>           struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>           int r = 0;
>>>>>>>>>>
>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>                   return;
>>>>>>>>>>
>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>> -             fence[i] +=
>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>>>>>>>>> +
>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>> +     }
>>>>>>>>>>
>>>>>>>>>>           /* Only set DPG pause for VCN3 or below, VCN4 and above will
>>>>>>>>>> be handled by FW */
>>>>>>>>>>           if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>                   struct dpg_pause_state new_state;
>>>>>>>>>>
>>>>>>>>>>                   if (fence[i] ||
>>>>>>>>>> @@ -436,18 +442,18 @@ static void
>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>                   else
>>>>>>>>>>                           new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>
>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>           }
>>>>>>>>>>
>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>> -
>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>                   mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>                   vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>                   mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>> !total_fences &&
>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>                           r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>> false);
>>>>>>>>>>                           if (r)
>>>>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>           int r = 0;
>>>>>>>>>>
>>>>>>>>>>           atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>
>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>> -      */
>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>> -
>>>>>>>>>>           mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>> move to here:
>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>> I think this should work for multiple instances.
>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>> David
>>>>>>>> Alex
>>>>>>>>
>>>>>>>>> David
>>>>>>>>>>           if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>                   r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>           }
>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>
>>>>>>>>>> -pg_lock:
>>>>>>>>>>           mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>           vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>>>>>>>>>>
>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
>>>>>>>>>> *ring)
>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>>>>>>>>>>
>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>
>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>                                 VCN_IDLE_TIMEOUT);
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>
>>>>>>>>>>           uint16_t inst_mask;
>>>>>>>>>>           uint8_t num_inst_per_aid;
>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>
>>>>>>>>>>           /* IP reg dump */
>>>>>>>>>>           uint32_t                *ip_dump;


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-13 23:06                   ` Wu, David
@ 2025-08-14 12:56                     ` Alex Deucher
  2025-08-14 15:35                       ` David Wu
  0 siblings, 1 reply; 36+ messages in thread
From: Alex Deucher @ 2025-08-14 12:56 UTC (permalink / raw)
  To: Wu, David; +Cc: Sundararaju, Sathishkumar, Alex Deucher, amd-gfx

On Wed, Aug 13, 2025 at 7:06 PM Wu, David <davidwu2@amd.com> wrote:
>
> On 8/13/2025 6:11 PM, Alex Deucher wrote:
> > On Wed, Aug 13, 2025 at 5:47 PM Wu, David <davidwu2@amd.com> wrote:
> >> On 8/13/2025 5:03 PM, Alex Deucher wrote:
> >>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
> >>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
> >>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
> >>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>> Hi Alex, Hi David,
> >>>>>>
> >>>>>> I see David's concern but his suggestion yet wont solve the problem,
> >>>>>> neither the current form , reason :-
> >>>>>>
> >>>>>> The emitted fence count and total submission count are fast transients
> >>>>>> which frequently become 0 in between video decodes (between jobs) even
> >>>>>> with the atomics and locks there can be a switch of video power profile,
> >>>>>> in the current form of patch that window is minimized, but still can
> >>>>>> happen if stress tested. But power state of any instance becoming zero
> >>>>> Can you explain how this can happen?  I'm not seeing it.
> >>>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
> >>>> completes, delayed idle work starts.
> >>>> inst0 idle handler can read 0 total fences and 0 total submission count,
> >>>> even if inst1 is actively decoding,
> >>>> that's between the jobs,
> >>>>     - as begin_use increaments vcn.total_submission_cnt and end_use
> >>>> decreaments vcn.total_submission_cnt that can be 0.
> >>>>     - if outstanding fences are cleared and no new emitted fence, between
> >>>> jobs , can be 0.
> >>>>     - both of the above conditions do not mean video decode is complete on
> >>>> inst1, it is actively decoding.
> >>> How can there be active decoding without an outstanding fence?  In
> >>> that case, total_fences (fences from both instances) would be non-0.
> >> I think it should be non-0.
> >> I do see a hiccup possible - i.e the power switching from ON to OFF then
> >> ON in the
> >> middle of decoding, i.e inst0 idle handler turns it off then inst1 turns
> >> it on.
> > How would that happen? As long as there submission cnt is non-0 and
> > there are outstanding fences on any instance, the video profile will
> > stay active.
> there could be no jobs but it doesn't timeout yet and new jobs will come in
> any ms - note all fences are done at this time. The idle handler sees no
> fences
> and no jobs so it turns off the power - but just ms later a new job is
> submitted
> from the same decode session which could be mpv player as it does not
> need to
> submit jobs without delays. This will turn on the power.

I'm not following.  Every submission will start with begin_use().

Alex

> David
> > Alex
> >
> >> We should avoid this glitch. This requires the idle handler sets/clears
> >> a flag for
> >> done for this instance as Sathish's original patch. When all instances
> >> set/clear the
> >> flag then we can safely power off.
> >> David
> >>> Alex
> >>>
> >>>> Whereas if instances are powered off we are sure idle time is past and
> >>>> it is powered off, no possible way of
> >>>> active video decode, when all instances are off we can safely assume no
> >>>> active decode and global lock protects
> >>>> it against new begin_use on any instance. But the only distant concern
> >>>> is global common locks w.r.t perf, but we
> >>>> are already having a global workprofile mutex , so there shouldn't be
> >>>> any drop in perf, with just one single
> >>>> global lock for all instances.
> >>>>
> >>>> Just sending out a patch with this fix, will leave it to you to decide
> >>>> the right method. If you think outstanding total fences
> >>>> can never be 0 during decode, then your previous version (v3) itself is
> >>>> good, there is no real benefit of splitting the handlers as such.
> >>>>
> >>>> Regards,
> >>>> Sathish
> >>>>> If it is possible, maybe it would be easier to just split the profile
> >>>>> and powergating into separate handlers.  The profile one would be
> >>>>> global and the powergating one would be per instance.  See the
> >>>>> attached patches.
> >>>>>
> >>>>> Alex
> >>>>>
> >>>>>> can be a sure shot indication of break in a video decode, the mistake in
> >>>>>> my patch was using per instance mutex, I should have used a common
> >>>>>> global mutex, then that covers the situation David is trying to bring out.
> >>>>>>
> >>>>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
> >>>>>> track power state could help us totally avoid this situation.
> >>>>>>
> >>>>>> Regards,
> >>>>>>
> >>>>>> Sathish
> >>>>>>
> >>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
> >>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
> >>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
> >>>>>>>>> Hi Alex,
> >>>>>>>>>
> >>>>>>>>> The addition of  total_submission_cnt should work - in that
> >>>>>>>>> it is unlikely to have a context switch right after the begin_use().
> >>>>>>>>> The suggestion of moving it inside the lock (which I prefer in case
> >>>>>>>>> someone
> >>>>>>>>> adds more before the lock and not reviewed thoroughly)
> >>>>>>>>>       - up to you to decide.
> >>>>>>>>>
> >>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
> >>>>>>>>>
> >>>>>>>>> Thanks,
> >>>>>>>>> David
> >>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
> >>>>>>>>>> If there are multiple instances of the VCN running,
> >>>>>>>>>> we may end up switching the video profile while another
> >>>>>>>>>> instance is active because we only take into account
> >>>>>>>>>> the current instance's submissions.  Look at all
> >>>>>>>>>> outstanding fences for the video profile.
> >>>>>>>>>>
> >>>>>>>>>> v2: drop early exit in begin_use()
> >>>>>>>>>> v3: handle possible race between begin_use() work handler
> >>>>>>>>>>
> >>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
> >>>>>>>>>> handling")
> >>>>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
> >>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> >>>>>>>>>> ---
> >>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
> >>>>>>>>>> ++++++++++++-------------
> >>>>>>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
> >>>>>>>>>>       2 files changed, 21 insertions(+), 20 deletions(-)
> >>>>>>>>>>
> >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
> >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>> @@ -415,19 +415,25 @@ static void
> >>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>>>           struct amdgpu_vcn_inst *vcn_inst =
> >>>>>>>>>>                   container_of(work, struct amdgpu_vcn_inst,
> >>>>>>>>>> idle_work.work);
> >>>>>>>>>>           struct amdgpu_device *adev = vcn_inst->adev;
> >>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
> >>>>>>>>>> +     unsigned int total_fences = 0,
> >>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>>>> +     unsigned int i, j;
> >>>>>>>>>>           int r = 0;
> >>>>>>>>>>
> >>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
> >>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
> >>>>>>>>>>                   return;
> >>>>>>>>>>
> >>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> >>>>>>>>>> -             fence[i] +=
> >>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> >>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> >>>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
> >>>>>>>>>> +
> >>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
> >>>>>>>>>> +                     fence[i] +=
> >>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
> >>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> >>>>>>>>>> +             total_fences += fence[i];
> >>>>>>>>>> +     }
> >>>>>>>>>>
> >>>>>>>>>>           /* Only set DPG pause for VCN3 or below, VCN4 and above will
> >>>>>>>>>> be handled by FW */
> >>>>>>>>>>           if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> >>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
> >>>>>>>>>> +         !vcn_inst->using_unified_queue) {
> >>>>>>>>>>                   struct dpg_pause_state new_state;
> >>>>>>>>>>
> >>>>>>>>>>                   if (fence[i] ||
> >>>>>>>>>> @@ -436,18 +442,18 @@ static void
> >>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>>>                   else
> >>>>>>>>>>                           new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
> >>>>>>>>>>
> >>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>>>>           }
> >>>>>>>>>>
> >>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> >>>>>>>>>> -     fences += fence[i];
> >>>>>>>>>> -
> >>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
> >>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>>>> +             /* This is specific to this instance */
> >>>>>>>>>>                   mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>                   vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> >>>>>>>>>>                   mutex_unlock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
> >>>>>>>>>> +             /* This is global and depends on all VCN instances */
> >>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
> >>>>>>>>>> !total_fences &&
> >>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
> >>>>>>>>>>                           r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>>>>>> false);
> >>>>>>>>>>                           if (r)
> >>>>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>>>>>> amdgpu_ring *ring)
> >>>>>>>>>>           int r = 0;
> >>>>>>>>>>
> >>>>>>>>>>           atomic_inc(&vcn_inst->total_submission_cnt);
> >>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>>>> move this addition down inside the mutex lock
> >>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
> >>>>>>>>>>
> >>>>>>>>>> -     /* We can safely return early here because we've cancelled the
> >>>>>>>>>> -      * the delayed work so there is no one else to set it to false
> >>>>>>>>>> -      * and we don't care if someone else sets it to true.
> >>>>>>>>>> -      */
> >>>>>>>>>> -     if (adev->vcn.workload_profile_active)
> >>>>>>>>>> -             goto pg_lock;
> >>>>>>>>>> -
> >>>>>>>>>>           mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>> move to here:
> >>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>>>> I think this should work for multiple instances.
> >>>>>>>> Why does this need to be protected by the mutex?
> >>>>>>> hmm.. OK - no need and it is actually better before the mutex.
> >>>>>>> David
> >>>>>>>> Alex
> >>>>>>>>
> >>>>>>>>> David
> >>>>>>>>>>           if (!adev->vcn.workload_profile_active) {
> >>>>>>>>>>                   r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>>>>>> amdgpu_ring *ring)
> >>>>>>>>>>           }
> >>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>
> >>>>>>>>>> -pg_lock:
> >>>>>>>>>>           mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>           vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> >>>>>>>>>>
> >>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
> >>>>>>>>>> *ring)
> >>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
> >>>>>>>>>>
> >>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
> >>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
> >>>>>>>>>>
> >>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
> >>>>>>>>>>                                 VCN_IDLE_TIMEOUT);
> >>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
> >>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
> >>>>>>>>>>
> >>>>>>>>>>           uint16_t inst_mask;
> >>>>>>>>>>           uint8_t num_inst_per_aid;
> >>>>>>>>>> +     atomic_t                total_submission_cnt;
> >>>>>>>>>>
> >>>>>>>>>>           /* IP reg dump */
> >>>>>>>>>>           uint32_t                *ip_dump;
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 12:56                     ` Alex Deucher
@ 2025-08-14 15:35                       ` David Wu
  2025-08-14 16:01                         ` Alex Deucher
  0 siblings, 1 reply; 36+ messages in thread
From: David Wu @ 2025-08-14 15:35 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Sundararaju, Sathishkumar, Alex Deucher, amd-gfx

On 2025-08-14 08:56, Alex Deucher wrote:
> On Wed, Aug 13, 2025 at 7:06 PM Wu, David <davidwu2@amd.com> wrote:
>> On 8/13/2025 6:11 PM, Alex Deucher wrote:
>>> On Wed, Aug 13, 2025 at 5:47 PM Wu, David <davidwu2@amd.com> wrote:
>>>> On 8/13/2025 5:03 PM, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>> Hi Alex, Hi David,
>>>>>>>>
>>>>>>>> I see David's concern but his suggestion yet wont solve the problem,
>>>>>>>> neither the current form , reason :-
>>>>>>>>
>>>>>>>> The emitted fence count and total submission count are fast transients
>>>>>>>> which frequently become 0 in between video decodes (between jobs) even
>>>>>>>> with the atomics and locks there can be a switch of video power profile,
>>>>>>>> in the current form of patch that window is minimized, but still can
>>>>>>>> happen if stress tested. But power state of any instance becoming zero
>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
>>>>>> completes, delayed idle work starts.
>>>>>> inst0 idle handler can read 0 total fences and 0 total submission count,
>>>>>> even if inst1 is actively decoding,
>>>>>> that's between the jobs,
>>>>>>      - as begin_use increaments vcn.total_submission_cnt and end_use
>>>>>> decreaments vcn.total_submission_cnt that can be 0.
>>>>>>      - if outstanding fences are cleared and no new emitted fence, between
>>>>>> jobs , can be 0.
>>>>>>      - both of the above conditions do not mean video decode is complete on
>>>>>> inst1, it is actively decoding.
>>>>> How can there be active decoding without an outstanding fence?  In
>>>>> that case, total_fences (fences from both instances) would be non-0.
>>>> I think it should be non-0.
>>>> I do see a hiccup possible - i.e the power switching from ON to OFF then
>>>> ON in the
>>>> middle of decoding, i.e inst0 idle handler turns it off then inst1 turns
>>>> it on.
>>> How would that happen? As long as there submission cnt is non-0 and
>>> there are outstanding fences on any instance, the video profile will
>>> stay active.
>> there could be no jobs but it doesn't timeout yet and new jobs will come in
>> any ms - note all fences are done at this time. The idle handler sees no
>> fences
>> and no jobs so it turns off the power - but just ms later a new job is
>> submitted
>> from the same decode session which could be mpv player as it does not
>> need to
>> submit jobs without delays. This will turn on the power.
> I'm not following.  Every submission will start with begin_use().
yes - it does - it can power on vcn but this happens in the middle of a
decode session which has 10s timeout to call its own idle handler - in fact
the other instance's idle handler will power off vcn because it does not 
know it needs
to wait until the decoding session times out.
> Alex
>
>> David
>>> Alex
>>>
>>>> We should avoid this glitch. This requires the idle handler sets/clears
>>>> a flag for
>>>> done for this instance as Sathish's original patch. When all instances
>>>> set/clear the
>>>> flag then we can safely power off.
>>>> David
>>>>> Alex
>>>>>
>>>>>> Whereas if instances are powered off we are sure idle time is past and
>>>>>> it is powered off, no possible way of
>>>>>> active video decode, when all instances are off we can safely assume no
>>>>>> active decode and global lock protects
>>>>>> it against new begin_use on any instance. But the only distant concern
>>>>>> is global common locks w.r.t perf, but we
>>>>>> are already having a global workprofile mutex , so there shouldn't be
>>>>>> any drop in perf, with just one single
>>>>>> global lock for all instances.
>>>>>>
>>>>>> Just sending out a patch with this fix, will leave it to you to decide
>>>>>> the right method. If you think outstanding total fences
>>>>>> can never be 0 during decode, then your previous version (v3) itself is
>>>>>> good, there is no real benefit of splitting the handlers as such.
>>>>>>
>>>>>> Regards,
>>>>>> Sathish
>>>>>>> If it is possible, maybe it would be easier to just split the profile
>>>>>>> and powergating into separate handlers.  The profile one would be
>>>>>>> global and the powergating one would be per instance.  See the
>>>>>>> attached patches.
>>>>>>>
>>>>>>> Alex
>>>>>>>
>>>>>>>> can be a sure shot indication of break in a video decode, the mistake in
>>>>>>>> my patch was using per instance mutex, I should have used a common
>>>>>>>> global mutex, then that covers the situation David is trying to bring out.
>>>>>>>>
>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
>>>>>>>> track power state could help us totally avoid this situation.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>>
>>>>>>>> Sathish
>>>>>>>>
>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>>> Hi Alex,
>>>>>>>>>>>
>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer in case
>>>>>>>>>>> someone
>>>>>>>>>>> adds more before the lock and not reviewed thoroughly)
>>>>>>>>>>>        - up to you to decide.
>>>>>>>>>>>
>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>>
>>>>>>>>>>> Thanks,
>>>>>>>>>>> David
>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>>> If there are multiple instances of the VCN running,
>>>>>>>>>>>> we may end up switching the video profile while another
>>>>>>>>>>>> instance is active because we only take into account
>>>>>>>>>>>> the current instance's submissions.  Look at all
>>>>>>>>>>>> outstanding fences for the video profile.
>>>>>>>>>>>>
>>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>>
>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
>>>>>>>>>>>> handling")
>>>>>>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>>> ---
>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>>        2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>            struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>>                    container_of(work, struct amdgpu_vcn_inst,
>>>>>>>>>>>> idle_work.work);
>>>>>>>>>>>>            struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>>            int r = 0;
>>>>>>>>>>>>
>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>>>                    return;
>>>>>>>>>>>>
>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>>>>>>>>>>> +
>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>>> +     }
>>>>>>>>>>>>
>>>>>>>>>>>>            /* Only set DPG pause for VCN3 or below, VCN4 and above will
>>>>>>>>>>>> be handled by FW */
>>>>>>>>>>>>            if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>>                    struct dpg_pause_state new_state;
>>>>>>>>>>>>
>>>>>>>>>>>>                    if (fence[i] ||
>>>>>>>>>>>> @@ -436,18 +442,18 @@ static void
>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>                    else
>>>>>>>>>>>>                            new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>>
>>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>>            }
>>>>>>>>>>>>
>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>>> -
>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>>                    mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>                    vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>>                    mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>>> !total_fences &&
>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>>                            r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>>> false);
>>>>>>>>>>>>                            if (r)
>>>>>>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>>>            int r = 0;
>>>>>>>>>>>>
>>>>>>>>>>>>            atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>>
>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>>> -      */
>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>>> -
>>>>>>>>>>>>            mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>> move to here:
>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>>> David
>>>>>>>>>> Alex
>>>>>>>>>>
>>>>>>>>>>> David
>>>>>>>>>>>>            if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>>                    r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>>>            }
>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>
>>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>>            mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>            vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>>>>>>>>>>>>
>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
>>>>>>>>>>>> *ring)
>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>>>>>>>>>>>>
>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>>
>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>>                                  VCN_IDLE_TIMEOUT);
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>>
>>>>>>>>>>>>            uint16_t inst_mask;
>>>>>>>>>>>>            uint8_t num_inst_per_aid;
>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>>
>>>>>>>>>>>>            /* IP reg dump */
>>>>>>>>>>>>            uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 15:35                       ` David Wu
@ 2025-08-14 16:01                         ` Alex Deucher
  2025-08-14 16:44                           ` David Wu
  0 siblings, 1 reply; 36+ messages in thread
From: Alex Deucher @ 2025-08-14 16:01 UTC (permalink / raw)
  To: David Wu; +Cc: Sundararaju, Sathishkumar, Alex Deucher, amd-gfx

On Thu, Aug 14, 2025 at 11:35 AM David Wu <davidwu2@amd.com> wrote:
>
> On 2025-08-14 08:56, Alex Deucher wrote:
> > On Wed, Aug 13, 2025 at 7:06 PM Wu, David <davidwu2@amd.com> wrote:
> >> On 8/13/2025 6:11 PM, Alex Deucher wrote:
> >>> On Wed, Aug 13, 2025 at 5:47 PM Wu, David <davidwu2@amd.com> wrote:
> >>>> On 8/13/2025 5:03 PM, Alex Deucher wrote:
> >>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
> >>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
> >>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
> >>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>>>> Hi Alex, Hi David,
> >>>>>>>>
> >>>>>>>> I see David's concern but his suggestion yet wont solve the problem,
> >>>>>>>> neither the current form , reason :-
> >>>>>>>>
> >>>>>>>> The emitted fence count and total submission count are fast transients
> >>>>>>>> which frequently become 0 in between video decodes (between jobs) even
> >>>>>>>> with the atomics and locks there can be a switch of video power profile,
> >>>>>>>> in the current form of patch that window is minimized, but still can
> >>>>>>>> happen if stress tested. But power state of any instance becoming zero
> >>>>>>> Can you explain how this can happen?  I'm not seeing it.
> >>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
> >>>>>> completes, delayed idle work starts.
> >>>>>> inst0 idle handler can read 0 total fences and 0 total submission count,
> >>>>>> even if inst1 is actively decoding,
> >>>>>> that's between the jobs,
> >>>>>>      - as begin_use increaments vcn.total_submission_cnt and end_use
> >>>>>> decreaments vcn.total_submission_cnt that can be 0.
> >>>>>>      - if outstanding fences are cleared and no new emitted fence, between
> >>>>>> jobs , can be 0.
> >>>>>>      - both of the above conditions do not mean video decode is complete on
> >>>>>> inst1, it is actively decoding.
> >>>>> How can there be active decoding without an outstanding fence?  In
> >>>>> that case, total_fences (fences from both instances) would be non-0.
> >>>> I think it should be non-0.
> >>>> I do see a hiccup possible - i.e the power switching from ON to OFF then
> >>>> ON in the
> >>>> middle of decoding, i.e inst0 idle handler turns it off then inst1 turns
> >>>> it on.
> >>> How would that happen? As long as there submission cnt is non-0 and
> >>> there are outstanding fences on any instance, the video profile will
> >>> stay active.
> >> there could be no jobs but it doesn't timeout yet and new jobs will come in
> >> any ms - note all fences are done at this time. The idle handler sees no
> >> fences
> >> and no jobs so it turns off the power - but just ms later a new job is
> >> submitted
> >> from the same decode session which could be mpv player as it does not
> >> need to
> >> submit jobs without delays. This will turn on the power.
> > I'm not following.  Every submission will start with begin_use().
> yes - it does - it can power on vcn but this happens in the middle of a
> decode session which has 10s timeout to call its own idle handler - in fact
> the other instance's idle handler will power off vcn because it does not
> know it needs
> to wait until the decoding session times out.

I don't follow.  If there are no outstanding fences, there is no
reason to not power down the VCN instance and disable the video
profile. If there are still outstanding fences, then the VCN instance
those fences are associated with will stay on and the video profile
will stay enabled.  If the engine hangs and eventually gets reset, the
fence will be signalled and then there will be no outstanding fences
so the idle handler will eventually disable the power profile.  The
idle handler will keep getting rescheduled as long as there is still
oustanding work.

Alex


> > Alex
> >
> >> David
> >>> Alex
> >>>
> >>>> We should avoid this glitch. This requires the idle handler sets/clears
> >>>> a flag for
> >>>> done for this instance as Sathish's original patch. When all instances
> >>>> set/clear the
> >>>> flag then we can safely power off.
> >>>> David
> >>>>> Alex
> >>>>>
> >>>>>> Whereas if instances are powered off we are sure idle time is past and
> >>>>>> it is powered off, no possible way of
> >>>>>> active video decode, when all instances are off we can safely assume no
> >>>>>> active decode and global lock protects
> >>>>>> it against new begin_use on any instance. But the only distant concern
> >>>>>> is global common locks w.r.t perf, but we
> >>>>>> are already having a global workprofile mutex , so there shouldn't be
> >>>>>> any drop in perf, with just one single
> >>>>>> global lock for all instances.
> >>>>>>
> >>>>>> Just sending out a patch with this fix, will leave it to you to decide
> >>>>>> the right method. If you think outstanding total fences
> >>>>>> can never be 0 during decode, then your previous version (v3) itself is
> >>>>>> good, there is no real benefit of splitting the handlers as such.
> >>>>>>
> >>>>>> Regards,
> >>>>>> Sathish
> >>>>>>> If it is possible, maybe it would be easier to just split the profile
> >>>>>>> and powergating into separate handlers.  The profile one would be
> >>>>>>> global and the powergating one would be per instance.  See the
> >>>>>>> attached patches.
> >>>>>>>
> >>>>>>> Alex
> >>>>>>>
> >>>>>>>> can be a sure shot indication of break in a video decode, the mistake in
> >>>>>>>> my patch was using per instance mutex, I should have used a common
> >>>>>>>> global mutex, then that covers the situation David is trying to bring out.
> >>>>>>>>
> >>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
> >>>>>>>> track power state could help us totally avoid this situation.
> >>>>>>>>
> >>>>>>>> Regards,
> >>>>>>>>
> >>>>>>>> Sathish
> >>>>>>>>
> >>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
> >>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
> >>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
> >>>>>>>>>>> Hi Alex,
> >>>>>>>>>>>
> >>>>>>>>>>> The addition of  total_submission_cnt should work - in that
> >>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
> >>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer in case
> >>>>>>>>>>> someone
> >>>>>>>>>>> adds more before the lock and not reviewed thoroughly)
> >>>>>>>>>>>        - up to you to decide.
> >>>>>>>>>>>
> >>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
> >>>>>>>>>>>
> >>>>>>>>>>> Thanks,
> >>>>>>>>>>> David
> >>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
> >>>>>>>>>>>> If there are multiple instances of the VCN running,
> >>>>>>>>>>>> we may end up switching the video profile while another
> >>>>>>>>>>>> instance is active because we only take into account
> >>>>>>>>>>>> the current instance's submissions.  Look at all
> >>>>>>>>>>>> outstanding fences for the video profile.
> >>>>>>>>>>>>
> >>>>>>>>>>>> v2: drop early exit in begin_use()
> >>>>>>>>>>>> v3: handle possible race between begin_use() work handler
> >>>>>>>>>>>>
> >>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
> >>>>>>>>>>>> handling")
> >>>>>>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
> >>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> >>>>>>>>>>>> ---
> >>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
> >>>>>>>>>>>> ++++++++++++-------------
> >>>>>>>>>>>>        drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
> >>>>>>>>>>>>        2 files changed, 21 insertions(+), 20 deletions(-)
> >>>>>>>>>>>>
> >>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
> >>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
> >>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>>>>>            struct amdgpu_vcn_inst *vcn_inst =
> >>>>>>>>>>>>                    container_of(work, struct amdgpu_vcn_inst,
> >>>>>>>>>>>> idle_work.work);
> >>>>>>>>>>>>            struct amdgpu_device *adev = vcn_inst->adev;
> >>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
> >>>>>>>>>>>> +     unsigned int total_fences = 0,
> >>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>>>>>> +     unsigned int i, j;
> >>>>>>>>>>>>            int r = 0;
> >>>>>>>>>>>>
> >>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
> >>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
> >>>>>>>>>>>>                    return;
> >>>>>>>>>>>>
> >>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> >>>>>>>>>>>> -             fence[i] +=
> >>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> >>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> >>>>>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
> >>>>>>>>>>>> +
> >>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
> >>>>>>>>>>>> +                     fence[i] +=
> >>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
> >>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> >>>>>>>>>>>> +             total_fences += fence[i];
> >>>>>>>>>>>> +     }
> >>>>>>>>>>>>
> >>>>>>>>>>>>            /* Only set DPG pause for VCN3 or below, VCN4 and above will
> >>>>>>>>>>>> be handled by FW */
> >>>>>>>>>>>>            if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> >>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
> >>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
> >>>>>>>>>>>>                    struct dpg_pause_state new_state;
> >>>>>>>>>>>>
> >>>>>>>>>>>>                    if (fence[i] ||
> >>>>>>>>>>>> @@ -436,18 +442,18 @@ static void
> >>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>>>>>                    else
> >>>>>>>>>>>>                            new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
> >>>>>>>>>>>>
> >>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>>>>>>            }
> >>>>>>>>>>>>
> >>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> >>>>>>>>>>>> -     fences += fence[i];
> >>>>>>>>>>>> -
> >>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
> >>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>>>>>> +             /* This is specific to this instance */
> >>>>>>>>>>>>                    mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>                    vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> >>>>>>>>>>>>                    mutex_unlock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
> >>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
> >>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
> >>>>>>>>>>>> !total_fences &&
> >>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
> >>>>>>>>>>>>                            r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>>>>>>>> false);
> >>>>>>>>>>>>                            if (r)
> >>>>>>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>>>>>>>> amdgpu_ring *ring)
> >>>>>>>>>>>>            int r = 0;
> >>>>>>>>>>>>
> >>>>>>>>>>>>            atomic_inc(&vcn_inst->total_submission_cnt);
> >>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>>>>>> move this addition down inside the mutex lock
> >>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
> >>>>>>>>>>>>
> >>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
> >>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
> >>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
> >>>>>>>>>>>> -      */
> >>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
> >>>>>>>>>>>> -             goto pg_lock;
> >>>>>>>>>>>> -
> >>>>>>>>>>>>            mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>> move to here:
> >>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>>>>>> I think this should work for multiple instances.
> >>>>>>>>>> Why does this need to be protected by the mutex?
> >>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
> >>>>>>>>> David
> >>>>>>>>>> Alex
> >>>>>>>>>>
> >>>>>>>>>>> David
> >>>>>>>>>>>>            if (!adev->vcn.workload_profile_active) {
> >>>>>>>>>>>>                    r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>>>>>>>> amdgpu_ring *ring)
> >>>>>>>>>>>>            }
> >>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>>
> >>>>>>>>>>>> -pg_lock:
> >>>>>>>>>>>>            mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>            vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> >>>>>>>>>>>>
> >>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
> >>>>>>>>>>>> *ring)
> >>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
> >>>>>>>>>>>>
> >>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
> >>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
> >>>>>>>>>>>>
> >>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
> >>>>>>>>>>>>                                  VCN_IDLE_TIMEOUT);
> >>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
> >>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
> >>>>>>>>>>>>
> >>>>>>>>>>>>            uint16_t inst_mask;
> >>>>>>>>>>>>            uint8_t num_inst_per_aid;
> >>>>>>>>>>>> +     atomic_t                total_submission_cnt;
> >>>>>>>>>>>>
> >>>>>>>>>>>>            /* IP reg dump */
> >>>>>>>>>>>>            uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 16:01                         ` Alex Deucher
@ 2025-08-14 16:44                           ` David Wu
  2025-08-14 18:00                             ` Alex Deucher
  0 siblings, 1 reply; 36+ messages in thread
From: David Wu @ 2025-08-14 16:44 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Sundararaju, Sathishkumar, Alex Deucher, amd-gfx


On 2025-08-14 12:01, Alex Deucher wrote:
> On Thu, Aug 14, 2025 at 11:35 AM David Wu <davidwu2@amd.com> wrote:
>> On 2025-08-14 08:56, Alex Deucher wrote:
>>> On Wed, Aug 13, 2025 at 7:06 PM Wu, David <davidwu2@amd.com> wrote:
>>>> On 8/13/2025 6:11 PM, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 5:47 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>> On 8/13/2025 5:03 PM, Alex Deucher wrote:
>>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>>>> Hi Alex, Hi David,
>>>>>>>>>>
>>>>>>>>>> I see David's concern but his suggestion yet wont solve the problem,
>>>>>>>>>> neither the current form , reason :-
>>>>>>>>>>
>>>>>>>>>> The emitted fence count and total submission count are fast transients
>>>>>>>>>> which frequently become 0 in between video decodes (between jobs) even
>>>>>>>>>> with the atomics and locks there can be a switch of video power profile,
>>>>>>>>>> in the current form of patch that window is minimized, but still can
>>>>>>>>>> happen if stress tested. But power state of any instance becoming zero
>>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
>>>>>>>> completes, delayed idle work starts.
>>>>>>>> inst0 idle handler can read 0 total fences and 0 total submission count,
>>>>>>>> even if inst1 is actively decoding,
>>>>>>>> that's between the jobs,
>>>>>>>>       - as begin_use increaments vcn.total_submission_cnt and end_use
>>>>>>>> decreaments vcn.total_submission_cnt that can be 0.
>>>>>>>>       - if outstanding fences are cleared and no new emitted fence, between
>>>>>>>> jobs , can be 0.
>>>>>>>>       - both of the above conditions do not mean video decode is complete on
>>>>>>>> inst1, it is actively decoding.
>>>>>>> How can there be active decoding without an outstanding fence?  In
>>>>>>> that case, total_fences (fences from both instances) would be non-0.
>>>>>> I think it should be non-0.
>>>>>> I do see a hiccup possible - i.e the power switching from ON to OFF then
>>>>>> ON in the
>>>>>> middle of decoding, i.e inst0 idle handler turns it off then inst1 turns
>>>>>> it on.
>>>>> How would that happen? As long as there submission cnt is non-0 and
>>>>> there are outstanding fences on any instance, the video profile will
>>>>> stay active.
>>>> there could be no jobs but it doesn't timeout yet and new jobs will come in
>>>> any ms - note all fences are done at this time. The idle handler sees no
>>>> fences
>>>> and no jobs so it turns off the power - but just ms later a new job is
>>>> submitted
>>>> from the same decode session which could be mpv player as it does not
>>>> need to
>>>> submit jobs without delays. This will turn on the power.
>>> I'm not following.  Every submission will start with begin_use().
>> yes - it does - it can power on vcn but this happens in the middle of a
>> decode session which has 10s timeout to call its own idle handler - in fact
>> the other instance's idle handler will power off vcn because it does not
>> know it needs
>> to wait until the decoding session times out.
> I don't follow.  If there are no outstanding fences, there is no
> reason to not power down the VCN instance and disable the video
> profile. If there are still outstanding fences, then the VCN instance
> those fences are associated with will stay on and the video profile
> will stay enabled.  If the engine hangs and eventually gets reset, the
> fence will be signalled and then there will be no outstanding fences
> so the idle handler will eventually disable the power profile.  The
> idle handler will keep getting rescheduled as long as there is still
> oustanding work.
inst0 and inst1:
inst0 sends jobA, then ends jobA and no more job submitted in 500ms and
job queue is empty - at this point  inst1's idle handler sees no 
outstanding fences/jobs
then power off.  However inst0 starts to submit job after 500ms - inst0' 
idle handler
has not started/scheduled to run but inst1's has finished already which
does not know inst0 has not timed out or called its own idle handler. 
This violates the
logic for idle handler's timeout condition. (i.e 10s timeout designed 
but timed out in 500ms)
all this means it powered down too early for inst0.
> Alex
>
>
>>> Alex
>>>
>>>> David
>>>>> Alex
>>>>>
>>>>>> We should avoid this glitch. This requires the idle handler sets/clears
>>>>>> a flag for
>>>>>> done for this instance as Sathish's original patch. When all instances
>>>>>> set/clear the
>>>>>> flag then we can safely power off.
>>>>>> David
>>>>>>> Alex
>>>>>>>
>>>>>>>> Whereas if instances are powered off we are sure idle time is past and
>>>>>>>> it is powered off, no possible way of
>>>>>>>> active video decode, when all instances are off we can safely assume no
>>>>>>>> active decode and global lock protects
>>>>>>>> it against new begin_use on any instance. But the only distant concern
>>>>>>>> is global common locks w.r.t perf, but we
>>>>>>>> are already having a global workprofile mutex , so there shouldn't be
>>>>>>>> any drop in perf, with just one single
>>>>>>>> global lock for all instances.
>>>>>>>>
>>>>>>>> Just sending out a patch with this fix, will leave it to you to decide
>>>>>>>> the right method. If you think outstanding total fences
>>>>>>>> can never be 0 during decode, then your previous version (v3) itself is
>>>>>>>> good, there is no real benefit of splitting the handlers as such.
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Sathish
>>>>>>>>> If it is possible, maybe it would be easier to just split the profile
>>>>>>>>> and powergating into separate handlers.  The profile one would be
>>>>>>>>> global and the powergating one would be per instance.  See the
>>>>>>>>> attached patches.
>>>>>>>>>
>>>>>>>>> Alex
>>>>>>>>>
>>>>>>>>>> can be a sure shot indication of break in a video decode, the mistake in
>>>>>>>>>> my patch was using per instance mutex, I should have used a common
>>>>>>>>>> global mutex, then that covers the situation David is trying to bring out.
>>>>>>>>>>
>>>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
>>>>>>>>>> track power state could help us totally avoid this situation.
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>>
>>>>>>>>>> Sathish
>>>>>>>>>>
>>>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
>>>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
>>>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
>>>>>>>>>>>>> Hi Alex,
>>>>>>>>>>>>>
>>>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
>>>>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
>>>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer in case
>>>>>>>>>>>>> someone
>>>>>>>>>>>>> adds more before the lock and not reviewed thoroughly)
>>>>>>>>>>>>>         - up to you to decide.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Thanks,
>>>>>>>>>>>>> David
>>>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
>>>>>>>>>>>>>> If there are multiple instances of the VCN running,
>>>>>>>>>>>>>> we may end up switching the video profile while another
>>>>>>>>>>>>>> instance is active because we only take into account
>>>>>>>>>>>>>> the current instance's submissions.  Look at all
>>>>>>>>>>>>>> outstanding fences for the video profile.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> v2: drop early exit in begin_use()
>>>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
>>>>>>>>>>>>>> handling")
>>>>>>>>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
>>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
>>>>>>>>>>>>>> ++++++++++++-------------
>>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
>>>>>>>>>>>>>>         2 files changed, 21 insertions(+), 20 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
>>>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
>>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>>>             struct amdgpu_vcn_inst *vcn_inst =
>>>>>>>>>>>>>>                     container_of(work, struct amdgpu_vcn_inst,
>>>>>>>>>>>>>> idle_work.work);
>>>>>>>>>>>>>>             struct amdgpu_device *adev = vcn_inst->adev;
>>>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
>>>>>>>>>>>>>> +     unsigned int total_fences = 0,
>>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
>>>>>>>>>>>>>> +     unsigned int i, j;
>>>>>>>>>>>>>>             int r = 0;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
>>>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
>>>>>>>>>>>>>>                     return;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
>>>>>>>>>>>>>> -             fence[i] +=
>>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
>>>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
>>>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
>>>>>>>>>>>>>> +                     fence[i] +=
>>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
>>>>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
>>>>>>>>>>>>>> +             total_fences += fence[i];
>>>>>>>>>>>>>> +     }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>             /* Only set DPG pause for VCN3 or below, VCN4 and above will
>>>>>>>>>>>>>> be handled by FW */
>>>>>>>>>>>>>>             if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
>>>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
>>>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
>>>>>>>>>>>>>>                     struct dpg_pause_state new_state;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>                     if (fence[i] ||
>>>>>>>>>>>>>> @@ -436,18 +442,18 @@ static void
>>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
>>>>>>>>>>>>>>                     else
>>>>>>>>>>>>>>                             new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
>>>>>>>>>>>>>>             }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
>>>>>>>>>>>>>> -     fences += fence[i];
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
>>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
>>>>>>>>>>>>>> +             /* This is specific to this instance */
>>>>>>>>>>>>>>                     mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>>                     vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
>>>>>>>>>>>>>>                     mutex_unlock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
>>>>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
>>>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
>>>>>>>>>>>>>> !total_fences &&
>>>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
>>>>>>>>>>>>>>                             r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>>>>> false);
>>>>>>>>>>>>>>                             if (r)
>>>>>>>>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>>>>>             int r = 0;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>             atomic_inc(&vcn_inst->total_submission_cnt);
>>>>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>>> move this addition down inside the mutex lock
>>>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
>>>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
>>>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
>>>>>>>>>>>>>> -      */
>>>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
>>>>>>>>>>>>>> -             goto pg_lock;
>>>>>>>>>>>>>> -
>>>>>>>>>>>>>>             mutex_lock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>> move to here:
>>>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
>>>>>>>>>>>>> I think this should work for multiple instances.
>>>>>>>>>>>> Why does this need to be protected by the mutex?
>>>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
>>>>>>>>>>> David
>>>>>>>>>>>> Alex
>>>>>>>>>>>>
>>>>>>>>>>>>> David
>>>>>>>>>>>>>>             if (!adev->vcn.workload_profile_active) {
>>>>>>>>>>>>>>                     r = amdgpu_dpm_switch_power_profile(adev,
>>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
>>>>>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
>>>>>>>>>>>>>> amdgpu_ring *ring)
>>>>>>>>>>>>>>             }
>>>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> -pg_lock:
>>>>>>>>>>>>>>             mutex_lock(&vcn_inst->vcn_pg_lock);
>>>>>>>>>>>>>>             vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
>>>>>>>>>>>>>> *ring)
>>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
>>>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
>>>>>>>>>>>>>>                                   VCN_IDLE_TIMEOUT);
>>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
>>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
>>>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>             uint16_t inst_mask;
>>>>>>>>>>>>>>             uint8_t num_inst_per_aid;
>>>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>             /* IP reg dump */
>>>>>>>>>>>>>>             uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 16:44                           ` David Wu
@ 2025-08-14 18:00                             ` Alex Deucher
  2025-08-14 19:18                               ` David Wu
  0 siblings, 1 reply; 36+ messages in thread
From: Alex Deucher @ 2025-08-14 18:00 UTC (permalink / raw)
  To: David Wu; +Cc: Sundararaju, Sathishkumar, Alex Deucher, amd-gfx

On Thu, Aug 14, 2025 at 12:44 PM David Wu <davidwu2@amd.com> wrote:
>
>
> On 2025-08-14 12:01, Alex Deucher wrote:
> > On Thu, Aug 14, 2025 at 11:35 AM David Wu <davidwu2@amd.com> wrote:
> >> On 2025-08-14 08:56, Alex Deucher wrote:
> >>> On Wed, Aug 13, 2025 at 7:06 PM Wu, David <davidwu2@amd.com> wrote:
> >>>> On 8/13/2025 6:11 PM, Alex Deucher wrote:
> >>>>> On Wed, Aug 13, 2025 at 5:47 PM Wu, David <davidwu2@amd.com> wrote:
> >>>>>> On 8/13/2025 5:03 PM, Alex Deucher wrote:
> >>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
> >>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
> >>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
> >>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
> >>>>>>>>>> Hi Alex, Hi David,
> >>>>>>>>>>
> >>>>>>>>>> I see David's concern but his suggestion yet wont solve the problem,
> >>>>>>>>>> neither the current form , reason :-
> >>>>>>>>>>
> >>>>>>>>>> The emitted fence count and total submission count are fast transients
> >>>>>>>>>> which frequently become 0 in between video decodes (between jobs) even
> >>>>>>>>>> with the atomics and locks there can be a switch of video power profile,
> >>>>>>>>>> in the current form of patch that window is minimized, but still can
> >>>>>>>>>> happen if stress tested. But power state of any instance becoming zero
> >>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
> >>>>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
> >>>>>>>> completes, delayed idle work starts.
> >>>>>>>> inst0 idle handler can read 0 total fences and 0 total submission count,
> >>>>>>>> even if inst1 is actively decoding,
> >>>>>>>> that's between the jobs,
> >>>>>>>>       - as begin_use increaments vcn.total_submission_cnt and end_use
> >>>>>>>> decreaments vcn.total_submission_cnt that can be 0.
> >>>>>>>>       - if outstanding fences are cleared and no new emitted fence, between
> >>>>>>>> jobs , can be 0.
> >>>>>>>>       - both of the above conditions do not mean video decode is complete on
> >>>>>>>> inst1, it is actively decoding.
> >>>>>>> How can there be active decoding without an outstanding fence?  In
> >>>>>>> that case, total_fences (fences from both instances) would be non-0.
> >>>>>> I think it should be non-0.
> >>>>>> I do see a hiccup possible - i.e the power switching from ON to OFF then
> >>>>>> ON in the
> >>>>>> middle of decoding, i.e inst0 idle handler turns it off then inst1 turns
> >>>>>> it on.
> >>>>> How would that happen? As long as there submission cnt is non-0 and
> >>>>> there are outstanding fences on any instance, the video profile will
> >>>>> stay active.
> >>>> there could be no jobs but it doesn't timeout yet and new jobs will come in
> >>>> any ms - note all fences are done at this time. The idle handler sees no
> >>>> fences
> >>>> and no jobs so it turns off the power - but just ms later a new job is
> >>>> submitted
> >>>> from the same decode session which could be mpv player as it does not
> >>>> need to
> >>>> submit jobs without delays. This will turn on the power.
> >>> I'm not following.  Every submission will start with begin_use().
> >> yes - it does - it can power on vcn but this happens in the middle of a
> >> decode session which has 10s timeout to call its own idle handler - in fact
> >> the other instance's idle handler will power off vcn because it does not
> >> know it needs
> >> to wait until the decoding session times out.
> > I don't follow.  If there are no outstanding fences, there is no
> > reason to not power down the VCN instance and disable the video
> > profile. If there are still outstanding fences, then the VCN instance
> > those fences are associated with will stay on and the video profile
> > will stay enabled.  If the engine hangs and eventually gets reset, the
> > fence will be signalled and then there will be no outstanding fences
> > so the idle handler will eventually disable the power profile.  The
> > idle handler will keep getting rescheduled as long as there is still
> > oustanding work.
> inst0 and inst1:
> inst0 sends jobA, then ends jobA and no more job submitted in 500ms and
> job queue is empty - at this point  inst1's idle handler sees no
> outstanding fences/jobs
> then power off.  However inst0 starts to submit job after 500ms - inst0'
> idle handler
> has not started/scheduled to run but inst1's has finished already which
> does not know inst0 has not timed out or called its own idle handler.
> This violates the
> logic for idle handler's timeout condition. (i.e 10s timeout designed
> but timed out in 500ms)
> all this means it powered down too early for inst0.

I still don't follow.  Here's a sample flow.  Job comes in on inst 0
and then slightly later on inst 1.

Inst 0 job submission
Inst 0 calls begin_use().  This cancels the current inst worker
thread.  It enables the video profile and ungates the instance.
IBs and fence packets get submitted to instance 0 of the engine
Inst 0 calls end_use().  This schedules the worker thread for
VCN_IDLE_TIMEOUT jiffies in the future.

Inst 1 job submission:
Inst 1 calls begin_use().  This cancels the current inst worker
thread.  It sees the video profile is enabled and ungates the
instance.
IBs and fence packets get submitted to instance 1 of the engine
Inst 1 calls end_use().  This schedules the worker thread for
VCN_IDLE_TIMEOUT jiffies in the future.

inst 0 work hander runs.  Sees outstanding fences on inst 0; skips
powergating inst 0, skips disabling video profile. Schedules the
worker thread for VCN_IDLE_TIMEOUT jiffies in the future.

inst 0 IB completes and fence signals

inst 1 IB completes and fence signals

inst 1 work hander runs.  Sees no outstanding fences on inst 1.
powergates inst 1.  Check if there are any outstanding fences on other
instances.  Sees the no fences from inst 0 so disables the video
profile.

inst 0 work hander runs.  Sees no outstanding fences on inst 0.
powergates inst 0.  Check if there are any outstanding fences on other
instances.  Sees the no fences from inst 1, sees that video profile is
already disabled.

You can insert additional job submissions anywhere you want in the timeline.

Alex

> > Alex
> >
> >
> >>> Alex
> >>>
> >>>> David
> >>>>> Alex
> >>>>>
> >>>>>> We should avoid this glitch. This requires the idle handler sets/clears
> >>>>>> a flag for
> >>>>>> done for this instance as Sathish's original patch. When all instances
> >>>>>> set/clear the
> >>>>>> flag then we can safely power off.
> >>>>>> David
> >>>>>>> Alex
> >>>>>>>
> >>>>>>>> Whereas if instances are powered off we are sure idle time is past and
> >>>>>>>> it is powered off, no possible way of
> >>>>>>>> active video decode, when all instances are off we can safely assume no
> >>>>>>>> active decode and global lock protects
> >>>>>>>> it against new begin_use on any instance. But the only distant concern
> >>>>>>>> is global common locks w.r.t perf, but we
> >>>>>>>> are already having a global workprofile mutex , so there shouldn't be
> >>>>>>>> any drop in perf, with just one single
> >>>>>>>> global lock for all instances.
> >>>>>>>>
> >>>>>>>> Just sending out a patch with this fix, will leave it to you to decide
> >>>>>>>> the right method. If you think outstanding total fences
> >>>>>>>> can never be 0 during decode, then your previous version (v3) itself is
> >>>>>>>> good, there is no real benefit of splitting the handlers as such.
> >>>>>>>>
> >>>>>>>> Regards,
> >>>>>>>> Sathish
> >>>>>>>>> If it is possible, maybe it would be easier to just split the profile
> >>>>>>>>> and powergating into separate handlers.  The profile one would be
> >>>>>>>>> global and the powergating one would be per instance.  See the
> >>>>>>>>> attached patches.
> >>>>>>>>>
> >>>>>>>>> Alex
> >>>>>>>>>
> >>>>>>>>>> can be a sure shot indication of break in a video decode, the mistake in
> >>>>>>>>>> my patch was using per instance mutex, I should have used a common
> >>>>>>>>>> global mutex, then that covers the situation David is trying to bring out.
> >>>>>>>>>>
> >>>>>>>>>> Using one global vcn.pg_lock for idle and begin_use and using flags to
> >>>>>>>>>> track power state could help us totally avoid this situation.
> >>>>>>>>>>
> >>>>>>>>>> Regards,
> >>>>>>>>>>
> >>>>>>>>>> Sathish
> >>>>>>>>>>
> >>>>>>>>>> On 8/13/2025 11:46 PM, Wu, David wrote:
> >>>>>>>>>>> On 8/13/2025 12:51 PM, Alex Deucher wrote:
> >>>>>>>>>>>> On Wed, Aug 13, 2025 at 12:39 PM Wu, David <davidwu2@amd.com> wrote:
> >>>>>>>>>>>>> Hi Alex,
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> The addition of  total_submission_cnt should work - in that
> >>>>>>>>>>>>> it is unlikely to have a context switch right after the begin_use().
> >>>>>>>>>>>>> The suggestion of moving it inside the lock (which I prefer in case
> >>>>>>>>>>>>> someone
> >>>>>>>>>>>>> adds more before the lock and not reviewed thoroughly)
> >>>>>>>>>>>>>         - up to you to decide.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Reviewed-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Thanks,
> >>>>>>>>>>>>> David
> >>>>>>>>>>>>> On 8/13/2025 9:45 AM, Alex Deucher wrote:
> >>>>>>>>>>>>>> If there are multiple instances of the VCN running,
> >>>>>>>>>>>>>> we may end up switching the video profile while another
> >>>>>>>>>>>>>> instance is active because we only take into account
> >>>>>>>>>>>>>> the current instance's submissions.  Look at all
> >>>>>>>>>>>>>> outstanding fences for the video profile.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> v2: drop early exit in begin_use()
> >>>>>>>>>>>>>> v3: handle possible race between begin_use() work handler
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Fixes: 3b669df92c85 ("drm/amdgpu/vcn: adjust workload profile
> >>>>>>>>>>>>>> handling")
> >>>>>>>>>>>>>> Reviewed-by: Sathishkumar S <sathishkumar.sundararaju@amd.com> (v1)
> >>>>>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
> >>>>>>>>>>>>>> ---
> >>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 40
> >>>>>>>>>>>>>> ++++++++++++-------------
> >>>>>>>>>>>>>>         drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
> >>>>>>>>>>>>>>         2 files changed, 21 insertions(+), 20 deletions(-)
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> index 9a76e11d1c184..593c1ddf8819b 100644
> >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> >>>>>>>>>>>>>> @@ -415,19 +415,25 @@ static void
> >>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>>>>>>>             struct amdgpu_vcn_inst *vcn_inst =
> >>>>>>>>>>>>>>                     container_of(work, struct amdgpu_vcn_inst,
> >>>>>>>>>>>>>> idle_work.work);
> >>>>>>>>>>>>>>             struct amdgpu_device *adev = vcn_inst->adev;
> >>>>>>>>>>>>>> -     unsigned int fences = 0, fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>>>>>>>> -     unsigned int i = vcn_inst->inst, j;
> >>>>>>>>>>>>>> +     unsigned int total_fences = 0,
> >>>>>>>>>>>>>> fence[AMDGPU_MAX_VCN_INSTANCES] = {0};
> >>>>>>>>>>>>>> +     unsigned int i, j;
> >>>>>>>>>>>>>>             int r = 0;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     if (adev->vcn.harvest_config & (1 << i))
> >>>>>>>>>>>>>> +     if (adev->vcn.harvest_config & (1 << vcn_inst->inst))
> >>>>>>>>>>>>>>                     return;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     for (j = 0; j < adev->vcn.inst[i].num_enc_rings; ++j)
> >>>>>>>>>>>>>> -             fence[i] +=
> >>>>>>>>>>>>>> amdgpu_fence_count_emitted(&vcn_inst->ring_enc[j]);
> >>>>>>>>>>>>>> +     for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> >>>>>>>>>>>>>> +             struct amdgpu_vcn_inst *v = &adev->vcn.inst[i];
> >>>>>>>>>>>>>> +
> >>>>>>>>>>>>>> +             for (j = 0; j < v->num_enc_rings; ++j)
> >>>>>>>>>>>>>> +                     fence[i] +=
> >>>>>>>>>>>>>> amdgpu_fence_count_emitted(&v->ring_enc[j]);
> >>>>>>>>>>>>>> +             fence[i] += amdgpu_fence_count_emitted(&v->ring_dec);
> >>>>>>>>>>>>>> +             total_fences += fence[i];
> >>>>>>>>>>>>>> +     }
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>             /* Only set DPG pause for VCN3 or below, VCN4 and above will
> >>>>>>>>>>>>>> be handled by FW */
> >>>>>>>>>>>>>>             if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
> >>>>>>>>>>>>>> -         !adev->vcn.inst[i].using_unified_queue) {
> >>>>>>>>>>>>>> +         !vcn_inst->using_unified_queue) {
> >>>>>>>>>>>>>>                     struct dpg_pause_state new_state;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>                     if (fence[i] ||
> >>>>>>>>>>>>>> @@ -436,18 +442,18 @@ static void
> >>>>>>>>>>>>>> amdgpu_vcn_idle_work_handler(struct work_struct *work)
> >>>>>>>>>>>>>>                     else
> >>>>>>>>>>>>>>                             new_state.fw_based = VCN_DPG_STATE__UNPAUSE;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -             adev->vcn.inst[i].pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>>>>>>>> +             vcn_inst->pause_dpg_mode(vcn_inst, &new_state);
> >>>>>>>>>>>>>>             }
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     fence[i] += amdgpu_fence_count_emitted(&vcn_inst->ring_dec);
> >>>>>>>>>>>>>> -     fences += fence[i];
> >>>>>>>>>>>>>> -
> >>>>>>>>>>>>>> -     if (!fences && !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>>>>>>>> +     if (!fence[vcn_inst->inst] &&
> >>>>>>>>>>>>>> !atomic_read(&vcn_inst->total_submission_cnt)) {
> >>>>>>>>>>>>>> +             /* This is specific to this instance */
> >>>>>>>>>>>>>>                     mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>>>                     vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_GATE);
> >>>>>>>>>>>>>>                     mutex_unlock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>>> mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>>>> -             if (adev->vcn.workload_profile_active) {
> >>>>>>>>>>>>>> +             /* This is global and depends on all VCN instances */
> >>>>>>>>>>>>>> +             if (adev->vcn.workload_profile_active &&
> >>>>>>>>>>>>>> !total_fences &&
> >>>>>>>>>>>>>> + !atomic_read(&adev->vcn.total_submission_cnt)) {
> >>>>>>>>>>>>>>                             r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>>>>>>>>>> false);
> >>>>>>>>>>>>>>                             if (r)
> >>>>>>>>>>>>>> @@ -467,16 +473,10 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>>>>>>>>>> amdgpu_ring *ring)
> >>>>>>>>>>>>>>             int r = 0;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>             atomic_inc(&vcn_inst->total_submission_cnt);
> >>>>>>>>>>>>>> +     atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>>>>>>>> move this addition down inside the mutex lock
> >>>>>>>>>>>>>> cancel_delayed_work_sync(&vcn_inst->idle_work);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -     /* We can safely return early here because we've cancelled the
> >>>>>>>>>>>>>> -      * the delayed work so there is no one else to set it to false
> >>>>>>>>>>>>>> -      * and we don't care if someone else sets it to true.
> >>>>>>>>>>>>>> -      */
> >>>>>>>>>>>>>> -     if (adev->vcn.workload_profile_active)
> >>>>>>>>>>>>>> -             goto pg_lock;
> >>>>>>>>>>>>>> -
> >>>>>>>>>>>>>>             mutex_lock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>>> move to here:
> >>>>>>>>>>>>> atomic_inc(&adev->vcn.total_submission_cnt);
> >>>>>>>>>>>>> I think this should work for multiple instances.
> >>>>>>>>>>>> Why does this need to be protected by the mutex?
> >>>>>>>>>>> hmm.. OK - no need and it is actually better before the mutex.
> >>>>>>>>>>> David
> >>>>>>>>>>>> Alex
> >>>>>>>>>>>>
> >>>>>>>>>>>>> David
> >>>>>>>>>>>>>>             if (!adev->vcn.workload_profile_active) {
> >>>>>>>>>>>>>>                     r = amdgpu_dpm_switch_power_profile(adev,
> >>>>>>>>>>>>>> PP_SMC_POWER_PROFILE_VIDEO,
> >>>>>>>>>>>>>> @@ -487,7 +487,6 @@ void amdgpu_vcn_ring_begin_use(struct
> >>>>>>>>>>>>>> amdgpu_ring *ring)
> >>>>>>>>>>>>>>             }
> >>>>>>>>>>>>>> mutex_unlock(&adev->vcn.workload_profile_mutex);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> -pg_lock:
> >>>>>>>>>>>>>>             mutex_lock(&vcn_inst->vcn_pg_lock);
> >>>>>>>>>>>>>>             vcn_inst->set_pg_state(vcn_inst, AMD_PG_STATE_UNGATE);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> @@ -528,6 +527,7 @@ void amdgpu_vcn_ring_end_use(struct amdgpu_ring
> >>>>>>>>>>>>>> *ring)
> >>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> atomic_dec(&ring->adev->vcn.inst[ring->me].total_submission_cnt);
> >>>>>>>>>>>>>> + atomic_dec(&ring->adev->vcn.total_submission_cnt);
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> schedule_delayed_work(&ring->adev->vcn.inst[ring->me].idle_work,
> >>>>>>>>>>>>>>                                   VCN_IDLE_TIMEOUT);
> >>>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> index b3fb1d0e43fc9..febc3ce8641ff 100644
> >>>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> >>>>>>>>>>>>>> @@ -352,6 +352,7 @@ struct amdgpu_vcn {
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>             uint16_t inst_mask;
> >>>>>>>>>>>>>>             uint8_t num_inst_per_aid;
> >>>>>>>>>>>>>> +     atomic_t                total_submission_cnt;
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>             /* IP reg dump */
> >>>>>>>>>>>>>>             uint32_t                *ip_dump;

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 18:00                             ` Alex Deucher
@ 2025-08-14 19:18                               ` David Wu
  2025-08-14 19:43                                 ` Alex Deucher
  0 siblings, 1 reply; 36+ messages in thread
From: David Wu @ 2025-08-14 19:18 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Sundararaju, Sathishkumar, Alex Deucher, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 7125 bytes --]

On 2025-08-14 14:00, Alex Deucher wrote:
> On Thu, Aug 14, 2025 at 12:44 PM David Wu<davidwu2@amd.com> wrote:
>>
>> On 2025-08-14 12:01, Alex Deucher wrote:
>>> On Thu, Aug 14, 2025 at 11:35 AM David Wu<davidwu2@amd.com> wrote:
>>>> On 2025-08-14 08:56, Alex Deucher wrote:
>>>>> On Wed, Aug 13, 2025 at 7:06 PM Wu, David<davidwu2@amd.com> wrote:
>>>>>> On 8/13/2025 6:11 PM, Alex Deucher wrote:
>>>>>>> On Wed, Aug 13, 2025 at 5:47 PM Wu, David<davidwu2@amd.com> wrote:
>>>>>>>> On 8/13/2025 5:03 PM, Alex Deucher wrote:
>>>>>>>>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>>>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>>>>>>>>>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>>>>>>>>>>> <sathishkumar.sundararaju@amd.com> wrote:
>>>>>>>>>>>> Hi Alex, Hi David,
>>>>>>>>>>>>
>>>>>>>>>>>> I see David's concern but his suggestion yet wont solve the problem,
>>>>>>>>>>>> neither the current form , reason :-
>>>>>>>>>>>>
>>>>>>>>>>>> The emitted fence count and total submission count are fast transients
>>>>>>>>>>>> which frequently become 0 in between video decodes (between jobs) even
>>>>>>>>>>>> with the atomics and locks there can be a switch of video power profile,
>>>>>>>>>>>> in the current form of patch that window is minimized, but still can
>>>>>>>>>>>> happen if stress tested. But power state of any instance becoming zero
>>>>>>>>>>> Can you explain how this can happen?  I'm not seeing it.
>>>>>>>>>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
>>>>>>>>>> completes, delayed idle work starts.
>>>>>>>>>> inst0 idle handler can read 0 total fences and 0 total submission count,
>>>>>>>>>> even if inst1 is actively decoding,
>>>>>>>>>> that's between the jobs,
>>>>>>>>>>        - as begin_use increaments vcn.total_submission_cnt and end_use
>>>>>>>>>> decreaments vcn.total_submission_cnt that can be 0.
>>>>>>>>>>        - if outstanding fences are cleared and no new emitted fence, between
>>>>>>>>>> jobs , can be 0.
>>>>>>>>>>        - both of the above conditions do not mean video decode is complete on
>>>>>>>>>> inst1, it is actively decoding.
>>>>>>>>> How can there be active decoding without an outstanding fence?  In
>>>>>>>>> that case, total_fences (fences from both instances) would be non-0.
>>>>>>>> I think it should be non-0.
>>>>>>>> I do see a hiccup possible - i.e the power switching from ON to OFF then
>>>>>>>> ON in the
>>>>>>>> middle of decoding, i.e inst0 idle handler turns it off then inst1 turns
>>>>>>>> it on.
>>>>>>> How would that happen? As long as there submission cnt is non-0 and
>>>>>>> there are outstanding fences on any instance, the video profile will
>>>>>>> stay active.
>>>>>> there could be no jobs but it doesn't timeout yet and new jobs will come in
>>>>>> any ms - note all fences are done at this time. The idle handler sees no
>>>>>> fences
>>>>>> and no jobs so it turns off the power - but just ms later a new job is
>>>>>> submitted
>>>>>> from the same decode session which could be mpv player as it does not
>>>>>> need to
>>>>>> submit jobs without delays. This will turn on the power.
>>>>> I'm not following.  Every submission will start with begin_use().
>>>> yes - it does - it can power on vcn but this happens in the middle of a
>>>> decode session which has 10s timeout to call its own idle handler - in fact
>>>> the other instance's idle handler will power off vcn because it does not
>>>> know it needs
>>>> to wait until the decoding session times out.
>>> I don't follow.  If there are no outstanding fences, there is no
>>> reason to not power down the VCN instance and disable the video
>>> profile. If there are still outstanding fences, then the VCN instance
>>> those fences are associated with will stay on and the video profile
>>> will stay enabled.  If the engine hangs and eventually gets reset, the
>>> fence will be signalled and then there will be no outstanding fences
>>> so the idle handler will eventually disable the power profile.  The
>>> idle handler will keep getting rescheduled as long as there is still
>>> oustanding work.
>> inst0 and inst1:
>> inst0 sends jobA, then ends jobA and no more job submitted in 500ms and
>> job queue is empty - at this point  inst1's idle handler sees no
>> outstanding fences/jobs
>> then power off.  However inst0 starts to submit job after 500ms - inst0'
>> idle handler
>> has not started/scheduled to run but inst1's has finished already which
>> does not know inst0 has not timed out or called its own idle handler.
>> This violates the
>> logic for idle handler's timeout condition. (i.e 10s timeout designed
>> but timed out in 500ms)
>> all this means it powered down too early for inst0.
> I still don't follow.  Here's a sample flow.  Job comes in on inst 0
> and then slightly later on inst 1.
>
> Inst 0 job submission
> Inst 0 calls begin_use().  This cancels the current inst worker
> thread.  It enables the video profile and ungates the instance.
> IBs and fence packets get submitted to instance 0 of the engine
> Inst 0 calls end_use().  This schedules the worker thread for
> VCN_IDLE_TIMEOUT jiffies in the future.
>
> Inst 1 job submission:
> Inst 1 calls begin_use().  This cancels the current inst worker
> thread.  It sees the video profile is enabled and ungates the
> instance.
> IBs and fence packets get submitted to instance 1 of the engine
> Inst 1 calls end_use().  This schedules the worker thread for
> VCN_IDLE_TIMEOUT jiffies in the future.
>
> inst 0 work hander runs.  Sees outstanding fences on inst 0; skips
> powergating inst 0, skips disabling video profile. Schedules the
> worker thread for VCN_IDLE_TIMEOUT jiffies in the future.
>
> inst 0 IB completes and fence signals
>
> inst 1 IB completes and fence signals
>
> inst 1 work hander runs.  Sees no outstanding fences on inst 1.
> powergates inst 1.  Check if there are any outstanding fences on other
> instances.  Sees the no fences from inst 0 so disables the video
> profile.
now there are jobs coming from inst0, so inst 0 idle handler won't run.
> inst 0 work hander runs.  Sees no outstanding fences on inst 0.
> powergates inst 0.  Check if there are any outstanding fences on other
> instances.  Sees the no fences from inst 1, sees that video profile is
> already disabled.
inst 0 work handler runs? could or could not - right? depends on if 
there are more jobs for inst0 and also
if VCN_IDLE_TIMEOUT jiffies has passed for inst0. There is possibly a 
point in the sequence that inst0
stops submit jobs but its idle handler has not run yet. Should we wait 
until all instances have finished their idle handlers?
if not then we will run into a power OFF(by inst1) -> ON(by inst0) for 
the active instance(inst0, expected ON). (active - I mean
those have not timed out in VCN_IDLE_TIMEOUT jiffies and still can 
submit jobs in any time.)
if this could happen then we powered off too early. (I did not say we 
cannot do it but it is not expected)
> You can insert additional job submissions anywhere you want in the timeline.
>
> Alex
>
>>>

[-- Attachment #2: Type: text/html, Size: 10023 bytes --]

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 19:18                               ` David Wu
@ 2025-08-14 19:43                                 ` Alex Deucher
  2025-08-14 21:39                                   ` David Wu
  0 siblings, 1 reply; 36+ messages in thread
From: Alex Deucher @ 2025-08-14 19:43 UTC (permalink / raw)
  To: David Wu; +Cc: Sundararaju, Sathishkumar, Alex Deucher, amd-gfx

On Thu, Aug 14, 2025 at 3:18 PM David Wu <davidwu2@amd.com> wrote:
>
> On 2025-08-14 14:00, Alex Deucher wrote:
>
> On Thu, Aug 14, 2025 at 12:44 PM David Wu <davidwu2@amd.com> wrote:
>
> On 2025-08-14 12:01, Alex Deucher wrote:
>
> On Thu, Aug 14, 2025 at 11:35 AM David Wu <davidwu2@amd.com> wrote:
>
> On 2025-08-14 08:56, Alex Deucher wrote:
>
> On Wed, Aug 13, 2025 at 7:06 PM Wu, David <davidwu2@amd.com> wrote:
>
> On 8/13/2025 6:11 PM, Alex Deucher wrote:
>
> On Wed, Aug 13, 2025 at 5:47 PM Wu, David <davidwu2@amd.com> wrote:
>
> On 8/13/2025 5:03 PM, Alex Deucher wrote:
>
> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
> <sathishkumar.sundararaju@amd.com> wrote:
>
> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>
> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
> <sathishkumar.sundararaju@amd.com> wrote:
>
> Hi Alex, Hi David,
>
> I see David's concern but his suggestion yet wont solve the problem,
> neither the current form , reason :-
>
> The emitted fence count and total submission count are fast transients
> which frequently become 0 in between video decodes (between jobs) even
> with the atomics and locks there can be a switch of video power profile,
> in the current form of patch that window is minimized, but still can
> happen if stress tested. But power state of any instance becoming zero
>
> Can you explain how this can happen?  I'm not seeing it.
>
> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
> completes, delayed idle work starts.
> inst0 idle handler can read 0 total fences and 0 total submission count,
> even if inst1 is actively decoding,
> that's between the jobs,
>       - as begin_use increaments vcn.total_submission_cnt and end_use
> decreaments vcn.total_submission_cnt that can be 0.
>       - if outstanding fences are cleared and no new emitted fence, between
> jobs , can be 0.
>       - both of the above conditions do not mean video decode is complete on
> inst1, it is actively decoding.
>
> How can there be active decoding without an outstanding fence?  In
> that case, total_fences (fences from both instances) would be non-0.
>
> I think it should be non-0.
> I do see a hiccup possible - i.e the power switching from ON to OFF then
> ON in the
> middle of decoding, i.e inst0 idle handler turns it off then inst1 turns
> it on.
>
> How would that happen? As long as there submission cnt is non-0 and
> there are outstanding fences on any instance, the video profile will
> stay active.
>
> there could be no jobs but it doesn't timeout yet and new jobs will come in
> any ms - note all fences are done at this time. The idle handler sees no
> fences
> and no jobs so it turns off the power - but just ms later a new job is
> submitted
> from the same decode session which could be mpv player as it does not
> need to
> submit jobs without delays. This will turn on the power.
>
> I'm not following.  Every submission will start with begin_use().
>
> yes - it does - it can power on vcn but this happens in the middle of a
> decode session which has 10s timeout to call its own idle handler - in fact
> the other instance's idle handler will power off vcn because it does not
> know it needs
> to wait until the decoding session times out.
>
> I don't follow.  If there are no outstanding fences, there is no
> reason to not power down the VCN instance and disable the video
> profile. If there are still outstanding fences, then the VCN instance
> those fences are associated with will stay on and the video profile
> will stay enabled.  If the engine hangs and eventually gets reset, the
> fence will be signalled and then there will be no outstanding fences
> so the idle handler will eventually disable the power profile.  The
> idle handler will keep getting rescheduled as long as there is still
> oustanding work.
>
> inst0 and inst1:
> inst0 sends jobA, then ends jobA and no more job submitted in 500ms and
> job queue is empty - at this point  inst1's idle handler sees no
> outstanding fences/jobs
> then power off.  However inst0 starts to submit job after 500ms - inst0'
> idle handler
> has not started/scheduled to run but inst1's has finished already which
> does not know inst0 has not timed out or called its own idle handler.
> This violates the
> logic for idle handler's timeout condition. (i.e 10s timeout designed
> but timed out in 500ms)
> all this means it powered down too early for inst0.
>
> I still don't follow.  Here's a sample flow.  Job comes in on inst 0
> and then slightly later on inst 1.
>
> Inst 0 job submission
> Inst 0 calls begin_use().  This cancels the current inst worker
> thread.  It enables the video profile and ungates the instance.
> IBs and fence packets get submitted to instance 0 of the engine
> Inst 0 calls end_use().  This schedules the worker thread for
> VCN_IDLE_TIMEOUT jiffies in the future.
>
> Inst 1 job submission:
> Inst 1 calls begin_use().  This cancels the current inst worker
> thread.  It sees the video profile is enabled and ungates the
> instance.
> IBs and fence packets get submitted to instance 1 of the engine
> Inst 1 calls end_use().  This schedules the worker thread for
> VCN_IDLE_TIMEOUT jiffies in the future.
>
> inst 0 work hander runs.  Sees outstanding fences on inst 0; skips
> powergating inst 0, skips disabling video profile. Schedules the
> worker thread for VCN_IDLE_TIMEOUT jiffies in the future.
>
> inst 0 IB completes and fence signals
>
> inst 1 IB completes and fence signals
>
> inst 1 work hander runs.  Sees no outstanding fences on inst 1.
> powergates inst 1.  Check if there are any outstanding fences on other
> instances.  Sees the no fences from inst 0 so disables the video
> profile.
>
> now there are jobs coming from inst0, so inst 0 idle handler won't run.
>
> inst 0 work hander runs.  Sees no outstanding fences on inst 0.
> powergates inst 0.  Check if there are any outstanding fences on other
> instances.  Sees the no fences from inst 1, sees that video profile is
> already disabled.
>
> inst 0 work handler runs? could or could not - right? depends on if there are more jobs for inst0 and also
> if  VCN_IDLE_TIMEOUT jiffies has passed for inst0. There is possibly a point in the sequence that inst0
> stops submit jobs but its idle handler has not run yet. Should we wait until all instances have finished their idle handlers?

The work handler will run unless it is cancelled in begin_use(), but
in that case, it will get scheduled again in end_use().  If it runs,
but there is still outstanding work (fences), then it will get
scheduled to run again in the work handler.  All we are about from the
power and video profile perspective is whether or not there are still
outstanding fences.  Once all fences are completed on all instances,
we can disable the video profile regardless of whether then the idle
handlers have run or not because the hw is already idle.

> if not then we will run into a power OFF(by inst1) -> ON(by inst0) for the active instance(inst0, expected ON). (active - I mean
> those have not timed out in VCN_IDLE_TIMEOUT jiffies and still can submit jobs in any time.)
> if this could happen then we powered off too early. (I did not say we cannot do it but it is not expected)
>

VCN_IDLE_TIMEOUT is not a timeout.  We could call it something else.
VCN_IDLE_CHECK_PERIOD is a better description.  It just runs in the
future to check if all of the fences are signalled.  If they are not
signalled, we schedule the worker to run again further in the future.
If you reduce the time, it will check more often, you can potentially
save more power, but you add more CPU overhead because the handler may
run more often.

Alex

> You can insert additional job submissions anywhere you want in the timeline.
>
> Alex
>
>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3)
  2025-08-14 19:43                                 ` Alex Deucher
@ 2025-08-14 21:39                                   ` David Wu
  0 siblings, 0 replies; 36+ messages in thread
From: David Wu @ 2025-08-14 21:39 UTC (permalink / raw)
  To: Alex Deucher; +Cc: Sundararaju, Sathishkumar, Alex Deucher, amd-gfx

after discussion with Alex offline it is clear now as the power profile 
switch is fine and should not
affect the user experience (such as mpv playback).
Alex, please correct me if my understanding is incorrect. This patch 
(v3) is also good.

Thanks,
David
On 2025-08-14 15:43, Alex Deucher wrote:
> On Thu, Aug 14, 2025 at 3:18 PM David Wu <davidwu2@amd.com> wrote:
>> On 2025-08-14 14:00, Alex Deucher wrote:
>>
>> On Thu, Aug 14, 2025 at 12:44 PM David Wu <davidwu2@amd.com> wrote:
>>
>> On 2025-08-14 12:01, Alex Deucher wrote:
>>
>> On Thu, Aug 14, 2025 at 11:35 AM David Wu <davidwu2@amd.com> wrote:
>>
>> On 2025-08-14 08:56, Alex Deucher wrote:
>>
>> On Wed, Aug 13, 2025 at 7:06 PM Wu, David <davidwu2@amd.com> wrote:
>>
>> On 8/13/2025 6:11 PM, Alex Deucher wrote:
>>
>> On Wed, Aug 13, 2025 at 5:47 PM Wu, David <davidwu2@amd.com> wrote:
>>
>> On 8/13/2025 5:03 PM, Alex Deucher wrote:
>>
>> On Wed, Aug 13, 2025 at 4:58 PM Sundararaju, Sathishkumar
>> <sathishkumar.sundararaju@amd.com> wrote:
>>
>> On 8/14/2025 1:35 AM, Alex Deucher wrote:
>>
>> On Wed, Aug 13, 2025 at 2:23 PM Sundararaju, Sathishkumar
>> <sathishkumar.sundararaju@amd.com> wrote:
>>
>> Hi Alex, Hi David,
>>
>> I see David's concern but his suggestion yet wont solve the problem,
>> neither the current form , reason :-
>>
>> The emitted fence count and total submission count are fast transients
>> which frequently become 0 in between video decodes (between jobs) even
>> with the atomics and locks there can be a switch of video power profile,
>> in the current form of patch that window is minimized, but still can
>> happen if stress tested. But power state of any instance becoming zero
>>
>> Can you explain how this can happen?  I'm not seeing it.
>>
>> Consider this situation, inst0 and inst1 actively decoding, inst0 decode
>> completes, delayed idle work starts.
>> inst0 idle handler can read 0 total fences and 0 total submission count,
>> even if inst1 is actively decoding,
>> that's between the jobs,
>>        - as begin_use increaments vcn.total_submission_cnt and end_use
>> decreaments vcn.total_submission_cnt that can be 0.
>>        - if outstanding fences are cleared and no new emitted fence, between
>> jobs , can be 0.
>>        - both of the above conditions do not mean video decode is complete on
>> inst1, it is actively decoding.
>>
>> How can there be active decoding without an outstanding fence?  In
>> that case, total_fences (fences from both instances) would be non-0.
>>
>> I think it should be non-0.
>> I do see a hiccup possible - i.e the power switching from ON to OFF then
>> ON in the
>> middle of decoding, i.e inst0 idle handler turns it off then inst1 turns
>> it on.
>>
>> How would that happen? As long as there submission cnt is non-0 and
>> there are outstanding fences on any instance, the video profile will
>> stay active.
>>
>> there could be no jobs but it doesn't timeout yet and new jobs will come in
>> any ms - note all fences are done at this time. The idle handler sees no
>> fences
>> and no jobs so it turns off the power - but just ms later a new job is
>> submitted
>> from the same decode session which could be mpv player as it does not
>> need to
>> submit jobs without delays. This will turn on the power.
>>
>> I'm not following.  Every submission will start with begin_use().
>>
>> yes - it does - it can power on vcn but this happens in the middle of a
>> decode session which has 10s timeout to call its own idle handler - in fact
>> the other instance's idle handler will power off vcn because it does not
>> know it needs
>> to wait until the decoding session times out.
>>
>> I don't follow.  If there are no outstanding fences, there is no
>> reason to not power down the VCN instance and disable the video
>> profile. If there are still outstanding fences, then the VCN instance
>> those fences are associated with will stay on and the video profile
>> will stay enabled.  If the engine hangs and eventually gets reset, the
>> fence will be signalled and then there will be no outstanding fences
>> so the idle handler will eventually disable the power profile.  The
>> idle handler will keep getting rescheduled as long as there is still
>> oustanding work.
>>
>> inst0 and inst1:
>> inst0 sends jobA, then ends jobA and no more job submitted in 500ms and
>> job queue is empty - at this point  inst1's idle handler sees no
>> outstanding fences/jobs
>> then power off.  However inst0 starts to submit job after 500ms - inst0'
>> idle handler
>> has not started/scheduled to run but inst1's has finished already which
>> does not know inst0 has not timed out or called its own idle handler.
>> This violates the
>> logic for idle handler's timeout condition. (i.e 10s timeout designed
>> but timed out in 500ms)
>> all this means it powered down too early for inst0.
>>
>> I still don't follow.  Here's a sample flow.  Job comes in on inst 0
>> and then slightly later on inst 1.
>>
>> Inst 0 job submission
>> Inst 0 calls begin_use().  This cancels the current inst worker
>> thread.  It enables the video profile and ungates the instance.
>> IBs and fence packets get submitted to instance 0 of the engine
>> Inst 0 calls end_use().  This schedules the worker thread for
>> VCN_IDLE_TIMEOUT jiffies in the future.
>>
>> Inst 1 job submission:
>> Inst 1 calls begin_use().  This cancels the current inst worker
>> thread.  It sees the video profile is enabled and ungates the
>> instance.
>> IBs and fence packets get submitted to instance 1 of the engine
>> Inst 1 calls end_use().  This schedules the worker thread for
>> VCN_IDLE_TIMEOUT jiffies in the future.
>>
>> inst 0 work hander runs.  Sees outstanding fences on inst 0; skips
>> powergating inst 0, skips disabling video profile. Schedules the
>> worker thread for VCN_IDLE_TIMEOUT jiffies in the future.
>>
>> inst 0 IB completes and fence signals
>>
>> inst 1 IB completes and fence signals
>>
>> inst 1 work hander runs.  Sees no outstanding fences on inst 1.
>> powergates inst 1.  Check if there are any outstanding fences on other
>> instances.  Sees the no fences from inst 0 so disables the video
>> profile.
>>
>> now there are jobs coming from inst0, so inst 0 idle handler won't run.
>>
>> inst 0 work hander runs.  Sees no outstanding fences on inst 0.
>> powergates inst 0.  Check if there are any outstanding fences on other
>> instances.  Sees the no fences from inst 1, sees that video profile is
>> already disabled.
>>
>> inst 0 work handler runs? could or could not - right? depends on if there are more jobs for inst0 and also
>> if  VCN_IDLE_TIMEOUT jiffies has passed for inst0. There is possibly a point in the sequence that inst0
>> stops submit jobs but its idle handler has not run yet. Should we wait until all instances have finished their idle handlers?
> The work handler will run unless it is cancelled in begin_use(), but
> in that case, it will get scheduled again in end_use().  If it runs,
> but there is still outstanding work (fences), then it will get
> scheduled to run again in the work handler.  All we are about from the
> power and video profile perspective is whether or not there are still
> outstanding fences.  Once all fences are completed on all instances,
> we can disable the video profile regardless of whether then the idle
> handlers have run or not because the hw is already idle.
>
>> if not then we will run into a power OFF(by inst1) -> ON(by inst0) for the active instance(inst0, expected ON). (active - I mean
>> those have not timed out in VCN_IDLE_TIMEOUT jiffies and still can submit jobs in any time.)
>> if this could happen then we powered off too early. (I did not say we cannot do it but it is not expected)
>>
> VCN_IDLE_TIMEOUT is not a timeout.  We could call it something else.
> VCN_IDLE_CHECK_PERIOD is a better description.  It just runs in the
> future to check if all of the fences are signalled.  If they are not
> signalled, we schedule the worker to run again further in the future.
> If you reduce the time, it will check more often, you can potentially
> save more power, but you add more CPU overhead because the handler may
> run more often.
>
> Alex
>
>> You can insert additional job submissions anywhere you want in the timeline.
>>
>> Alex
>>
>>

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2025-08-18 19:31 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-13 13:45 [PATCH] drm/amdgpu/vcn: fix video profile race condition (v3) Alex Deucher
2025-08-13 16:31 ` Wu, David
2025-08-13 16:51   ` Alex Deucher
2025-08-13 18:16     ` Wu, David
2025-08-13 18:23       ` Sundararaju, Sathishkumar
2025-08-13 20:05         ` Alex Deucher
2025-08-13 20:58           ` Sundararaju, Sathishkumar
2025-08-13 21:03             ` Alex Deucher
2025-08-13 21:16               ` Sundararaju, Sathishkumar
2025-08-13 22:08                 ` Alex Deucher
2025-08-13 23:13                   ` Sundararaju, Sathishkumar
2025-08-14  8:41                     ` Lazar, Lijo
2025-08-14  9:11                       ` Sundararaju, Sathishkumar
2025-08-14  9:24                         ` Lazar, Lijo
2025-08-14  9:46                           ` Lazar, Lijo
2025-08-14 11:52                             ` Sundararaju, Sathishkumar
2025-08-14 12:03                               ` Lazar, Lijo
2025-08-14 12:48                                 ` Sundararaju, Sathishkumar
2025-08-14 12:54                                   ` Lazar, Lijo
2025-08-14 17:44                                     ` David Wu
2025-08-14 18:06                                       ` Lazar, Lijo
2025-08-14 18:42                                         ` David Wu
2025-08-18 19:22                                           ` David Wu
2025-08-18 19:31                                             ` Alex Deucher
2025-08-14 17:06                                   ` Sundararaju, Sathishkumar
2025-08-13 21:47               ` Wu, David
2025-08-13 22:11                 ` Alex Deucher
2025-08-13 23:06                   ` Wu, David
2025-08-14 12:56                     ` Alex Deucher
2025-08-14 15:35                       ` David Wu
2025-08-14 16:01                         ` Alex Deucher
2025-08-14 16:44                           ` David Wu
2025-08-14 18:00                             ` Alex Deucher
2025-08-14 19:18                               ` David Wu
2025-08-14 19:43                                 ` Alex Deucher
2025-08-14 21:39                                   ` David Wu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).