All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/admgpu: fix vcn sw init failed
@ 2024-11-12 14:30 Jesse.zhang@amd.com
  2024-11-12 14:30 ` [PATCH 2/2] drm/amdgpu: " Jesse.zhang@amd.com
  2024-11-13  5:30 ` [PATCH 1/2] drm/admgpu: " Alex Deucher
  0 siblings, 2 replies; 14+ messages in thread
From: Jesse.zhang@amd.com @ 2024-11-12 14:30 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alexander.Deucher, Christian Koenig, vitaly.prosyak, Tim.Huang,
	Jesse.zhang@amd.com

For multiple vcn instances, to avoid creating reset sysfs multiple times,
 add the instance paramter in reset mask init.

Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c |  8 ++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  4 ++--
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   | 10 ++++------
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c |  4 ++--
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 25f490ad3a85..1d4eda649845 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1295,11 +1295,11 @@ static ssize_t amdgpu_get_vcn_reset_mask(struct device *dev,
 static DEVICE_ATTR(vcn_reset_mask, 0444,
 		   amdgpu_get_vcn_reset_mask, NULL);
 
-int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev, int inst)
 {
 	int r = 0;
 
-	if (adev->vcn.num_vcn_inst) {
+	if (inst == 0) {
 		r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask);
 		if (r)
 			return r;
@@ -1308,12 +1308,12 @@ int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
 	return r;
 }
 
-void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev, int inst)
 {
 	int idx;
 
 	if (drm_dev_enter(adev_to_drm(adev), &idx)) {
-		if (adev->vcn.num_vcn_inst)
+		if (inst == 0)
 			device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
 		drm_dev_exit(idx);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 7ff4ae2a0432..9b10044c61a3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -519,7 +519,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
 			       enum AMDGPU_UCODE_ID ucode_id);
 int amdgpu_vcn_save_vcpu_bo(struct amdgpu_device *adev, int inst);
-int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev);
-void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev);
+int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev, int inst);
+void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev, int inst);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index 59f83409d323..109b27904984 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -250,11 +250,9 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block *ip_block)
 		ip_block->ip_dump = ptr;
 	}
 
-	if (inst == 0) {
-		r = amdgpu_vcn_sysfs_reset_mask_init(adev);
-		if (r)
-			return r;
-	}
+	r = amdgpu_vcn_sysfs_reset_mask_init(adev, inst);
+	if (r)
+		return r;
 
 	return 0;
 }
@@ -292,7 +290,7 @@ static int vcn_v4_0_sw_fini(struct amdgpu_ip_block *ip_block)
 	if (r)
 		return r;
 
-	amdgpu_vcn_sysfs_reset_mask_fini(adev);
+	amdgpu_vcn_sysfs_reset_mask_fini(adev, inst);
 	r = amdgpu_vcn_sw_fini(adev, inst);
 
 	kfree(ip_block->ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index e9b869f373c9..ef3dfd44a022 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -217,7 +217,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
 		ip_block->ip_dump = ptr;
 	}
 
-	r = amdgpu_vcn_sysfs_reset_mask_init(adev);
+	r = amdgpu_vcn_sysfs_reset_mask_init(adev, inst);
 	if (r)
 		return r;
 
@@ -254,7 +254,7 @@ static int vcn_v4_0_3_sw_fini(struct amdgpu_ip_block *ip_block)
 	if (r)
 		return r;
 
-	amdgpu_vcn_sysfs_reset_mask_fini(adev);
+	amdgpu_vcn_sysfs_reset_mask_fini(adev, inst);
 	r = amdgpu_vcn_sw_fini(adev, inst);
 
 	kfree(ip_block->ip_dump);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
index 96ec01cffea3..8f9c19c68d88 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
@@ -186,7 +186,7 @@ static int vcn_v5_0_0_sw_init(struct amdgpu_ip_block *ip_block)
 		ip_block->ip_dump = ptr;
 	}
 
-	r = amdgpu_vcn_sysfs_reset_mask_init(adev);
+	r = amdgpu_vcn_sysfs_reset_mask_init(adev, inst);
 	if (r)
 		return r;
 
@@ -223,7 +223,7 @@ static int vcn_v5_0_0_sw_fini(struct amdgpu_ip_block *ip_block)
 	if (r)
 		return r;
 
-	amdgpu_vcn_sysfs_reset_mask_fini(adev);
+	amdgpu_vcn_sysfs_reset_mask_fini(adev, inst);
 	r = amdgpu_vcn_sw_fini(adev, inst);
 
 	kfree(ip_block->ip_dump);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-12 14:30 [PATCH 1/2] drm/admgpu: fix vcn sw init failed Jesse.zhang@amd.com
@ 2024-11-12 14:30 ` Jesse.zhang@amd.com
  2024-11-12 14:54   ` Lazar, Lijo
  2024-11-13  5:30 ` [PATCH 1/2] drm/admgpu: " Alex Deucher
  1 sibling, 1 reply; 14+ messages in thread
From: Jesse.zhang@amd.com @ 2024-11-12 14:30 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alexander.Deucher, Christian Koenig, vitaly.prosyak, Tim.Huang,
	Jesse.zhang@amd.com

[ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP block <vcn_v4_0_3> failed -22
[ 2875.880494] amdgpu 0000:01:00.0: amdgpu: amdgpu_device_ip_init failed
[ 2875.887689] amdgpu 0000:01:00.0: amdgpu: Fatal error during GPU init
[ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.

Add irqs with different IRQ source pointer for vcn0 and vcn1.

Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index ef3dfd44a022..82b90f1e6f33 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry vcn_reg_list_4_0_3[] = {
 
 #define NORMALIZE_VCN_REG_OFFSET(offset) \
 		(offset & 0x1FFFF)
+static int amdgpu_ih_clientid_vcns[] = {
+	SOC15_IH_CLIENTID_VCN,
+	SOC15_IH_CLIENTID_VCN1
+};
 
 static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
 static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device *adev, int inst);
@@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
 	if (r)
 		return r;
 
-	/* VCN DEC TRAP */
-	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
-		VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
+	/* VCN UNIFIED TRAP */
+	r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
+			VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst[inst].irq);
 	if (r)
 		return r;
 
@@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
 
 	ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
 	sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
-	r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
+	r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
 				 AMDGPU_RING_PRIO_DEFAULT,
 				 &adev->vcn.inst[inst].sched_score);
 	if (r)
@@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
  */
 static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int inst)
 {
-	adev->vcn.inst->irq.num_types++;
+	if (adev->vcn.harvest_config & (1 << inst))
+		return;
+
+	adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
 
-	adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
+	adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
 }
 
 static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block *ip_block, struct drm_printer *p)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-12 14:30 ` [PATCH 2/2] drm/amdgpu: " Jesse.zhang@amd.com
@ 2024-11-12 14:54   ` Lazar, Lijo
  2024-11-13  2:28     ` Zhang, Jesse(Jie)
  0 siblings, 1 reply; 14+ messages in thread
From: Lazar, Lijo @ 2024-11-12 14:54 UTC (permalink / raw)
  To: Jesse.zhang@amd.com, amd-gfx
  Cc: Alexander.Deucher, Christian Koenig, vitaly.prosyak, Tim.Huang



On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP block <vcn_v4_0_3> failed -22
> [ 2875.880494] amdgpu 0000:01:00.0: amdgpu: amdgpu_device_ip_init failed
> [ 2875.887689] amdgpu 0000:01:00.0: amdgpu: Fatal error during GPU init
> [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
> 
> Add irqs with different IRQ source pointer for vcn0 and vcn1.
> 
> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
>  1 file changed, 13 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> index ef3dfd44a022..82b90f1e6f33 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry vcn_reg_list_4_0_3[] = {
>  
>  #define NORMALIZE_VCN_REG_OFFSET(offset) \
>  		(offset & 0x1FFFF)
> +static int amdgpu_ih_clientid_vcns[] = {
> +	SOC15_IH_CLIENTID_VCN,
> +	SOC15_IH_CLIENTID_VCN1

This is not valid for 4.0.3. It uses only the same client id, different
node_id to distinguish. Also, there are max of 4 instances.

I would say that entire IP instance series was done in a haste without
applying thought and breaks other things including ip block mask.

Thanks,
Lijo

> +};
>  
>  static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
>  static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device *adev, int inst);
> @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>  	if (r)
>  		return r;
>  
> -	/* VCN DEC TRAP */
> -	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
> -		VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
> +	/* VCN UNIFIED TRAP */
> +	r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
> +			VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst[inst].irq);
>  	if (r)
>  		return r;
>  
> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>  
>  	ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
>  	sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
> -	r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
> +	r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
>  				 AMDGPU_RING_PRIO_DEFAULT,
>  				 &adev->vcn.inst[inst].sched_score);
>  	if (r)
> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
>   */
>  static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int inst)
>  {
> -	adev->vcn.inst->irq.num_types++;
> +	if (adev->vcn.harvest_config & (1 << inst))
> +		return;
> +
> +	adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
>  
> -	adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
> +	adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
>  }
>  
>  static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block *ip_block, struct drm_printer *p)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-12 14:54   ` Lazar, Lijo
@ 2024-11-13  2:28     ` Zhang, Jesse(Jie)
  2024-11-13  3:10       ` Lazar, Lijo
  0 siblings, 1 reply; 14+ messages in thread
From: Zhang, Jesse(Jie) @ 2024-11-13  2:28 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx@lists.freedesktop.org
  Cc: Deucher, Alexander, Koenig, Christian, Prosyak, Vitaly,
	Huang, Tim

[AMD Official Use Only - AMD Internal Distribution Only]

Hi, Lijo,

-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@amd.com>
Sent: Tuesday, November 12, 2024 10:54 PM
To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed



On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP
> block <vcn_v4_0_3> failed -22 [ 2875.880494] amdgpu 0000:01:00.0:
> amdgpu: amdgpu_device_ip_init failed [ 2875.887689] amdgpu
> 0000:01:00.0: amdgpu: Fatal error during GPU init [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
>
> Add irqs with different IRQ source pointer for vcn0 and vcn1.
>
> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
>  1 file changed, 13 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> index ef3dfd44a022..82b90f1e6f33 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry
> vcn_reg_list_4_0_3[] = {
>
>  #define NORMALIZE_VCN_REG_OFFSET(offset) \
>               (offset & 0x1FFFF)
> +static int amdgpu_ih_clientid_vcns[] = {
> +     SOC15_IH_CLIENTID_VCN,
> +     SOC15_IH_CLIENTID_VCN1

This is not valid for 4.0.3. It uses only the same client id, different node_id to distinguish. Also, there are max of 4 instances.

I would say that entire IP instance series was done in a haste without applying thought and breaks other things including ip block mask.

If the same client id is used, it returns -EINVAL (because of the following check) and sw init fails at step vcn_v4_0_3_sw_init / amdgpu_irq_add_id.

amdgpu_irq_add_id:
if (adev->irq.client[client_id].sources[src_id] != NULL)
        return -EINVAL;

Regards
Jesse


Thanks,
Lijo

> +};
>
>  static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
> static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device
> *adev, int inst); @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>       if (r)
>               return r;
>
> -     /* VCN DEC TRAP */
> -     r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
> -             VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
> +     /* VCN UNIFIED TRAP */
> +     r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
> +                     VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE,
> +&adev->vcn.inst[inst].irq);
>       if (r)
>               return r;
>
> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct
> amdgpu_ip_block *ip_block)
>
>       ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
>       sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
> -     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
> +     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
>                                AMDGPU_RING_PRIO_DEFAULT,
>                                &adev->vcn.inst[inst].sched_score);
>       if (r)
> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
>   */
>  static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int
> inst)  {
> -     adev->vcn.inst->irq.num_types++;
> +     if (adev->vcn.harvest_config & (1 << inst))
> +             return;
> +
> +     adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
>
> -     adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
> +     adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
>  }
>
>  static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block
> *ip_block, struct drm_printer *p)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-13  2:28     ` Zhang, Jesse(Jie)
@ 2024-11-13  3:10       ` Lazar, Lijo
  2024-11-13  4:46         ` Alex Deucher
  0 siblings, 1 reply; 14+ messages in thread
From: Lazar, Lijo @ 2024-11-13  3:10 UTC (permalink / raw)
  To: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org
  Cc: Deucher, Alexander, Koenig, Christian, Prosyak, Vitaly,
	Huang, Tim



On 11/13/2024 7:58 AM, Zhang, Jesse(Jie) wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
> 
> Hi, Lijo,
> 
> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> Sent: Tuesday, November 12, 2024 10:54 PM
> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
> Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
> 
> 
> 
> On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
>> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP
>> block <vcn_v4_0_3> failed -22 [ 2875.880494] amdgpu 0000:01:00.0:
>> amdgpu: amdgpu_device_ip_init failed [ 2875.887689] amdgpu
>> 0000:01:00.0: amdgpu: Fatal error during GPU init [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
>>
>> Add irqs with different IRQ source pointer for vcn0 and vcn1.
>>
>> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
>>  1 file changed, 13 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>> index ef3dfd44a022..82b90f1e6f33 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry
>> vcn_reg_list_4_0_3[] = {
>>
>>  #define NORMALIZE_VCN_REG_OFFSET(offset) \
>>               (offset & 0x1FFFF)
>> +static int amdgpu_ih_clientid_vcns[] = {
>> +     SOC15_IH_CLIENTID_VCN,
>> +     SOC15_IH_CLIENTID_VCN1
> 
> This is not valid for 4.0.3. It uses only the same client id, different node_id to distinguish. Also, there are max of 4 instances.
> 
> I would say that entire IP instance series was done in a haste without applying thought and breaks other things including ip block mask.
> 
> If the same client id is used, it returns -EINVAL (because of the following check) and sw init fails at step vcn_v4_0_3_sw_init / amdgpu_irq_add_id.
> 
> amdgpu_irq_add_id:
> if (adev->irq.client[client_id].sources[src_id] != NULL)
>         return -EINVAL;
> 

We had some side discussions on IP block-per-instance approach.
Personally, I was not in favour of it as I thought allowing IP block to
handle its own instances is the better approach and that could handle
dependencies between instances. Turns out that there are more like
handling common things for all instances as in this example.

I would prefer to revert the patch series and consider all angles before
moving forward on the approach. Will leave this to Alex/Christian/Leo on
the final decsion.

Thanks,
Lijo

> Regards
> Jesse
> 
> 
> Thanks,
> Lijo
> 
>> +};
>>
>>  static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
>> static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device
>> *adev, int inst); @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>>       if (r)
>>               return r;
>>
>> -     /* VCN DEC TRAP */
>> -     r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
>> -             VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
>> +     /* VCN UNIFIED TRAP */
>> +     r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
>> +                     VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE,
>> +&adev->vcn.inst[inst].irq);
>>       if (r)
>>               return r;
>>
>> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct
>> amdgpu_ip_block *ip_block)
>>
>>       ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
>>       sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
>> -     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
>> +     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
>>                                AMDGPU_RING_PRIO_DEFAULT,
>>                                &adev->vcn.inst[inst].sched_score);
>>       if (r)
>> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
>>   */
>>  static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int
>> inst)  {
>> -     adev->vcn.inst->irq.num_types++;
>> +     if (adev->vcn.harvest_config & (1 << inst))
>> +             return;
>> +
>> +     adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
>>
>> -     adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
>> +     adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
>>  }
>>
>>  static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block
>> *ip_block, struct drm_printer *p)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-13  3:10       ` Lazar, Lijo
@ 2024-11-13  4:46         ` Alex Deucher
  2024-11-13  5:03           ` Lazar, Lijo
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Deucher @ 2024-11-13  4:46 UTC (permalink / raw)
  To: Lazar, Lijo
  Cc: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org,
	Deucher, Alexander, Koenig, Christian, Prosyak, Vitaly,
	Huang, Tim

[-- Attachment #1: Type: text/plain, Size: 5232 bytes --]

On Tue, Nov 12, 2024 at 10:24 PM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>
>
>
> On 11/13/2024 7:58 AM, Zhang, Jesse(Jie) wrote:
> > [AMD Official Use Only - AMD Internal Distribution Only]
> >
> > Hi, Lijo,
> >
> > -----Original Message-----
> > From: Lazar, Lijo <Lijo.Lazar@amd.com>
> > Sent: Tuesday, November 12, 2024 10:54 PM
> > To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
> > Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
> > Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
> >
> >
> >
> > On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
> >> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP
> >> block <vcn_v4_0_3> failed -22 [ 2875.880494] amdgpu 0000:01:00.0:
> >> amdgpu: amdgpu_device_ip_init failed [ 2875.887689] amdgpu
> >> 0000:01:00.0: amdgpu: Fatal error during GPU init [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
> >>
> >> Add irqs with different IRQ source pointer for vcn0 and vcn1.
> >>
> >> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
> >> ---
> >>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
> >>  1 file changed, 13 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >> index ef3dfd44a022..82b90f1e6f33 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry
> >> vcn_reg_list_4_0_3[] = {
> >>
> >>  #define NORMALIZE_VCN_REG_OFFSET(offset) \
> >>               (offset & 0x1FFFF)
> >> +static int amdgpu_ih_clientid_vcns[] = {
> >> +     SOC15_IH_CLIENTID_VCN,
> >> +     SOC15_IH_CLIENTID_VCN1
> >
> > This is not valid for 4.0.3. It uses only the same client id, different node_id to distinguish. Also, there are max of 4 instances.
> >
> > I would say that entire IP instance series was done in a haste without applying thought and breaks other things including ip block mask.
> >
> > If the same client id is used, it returns -EINVAL (because of the following check) and sw init fails at step vcn_v4_0_3_sw_init / amdgpu_irq_add_id.
> >
> > amdgpu_irq_add_id:
> > if (adev->irq.client[client_id].sources[src_id] != NULL)
> >         return -EINVAL;
> >
>
> We had some side discussions on IP block-per-instance approach.
> Personally, I was not in favour of it as I thought allowing IP block to
> handle its own instances is the better approach and that could handle
> dependencies between instances. Turns out that there are more like
> handling common things for all instances as in this example.

We tried that route as well before this one and it was ugly as well.

>
> I would prefer to revert the patch series and consider all angles before
> moving forward on the approach. Will leave this to Alex/Christian/Leo on
> the final decsion.

Do the attached patches fix it?

Alex

>
> Thanks,
> Lijo
>
> > Regards
> > Jesse
> >
> >
> > Thanks,
> > Lijo
> >
> >> +};
> >>
> >>  static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
> >> static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device
> >> *adev, int inst); @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
> >>       if (r)
> >>               return r;
> >>
> >> -     /* VCN DEC TRAP */
> >> -     r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
> >> -             VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
> >> +     /* VCN UNIFIED TRAP */
> >> +     r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
> >> +                     VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE,
> >> +&adev->vcn.inst[inst].irq);
> >>       if (r)
> >>               return r;
> >>
> >> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct
> >> amdgpu_ip_block *ip_block)
> >>
> >>       ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
> >>       sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
> >> -     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
> >> +     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
> >>                                AMDGPU_RING_PRIO_DEFAULT,
> >>                                &adev->vcn.inst[inst].sched_score);
> >>       if (r)
> >> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
> >>   */
> >>  static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int
> >> inst)  {
> >> -     adev->vcn.inst->irq.num_types++;
> >> +     if (adev->vcn.harvest_config & (1 << inst))
> >> +             return;
> >> +
> >> +     adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
> >>
> >> -     adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
> >> +     adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
> >>  }
> >>
> >>  static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block
> >> *ip_block, struct drm_printer *p)

[-- Attachment #2: 0002-drm-amdgpu-use-a-single-set-of-interrupt-funcs-for-v.patch --]
[-- Type: text/x-patch, Size: 1398 bytes --]

From 939a93c835c130720a346dfb6baad297f1f26e25 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Tue, 12 Nov 2024 23:26:52 -0500
Subject: [PATCH 2/2] drm/amdgpu: use a single set of interrupt funcs for vcn
 4.0.3

There a single client and source id.  The node id from the IV
ring is used to determine which instance the interrupt belongs
to.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index 88cbf1a88a07..8534f5370094 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -153,7 +153,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
 	/* VCN DEC TRAP */
 	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
 		VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
-	if (r)
+	if (r && r != -EEXIST)
 		return r;
 
 	volatile struct amdgpu_vcn4_fw_shared *fw_shared;
@@ -1741,7 +1741,7 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
  */
 static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int inst)
 {
-	adev->vcn.inst->irq.num_types++;
+	adev->vcn.inst->irq.num_types = 1;
 
 	adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
 }
-- 
2.47.0


[-- Attachment #3: 0001-drm-amdgpu-return-EEXIST-if-an-interrupt-handler-exi.patch --]
[-- Type: text/x-patch, Size: 925 bytes --]

From c9814f2b031a8a08c53a67e91e25f4e06b3e0b19 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Tue, 12 Nov 2024 22:51:01 -0500
Subject: [PATCH 1/2] drm/amdgpu: return -EEXIST if an interrupt handler exists

So we can tell is one is already registered.

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 19ce4da285e8..2f66b8bba3c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -408,7 +408,7 @@ int amdgpu_irq_add_id(struct amdgpu_device *adev,
 	}
 
 	if (adev->irq.client[client_id].sources[src_id] != NULL)
-		return -EINVAL;
+		return -EEXIST;
 
 	if (source->num_types && !source->enabled_types) {
 		atomic_t *types;
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-13  4:46         ` Alex Deucher
@ 2024-11-13  5:03           ` Lazar, Lijo
  2024-11-13  5:22             ` Zhang, Jesse(Jie)
  2024-11-13  5:24             ` Alex Deucher
  0 siblings, 2 replies; 14+ messages in thread
From: Lazar, Lijo @ 2024-11-13  5:03 UTC (permalink / raw)
  To: Alex Deucher
  Cc: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org,
	Deucher, Alexander, Koenig, Christian, Prosyak, Vitaly,
	Huang, Tim



On 11/13/2024 10:16 AM, Alex Deucher wrote:
> On Tue, Nov 12, 2024 at 10:24 PM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>>
>>
>>
>> On 11/13/2024 7:58 AM, Zhang, Jesse(Jie) wrote:
>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>
>>> Hi, Lijo,
>>>
>>> -----Original Message-----
>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>> Sent: Tuesday, November 12, 2024 10:54 PM
>>> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
>>> Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
>>>
>>>
>>>
>>> On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
>>>> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP
>>>> block <vcn_v4_0_3> failed -22 [ 2875.880494] amdgpu 0000:01:00.0:
>>>> amdgpu: amdgpu_device_ip_init failed [ 2875.887689] amdgpu
>>>> 0000:01:00.0: amdgpu: Fatal error during GPU init [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
>>>>
>>>> Add irqs with different IRQ source pointer for vcn0 and vcn1.
>>>>
>>>> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
>>>> ---
>>>>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
>>>>  1 file changed, 13 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>> index ef3dfd44a022..82b90f1e6f33 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry
>>>> vcn_reg_list_4_0_3[] = {
>>>>
>>>>  #define NORMALIZE_VCN_REG_OFFSET(offset) \
>>>>               (offset & 0x1FFFF)
>>>> +static int amdgpu_ih_clientid_vcns[] = {
>>>> +     SOC15_IH_CLIENTID_VCN,
>>>> +     SOC15_IH_CLIENTID_VCN1
>>>
>>> This is not valid for 4.0.3. It uses only the same client id, different node_id to distinguish. Also, there are max of 4 instances.
>>>
>>> I would say that entire IP instance series was done in a haste without applying thought and breaks other things including ip block mask.
>>>
>>> If the same client id is used, it returns -EINVAL (because of the following check) and sw init fails at step vcn_v4_0_3_sw_init / amdgpu_irq_add_id.
>>>
>>> amdgpu_irq_add_id:
>>> if (adev->irq.client[client_id].sources[src_id] != NULL)
>>>         return -EINVAL;
>>>
>>
>> We had some side discussions on IP block-per-instance approach.
>> Personally, I was not in favour of it as I thought allowing IP block to
>> handle its own instances is the better approach and that could handle
>> dependencies between instances. Turns out that there are more like
>> handling common things for all instances as in this example.
> 
> We tried that route as well before this one and it was ugly as well.
> 
>>
>> I would prefer to revert the patch series and consider all angles before
>> moving forward on the approach. Will leave this to Alex/Christian/Leo on
>> the final decsion.
> 
> Do the attached patches fix it?
> 

This is kind of a piece-meal fix. This doesn't address the larger
problem of how to handle things common for all IP instances.

Thanks,
Lijo

> Alex
> 
>>
>> Thanks,
>> Lijo
>>
>>> Regards
>>> Jesse
>>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>> +};
>>>>
>>>>  static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
>>>> static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device
>>>> *adev, int inst); @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>>>>       if (r)
>>>>               return r;
>>>>
>>>> -     /* VCN DEC TRAP */
>>>> -     r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
>>>> -             VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
>>>> +     /* VCN UNIFIED TRAP */
>>>> +     r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
>>>> +                     VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE,
>>>> +&adev->vcn.inst[inst].irq);
>>>>       if (r)
>>>>               return r;
>>>>
>>>> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct
>>>> amdgpu_ip_block *ip_block)
>>>>
>>>>       ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
>>>>       sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
>>>> -     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
>>>> +     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
>>>>                                AMDGPU_RING_PRIO_DEFAULT,
>>>>                                &adev->vcn.inst[inst].sched_score);
>>>>       if (r)
>>>> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
>>>>   */
>>>>  static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int
>>>> inst)  {
>>>> -     adev->vcn.inst->irq.num_types++;
>>>> +     if (adev->vcn.harvest_config & (1 << inst))
>>>> +             return;
>>>> +
>>>> +     adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
>>>>
>>>> -     adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
>>>> +     adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
>>>>  }
>>>>
>>>>  static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block
>>>> *ip_block, struct drm_printer *p)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-13  5:03           ` Lazar, Lijo
@ 2024-11-13  5:22             ` Zhang, Jesse(Jie)
  2024-11-13  5:24             ` Alex Deucher
  1 sibling, 0 replies; 14+ messages in thread
From: Zhang, Jesse(Jie) @ 2024-11-13  5:22 UTC (permalink / raw)
  To: Lazar, Lijo, Alex Deucher
  Cc: amd-gfx@lists.freedesktop.org, Deucher,  Alexander,
	Koenig, Christian, Prosyak, Vitaly, Huang, Tim

[AMD Official Use Only - AMD Internal Distribution Only]

HI Alex,

-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@amd.com>
Sent: Wednesday, November 13, 2024 1:03 PM
To: Alex Deucher <alexdeucher@gmail.com>
Cc: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed



On 11/13/2024 10:16 AM, Alex Deucher wrote:
> On Tue, Nov 12, 2024 at 10:24 PM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>>
>>
>>
>> On 11/13/2024 7:58 AM, Zhang, Jesse(Jie) wrote:
>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>
>>> Hi, Lijo,
>>>
>>> -----Original Message-----
>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>> Sent: Tuesday, November 12, 2024 10:54 PM
>>> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>;
>>> amd-gfx@lists.freedesktop.org
>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig,
>>> Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly
>>> <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
>>> Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
>>>
>>>
>>>
>>> On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
>>>> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of
>>>> IP block <vcn_v4_0_3> failed -22 [ 2875.880494] amdgpu 0000:01:00.0:
>>>> amdgpu: amdgpu_device_ip_init failed [ 2875.887689] amdgpu
>>>> 0000:01:00.0: amdgpu: Fatal error during GPU init [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
>>>>
>>>> Add irqs with different IRQ source pointer for vcn0 and vcn1.
>>>>
>>>> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
>>>> ---
>>>>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
>>>>  1 file changed, 13 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>> index ef3dfd44a022..82b90f1e6f33 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry
>>>> vcn_reg_list_4_0_3[] = {
>>>>
>>>>  #define NORMALIZE_VCN_REG_OFFSET(offset) \
>>>>               (offset & 0x1FFFF)
>>>> +static int amdgpu_ih_clientid_vcns[] = {
>>>> +     SOC15_IH_CLIENTID_VCN,
>>>> +     SOC15_IH_CLIENTID_VCN1
>>>
>>> This is not valid for 4.0.3. It uses only the same client id, different node_id to distinguish. Also, there are max of 4 instances.
>>>
>>> I would say that entire IP instance series was done in a haste without applying thought and breaks other things including ip block mask.
>>>
>>> If the same client id is used, it returns -EINVAL (because of the following check) and sw init fails at step vcn_v4_0_3_sw_init / amdgpu_irq_add_id.
>>>
>>> amdgpu_irq_add_id:
>>> if (adev->irq.client[client_id].sources[src_id] != NULL)
>>>         return -EINVAL;
>>>
>>
>> We had some side discussions on IP block-per-instance approach.
>> Personally, I was not in favour of it as I thought allowing IP block
>> to handle its own instances is the better approach and that could
>> handle dependencies between instances. Turns out that there are more
>> like handling common things for all instances as in this example.
>
> We tried that route as well before this one and it was ugly as well.
>
>>
>> I would prefer to revert the patch series and consider all angles
>> before moving forward on the approach. Will leave this to
>> Alex/Christian/Leo on the final decsion.
>
> Do the attached patches fix it?


Yes, using your patch vcn sw init will pass.
Please help to check this patch: [PATCH 1/2] drm/admgpu: fix vcn sw init failed
Otherwise duplicate sysfs warnings will be displayed.

 sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0/vcn_reset_mask'
[  562.443738] CPU: 13 PID: 4888 Comm: modprobe Tainted: G            E      6.10.0+ #51
[  562.443740] Hardware name: AMD Splinter/Splinter-RPL, BIOS VS2683299N.FD 05/10/2023
[  562.443741] Call Trace:
[  562.443743]  <TASK>
[  562.443746]  dump_stack_lvl+0x70/0x90
[  562.443751]  dump_stack+0x14/0x20
[  562.443753]  sysfs_warn_dup+0x60/0x80
[  562.443757]  sysfs_add_file_mode_ns+0x126/0x130
[  562.443760]  sysfs_create_file_ns+0x68/0xa0
[  562.443762]  device_create_file+0x46/0x90
[  562.443766]  amdgpu_vcn_sysfs_reset_mask_init+0x1c/0x30 [amdgpu]
[  562.443991]  vcn_v4_0_3_sw_init+0x270/0x3e0 [amdgpu]
[  562.444120]  amdgpu_device_init+0x1a0e/0x35a0 [amdgpu]

Regards
Jesse


This is kind of a piece-meal fix. This doesn't address the larger problem of how to handle things common for all IP instances.

Thanks,
Lijo

> Alex
>
>>
>> Thanks,
>> Lijo
>>
>>> Regards
>>> Jesse
>>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>> +};
>>>>
>>>>  static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
>>>> static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device
>>>> *adev, int inst); @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>>>>       if (r)
>>>>               return r;
>>>>
>>>> -     /* VCN DEC TRAP */
>>>> -     r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
>>>> -             VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
>>>> +     /* VCN UNIFIED TRAP */
>>>> +     r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
>>>> +                     VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE,
>>>> +&adev->vcn.inst[inst].irq);
>>>>       if (r)
>>>>               return r;
>>>>
>>>> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct
>>>> amdgpu_ip_block *ip_block)
>>>>
>>>>       ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
>>>>       sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
>>>> -     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
>>>> +     r = amdgpu_ring_init(adev, ring, 512,
>>>> + &adev->vcn.inst[inst].irq, 0,
>>>>                                AMDGPU_RING_PRIO_DEFAULT,
>>>>                                &adev->vcn.inst[inst].sched_score);
>>>>       if (r)
>>>> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
>>>>   */
>>>>  static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev,
>>>> int
>>>> inst)  {
>>>> -     adev->vcn.inst->irq.num_types++;
>>>> +     if (adev->vcn.harvest_config & (1 << inst))
>>>> +             return;
>>>> +
>>>> +     adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings
>>>> + + 1;
>>>>
>>>> -     adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
>>>> +     adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
>>>>  }
>>>>
>>>>  static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block
>>>> *ip_block, struct drm_printer *p)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-13  5:03           ` Lazar, Lijo
  2024-11-13  5:22             ` Zhang, Jesse(Jie)
@ 2024-11-13  5:24             ` Alex Deucher
  2024-11-13  5:32               ` Lazar, Lijo
  1 sibling, 1 reply; 14+ messages in thread
From: Alex Deucher @ 2024-11-13  5:24 UTC (permalink / raw)
  To: Lazar, Lijo
  Cc: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org,
	Deucher, Alexander, Koenig, Christian, Prosyak, Vitaly,
	Huang, Tim

On Wed, Nov 13, 2024 at 12:03 AM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>
>
>
> On 11/13/2024 10:16 AM, Alex Deucher wrote:
> > On Tue, Nov 12, 2024 at 10:24 PM Lazar, Lijo <lijo.lazar@amd.com> wrote:
> >>
> >>
> >>
> >> On 11/13/2024 7:58 AM, Zhang, Jesse(Jie) wrote:
> >>> [AMD Official Use Only - AMD Internal Distribution Only]
> >>>
> >>> Hi, Lijo,
> >>>
> >>> -----Original Message-----
> >>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> >>> Sent: Tuesday, November 12, 2024 10:54 PM
> >>> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
> >>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
> >>> Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
> >>>
> >>>
> >>>
> >>> On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
> >>>> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP
> >>>> block <vcn_v4_0_3> failed -22 [ 2875.880494] amdgpu 0000:01:00.0:
> >>>> amdgpu: amdgpu_device_ip_init failed [ 2875.887689] amdgpu
> >>>> 0000:01:00.0: amdgpu: Fatal error during GPU init [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
> >>>>
> >>>> Add irqs with different IRQ source pointer for vcn0 and vcn1.
> >>>>
> >>>> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
> >>>> ---
> >>>>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
> >>>>  1 file changed, 13 insertions(+), 6 deletions(-)
> >>>>
> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>> index ef3dfd44a022..82b90f1e6f33 100644
> >>>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry
> >>>> vcn_reg_list_4_0_3[] = {
> >>>>
> >>>>  #define NORMALIZE_VCN_REG_OFFSET(offset) \
> >>>>               (offset & 0x1FFFF)
> >>>> +static int amdgpu_ih_clientid_vcns[] = {
> >>>> +     SOC15_IH_CLIENTID_VCN,
> >>>> +     SOC15_IH_CLIENTID_VCN1
> >>>
> >>> This is not valid for 4.0.3. It uses only the same client id, different node_id to distinguish. Also, there are max of 4 instances.
> >>>
> >>> I would say that entire IP instance series was done in a haste without applying thought and breaks other things including ip block mask.
> >>>
> >>> If the same client id is used, it returns -EINVAL (because of the following check) and sw init fails at step vcn_v4_0_3_sw_init / amdgpu_irq_add_id.
> >>>
> >>> amdgpu_irq_add_id:
> >>> if (adev->irq.client[client_id].sources[src_id] != NULL)
> >>>         return -EINVAL;
> >>>
> >>
> >> We had some side discussions on IP block-per-instance approach.
> >> Personally, I was not in favour of it as I thought allowing IP block to
> >> handle its own instances is the better approach and that could handle
> >> dependencies between instances. Turns out that there are more like
> >> handling common things for all instances as in this example.
> >
> > We tried that route as well before this one and it was ugly as well.
> >
> >>
> >> I would prefer to revert the patch series and consider all angles before
> >> moving forward on the approach. Will leave this to Alex/Christian/Leo on
> >> the final decsion.
> >
> > Do the attached patches fix it?
> >
>
> This is kind of a piece-meal fix. This doesn't address the larger
> problem of how to handle things common for all IP instances.

I think we'll need to handle them as we encounter them.  We can always
split common stuff out to helpers which can be used by multiple
instances.  But I think once we get past this refactoring it will put
us in a better place for dealing with multiple IP instances.  Consider
the case of a part with multiple blocks of the same type with
different IP versions.  Those would not easily be handled with a
single IP block handling multiple IP instances.

Alex

>
> Thanks,
> Lijo
>
> > Alex
> >
> >>
> >> Thanks,
> >> Lijo
> >>
> >>> Regards
> >>> Jesse
> >>>
> >>>
> >>> Thanks,
> >>> Lijo
> >>>
> >>>> +};
> >>>>
> >>>>  static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
> >>>> static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device
> >>>> *adev, int inst); @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
> >>>>       if (r)
> >>>>               return r;
> >>>>
> >>>> -     /* VCN DEC TRAP */
> >>>> -     r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
> >>>> -             VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
> >>>> +     /* VCN UNIFIED TRAP */
> >>>> +     r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
> >>>> +                     VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE,
> >>>> +&adev->vcn.inst[inst].irq);
> >>>>       if (r)
> >>>>               return r;
> >>>>
> >>>> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct
> >>>> amdgpu_ip_block *ip_block)
> >>>>
> >>>>       ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
> >>>>       sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
> >>>> -     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
> >>>> +     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
> >>>>                                AMDGPU_RING_PRIO_DEFAULT,
> >>>>                                &adev->vcn.inst[inst].sched_score);
> >>>>       if (r)
> >>>> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
> >>>>   */
> >>>>  static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int
> >>>> inst)  {
> >>>> -     adev->vcn.inst->irq.num_types++;
> >>>> +     if (adev->vcn.harvest_config & (1 << inst))
> >>>> +             return;
> >>>> +
> >>>> +     adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
> >>>>
> >>>> -     adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
> >>>> +     adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
> >>>>  }
> >>>>
> >>>>  static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block
> >>>> *ip_block, struct drm_printer *p)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] drm/admgpu: fix vcn sw init failed
  2024-11-12 14:30 [PATCH 1/2] drm/admgpu: fix vcn sw init failed Jesse.zhang@amd.com
  2024-11-12 14:30 ` [PATCH 2/2] drm/amdgpu: " Jesse.zhang@amd.com
@ 2024-11-13  5:30 ` Alex Deucher
  1 sibling, 0 replies; 14+ messages in thread
From: Alex Deucher @ 2024-11-13  5:30 UTC (permalink / raw)
  To: Jesse.zhang@amd.com
  Cc: amd-gfx, Alexander.Deucher, Christian Koenig, vitaly.prosyak,
	Tim.Huang

On Tue, Nov 12, 2024 at 9:31 AM Jesse.zhang@amd.com <jesse.zhang@amd.com> wrote:
>
> For multiple vcn instances, to avoid creating reset sysfs multiple times,
>  add the instance paramter in reset mask init.

I think it would be better to create one sysfs file per instance.  E.g.,
vcn_reset_mask, vcn1_reset_mask, vcn2_reset_mask, etc.

Alex

>
> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c |  8 ++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  4 ++--
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   | 10 ++++------
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c |  4 ++--
>  drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c |  4 ++--
>  5 files changed, 14 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> index 25f490ad3a85..1d4eda649845 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> @@ -1295,11 +1295,11 @@ static ssize_t amdgpu_get_vcn_reset_mask(struct device *dev,
>  static DEVICE_ATTR(vcn_reset_mask, 0444,
>                    amdgpu_get_vcn_reset_mask, NULL);
>
> -int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
> +int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev, int inst)
>  {
>         int r = 0;
>
> -       if (adev->vcn.num_vcn_inst) {
> +       if (inst == 0) {
>                 r = device_create_file(adev->dev, &dev_attr_vcn_reset_mask);
>                 if (r)
>                         return r;
> @@ -1308,12 +1308,12 @@ int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev)
>         return r;
>  }
>
> -void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev)
> +void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev, int inst)
>  {
>         int idx;
>
>         if (drm_dev_enter(adev_to_drm(adev), &idx)) {
> -               if (adev->vcn.num_vcn_inst)
> +               if (inst == 0)
>                         device_remove_file(adev->dev, &dev_attr_vcn_reset_mask);
>                 drm_dev_exit(idx);
>         }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> index 7ff4ae2a0432..9b10044c61a3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
> @@ -519,7 +519,7 @@ int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
>  int amdgpu_vcn_psp_update_sram(struct amdgpu_device *adev, int inst_idx,
>                                enum AMDGPU_UCODE_ID ucode_id);
>  int amdgpu_vcn_save_vcpu_bo(struct amdgpu_device *adev, int inst);
> -int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev);
> -void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev);
> +int amdgpu_vcn_sysfs_reset_mask_init(struct amdgpu_device *adev, int inst);
> +void amdgpu_vcn_sysfs_reset_mask_fini(struct amdgpu_device *adev, int inst);
>
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> index 59f83409d323..109b27904984 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> @@ -250,11 +250,9 @@ static int vcn_v4_0_sw_init(struct amdgpu_ip_block *ip_block)
>                 ip_block->ip_dump = ptr;
>         }
>
> -       if (inst == 0) {
> -               r = amdgpu_vcn_sysfs_reset_mask_init(adev);
> -               if (r)
> -                       return r;
> -       }
> +       r = amdgpu_vcn_sysfs_reset_mask_init(adev, inst);
> +       if (r)
> +               return r;
>
>         return 0;
>  }
> @@ -292,7 +290,7 @@ static int vcn_v4_0_sw_fini(struct amdgpu_ip_block *ip_block)
>         if (r)
>                 return r;
>
> -       amdgpu_vcn_sysfs_reset_mask_fini(adev);
> +       amdgpu_vcn_sysfs_reset_mask_fini(adev, inst);
>         r = amdgpu_vcn_sw_fini(adev, inst);
>
>         kfree(ip_block->ip_dump);
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> index e9b869f373c9..ef3dfd44a022 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> @@ -217,7 +217,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>                 ip_block->ip_dump = ptr;
>         }
>
> -       r = amdgpu_vcn_sysfs_reset_mask_init(adev);
> +       r = amdgpu_vcn_sysfs_reset_mask_init(adev, inst);
>         if (r)
>                 return r;
>
> @@ -254,7 +254,7 @@ static int vcn_v4_0_3_sw_fini(struct amdgpu_ip_block *ip_block)
>         if (r)
>                 return r;
>
> -       amdgpu_vcn_sysfs_reset_mask_fini(adev);
> +       amdgpu_vcn_sysfs_reset_mask_fini(adev, inst);
>         r = amdgpu_vcn_sw_fini(adev, inst);
>
>         kfree(ip_block->ip_dump);
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
> index 96ec01cffea3..8f9c19c68d88 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
> @@ -186,7 +186,7 @@ static int vcn_v5_0_0_sw_init(struct amdgpu_ip_block *ip_block)
>                 ip_block->ip_dump = ptr;
>         }
>
> -       r = amdgpu_vcn_sysfs_reset_mask_init(adev);
> +       r = amdgpu_vcn_sysfs_reset_mask_init(adev, inst);
>         if (r)
>                 return r;
>
> @@ -223,7 +223,7 @@ static int vcn_v5_0_0_sw_fini(struct amdgpu_ip_block *ip_block)
>         if (r)
>                 return r;
>
> -       amdgpu_vcn_sysfs_reset_mask_fini(adev);
> +       amdgpu_vcn_sysfs_reset_mask_fini(adev, inst);
>         r = amdgpu_vcn_sw_fini(adev, inst);
>
>         kfree(ip_block->ip_dump);
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-13  5:24             ` Alex Deucher
@ 2024-11-13  5:32               ` Lazar, Lijo
  2024-11-13 21:43                 ` Alex Deucher
  0 siblings, 1 reply; 14+ messages in thread
From: Lazar, Lijo @ 2024-11-13  5:32 UTC (permalink / raw)
  To: Alex Deucher
  Cc: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org,
	Deucher, Alexander, Koenig, Christian, Prosyak, Vitaly,
	Huang, Tim



On 11/13/2024 10:54 AM, Alex Deucher wrote:
> On Wed, Nov 13, 2024 at 12:03 AM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>>
>>
>>
>> On 11/13/2024 10:16 AM, Alex Deucher wrote:
>>> On Tue, Nov 12, 2024 at 10:24 PM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>>>>
>>>>
>>>>
>>>> On 11/13/2024 7:58 AM, Zhang, Jesse(Jie) wrote:
>>>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>>>
>>>>> Hi, Lijo,
>>>>>
>>>>> -----Original Message-----
>>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>>>> Sent: Tuesday, November 12, 2024 10:54 PM
>>>>> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
>>>>> Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
>>>>>
>>>>>
>>>>>
>>>>> On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
>>>>>> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP
>>>>>> block <vcn_v4_0_3> failed -22 [ 2875.880494] amdgpu 0000:01:00.0:
>>>>>> amdgpu: amdgpu_device_ip_init failed [ 2875.887689] amdgpu
>>>>>> 0000:01:00.0: amdgpu: Fatal error during GPU init [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
>>>>>>
>>>>>> Add irqs with different IRQ source pointer for vcn0 and vcn1.
>>>>>>
>>>>>> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
>>>>>> ---
>>>>>>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
>>>>>>  1 file changed, 13 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>>>> index ef3dfd44a022..82b90f1e6f33 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>>>> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry
>>>>>> vcn_reg_list_4_0_3[] = {
>>>>>>
>>>>>>  #define NORMALIZE_VCN_REG_OFFSET(offset) \
>>>>>>               (offset & 0x1FFFF)
>>>>>> +static int amdgpu_ih_clientid_vcns[] = {
>>>>>> +     SOC15_IH_CLIENTID_VCN,
>>>>>> +     SOC15_IH_CLIENTID_VCN1
>>>>>
>>>>> This is not valid for 4.0.3. It uses only the same client id, different node_id to distinguish. Also, there are max of 4 instances.
>>>>>
>>>>> I would say that entire IP instance series was done in a haste without applying thought and breaks other things including ip block mask.
>>>>>
>>>>> If the same client id is used, it returns -EINVAL (because of the following check) and sw init fails at step vcn_v4_0_3_sw_init / amdgpu_irq_add_id.
>>>>>
>>>>> amdgpu_irq_add_id:
>>>>> if (adev->irq.client[client_id].sources[src_id] != NULL)
>>>>>         return -EINVAL;
>>>>>
>>>>
>>>> We had some side discussions on IP block-per-instance approach.
>>>> Personally, I was not in favour of it as I thought allowing IP block to
>>>> handle its own instances is the better approach and that could handle
>>>> dependencies between instances. Turns out that there are more like
>>>> handling common things for all instances as in this example.
>>>
>>> We tried that route as well before this one and it was ugly as well.
>>>
>>>>
>>>> I would prefer to revert the patch series and consider all angles before
>>>> moving forward on the approach. Will leave this to Alex/Christian/Leo on
>>>> the final decsion.
>>>
>>> Do the attached patches fix it?
>>>
>>
>> This is kind of a piece-meal fix. This doesn't address the larger
>> problem of how to handle things common for all IP instances.
> 
> I think we'll need to handle them as we encounter them.  We can always
> split common stuff out to helpers which can be used by multiple
> instances.

I don't think so. It made a fundamental change. We changed the base
layer of considering IP as a single block. A common swinit or swfini is
no longer the case. Consider how a sysfs initialization like
enable_isolation could be handled if the same approach is taken for GFX IP.

I would still say that we broke the current foundation with this
approach and hoping that uppper layer fixes can help to hold things
together. Or, it needs a start-from-scratch approach.

Thanks,
Lijo

  But I think once we get past this refactoring it will put
> us in a better place for dealing with multiple IP instances.  Consider
> the case of a part with multiple blocks of the same type with
> different IP versions.  Those would not easily be handled with a
> single IP block handling multiple IP instances.
> 
> Alex
> 
>>
>> Thanks,
>> Lijo
>>
>>> Alex
>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>> Regards
>>>>> Jesse
>>>>>
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>>> +};
>>>>>>
>>>>>>  static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
>>>>>> static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device
>>>>>> *adev, int inst); @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>>>>>>       if (r)
>>>>>>               return r;
>>>>>>
>>>>>> -     /* VCN DEC TRAP */
>>>>>> -     r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
>>>>>> -             VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
>>>>>> +     /* VCN UNIFIED TRAP */
>>>>>> +     r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
>>>>>> +                     VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE,
>>>>>> +&adev->vcn.inst[inst].irq);
>>>>>>       if (r)
>>>>>>               return r;
>>>>>>
>>>>>> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct
>>>>>> amdgpu_ip_block *ip_block)
>>>>>>
>>>>>>       ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
>>>>>>       sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
>>>>>> -     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
>>>>>> +     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
>>>>>>                                AMDGPU_RING_PRIO_DEFAULT,
>>>>>>                                &adev->vcn.inst[inst].sched_score);
>>>>>>       if (r)
>>>>>> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
>>>>>>   */
>>>>>>  static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int
>>>>>> inst)  {
>>>>>> -     adev->vcn.inst->irq.num_types++;
>>>>>> +     if (adev->vcn.harvest_config & (1 << inst))
>>>>>> +             return;
>>>>>> +
>>>>>> +     adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
>>>>>>
>>>>>> -     adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
>>>>>> +     adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
>>>>>>  }
>>>>>>
>>>>>>  static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block
>>>>>> *ip_block, struct drm_printer *p)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-13  5:32               ` Lazar, Lijo
@ 2024-11-13 21:43                 ` Alex Deucher
  2024-11-15 12:34                   ` Christian König
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Deucher @ 2024-11-13 21:43 UTC (permalink / raw)
  To: Lazar, Lijo
  Cc: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org,
	Deucher, Alexander, Koenig, Christian, Prosyak, Vitaly,
	Huang, Tim

On Wed, Nov 13, 2024 at 12:32 AM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>
>
>
> On 11/13/2024 10:54 AM, Alex Deucher wrote:
> > On Wed, Nov 13, 2024 at 12:03 AM Lazar, Lijo <lijo.lazar@amd.com> wrote:
> >>
> >>
> >>
> >> On 11/13/2024 10:16 AM, Alex Deucher wrote:
> >>> On Tue, Nov 12, 2024 at 10:24 PM Lazar, Lijo <lijo.lazar@amd.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 11/13/2024 7:58 AM, Zhang, Jesse(Jie) wrote:
> >>>>> [AMD Official Use Only - AMD Internal Distribution Only]
> >>>>>
> >>>>> Hi, Lijo,
> >>>>>
> >>>>> -----Original Message-----
> >>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> >>>>> Sent: Tuesday, November 12, 2024 10:54 PM
> >>>>> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
> >>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
> >>>>> Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
> >>>>>
> >>>>>
> >>>>>
> >>>>> On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
> >>>>>> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP
> >>>>>> block <vcn_v4_0_3> failed -22 [ 2875.880494] amdgpu 0000:01:00.0:
> >>>>>> amdgpu: amdgpu_device_ip_init failed [ 2875.887689] amdgpu
> >>>>>> 0000:01:00.0: amdgpu: Fatal error during GPU init [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
> >>>>>>
> >>>>>> Add irqs with different IRQ source pointer for vcn0 and vcn1.
> >>>>>>
> >>>>>> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
> >>>>>> ---
> >>>>>>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
> >>>>>>  1 file changed, 13 insertions(+), 6 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>>>> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>>>> index ef3dfd44a022..82b90f1e6f33 100644
> >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>>>> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry
> >>>>>> vcn_reg_list_4_0_3[] = {
> >>>>>>
> >>>>>>  #define NORMALIZE_VCN_REG_OFFSET(offset) \
> >>>>>>               (offset & 0x1FFFF)
> >>>>>> +static int amdgpu_ih_clientid_vcns[] = {
> >>>>>> +     SOC15_IH_CLIENTID_VCN,
> >>>>>> +     SOC15_IH_CLIENTID_VCN1
> >>>>>
> >>>>> This is not valid for 4.0.3. It uses only the same client id, different node_id to distinguish. Also, there are max of 4 instances.
> >>>>>
> >>>>> I would say that entire IP instance series was done in a haste without applying thought and breaks other things including ip block mask.
> >>>>>
> >>>>> If the same client id is used, it returns -EINVAL (because of the following check) and sw init fails at step vcn_v4_0_3_sw_init / amdgpu_irq_add_id.
> >>>>>
> >>>>> amdgpu_irq_add_id:
> >>>>> if (adev->irq.client[client_id].sources[src_id] != NULL)
> >>>>>         return -EINVAL;
> >>>>>
> >>>>
> >>>> We had some side discussions on IP block-per-instance approach.
> >>>> Personally, I was not in favour of it as I thought allowing IP block to
> >>>> handle its own instances is the better approach and that could handle
> >>>> dependencies between instances. Turns out that there are more like
> >>>> handling common things for all instances as in this example.
> >>>
> >>> We tried that route as well before this one and it was ugly as well.
> >>>
> >>>>
> >>>> I would prefer to revert the patch series and consider all angles before
> >>>> moving forward on the approach. Will leave this to Alex/Christian/Leo on
> >>>> the final decsion.
> >>>
> >>> Do the attached patches fix it?
> >>>
> >>
> >> This is kind of a piece-meal fix. This doesn't address the larger
> >> problem of how to handle things common for all IP instances.
> >
> > I think we'll need to handle them as we encounter them.  We can always
> > split common stuff out to helpers which can be used by multiple
> > instances.
>
> I don't think so. It made a fundamental change. We changed the base
> layer of considering IP as a single block. A common swinit or swfini is
> no longer the case. Consider how a sysfs initialization like
> enable_isolation could be handled if the same approach is taken for GFX IP.
>
> I would still say that we broke the current foundation with this
> approach and hoping that uppper layer fixes can help to hold things
> together. Or, it needs a start-from-scratch approach.

This was the original intention when we first designed the driver and
the IP block structures.  Unfortunately all of the early chips only
had one instance of each IP block and since then we've just built up
technical debt with respect to the instance handling.  That said, I
think the rework of this level at this point is probably too much, in
digging through the current state, there are just too many corner
cases to fix up.  I agree we should probably forgo it at this point.

Alex


Alex

>
> Thanks,
> Lijo
>
>   But I think once we get past this refactoring it will put
> > us in a better place for dealing with multiple IP instances.  Consider
> > the case of a part with multiple blocks of the same type with
> > different IP versions.  Those would not easily be handled with a
> > single IP block handling multiple IP instances.
> >
> > Alex
> >
> >>
> >> Thanks,
> >> Lijo
> >>
> >>> Alex
> >>>
> >>>>
> >>>> Thanks,
> >>>> Lijo
> >>>>
> >>>>> Regards
> >>>>> Jesse
> >>>>>
> >>>>>
> >>>>> Thanks,
> >>>>> Lijo
> >>>>>
> >>>>>> +};
> >>>>>>
> >>>>>>  static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
> >>>>>> static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device
> >>>>>> *adev, int inst); @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
> >>>>>>       if (r)
> >>>>>>               return r;
> >>>>>>
> >>>>>> -     /* VCN DEC TRAP */
> >>>>>> -     r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
> >>>>>> -             VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
> >>>>>> +     /* VCN UNIFIED TRAP */
> >>>>>> +     r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
> >>>>>> +                     VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE,
> >>>>>> +&adev->vcn.inst[inst].irq);
> >>>>>>       if (r)
> >>>>>>               return r;
> >>>>>>
> >>>>>> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct
> >>>>>> amdgpu_ip_block *ip_block)
> >>>>>>
> >>>>>>       ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
> >>>>>>       sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
> >>>>>> -     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
> >>>>>> +     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
> >>>>>>                                AMDGPU_RING_PRIO_DEFAULT,
> >>>>>>                                &adev->vcn.inst[inst].sched_score);
> >>>>>>       if (r)
> >>>>>> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
> >>>>>>   */
> >>>>>>  static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int
> >>>>>> inst)  {
> >>>>>> -     adev->vcn.inst->irq.num_types++;
> >>>>>> +     if (adev->vcn.harvest_config & (1 << inst))
> >>>>>> +             return;
> >>>>>> +
> >>>>>> +     adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
> >>>>>>
> >>>>>> -     adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
> >>>>>> +     adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
> >>>>>>  }
> >>>>>>
> >>>>>>  static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block
> >>>>>> *ip_block, struct drm_printer *p)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-13 21:43                 ` Alex Deucher
@ 2024-11-15 12:34                   ` Christian König
  2024-11-15 14:24                     ` Alex Deucher
  0 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2024-11-15 12:34 UTC (permalink / raw)
  To: Alex Deucher, Lazar, Lijo
  Cc: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org,
	Deucher, Alexander, Koenig, Christian, Prosyak, Vitaly,
	Huang, Tim, Zhang, Boyuan

Am 13.11.24 um 22:43 schrieb Alex Deucher:
> On Wed, Nov 13, 2024 at 12:32 AM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>>
>>
>> On 11/13/2024 10:54 AM, Alex Deucher wrote:
>>> On Wed, Nov 13, 2024 at 12:03 AM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>>>>
>>>>
>>>> On 11/13/2024 10:16 AM, Alex Deucher wrote:
>>>>> On Tue, Nov 12, 2024 at 10:24 PM Lazar, Lijo <lijo.lazar@amd.com> wrote:
>>>>>>
>>>>>>
>>>>>> On 11/13/2024 7:58 AM, Zhang, Jesse(Jie) wrote:
>>>>>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>>>>>
>>>>>>> Hi, Lijo,
>>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
>>>>>>> Sent: Tuesday, November 12, 2024 10:54 PM
>>>>>>> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
>>>>>>> Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
>>>>>>>> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP
>>>>>>>> block <vcn_v4_0_3> failed -22 [ 2875.880494] amdgpu 0000:01:00.0:
>>>>>>>> amdgpu: amdgpu_device_ip_init failed [ 2875.887689] amdgpu
>>>>>>>> 0000:01:00.0: amdgpu: Fatal error during GPU init [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
>>>>>>>>
>>>>>>>> Add irqs with different IRQ source pointer for vcn0 and vcn1.
>>>>>>>>
>>>>>>>> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
>>>>>>>> ---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
>>>>>>>>   1 file changed, 13 insertions(+), 6 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>>>>>> index ef3dfd44a022..82b90f1e6f33 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
>>>>>>>> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry
>>>>>>>> vcn_reg_list_4_0_3[] = {
>>>>>>>>
>>>>>>>>   #define NORMALIZE_VCN_REG_OFFSET(offset) \
>>>>>>>>                (offset & 0x1FFFF)
>>>>>>>> +static int amdgpu_ih_clientid_vcns[] = {
>>>>>>>> +     SOC15_IH_CLIENTID_VCN,
>>>>>>>> +     SOC15_IH_CLIENTID_VCN1
>>>>>>> This is not valid for 4.0.3. It uses only the same client id, different node_id to distinguish. Also, there are max of 4 instances.
>>>>>>>
>>>>>>> I would say that entire IP instance series was done in a haste without applying thought and breaks other things including ip block mask.
>>>>>>>
>>>>>>> If the same client id is used, it returns -EINVAL (because of the following check) and sw init fails at step vcn_v4_0_3_sw_init / amdgpu_irq_add_id.
>>>>>>>
>>>>>>> amdgpu_irq_add_id:
>>>>>>> if (adev->irq.client[client_id].sources[src_id] != NULL)
>>>>>>>          return -EINVAL;
>>>>>>>
>>>>>> We had some side discussions on IP block-per-instance approach.
>>>>>> Personally, I was not in favour of it as I thought allowing IP block to
>>>>>> handle its own instances is the better approach and that could handle
>>>>>> dependencies between instances. Turns out that there are more like
>>>>>> handling common things for all instances as in this example.
>>>>> We tried that route as well before this one and it was ugly as well.
>>>>>
>>>>>> I would prefer to revert the patch series and consider all angles before
>>>>>> moving forward on the approach. Will leave this to Alex/Christian/Leo on
>>>>>> the final decsion.
>>>>> Do the attached patches fix it?
>>>>>
>>>> This is kind of a piece-meal fix. This doesn't address the larger
>>>> problem of how to handle things common for all IP instances.
>>> I think we'll need to handle them as we encounter them.  We can always
>>> split common stuff out to helpers which can be used by multiple
>>> instances.
>> I don't think so. It made a fundamental change. We changed the base
>> layer of considering IP as a single block. A common swinit or swfini is
>> no longer the case. Consider how a sysfs initialization like
>> enable_isolation could be handled if the same approach is taken for GFX IP.
>>
>> I would still say that we broke the current foundation with this
>> approach and hoping that uppper layer fixes can help to hold things
>> together. Or, it needs a start-from-scratch approach.
> This was the original intention when we first designed the driver and
> the IP block structures.  Unfortunately all of the early chips only
> had one instance of each IP block and since then we've just built up
> technical debt with respect to the instance handling.  That said, I
> think the rework of this level at this point is probably too much, in
> digging through the current state, there are just too many corner
> cases to fix up.  I agree we should probably forgo it at this point.

I would really like to keep the design as is. The fallout we see is 
basically just points out that we have some more broken concepts here.

For example sysfs file should never be initialized from IP specific 
functions in the first place. Why the heck do we do that?

We should probably rather keep the design for the VCN generation Boyuan 
actually tested and see what else we need to fix to get to that design.

Regards,
Christian.

>
> Alex
>
>
> Alex
>
>> Thanks,
>> Lijo
>>
>>    But I think once we get past this refactoring it will put
>>> us in a better place for dealing with multiple IP instances.  Consider
>>> the case of a part with multiple blocks of the same type with
>>> different IP versions.  Those would not easily be handled with a
>>> single IP block handling multiple IP instances.
>>>
>>> Alex
>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>> Alex
>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>>
>>>>>>> Regards
>>>>>>> Jesse
>>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Lijo
>>>>>>>
>>>>>>>> +};
>>>>>>>>
>>>>>>>>   static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
>>>>>>>> static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device
>>>>>>>> *adev, int inst); @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>>>>>>>>        if (r)
>>>>>>>>                return r;
>>>>>>>>
>>>>>>>> -     /* VCN DEC TRAP */
>>>>>>>> -     r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
>>>>>>>> -             VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
>>>>>>>> +     /* VCN UNIFIED TRAP */
>>>>>>>> +     r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
>>>>>>>> +                     VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE,
>>>>>>>> +&adev->vcn.inst[inst].irq);
>>>>>>>>        if (r)
>>>>>>>>                return r;
>>>>>>>>
>>>>>>>> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct
>>>>>>>> amdgpu_ip_block *ip_block)
>>>>>>>>
>>>>>>>>        ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
>>>>>>>>        sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
>>>>>>>> -     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
>>>>>>>> +     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
>>>>>>>>                                 AMDGPU_RING_PRIO_DEFAULT,
>>>>>>>>                                 &adev->vcn.inst[inst].sched_score);
>>>>>>>>        if (r)
>>>>>>>> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
>>>>>>>>    */
>>>>>>>>   static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int
>>>>>>>> inst)  {
>>>>>>>> -     adev->vcn.inst->irq.num_types++;
>>>>>>>> +     if (adev->vcn.harvest_config & (1 << inst))
>>>>>>>> +             return;
>>>>>>>> +
>>>>>>>> +     adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
>>>>>>>>
>>>>>>>> -     adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
>>>>>>>> +     adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block
>>>>>>>> *ip_block, struct drm_printer *p)


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
  2024-11-15 12:34                   ` Christian König
@ 2024-11-15 14:24                     ` Alex Deucher
  0 siblings, 0 replies; 14+ messages in thread
From: Alex Deucher @ 2024-11-15 14:24 UTC (permalink / raw)
  To: Christian König
  Cc: Lazar, Lijo, Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org,
	Deucher, Alexander, Koenig, Christian, Prosyak, Vitaly,
	Huang, Tim, Zhang, Boyuan

On Fri, Nov 15, 2024 at 7:34 AM Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Am 13.11.24 um 22:43 schrieb Alex Deucher:
> > On Wed, Nov 13, 2024 at 12:32 AM Lazar, Lijo <lijo.lazar@amd.com> wrote:
> >>
> >>
> >> On 11/13/2024 10:54 AM, Alex Deucher wrote:
> >>> On Wed, Nov 13, 2024 at 12:03 AM Lazar, Lijo <lijo.lazar@amd.com> wrote:
> >>>>
> >>>>
> >>>> On 11/13/2024 10:16 AM, Alex Deucher wrote:
> >>>>> On Tue, Nov 12, 2024 at 10:24 PM Lazar, Lijo <lijo.lazar@amd.com> wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 11/13/2024 7:58 AM, Zhang, Jesse(Jie) wrote:
> >>>>>>> [AMD Official Use Only - AMD Internal Distribution Only]
> >>>>>>>
> >>>>>>> Hi, Lijo,
> >>>>>>>
> >>>>>>> -----Original Message-----
> >>>>>>> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> >>>>>>> Sent: Tuesday, November 12, 2024 10:54 PM
> >>>>>>> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
> >>>>>>> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com>; Huang, Tim <Tim.Huang@amd.com>
> >>>>>>> Subject: Re: [PATCH 2/2] drm/amdgpu: fix vcn sw init failed
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> On 11/12/2024 8:00 PM, Jesse.zhang@amd.com wrote:
> >>>>>>>> [ 2875.870277] [drm:amdgpu_device_init [amdgpu]] *ERROR* sw_init of IP
> >>>>>>>> block <vcn_v4_0_3> failed -22 [ 2875.880494] amdgpu 0000:01:00.0:
> >>>>>>>> amdgpu: amdgpu_device_ip_init failed [ 2875.887689] amdgpu
> >>>>>>>> 0000:01:00.0: amdgpu: Fatal error during GPU init [ 2875.894791] amdgpu 0000:01:00.0: amdgpu: amdgpu: finishing device.
> >>>>>>>>
> >>>>>>>> Add irqs with different IRQ source pointer for vcn0 and vcn1.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
> >>>>>>>> ---
> >>>>>>>>   drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 19 +++++++++++++------
> >>>>>>>>   1 file changed, 13 insertions(+), 6 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>>>>>> index ef3dfd44a022..82b90f1e6f33 100644
> >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> >>>>>>>> @@ -83,6 +83,10 @@ static const struct amdgpu_hwip_reg_entry
> >>>>>>>> vcn_reg_list_4_0_3[] = {
> >>>>>>>>
> >>>>>>>>   #define NORMALIZE_VCN_REG_OFFSET(offset) \
> >>>>>>>>                (offset & 0x1FFFF)
> >>>>>>>> +static int amdgpu_ih_clientid_vcns[] = {
> >>>>>>>> +     SOC15_IH_CLIENTID_VCN,
> >>>>>>>> +     SOC15_IH_CLIENTID_VCN1
> >>>>>>> This is not valid for 4.0.3. It uses only the same client id, different node_id to distinguish. Also, there are max of 4 instances.
> >>>>>>>
> >>>>>>> I would say that entire IP instance series was done in a haste without applying thought and breaks other things including ip block mask.
> >>>>>>>
> >>>>>>> If the same client id is used, it returns -EINVAL (because of the following check) and sw init fails at step vcn_v4_0_3_sw_init / amdgpu_irq_add_id.
> >>>>>>>
> >>>>>>> amdgpu_irq_add_id:
> >>>>>>> if (adev->irq.client[client_id].sources[src_id] != NULL)
> >>>>>>>          return -EINVAL;
> >>>>>>>
> >>>>>> We had some side discussions on IP block-per-instance approach.
> >>>>>> Personally, I was not in favour of it as I thought allowing IP block to
> >>>>>> handle its own instances is the better approach and that could handle
> >>>>>> dependencies between instances. Turns out that there are more like
> >>>>>> handling common things for all instances as in this example.
> >>>>> We tried that route as well before this one and it was ugly as well.
> >>>>>
> >>>>>> I would prefer to revert the patch series and consider all angles before
> >>>>>> moving forward on the approach. Will leave this to Alex/Christian/Leo on
> >>>>>> the final decsion.
> >>>>> Do the attached patches fix it?
> >>>>>
> >>>> This is kind of a piece-meal fix. This doesn't address the larger
> >>>> problem of how to handle things common for all IP instances.
> >>> I think we'll need to handle them as we encounter them.  We can always
> >>> split common stuff out to helpers which can be used by multiple
> >>> instances.
> >> I don't think so. It made a fundamental change. We changed the base
> >> layer of considering IP as a single block. A common swinit or swfini is
> >> no longer the case. Consider how a sysfs initialization like
> >> enable_isolation could be handled if the same approach is taken for GFX IP.
> >>
> >> I would still say that we broke the current foundation with this
> >> approach and hoping that uppper layer fixes can help to hold things
> >> together. Or, it needs a start-from-scratch approach.
> > This was the original intention when we first designed the driver and
> > the IP block structures.  Unfortunately all of the early chips only
> > had one instance of each IP block and since then we've just built up
> > technical debt with respect to the instance handling.  That said, I
> > think the rework of this level at this point is probably too much, in
> > digging through the current state, there are just too many corner
> > cases to fix up.  I agree we should probably forgo it at this point.
>
> I would really like to keep the design as is. The fallout we see is
> basically just points out that we have some more broken concepts here.
>
> For example sysfs file should never be initialized from IP specific
> functions in the first place. Why the heck do we do that?

Where else would we do it?  At the moment all of the IPs set up their
IP specific sysfs files.  Also, if we have one IP block per instance,
we may have different reset capabilities per instance.  In that case,
the sysfs files should be per instance.

Alex

>
> We should probably rather keep the design for the VCN generation Boyuan
> actually tested and see what else we need to fix to get to that design.
>
> Regards,
> Christian.
>
> >
> > Alex
> >
> >
> > Alex
> >
> >> Thanks,
> >> Lijo
> >>
> >>    But I think once we get past this refactoring it will put
> >>> us in a better place for dealing with multiple IP instances.  Consider
> >>> the case of a part with multiple blocks of the same type with
> >>> different IP versions.  Those would not easily be handled with a
> >>> single IP block handling multiple IP instances.
> >>>
> >>> Alex
> >>>
> >>>> Thanks,
> >>>> Lijo
> >>>>
> >>>>> Alex
> >>>>>
> >>>>>> Thanks,
> >>>>>> Lijo
> >>>>>>
> >>>>>>> Regards
> >>>>>>> Jesse
> >>>>>>>
> >>>>>>>
> >>>>>>> Thanks,
> >>>>>>> Lijo
> >>>>>>>
> >>>>>>>> +};
> >>>>>>>>
> >>>>>>>>   static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
> >>>>>>>> static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device
> >>>>>>>> *adev, int inst); @@ -150,9 +154,9 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
> >>>>>>>>        if (r)
> >>>>>>>>                return r;
> >>>>>>>>
> >>>>>>>> -     /* VCN DEC TRAP */
> >>>>>>>> -     r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
> >>>>>>>> -             VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE, &adev->vcn.inst->irq);
> >>>>>>>> +     /* VCN UNIFIED TRAP */
> >>>>>>>> +     r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_vcns[inst],
> >>>>>>>> +                     VCN_4_0__SRCID__UVD_ENC_GENERAL_PURPOSE,
> >>>>>>>> +&adev->vcn.inst[inst].irq);
> >>>>>>>>        if (r)
> >>>>>>>>                return r;
> >>>>>>>>
> >>>>>>>> @@ -174,7 +178,7 @@ static int vcn_v4_0_3_sw_init(struct
> >>>>>>>> amdgpu_ip_block *ip_block)
> >>>>>>>>
> >>>>>>>>        ring->vm_hub = AMDGPU_MMHUB0(adev->vcn.inst[inst].aid_id);
> >>>>>>>>        sprintf(ring->name, "vcn_unified_%d", adev->vcn.inst[inst].aid_id);
> >>>>>>>> -     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst->irq, 0,
> >>>>>>>> +     r = amdgpu_ring_init(adev, ring, 512, &adev->vcn.inst[inst].irq, 0,
> >>>>>>>>                                 AMDGPU_RING_PRIO_DEFAULT,
> >>>>>>>>                                 &adev->vcn.inst[inst].sched_score);
> >>>>>>>>        if (r)
> >>>>>>>> @@ -1734,9 +1738,12 @@ static const struct amdgpu_irq_src_funcs vcn_v4_0_3_irq_funcs = {
> >>>>>>>>    */
> >>>>>>>>   static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev, int
> >>>>>>>> inst)  {
> >>>>>>>> -     adev->vcn.inst->irq.num_types++;
> >>>>>>>> +     if (adev->vcn.harvest_config & (1 << inst))
> >>>>>>>> +             return;
> >>>>>>>> +
> >>>>>>>> +     adev->vcn.inst[inst].irq.num_types = adev->vcn.num_enc_rings + 1;
> >>>>>>>>
> >>>>>>>> -     adev->vcn.inst->irq.funcs = &vcn_v4_0_3_irq_funcs;
> >>>>>>>> +     adev->vcn.inst[inst].irq.funcs = &vcn_v4_0_3_irq_funcs;
> >>>>>>>>   }
> >>>>>>>>
> >>>>>>>>   static void vcn_v4_0_3_print_ip_state(struct amdgpu_ip_block
> >>>>>>>> *ip_block, struct drm_printer *p)
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2024-11-15 14:25 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-12 14:30 [PATCH 1/2] drm/admgpu: fix vcn sw init failed Jesse.zhang@amd.com
2024-11-12 14:30 ` [PATCH 2/2] drm/amdgpu: " Jesse.zhang@amd.com
2024-11-12 14:54   ` Lazar, Lijo
2024-11-13  2:28     ` Zhang, Jesse(Jie)
2024-11-13  3:10       ` Lazar, Lijo
2024-11-13  4:46         ` Alex Deucher
2024-11-13  5:03           ` Lazar, Lijo
2024-11-13  5:22             ` Zhang, Jesse(Jie)
2024-11-13  5:24             ` Alex Deucher
2024-11-13  5:32               ` Lazar, Lijo
2024-11-13 21:43                 ` Alex Deucher
2024-11-15 12:34                   ` Christian König
2024-11-15 14:24                     ` Alex Deucher
2024-11-13  5:30 ` [PATCH 1/2] drm/admgpu: " Alex Deucher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.