AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdgpu: print VCN instance dump for valid instance
@ 2024-07-26 12:47 Sunil Khatri
  2024-07-26 12:47 ` [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's Sunil Khatri
  2024-07-26 13:12 ` [PATCH 1/2] drm/amdgpu: print VCN instance dump for valid instance Alex Deucher
  0 siblings, 2 replies; 15+ messages in thread
From: Sunil Khatri @ 2024-07-26 12:47 UTC (permalink / raw)
  To: Alex Deucher, Liu Leo, Christian König; +Cc: amd-gfx, Sunil Khatri

VCN dump is dependent on power state of the ip. Dump is
valid if VCN was powered up at the time of ip dump.

Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 28 +++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
index 9e1cbeee10db..c2278cc49dd5 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
@@ -2329,7 +2329,7 @@ static void vcn_v3_0_print_ip_state(void *handle, struct drm_printer *p)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int i, j;
 	uint32_t reg_count = ARRAY_SIZE(vcn_reg_list_3_0);
-	uint32_t inst_off;
+	uint32_t inst_off, is_powered;
 
 	if (!adev->vcn.ip_dump)
 		return;
@@ -2342,11 +2342,17 @@ static void vcn_v3_0_print_ip_state(void *handle, struct drm_printer *p)
 		}
 
 		inst_off = i * reg_count;
-		drm_printf(p, "\nActive Instance:VCN%d\n", i);
+		is_powered = (adev->vcn.ip_dump[inst_off] &
+				UVD_POWER_STATUS__UVD_POWER_STATUS_MASK) != 1;
 
-		for (j = 0; j < reg_count; j++)
-			drm_printf(p, "%-50s \t 0x%08x\n", vcn_reg_list_3_0[j].reg_name,
-				   adev->vcn.ip_dump[inst_off + j]);
+		if (is_powered) {
+			drm_printf(p, "\nActive Instance:VCN%d\n", i);
+			for (j = 0; j < reg_count; j++)
+				drm_printf(p, "%-50s \t 0x%08x\n", vcn_reg_list_3_0[j].reg_name,
+					   adev->vcn.ip_dump[inst_off + j]);
+		} else {
+			drm_printf(p, "\nInactive Instance:VCN%d\n", i);
+		}
 	}
 }
 
@@ -2354,7 +2360,7 @@ static void vcn_v3_0_dump_ip_state(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int i, j;
-	bool reg_safe;
+	bool is_powered;
 	uint32_t inst_off;
 	uint32_t reg_count = ARRAY_SIZE(vcn_reg_list_3_0);
 
@@ -2366,11 +2372,13 @@ static void vcn_v3_0_dump_ip_state(void *handle)
 			continue;
 
 		inst_off = i * reg_count;
-		reg_safe = (RREG32_SOC15(VCN, i, mmUVD_POWER_STATUS) &
-			    UVD_POWER_STATUS__UVD_POWER_STATUS_MASK) != 1;
+		/* mmUVD_POWER_STATUS is always readable and is first element of the array */
+		adev->vcn.ip_dump[inst_off] = RREG32_SOC15(VCN, i, mmUVD_POWER_STATUS);
+		is_powered = (adev->vcn.ip_dump[inst_off] &
+				UVD_POWER_STATUS__UVD_POWER_STATUS_MASK) != 1;
 
-		if (reg_safe)
-			for (j = 0; j < reg_count; j++)
+		if (is_powered)
+			for (j = 1; j < reg_count; j++)
 				adev->vcn.ip_dump[inst_off + j] =
 					RREG32(SOC15_REG_ENTRY_OFFSET_INST(vcn_reg_list_3_0[j], i));
 	}
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-26 12:47 [PATCH 1/2] drm/amdgpu: print VCN instance dump for valid instance Sunil Khatri
@ 2024-07-26 12:47 ` Sunil Khatri
  2024-07-26 13:12   ` Alex Deucher
  2024-07-26 13:12 ` [PATCH 1/2] drm/amdgpu: print VCN instance dump for valid instance Alex Deucher
  1 sibling, 1 reply; 15+ messages in thread
From: Sunil Khatri @ 2024-07-26 12:47 UTC (permalink / raw)
  To: Alex Deucher, Liu Leo, Christian König; +Cc: amd-gfx, Sunil Khatri

Problem:
IP dump right now is done post suspend of
all IP's which for some IP's could change power
state and software state too which we do not want
to reflect in the dump as it might not be same at
the time of hang.

Solution:
IP should be dumped as close to the HW state when
the GPU was in hung state without trying to reinitialize
any resource.

Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 +++++++++++-----------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 730dae77570c..74f6f15e73b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
 	return ret;
 }
 
+static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
+{
+	int i;
+
+	lockdep_assert_held(&adev->reset_domain->sem);
+
+	for (i = 0; i < adev->reset_info.num_regs; i++) {
+		adev->reset_info.reset_dump_reg_value[i] =
+			RREG32(adev->reset_info.reset_dump_reg_list[i]);
+
+		trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
+					     adev->reset_info.reset_dump_reg_value[i]);
+	}
+
+	return 0;
+}
+
 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 				 struct amdgpu_reset_context *reset_context)
 {
 	int i, r = 0;
 	struct amdgpu_job *job = NULL;
+	struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
 	bool need_full_reset =
 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
 
@@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 			}
 		}
 
+		if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
+			amdgpu_reset_reg_dumps(tmp_adev);
+
+			dev_info(tmp_adev->dev, "Dumping IP State\n");
+			/* Trigger ip dump before we reset the asic */
+			for (i = 0; i < tmp_adev->num_ip_blocks; i++)
+				if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
+					tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
+							(void *)tmp_adev);
+			dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
+		}
+
 		if (need_full_reset)
 			r = amdgpu_device_ip_suspend(adev);
 		if (need_full_reset)
@@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	return r;
 }
 
-static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
-{
-	int i;
-
-	lockdep_assert_held(&adev->reset_domain->sem);
-
-	for (i = 0; i < adev->reset_info.num_regs; i++) {
-		adev->reset_info.reset_dump_reg_value[i] =
-			RREG32(adev->reset_info.reset_dump_reg_list[i]);
-
-		trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
-					     adev->reset_info.reset_dump_reg_value[i]);
-	}
-
-	return 0;
-}
-
 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 			 struct amdgpu_reset_context *reset_context)
 {
 	struct amdgpu_device *tmp_adev = NULL;
 	bool need_full_reset, skip_hw_reset, vram_lost = false;
 	int r = 0;
-	uint32_t i;
 
 	/* Try reset handler method first */
 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
 				    reset_list);
 
-	if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
-		amdgpu_reset_reg_dumps(tmp_adev);
-
-		dev_info(tmp_adev->dev, "Dumping IP State\n");
-		/* Trigger ip dump before we reset the asic */
-		for (i = 0; i < tmp_adev->num_ip_blocks; i++)
-			if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
-				tmp_adev->ip_blocks[i].version->funcs
-				->dump_ip_state((void *)tmp_adev);
-		dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
-	}
-
 	reset_context->reset_device_list = device_list_handle;
 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
 	/* If reset handler not implemented, continue; otherwise return */
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-26 12:47 ` [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's Sunil Khatri
@ 2024-07-26 13:12   ` Alex Deucher
  2024-07-26 13:48     ` Lazar, Lijo
  0 siblings, 1 reply; 15+ messages in thread
From: Alex Deucher @ 2024-07-26 13:12 UTC (permalink / raw)
  To: Sunil Khatri; +Cc: Alex Deucher, Liu Leo, Christian König, amd-gfx

On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>
> Problem:
> IP dump right now is done post suspend of
> all IP's which for some IP's could change power
> state and software state too which we do not want
> to reflect in the dump as it might not be same at
> the time of hang.
>
> Solution:
> IP should be dumped as close to the HW state when
> the GPU was in hung state without trying to reinitialize
> any resource.
>
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>

Acked-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 +++++++++++-----------
>  1 file changed, 30 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 730dae77570c..74f6f15e73b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
>         return ret;
>  }
>
> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> +{
> +       int i;
> +
> +       lockdep_assert_held(&adev->reset_domain->sem);
> +
> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
> +               adev->reset_info.reset_dump_reg_value[i] =
> +                       RREG32(adev->reset_info.reset_dump_reg_list[i]);
> +
> +               trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
> +                                            adev->reset_info.reset_dump_reg_value[i]);
> +       }
> +
> +       return 0;
> +}
> +
>  int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>                                  struct amdgpu_reset_context *reset_context)
>  {
>         int i, r = 0;
>         struct amdgpu_job *job = NULL;
> +       struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
>         bool need_full_reset =
>                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
>
> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>                         }
>                 }
>
> +               if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
> +                       amdgpu_reset_reg_dumps(tmp_adev);
> +
> +                       dev_info(tmp_adev->dev, "Dumping IP State\n");
> +                       /* Trigger ip dump before we reset the asic */
> +                       for (i = 0; i < tmp_adev->num_ip_blocks; i++)
> +                               if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
> +                                       tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
> +                                                       (void *)tmp_adev);
> +                       dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
> +               }
> +
>                 if (need_full_reset)
>                         r = amdgpu_device_ip_suspend(adev);
>                 if (need_full_reset)
> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>         return r;
>  }
>
> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> -{
> -       int i;
> -
> -       lockdep_assert_held(&adev->reset_domain->sem);
> -
> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
> -               adev->reset_info.reset_dump_reg_value[i] =
> -                       RREG32(adev->reset_info.reset_dump_reg_list[i]);
> -
> -               trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
> -                                            adev->reset_info.reset_dump_reg_value[i]);
> -       }
> -
> -       return 0;
> -}
> -
>  int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>                          struct amdgpu_reset_context *reset_context)
>  {
>         struct amdgpu_device *tmp_adev = NULL;
>         bool need_full_reset, skip_hw_reset, vram_lost = false;
>         int r = 0;
> -       uint32_t i;
>
>         /* Try reset handler method first */
>         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
>                                     reset_list);
>
> -       if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
> -               amdgpu_reset_reg_dumps(tmp_adev);
> -
> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
> -               /* Trigger ip dump before we reset the asic */
> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
> -                       if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
> -                               tmp_adev->ip_blocks[i].version->funcs
> -                               ->dump_ip_state((void *)tmp_adev);
> -               dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
> -       }
> -
>         reset_context->reset_device_list = device_list_handle;
>         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
>         /* If reset handler not implemented, continue; otherwise return */
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] drm/amdgpu: print VCN instance dump for valid instance
  2024-07-26 12:47 [PATCH 1/2] drm/amdgpu: print VCN instance dump for valid instance Sunil Khatri
  2024-07-26 12:47 ` [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's Sunil Khatri
@ 2024-07-26 13:12 ` Alex Deucher
  1 sibling, 0 replies; 15+ messages in thread
From: Alex Deucher @ 2024-07-26 13:12 UTC (permalink / raw)
  To: Sunil Khatri; +Cc: Alex Deucher, Liu Leo, Christian König, amd-gfx

On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>
> VCN dump is dependent on power state of the ip. Dump is
> valid if VCN was powered up at the time of ip dump.
>
> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 28 +++++++++++++++++----------
>  1 file changed, 18 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
> index 9e1cbeee10db..c2278cc49dd5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
> @@ -2329,7 +2329,7 @@ static void vcn_v3_0_print_ip_state(void *handle, struct drm_printer *p)
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>         int i, j;
>         uint32_t reg_count = ARRAY_SIZE(vcn_reg_list_3_0);
> -       uint32_t inst_off;
> +       uint32_t inst_off, is_powered;
>
>         if (!adev->vcn.ip_dump)
>                 return;
> @@ -2342,11 +2342,17 @@ static void vcn_v3_0_print_ip_state(void *handle, struct drm_printer *p)
>                 }
>
>                 inst_off = i * reg_count;
> -               drm_printf(p, "\nActive Instance:VCN%d\n", i);
> +               is_powered = (adev->vcn.ip_dump[inst_off] &
> +                               UVD_POWER_STATUS__UVD_POWER_STATUS_MASK) != 1;
>
> -               for (j = 0; j < reg_count; j++)
> -                       drm_printf(p, "%-50s \t 0x%08x\n", vcn_reg_list_3_0[j].reg_name,
> -                                  adev->vcn.ip_dump[inst_off + j]);
> +               if (is_powered) {
> +                       drm_printf(p, "\nActive Instance:VCN%d\n", i);
> +                       for (j = 0; j < reg_count; j++)
> +                               drm_printf(p, "%-50s \t 0x%08x\n", vcn_reg_list_3_0[j].reg_name,
> +                                          adev->vcn.ip_dump[inst_off + j]);
> +               } else {
> +                       drm_printf(p, "\nInactive Instance:VCN%d\n", i);
> +               }
>         }
>  }
>
> @@ -2354,7 +2360,7 @@ static void vcn_v3_0_dump_ip_state(void *handle)
>  {
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>         int i, j;
> -       bool reg_safe;
> +       bool is_powered;
>         uint32_t inst_off;
>         uint32_t reg_count = ARRAY_SIZE(vcn_reg_list_3_0);
>
> @@ -2366,11 +2372,13 @@ static void vcn_v3_0_dump_ip_state(void *handle)
>                         continue;
>
>                 inst_off = i * reg_count;
> -               reg_safe = (RREG32_SOC15(VCN, i, mmUVD_POWER_STATUS) &
> -                           UVD_POWER_STATUS__UVD_POWER_STATUS_MASK) != 1;
> +               /* mmUVD_POWER_STATUS is always readable and is first element of the array */
> +               adev->vcn.ip_dump[inst_off] = RREG32_SOC15(VCN, i, mmUVD_POWER_STATUS);
> +               is_powered = (adev->vcn.ip_dump[inst_off] &
> +                               UVD_POWER_STATUS__UVD_POWER_STATUS_MASK) != 1;
>
> -               if (reg_safe)
> -                       for (j = 0; j < reg_count; j++)
> +               if (is_powered)
> +                       for (j = 1; j < reg_count; j++)
>                                 adev->vcn.ip_dump[inst_off + j] =
>                                         RREG32(SOC15_REG_ENTRY_OFFSET_INST(vcn_reg_list_3_0[j], i));
>         }
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-26 13:12   ` Alex Deucher
@ 2024-07-26 13:48     ` Lazar, Lijo
  2024-07-26 14:23       ` Khatri, Sunil
  0 siblings, 1 reply; 15+ messages in thread
From: Lazar, Lijo @ 2024-07-26 13:48 UTC (permalink / raw)
  To: Alex Deucher, Sunil Khatri
  Cc: Alex Deucher, Liu Leo, Christian König, amd-gfx



On 7/26/2024 6:42 PM, Alex Deucher wrote:
> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>>
>> Problem:
>> IP dump right now is done post suspend of
>> all IP's which for some IP's could change power
>> state and software state too which we do not want
>> to reflect in the dump as it might not be same at
>> the time of hang.
>>
>> Solution:
>> IP should be dumped as close to the HW state when
>> the GPU was in hung state without trying to reinitialize
>> any resource.
>>
>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> 
> Acked-by: Alex Deucher <alexander.deucher@amd.com>
> 
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 +++++++++++-----------
>>  1 file changed, 30 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 730dae77570c..74f6f15e73b5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
>>         return ret;
>>  }
>>
>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>> +{
>> +       int i;
>> +
>> +       lockdep_assert_held(&adev->reset_domain->sem);
>> +
>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
>> +               adev->reset_info.reset_dump_reg_value[i] =
>> +                       RREG32(adev->reset_info.reset_dump_reg_list[i]);

A suspend also involves power/clock ungate. When reg dump is moved
earlier, I'm not sure if this read works for all. If it's left to
individual IP call backs, they could just do the same or better to move
these up before a dump.

        amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
        amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);

Thanks,
Lijo

>> +
>> +               trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>> +                                            adev->reset_info.reset_dump_reg_value[i]);
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>>  int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>                                  struct amdgpu_reset_context *reset_context)
>>  {
>>         int i, r = 0;
>>         struct amdgpu_job *job = NULL;
>> +       struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
>>         bool need_full_reset =
>>                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
>>
>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>                         }
>>                 }
>>
>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>> +                       amdgpu_reset_reg_dumps(tmp_adev);
>> +
>> +                       dev_info(tmp_adev->dev, "Dumping IP State\n");
>> +                       /* Trigger ip dump before we reset the asic */
>> +                       for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>> +                               if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>> +                                       tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>> +                                                       (void *)tmp_adev);
>> +                       dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
>> +               }
>> +
>>                 if (need_full_reset)
>>                         r = amdgpu_device_ip_suspend(adev);
>>                 if (need_full_reset)
>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>         return r;
>>  }
>>
>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>> -{
>> -       int i;
>> -
>> -       lockdep_assert_held(&adev->reset_domain->sem);
>> -
>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
>> -               adev->reset_info.reset_dump_reg_value[i] =
>> -                       RREG32(adev->reset_info.reset_dump_reg_list[i]);
>> -
>> -               trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>> -                                            adev->reset_info.reset_dump_reg_value[i]);
>> -       }
>> -
>> -       return 0;
>> -}
>> -
>>  int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>                          struct amdgpu_reset_context *reset_context)
>>  {
>>         struct amdgpu_device *tmp_adev = NULL;
>>         bool need_full_reset, skip_hw_reset, vram_lost = false;
>>         int r = 0;
>> -       uint32_t i;
>>
>>         /* Try reset handler method first */
>>         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
>>                                     reset_list);
>>
>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>> -               amdgpu_reset_reg_dumps(tmp_adev);
>> -
>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
>> -               /* Trigger ip dump before we reset the asic */
>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>> -                       if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>> -                               tmp_adev->ip_blocks[i].version->funcs
>> -                               ->dump_ip_state((void *)tmp_adev);
>> -               dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
>> -       }
>> -
>>         reset_context->reset_device_list = device_list_handle;
>>         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
>>         /* If reset handler not implemented, continue; otherwise return */
>> --
>> 2.34.1
>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-26 13:48     ` Lazar, Lijo
@ 2024-07-26 14:23       ` Khatri, Sunil
  2024-07-26 14:41         ` Khatri, Sunil
  0 siblings, 1 reply; 15+ messages in thread
From: Khatri, Sunil @ 2024-07-26 14:23 UTC (permalink / raw)
  To: Lazar, Lijo, Alex Deucher, Sunil Khatri
  Cc: Alex Deucher, Liu Leo, Christian König, amd-gfx


On 7/26/2024 7:18 PM, Lazar, Lijo wrote:
>
> On 7/26/2024 6:42 PM, Alex Deucher wrote:
>> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com> wrote:
>>> Problem:
>>> IP dump right now is done post suspend of
>>> all IP's which for some IP's could change power
>>> state and software state too which we do not want
>>> to reflect in the dump as it might not be same at
>>> the time of hang.
>>>
>>> Solution:
>>> IP should be dumped as close to the HW state when
>>> the GPU was in hung state without trying to reinitialize
>>> any resource.
>>>
>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 +++++++++++-----------
>>>   1 file changed, 30 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 730dae77570c..74f6f15e73b5 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
>>>          return ret;
>>>   }
>>>
>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>> +{
>>> +       int i;
>>> +
>>> +       lockdep_assert_held(&adev->reset_domain->sem);
>>> +
>>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>> +               adev->reset_info.reset_dump_reg_value[i] =
>>> +                       RREG32(adev->reset_info.reset_dump_reg_list[i]);
> A suspend also involves power/clock ungate. When reg dump is moved
> earlier, I'm not sure if this read works for all. If it's left to
> individual IP call backs, they could just do the same or better to move
> these up before a dump.
Suspend also put the status.hw = false and each IP in their respective 
suspend state which i feel does change the state of the HW.
To get the correct snapshot of the GPU register we should not be 
fiddling with the HW IP at least till we capture the dump and that is 
the intention behind the change.

Do you think there is a problem in this approach?
>
>          amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>          amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
Adding this does sounds better to enable just before we dump the 
registers but i am not very sure if ungating would be clean here or not. 
i Could try quickly adding these two calls just before dump.

All i am worried if it does change some register reflecting the original 
state of registers at dump.

What u suggest ?
Regards
Sunil

>
> Thanks,
> Lijo
>
>>> +
>>> +               trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>> +                                            adev->reset_info.reset_dump_reg_value[i]);
>>> +       }
>>> +
>>> +       return 0;
>>> +}
>>> +
>>>   int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>                                   struct amdgpu_reset_context *reset_context)
>>>   {
>>>          int i, r = 0;
>>>          struct amdgpu_job *job = NULL;
>>> +       struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
>>>          bool need_full_reset =
>>>                  test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
>>>
>>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>                          }
>>>                  }
>>>
>>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>> +                       amdgpu_reset_reg_dumps(tmp_adev);
>>> +
>>> +                       dev_info(tmp_adev->dev, "Dumping IP State\n");
>>> +                       /* Trigger ip dump before we reset the asic */
>>> +                       for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>> +                               if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>> +                                       tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>> +                                                       (void *)tmp_adev);
>>> +                       dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
>>> +               }
>>> +
>>>                  if (need_full_reset)
>>>                          r = amdgpu_device_ip_suspend(adev);
>>>                  if (need_full_reset)
>>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>          return r;
>>>   }
>>>
>>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>> -{
>>> -       int i;
>>> -
>>> -       lockdep_assert_held(&adev->reset_domain->sem);
>>> -
>>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>> -               adev->reset_info.reset_dump_reg_value[i] =
>>> -                       RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>> -
>>> -               trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>> -                                            adev->reset_info.reset_dump_reg_value[i]);
>>> -       }
>>> -
>>> -       return 0;
>>> -}
>>> -
>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>                           struct amdgpu_reset_context *reset_context)
>>>   {
>>>          struct amdgpu_device *tmp_adev = NULL;
>>>          bool need_full_reset, skip_hw_reset, vram_lost = false;
>>>          int r = 0;
>>> -       uint32_t i;
>>>
>>>          /* Try reset handler method first */
>>>          tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
>>>                                      reset_list);
>>>
>>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>> -               amdgpu_reset_reg_dumps(tmp_adev);
>>> -
>>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
>>> -               /* Trigger ip dump before we reset the asic */
>>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>> -                       if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>> -                               tmp_adev->ip_blocks[i].version->funcs
>>> -                               ->dump_ip_state((void *)tmp_adev);
>>> -               dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
>>> -       }
>>> -
>>>          reset_context->reset_device_list = device_list_handle;
>>>          r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
>>>          /* If reset handler not implemented, continue; otherwise return */
>>> --
>>> 2.34.1
>>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-26 14:23       ` Khatri, Sunil
@ 2024-07-26 14:41         ` Khatri, Sunil
  2024-07-26 15:06           ` Lazar, Lijo
  0 siblings, 1 reply; 15+ messages in thread
From: Khatri, Sunil @ 2024-07-26 14:41 UTC (permalink / raw)
  To: Lazar, Lijo, Alex Deucher, Sunil Khatri
  Cc: Alex Deucher, Liu Leo, Christian König, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 8003 bytes --]


On 7/26/2024 7:53 PM, Khatri, Sunil wrote:
>
> On 7/26/2024 7:18 PM, Lazar, Lijo wrote:
>>
>> On 7/26/2024 6:42 PM, Alex Deucher wrote:
>>> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com> 
>>> wrote:
>>>> Problem:
>>>> IP dump right now is done post suspend of
>>>> all IP's which for some IP's could change power
>>>> state and software state too which we do not want
>>>> to reflect in the dump as it might not be same at
>>>> the time of hang.
>>>>
>>>> Solution:
>>>> IP should be dumped as close to the HW state when
>>>> the GPU was in hung state without trying to reinitialize
>>>> any resource.
>>>>
>>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>>>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 
>>>> +++++++++++-----------
>>>>   1 file changed, 30 insertions(+), 30 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 730dae77570c..74f6f15e73b5 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct 
>>>> amdgpu_device *adev)
>>>>          return ret;
>>>>   }
>>>>
>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>> +{
>>>> +       int i;
>>>> +
>>>> + lockdep_assert_held(&adev->reset_domain->sem);
>>>> +
>>>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>> +               adev->reset_info.reset_dump_reg_value[i] =
>>>> + RREG32(adev->reset_info.reset_dump_reg_list[i]);
>> A suspend also involves power/clock ungate. When reg dump is moved
>> earlier, I'm not sure if this read works for all. If it's left to
>> individual IP call backs, they could just do the same or better to move
>> these up before a dump.
> Suspend also put the status.hw = false and each IP in their respective 
> suspend state which i feel does change the state of the HW.
> To get the correct snapshot of the GPU register we should not be 
> fiddling with the HW IP at least till we capture the dump and that is 
> the intention behind the change.
>
> Do you think there is a problem in this approach?
>>
>>          amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>          amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
> Adding this does sounds better to enable just before we dump the 
> registers but i am not very sure if ungating would be clean here or 
> not. i Could try quickly adding these two calls just before dump.
>
> All i am worried if it does change some register reflecting the 
> original state of registers at dump.
>
> What u suggest ?
> Regards
> Sunil
I quickly validated on Navi22 by adding below call just before we dump 
registers
if(!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
amdgpu_reset_reg_dumps(tmp_adev);
dev_info(tmp_adev->dev, "Dumping IP State\n");
     /* Trigger ip dump before we reset the asic */
for(i=0; i<tmp_adev->num_ip_blocks; i++)
if(tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
                                     (void*)tmp_adev);
dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
}
If this sounds fine with you i am update that. Regards Sunil Khatri
>
>>
>> Thanks,
>> Lijo
>>
>>>> +
>>>> + 
>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>> + adev->reset_info.reset_dump_reg_value[i]);
>>>> +       }
>>>> +
>>>> +       return 0;
>>>> +}
>>>> +
>>>>   int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>                                   struct amdgpu_reset_context 
>>>> *reset_context)
>>>>   {
>>>>          int i, r = 0;
>>>>          struct amdgpu_job *job = NULL;
>>>> +       struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
>>>>          bool need_full_reset =
>>>>                  test_bit(AMDGPU_NEED_FULL_RESET, 
>>>> &reset_context->flags);
>>>>
>>>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct 
>>>> amdgpu_device *adev,
>>>>                          }
>>>>                  }
>>>>
>>>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP, 
>>>> &reset_context->flags)) {
>>>> +                       amdgpu_reset_reg_dumps(tmp_adev);
>>>> +
>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>> +                       /* Trigger ip dump before we reset the asic */
>>>> +                       for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>> +                               if 
>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>> + tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>> + (void *)tmp_adev);
>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State 
>>>> Completed\n");
>>>> +               }
>>>> +
>>>>                  if (need_full_reset)
>>>>                          r = amdgpu_device_ip_suspend(adev);
>>>>                  if (need_full_reset)
>>>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct 
>>>> amdgpu_device *adev,
>>>>          return r;
>>>>   }
>>>>
>>>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>> -{
>>>> -       int i;
>>>> -
>>>> - lockdep_assert_held(&adev->reset_domain->sem);
>>>> -
>>>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>> -               adev->reset_info.reset_dump_reg_value[i] =
>>>> - RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>> -
>>>> - 
>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>> - adev->reset_info.reset_dump_reg_value[i]);
>>>> -       }
>>>> -
>>>> -       return 0;
>>>> -}
>>>> -
>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>                           struct amdgpu_reset_context *reset_context)
>>>>   {
>>>>          struct amdgpu_device *tmp_adev = NULL;
>>>>          bool need_full_reset, skip_hw_reset, vram_lost = false;
>>>>          int r = 0;
>>>> -       uint32_t i;
>>>>
>>>>          /* Try reset handler method first */
>>>>          tmp_adev = list_first_entry(device_list_handle, struct 
>>>> amdgpu_device,
>>>>                                      reset_list);
>>>>
>>>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>>> -               amdgpu_reset_reg_dumps(tmp_adev);
>>>> -
>>>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>> -               /* Trigger ip dump before we reset the asic */
>>>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>> -                       if 
>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>> - tmp_adev->ip_blocks[i].version->funcs
>>>> -                               ->dump_ip_state((void *)tmp_adev);
>>>> -               dev_info(tmp_adev->dev, "Dumping IP State 
>>>> Completed\n");
>>>> -       }
>>>> -
>>>>          reset_context->reset_device_list = device_list_handle;
>>>>          r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
>>>>          /* If reset handler not implemented, continue; otherwise 
>>>> return */
>>>> -- 
>>>> 2.34.1
>>>>

[-- Attachment #2: Type: text/html, Size: 22665 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-26 14:41         ` Khatri, Sunil
@ 2024-07-26 15:06           ` Lazar, Lijo
  2024-07-26 17:16             ` Khatri, Sunil
  0 siblings, 1 reply; 15+ messages in thread
From: Lazar, Lijo @ 2024-07-26 15:06 UTC (permalink / raw)
  To: Khatri, Sunil, Alex Deucher, Sunil Khatri
  Cc: Alex Deucher, Liu Leo, Christian König, amd-gfx



On 7/26/2024 8:11 PM, Khatri, Sunil wrote:
> 
> On 7/26/2024 7:53 PM, Khatri, Sunil wrote:
>>
>> On 7/26/2024 7:18 PM, Lazar, Lijo wrote:
>>>
>>> On 7/26/2024 6:42 PM, Alex Deucher wrote:
>>>> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com>
>>>> wrote:
>>>>> Problem:
>>>>> IP dump right now is done post suspend of
>>>>> all IP's which for some IP's could change power
>>>>> state and software state too which we do not want
>>>>> to reflect in the dump as it might not be same at
>>>>> the time of hang.
>>>>>
>>>>> Solution:
>>>>> IP should be dumped as close to the HW state when
>>>>> the GPU was in hung state without trying to reinitialize
>>>>> any resource.
>>>>>
>>>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>>>>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60
>>>>> +++++++++++-----------
>>>>>   1 file changed, 30 insertions(+), 30 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 730dae77570c..74f6f15e73b5 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct
>>>>> amdgpu_device *adev)
>>>>>          return ret;
>>>>>   }
>>>>>
>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>> +{
>>>>> +       int i;
>>>>> +
>>>>> +       lockdep_assert_held(&adev->reset_domain->sem);
>>>>> +
>>>>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>> +               adev->reset_info.reset_dump_reg_value[i] =
>>>>> +                      
>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>> A suspend also involves power/clock ungate. When reg dump is moved
>>> earlier, I'm not sure if this read works for all. If it's left to
>>> individual IP call backs, they could just do the same or better to move
>>> these up before a dump.
>> Suspend also put the status.hw = false and each IP in their respective
>> suspend state which i feel does change the state of the HW.
>> To get the correct snapshot of the GPU register we should not be
>> fiddling with the HW IP at least till we capture the dump and that is
>> the intention behind the change.
>>
>> Do you think there is a problem in this approach?
>>>
>>>          amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>          amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>> Adding this does sounds better to enable just before we dump the
>> registers but i am not very sure if ungating would be clean here or
>> not. i Could try quickly adding these two calls just before dump.
>>
>> All i am worried if it does change some register reflecting the
>> original state of registers at dump.
>>

I was thinking that if it includes some GFX regs and the hang happened
because of some SDMA/VCN jobs which somehow keeps GFXOFF state intact.

BTW, since there is already dump_ip state which could capture IP regs
separately and handle their power/clock gate situations, do you think
this generic one is still needed?

Thanks,
Lijo

>> What u suggest ?
>> Regards
>> Sunil
> I quickly validated on Navi22 by adding below call just before we dump
> registers
> if(!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>                        
>     amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>     amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>                        
>     amdgpu_reset_reg_dumps(tmp_adev);
>     dev_info(tmp_adev->dev, "Dumping IP State\n");
>     /* Trigger ip dump before we reset the asic */
>     for(i=0; i<tmp_adev->num_ip_blocks; i++)
>         if(tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>             tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>                                     (void*)tmp_adev);
>     dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
> }
> If this sounds fine with you i am update that. Regards Sunil Khatri
>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>>> +
>>>>> +              
>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>> +                                           
>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>> +       }
>>>>> +
>>>>> +       return 0;
>>>>> +}
>>>>> +
>>>>>   int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>>                                   struct amdgpu_reset_context
>>>>> *reset_context)
>>>>>   {
>>>>>          int i, r = 0;
>>>>>          struct amdgpu_job *job = NULL;
>>>>> +       struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
>>>>>          bool need_full_reset =
>>>>>                  test_bit(AMDGPU_NEED_FULL_RESET,
>>>>> &reset_context->flags);
>>>>>
>>>>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct
>>>>> amdgpu_device *adev,
>>>>>                          }
>>>>>                  }
>>>>>
>>>>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>> &reset_context->flags)) {
>>>>> +                       amdgpu_reset_reg_dumps(tmp_adev);
>>>>> +
>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>> +                       /* Trigger ip dump before we reset the asic */
>>>>> +                       for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>>> +                               if
>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>> +                                      
>>>>> tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>> +                                                       (void
>>>>> *)tmp_adev);
>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State
>>>>> Completed\n");
>>>>> +               }
>>>>> +
>>>>>                  if (need_full_reset)
>>>>>                          r = amdgpu_device_ip_suspend(adev);
>>>>>                  if (need_full_reset)
>>>>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct
>>>>> amdgpu_device *adev,
>>>>>          return r;
>>>>>   }
>>>>>
>>>>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>> -{
>>>>> -       int i;
>>>>> -
>>>>> -       lockdep_assert_held(&adev->reset_domain->sem);
>>>>> -
>>>>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>> -               adev->reset_info.reset_dump_reg_value[i] =
>>>>> -                      
>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>> -
>>>>> -              
>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>> -                                           
>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>> -       }
>>>>> -
>>>>> -       return 0;
>>>>> -}
>>>>> -
>>>>>   int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>                           struct amdgpu_reset_context *reset_context)
>>>>>   {
>>>>>          struct amdgpu_device *tmp_adev = NULL;
>>>>>          bool need_full_reset, skip_hw_reset, vram_lost = false;
>>>>>          int r = 0;
>>>>> -       uint32_t i;
>>>>>
>>>>>          /* Try reset handler method first */
>>>>>          tmp_adev = list_first_entry(device_list_handle, struct
>>>>> amdgpu_device,
>>>>>                                      reset_list);
>>>>>
>>>>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>>>> -               amdgpu_reset_reg_dumps(tmp_adev);
>>>>> -
>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>> -               /* Trigger ip dump before we reset the asic */
>>>>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>>> -                       if
>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>> -                               tmp_adev->ip_blocks[i].version->funcs
>>>>> -                               ->dump_ip_state((void *)tmp_adev);
>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State
>>>>> Completed\n");
>>>>> -       }
>>>>> -
>>>>>          reset_context->reset_device_list = device_list_handle;
>>>>>          r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
>>>>>          /* If reset handler not implemented, continue; otherwise
>>>>> return */
>>>>> -- 
>>>>> 2.34.1
>>>>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-26 15:06           ` Lazar, Lijo
@ 2024-07-26 17:16             ` Khatri, Sunil
  2024-07-26 18:43               ` Alex Deucher
  0 siblings, 1 reply; 15+ messages in thread
From: Khatri, Sunil @ 2024-07-26 17:16 UTC (permalink / raw)
  To: Lazar, Lijo, Alex Deucher, Sunil Khatri
  Cc: Alex Deucher, Liu Leo, Christian König, amd-gfx


On 7/26/2024 8:36 PM, Lazar, Lijo wrote:
>
> On 7/26/2024 8:11 PM, Khatri, Sunil wrote:
>> On 7/26/2024 7:53 PM, Khatri, Sunil wrote:
>>> On 7/26/2024 7:18 PM, Lazar, Lijo wrote:
>>>> On 7/26/2024 6:42 PM, Alex Deucher wrote:
>>>>> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com>
>>>>> wrote:
>>>>>> Problem:
>>>>>> IP dump right now is done post suspend of
>>>>>> all IP's which for some IP's could change power
>>>>>> state and software state too which we do not want
>>>>>> to reflect in the dump as it might not be same at
>>>>>> the time of hang.
>>>>>>
>>>>>> Solution:
>>>>>> IP should be dumped as close to the HW state when
>>>>>> the GPU was in hung state without trying to reinitialize
>>>>>> any resource.
>>>>>>
>>>>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>>>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>
>>>>>> ---
>>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60
>>>>>> +++++++++++-----------
>>>>>>    1 file changed, 30 insertions(+), 30 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index 730dae77570c..74f6f15e73b5 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct
>>>>>> amdgpu_device *adev)
>>>>>>           return ret;
>>>>>>    }
>>>>>>
>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>> +{
>>>>>> +       int i;
>>>>>> +
>>>>>> +       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>> +
>>>>>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>> +               adev->reset_info.reset_dump_reg_value[i] =
>>>>>> +
>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>> A suspend also involves power/clock ungate. When reg dump is moved
>>>> earlier, I'm not sure if this read works for all. If it's left to
>>>> individual IP call backs, they could just do the same or better to move
>>>> these up before a dump.
>>> Suspend also put the status.hw = false and each IP in their respective
>>> suspend state which i feel does change the state of the HW.
>>> To get the correct snapshot of the GPU register we should not be
>>> fiddling with the HW IP at least till we capture the dump and that is
>>> the intention behind the change.
>>>
>>> Do you think there is a problem in this approach?
>>>>           amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>           amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>> Adding this does sounds better to enable just before we dump the
>>> registers but i am not very sure if ungating would be clean here or
>>> not. i Could try quickly adding these two calls just before dump.
>>>
>>> All i am worried if it does change some register reflecting the
>>> original state of registers at dump.
>>>
> I was thinking that if it includes some GFX regs and the hang happened
> because of some SDMA/VCN jobs which somehow keeps GFXOFF state intact.

For GFX and SDMA hangs we make sure to disable GFXOFF before so for 
those IP's
I am not worried and also based on my testing on NAVI22 for GPU hang 
their registers
seems to be read cleanly.
Another point that i was thinking is after a GPU hang no where till the 
point of dump
any registers are touched and that is what we need considering we are 
able to read the registers.

For VCN there is dynamic gating which is controlled by the FW if i am 
not wrong which makes the VCN
off and registers cant be read. Only in case of VCN hang i am assuming 
due to a Job pending VCN should be in power ON
state and out intent of reading the registers should work fine. Sadly i 
am unable to validate it as there arent any test readily
available to hang VCN.

>
> BTW, since there is already dump_ip state which could capture IP regs
> separately and handle their power/clock gate situations, do you think
> this generic one is still needed?
>
> For whatever we have implemented till now seems to work fine in case of GPU hang withotu fidling the
> power state explicitly. But Calling suspend of each IP surely seems to change some state in IPs and SW state too.
> So i think until we experience a real problem we should go without the generic UNGATE call for all IP's
>
> But i am not an expert of power, so whatever you suggest would make more sense for the driver.
Regards
Sunil Khatri
>
> Thanks,
> Lijo
>
>>> What u suggest ?
>>> Regards
>>> Sunil
>> I quickly validated on Navi22 by adding below call just before we dump
>> registers
>> if(!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>                         
>>      amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>      amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>                         
>>      amdgpu_reset_reg_dumps(tmp_adev);
>>      dev_info(tmp_adev->dev, "Dumping IP State\n");
>>      /* Trigger ip dump before we reset the asic */
>>      for(i=0; i<tmp_adev->num_ip_blocks; i++)
>>          if(tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>              tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>                                      (void*)tmp_adev);
>>      dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
>> }
>> If this sounds fine with you i am update that. Regards Sunil Khatri
>>>> Thanks,
>>>> Lijo
>>>>
>>>>>> +
>>>>>> +
>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>> +
>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>> +       }
>>>>>> +
>>>>>> +       return 0;
>>>>>> +}
>>>>>> +
>>>>>>    int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>>>                                    struct amdgpu_reset_context
>>>>>> *reset_context)
>>>>>>    {
>>>>>>           int i, r = 0;
>>>>>>           struct amdgpu_job *job = NULL;
>>>>>> +       struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
>>>>>>           bool need_full_reset =
>>>>>>                   test_bit(AMDGPU_NEED_FULL_RESET,
>>>>>> &reset_context->flags);
>>>>>>
>>>>>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>> amdgpu_device *adev,
>>>>>>                           }
>>>>>>                   }
>>>>>>
>>>>>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>>> &reset_context->flags)) {
>>>>>> +                       amdgpu_reset_reg_dumps(tmp_adev);
>>>>>> +
>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>> +                       /* Trigger ip dump before we reset the asic */
>>>>>> +                       for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>>>> +                               if
>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>> +
>>>>>> tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>>> +                                                       (void
>>>>>> *)tmp_adev);
>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State
>>>>>> Completed\n");
>>>>>> +               }
>>>>>> +
>>>>>>                   if (need_full_reset)
>>>>>>                           r = amdgpu_device_ip_suspend(adev);
>>>>>>                   if (need_full_reset)
>>>>>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>> amdgpu_device *adev,
>>>>>>           return r;
>>>>>>    }
>>>>>>
>>>>>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>> -{
>>>>>> -       int i;
>>>>>> -
>>>>>> -       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>> -
>>>>>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>> -               adev->reset_info.reset_dump_reg_value[i] =
>>>>>> -
>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>> -
>>>>>> -
>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>> -
>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>> -       }
>>>>>> -
>>>>>> -       return 0;
>>>>>> -}
>>>>>> -
>>>>>>    int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>>                            struct amdgpu_reset_context *reset_context)
>>>>>>    {
>>>>>>           struct amdgpu_device *tmp_adev = NULL;
>>>>>>           bool need_full_reset, skip_hw_reset, vram_lost = false;
>>>>>>           int r = 0;
>>>>>> -       uint32_t i;
>>>>>>
>>>>>>           /* Try reset handler method first */
>>>>>>           tmp_adev = list_first_entry(device_list_handle, struct
>>>>>> amdgpu_device,
>>>>>>                                       reset_list);
>>>>>>
>>>>>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>>>>> -               amdgpu_reset_reg_dumps(tmp_adev);
>>>>>> -
>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>> -               /* Trigger ip dump before we reset the asic */
>>>>>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>>>> -                       if
>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>> -                               tmp_adev->ip_blocks[i].version->funcs
>>>>>> -                               ->dump_ip_state((void *)tmp_adev);
>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State
>>>>>> Completed\n");
>>>>>> -       }
>>>>>> -
>>>>>>           reset_context->reset_device_list = device_list_handle;
>>>>>>           r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
>>>>>>           /* If reset handler not implemented, continue; otherwise
>>>>>> return */
>>>>>> -- 
>>>>>> 2.34.1
>>>>>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-26 17:16             ` Khatri, Sunil
@ 2024-07-26 18:43               ` Alex Deucher
  2024-07-26 19:21                 ` Khatri, Sunil
  0 siblings, 1 reply; 15+ messages in thread
From: Alex Deucher @ 2024-07-26 18:43 UTC (permalink / raw)
  To: Khatri, Sunil
  Cc: Lazar, Lijo, Sunil Khatri, Alex Deucher, Liu Leo,
	Christian König, amd-gfx

On Fri, Jul 26, 2024 at 1:16 PM Khatri, Sunil <sukhatri@amd.com> wrote:
>
>
> On 7/26/2024 8:36 PM, Lazar, Lijo wrote:
> >
> > On 7/26/2024 8:11 PM, Khatri, Sunil wrote:
> >> On 7/26/2024 7:53 PM, Khatri, Sunil wrote:
> >>> On 7/26/2024 7:18 PM, Lazar, Lijo wrote:
> >>>> On 7/26/2024 6:42 PM, Alex Deucher wrote:
> >>>>> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com>
> >>>>> wrote:
> >>>>>> Problem:
> >>>>>> IP dump right now is done post suspend of
> >>>>>> all IP's which for some IP's could change power
> >>>>>> state and software state too which we do not want
> >>>>>> to reflect in the dump as it might not be same at
> >>>>>> the time of hang.
> >>>>>>
> >>>>>> Solution:
> >>>>>> IP should be dumped as close to the HW state when
> >>>>>> the GPU was in hung state without trying to reinitialize
> >>>>>> any resource.
> >>>>>>
> >>>>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
> >>>>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
> >>>>>
> >>>>>> ---
> >>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60
> >>>>>> +++++++++++-----------
> >>>>>>    1 file changed, 30 insertions(+), 30 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>> index 730dae77570c..74f6f15e73b5 100644
> >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> >>>>>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct
> >>>>>> amdgpu_device *adev)
> >>>>>>           return ret;
> >>>>>>    }
> >>>>>>
> >>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> >>>>>> +{
> >>>>>> +       int i;
> >>>>>> +
> >>>>>> +       lockdep_assert_held(&adev->reset_domain->sem);
> >>>>>> +
> >>>>>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
> >>>>>> +               adev->reset_info.reset_dump_reg_value[i] =
> >>>>>> +
> >>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
> >>>> A suspend also involves power/clock ungate. When reg dump is moved
> >>>> earlier, I'm not sure if this read works for all. If it's left to
> >>>> individual IP call backs, they could just do the same or better to move
> >>>> these up before a dump.
> >>> Suspend also put the status.hw = false and each IP in their respective
> >>> suspend state which i feel does change the state of the HW.
> >>> To get the correct snapshot of the GPU register we should not be
> >>> fiddling with the HW IP at least till we capture the dump and that is
> >>> the intention behind the change.
> >>>
> >>> Do you think there is a problem in this approach?
> >>>>           amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
> >>>>           amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
> >>> Adding this does sounds better to enable just before we dump the
> >>> registers but i am not very sure if ungating would be clean here or
> >>> not. i Could try quickly adding these two calls just before dump.
> >>>
> >>> All i am worried if it does change some register reflecting the
> >>> original state of registers at dump.
> >>>
> > I was thinking that if it includes some GFX regs and the hang happened
> > because of some SDMA/VCN jobs which somehow keeps GFXOFF state intact.
>
> For GFX and SDMA hangs we make sure to disable GFXOFF before so for
> those IP's
> I am not worried and also based on my testing on NAVI22 for GPU hang
> their registers
> seems to be read cleanly.
> Another point that i was thinking is after a GPU hang no where till the
> point of dump
> any registers are touched and that is what we need considering we are
> able to read the registers.
>
> For VCN there is dynamic gating which is controlled by the FW if i am
> not wrong which makes the VCN
> off and registers cant be read. Only in case of VCN hang i am assuming
> due to a Job pending VCN should be in power ON
> state and out intent of reading the registers should work fine. Sadly i
> am unable to validate it as there arent any test readily
> available to hang VCN.

We want to take the register dump as early as possible in the reset
sequence, ideally before any of the hw gets touched as part of the
reset sequence.  Otherwise, the dump is not going to be useful.

Alex


>
> >
> > BTW, since there is already dump_ip state which could capture IP regs
> > separately and handle their power/clock gate situations, do you think
> > this generic one is still needed?
> >
> > For whatever we have implemented till now seems to work fine in case of GPU hang withotu fidling the
> > power state explicitly. But Calling suspend of each IP surely seems to change some state in IPs and SW state too.
> > So i think until we experience a real problem we should go without the generic UNGATE call for all IP's
> >
> > But i am not an expert of power, so whatever you suggest would make more sense for the driver.
> Regards
> Sunil Khatri
> >
> > Thanks,
> > Lijo
> >
> >>> What u suggest ?
> >>> Regards
> >>> Sunil
> >> I quickly validated on Navi22 by adding below call just before we dump
> >> registers
> >> if(!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
> >>
> >>      amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
> >>      amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
> >>
> >>      amdgpu_reset_reg_dumps(tmp_adev);
> >>      dev_info(tmp_adev->dev, "Dumping IP State\n");
> >>      /* Trigger ip dump before we reset the asic */
> >>      for(i=0; i<tmp_adev->num_ip_blocks; i++)
> >>          if(tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
> >>              tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
> >>                                      (void*)tmp_adev);
> >>      dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
> >> }
> >> If this sounds fine with you i am update that. Regards Sunil Khatri
> >>>> Thanks,
> >>>> Lijo
> >>>>
> >>>>>> +
> >>>>>> +
> >>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
> >>>>>> +
> >>>>>> adev->reset_info.reset_dump_reg_value[i]);
> >>>>>> +       }
> >>>>>> +
> >>>>>> +       return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>>    int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> >>>>>>                                    struct amdgpu_reset_context
> >>>>>> *reset_context)
> >>>>>>    {
> >>>>>>           int i, r = 0;
> >>>>>>           struct amdgpu_job *job = NULL;
> >>>>>> +       struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
> >>>>>>           bool need_full_reset =
> >>>>>>                   test_bit(AMDGPU_NEED_FULL_RESET,
> >>>>>> &reset_context->flags);
> >>>>>>
> >>>>>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct
> >>>>>> amdgpu_device *adev,
> >>>>>>                           }
> >>>>>>                   }
> >>>>>>
> >>>>>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP,
> >>>>>> &reset_context->flags)) {
> >>>>>> +                       amdgpu_reset_reg_dumps(tmp_adev);
> >>>>>> +
> >>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State\n");
> >>>>>> +                       /* Trigger ip dump before we reset the asic */
> >>>>>> +                       for (i = 0; i < tmp_adev->num_ip_blocks; i++)
> >>>>>> +                               if
> >>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
> >>>>>> +
> >>>>>> tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
> >>>>>> +                                                       (void
> >>>>>> *)tmp_adev);
> >>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State
> >>>>>> Completed\n");
> >>>>>> +               }
> >>>>>> +
> >>>>>>                   if (need_full_reset)
> >>>>>>                           r = amdgpu_device_ip_suspend(adev);
> >>>>>>                   if (need_full_reset)
> >>>>>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct
> >>>>>> amdgpu_device *adev,
> >>>>>>           return r;
> >>>>>>    }
> >>>>>>
> >>>>>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> >>>>>> -{
> >>>>>> -       int i;
> >>>>>> -
> >>>>>> -       lockdep_assert_held(&adev->reset_domain->sem);
> >>>>>> -
> >>>>>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
> >>>>>> -               adev->reset_info.reset_dump_reg_value[i] =
> >>>>>> -
> >>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
> >>>>>> -
> >>>>>> -
> >>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
> >>>>>> -
> >>>>>> adev->reset_info.reset_dump_reg_value[i]);
> >>>>>> -       }
> >>>>>> -
> >>>>>> -       return 0;
> >>>>>> -}
> >>>>>> -
> >>>>>>    int amdgpu_do_asic_reset(struct list_head *device_list_handle,
> >>>>>>                            struct amdgpu_reset_context *reset_context)
> >>>>>>    {
> >>>>>>           struct amdgpu_device *tmp_adev = NULL;
> >>>>>>           bool need_full_reset, skip_hw_reset, vram_lost = false;
> >>>>>>           int r = 0;
> >>>>>> -       uint32_t i;
> >>>>>>
> >>>>>>           /* Try reset handler method first */
> >>>>>>           tmp_adev = list_first_entry(device_list_handle, struct
> >>>>>> amdgpu_device,
> >>>>>>                                       reset_list);
> >>>>>>
> >>>>>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
> >>>>>> -               amdgpu_reset_reg_dumps(tmp_adev);
> >>>>>> -
> >>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
> >>>>>> -               /* Trigger ip dump before we reset the asic */
> >>>>>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
> >>>>>> -                       if
> >>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
> >>>>>> -                               tmp_adev->ip_blocks[i].version->funcs
> >>>>>> -                               ->dump_ip_state((void *)tmp_adev);
> >>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State
> >>>>>> Completed\n");
> >>>>>> -       }
> >>>>>> -
> >>>>>>           reset_context->reset_device_list = device_list_handle;
> >>>>>>           r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
> >>>>>>           /* If reset handler not implemented, continue; otherwise
> >>>>>> return */
> >>>>>> --
> >>>>>> 2.34.1
> >>>>>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-26 18:43               ` Alex Deucher
@ 2024-07-26 19:21                 ` Khatri, Sunil
  2024-07-29  4:38                   ` Lazar, Lijo
  0 siblings, 1 reply; 15+ messages in thread
From: Khatri, Sunil @ 2024-07-26 19:21 UTC (permalink / raw)
  To: Alex Deucher
  Cc: Lazar, Lijo, Sunil Khatri, Alex Deucher, Liu Leo,
	Christian König, amd-gfx


On 7/27/2024 12:13 AM, Alex Deucher wrote:
> On Fri, Jul 26, 2024 at 1:16 PM Khatri, Sunil <sukhatri@amd.com> wrote:
>>
>> On 7/26/2024 8:36 PM, Lazar, Lijo wrote:
>>> On 7/26/2024 8:11 PM, Khatri, Sunil wrote:
>>>> On 7/26/2024 7:53 PM, Khatri, Sunil wrote:
>>>>> On 7/26/2024 7:18 PM, Lazar, Lijo wrote:
>>>>>> On 7/26/2024 6:42 PM, Alex Deucher wrote:
>>>>>>> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com>
>>>>>>> wrote:
>>>>>>>> Problem:
>>>>>>>> IP dump right now is done post suspend of
>>>>>>>> all IP's which for some IP's could change power
>>>>>>>> state and software state too which we do not want
>>>>>>>> to reflect in the dump as it might not be same at
>>>>>>>> the time of hang.
>>>>>>>>
>>>>>>>> Solution:
>>>>>>>> IP should be dumped as close to the HW state when
>>>>>>>> the GPU was in hung state without trying to reinitialize
>>>>>>>> any resource.
>>>>>>>>
>>>>>>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>>>>>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>
>>>>>>>> ---
>>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60
>>>>>>>> +++++++++++-----------
>>>>>>>>     1 file changed, 30 insertions(+), 30 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> index 730dae77570c..74f6f15e73b5 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct
>>>>>>>> amdgpu_device *adev)
>>>>>>>>            return ret;
>>>>>>>>     }
>>>>>>>>
>>>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>> +{
>>>>>>>> +       int i;
>>>>>>>> +
>>>>>>>> +       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>>>> +
>>>>>>>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>>>> +               adev->reset_info.reset_dump_reg_value[i] =
>>>>>>>> +
>>>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>> A suspend also involves power/clock ungate. When reg dump is moved
>>>>>> earlier, I'm not sure if this read works for all. If it's left to
>>>>>> individual IP call backs, they could just do the same or better to move
>>>>>> these up before a dump.
>>>>> Suspend also put the status.hw = false and each IP in their respective
>>>>> suspend state which i feel does change the state of the HW.
>>>>> To get the correct snapshot of the GPU register we should not be
>>>>> fiddling with the HW IP at least till we capture the dump and that is
>>>>> the intention behind the change.
>>>>>
>>>>> Do you think there is a problem in this approach?
>>>>>>            amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>>>            amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>> Adding this does sounds better to enable just before we dump the
>>>>> registers but i am not very sure if ungating would be clean here or
>>>>> not. i Could try quickly adding these two calls just before dump.
>>>>>
>>>>> All i am worried if it does change some register reflecting the
>>>>> original state of registers at dump.
>>>>>
>>> I was thinking that if it includes some GFX regs and the hang happened
>>> because of some SDMA/VCN jobs which somehow keeps GFXOFF state intact.
>> For GFX and SDMA hangs we make sure to disable GFXOFF before so for
>> those IP's
>> I am not worried and also based on my testing on NAVI22 for GPU hang
>> their registers
>> seems to be read cleanly.
>> Another point that i was thinking is after a GPU hang no where till the
>> point of dump
>> any registers are touched and that is what we need considering we are
>> able to read the registers.
>>
>> For VCN there is dynamic gating which is controlled by the FW if i am
>> not wrong which makes the VCN
>> off and registers cant be read. Only in case of VCN hang i am assuming
>> due to a Job pending VCN should be in power ON
>> state and out intent of reading the registers should work fine. Sadly i
>> am unable to validate it as there arent any test readily
>> available to hang VCN.
> We want to take the register dump as early as possible in the reset
> sequence, ideally before any of the hw gets touched as part of the
> reset sequence.  Otherwise, the dump is not going to be useful.
>
> Alex

Sure Alex. I am also of the same view that we dont want to change any 
power status of any IP as it could change the values.

Regards
Sunil Khatri

>
>
>>> BTW, since there is already dump_ip state which could capture IP regs
>>> separately and handle their power/clock gate situations, do you think
>>> this generic one is still needed?
>>>
>>> For whatever we have implemented till now seems to work fine in case of GPU hang withotu fidling the
>>> power state explicitly. But Calling suspend of each IP surely seems to change some state in IPs and SW state too.
>>> So i think until we experience a real problem we should go without the generic UNGATE call for all IP's
>>>
>>> But i am not an expert of power, so whatever you suggest would make more sense for the driver.
>> Regards
>> Sunil Khatri
>>> Thanks,
>>> Lijo
>>>
>>>>> What u suggest ?
>>>>> Regards
>>>>> Sunil
>>>> I quickly validated on Navi22 by adding below call just before we dump
>>>> registers
>>>> if(!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>>>
>>>>       amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>       amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>
>>>>       amdgpu_reset_reg_dumps(tmp_adev);
>>>>       dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>       /* Trigger ip dump before we reset the asic */
>>>>       for(i=0; i<tmp_adev->num_ip_blocks; i++)
>>>>           if(tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>               tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>                                       (void*)tmp_adev);
>>>>       dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
>>>> }
>>>> If this sounds fine with you i am update that. Regards Sunil Khatri
>>>>>> Thanks,
>>>>>> Lijo
>>>>>>
>>>>>>>> +
>>>>>>>> +
>>>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>>>> +
>>>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>>>> +       }
>>>>>>>> +
>>>>>>>> +       return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>     int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>>>>>                                     struct amdgpu_reset_context
>>>>>>>> *reset_context)
>>>>>>>>     {
>>>>>>>>            int i, r = 0;
>>>>>>>>            struct amdgpu_job *job = NULL;
>>>>>>>> +       struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
>>>>>>>>            bool need_full_reset =
>>>>>>>>                    test_bit(AMDGPU_NEED_FULL_RESET,
>>>>>>>> &reset_context->flags);
>>>>>>>>
>>>>>>>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>                            }
>>>>>>>>                    }
>>>>>>>>
>>>>>>>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>>>>> &reset_context->flags)) {
>>>>>>>> +                       amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>> +
>>>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>>>> +                       /* Trigger ip dump before we reset the asic */
>>>>>>>> +                       for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>>>>>> +                               if
>>>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>> +
>>>>>>>> tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>>>>> +                                                       (void
>>>>>>>> *)tmp_adev);
>>>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State
>>>>>>>> Completed\n");
>>>>>>>> +               }
>>>>>>>> +
>>>>>>>>                    if (need_full_reset)
>>>>>>>>                            r = amdgpu_device_ip_suspend(adev);
>>>>>>>>                    if (need_full_reset)
>>>>>>>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>>>> amdgpu_device *adev,
>>>>>>>>            return r;
>>>>>>>>     }
>>>>>>>>
>>>>>>>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>> -{
>>>>>>>> -       int i;
>>>>>>>> -
>>>>>>>> -       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>>>> -
>>>>>>>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>>>> -               adev->reset_info.reset_dump_reg_value[i] =
>>>>>>>> -
>>>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>>>> -
>>>>>>>> -
>>>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>>>> -
>>>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>>>> -       }
>>>>>>>> -
>>>>>>>> -       return 0;
>>>>>>>> -}
>>>>>>>> -
>>>>>>>>     int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>>>>                             struct amdgpu_reset_context *reset_context)
>>>>>>>>     {
>>>>>>>>            struct amdgpu_device *tmp_adev = NULL;
>>>>>>>>            bool need_full_reset, skip_hw_reset, vram_lost = false;
>>>>>>>>            int r = 0;
>>>>>>>> -       uint32_t i;
>>>>>>>>
>>>>>>>>            /* Try reset handler method first */
>>>>>>>>            tmp_adev = list_first_entry(device_list_handle, struct
>>>>>>>> amdgpu_device,
>>>>>>>>                                        reset_list);
>>>>>>>>
>>>>>>>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>>>>>>> -               amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>> -
>>>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>>>> -               /* Trigger ip dump before we reset the asic */
>>>>>>>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>>>>>> -                       if
>>>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>> -                               tmp_adev->ip_blocks[i].version->funcs
>>>>>>>> -                               ->dump_ip_state((void *)tmp_adev);
>>>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State
>>>>>>>> Completed\n");
>>>>>>>> -       }
>>>>>>>> -
>>>>>>>>            reset_context->reset_device_list = device_list_handle;
>>>>>>>>            r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
>>>>>>>>            /* If reset handler not implemented, continue; otherwise
>>>>>>>> return */
>>>>>>>> --
>>>>>>>> 2.34.1
>>>>>>>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-26 19:21                 ` Khatri, Sunil
@ 2024-07-29  4:38                   ` Lazar, Lijo
  2024-07-29  5:38                     ` Khatri, Sunil
  0 siblings, 1 reply; 15+ messages in thread
From: Lazar, Lijo @ 2024-07-29  4:38 UTC (permalink / raw)
  To: Khatri, Sunil, Alex Deucher
  Cc: Sunil Khatri, Alex Deucher, Liu Leo, Christian König,
	amd-gfx



On 7/27/2024 12:51 AM, Khatri, Sunil wrote:
> 
> On 7/27/2024 12:13 AM, Alex Deucher wrote:
>> On Fri, Jul 26, 2024 at 1:16 PM Khatri, Sunil <sukhatri@amd.com> wrote:
>>>
>>> On 7/26/2024 8:36 PM, Lazar, Lijo wrote:
>>>> On 7/26/2024 8:11 PM, Khatri, Sunil wrote:
>>>>> On 7/26/2024 7:53 PM, Khatri, Sunil wrote:
>>>>>> On 7/26/2024 7:18 PM, Lazar, Lijo wrote:
>>>>>>> On 7/26/2024 6:42 PM, Alex Deucher wrote:
>>>>>>>> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com>
>>>>>>>> wrote:
>>>>>>>>> Problem:
>>>>>>>>> IP dump right now is done post suspend of
>>>>>>>>> all IP's which for some IP's could change power
>>>>>>>>> state and software state too which we do not want
>>>>>>>>> to reflect in the dump as it might not be same at
>>>>>>>>> the time of hang.
>>>>>>>>>
>>>>>>>>> Solution:
>>>>>>>>> IP should be dumped as close to the HW state when
>>>>>>>>> the GPU was in hung state without trying to reinitialize
>>>>>>>>> any resource.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>>>>>>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>
>>>>>>>>> ---
>>>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60
>>>>>>>>> +++++++++++-----------
>>>>>>>>>     1 file changed, 30 insertions(+), 30 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> index 730dae77570c..74f6f15e73b5 100644
>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct
>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>            return ret;
>>>>>>>>>     }
>>>>>>>>>
>>>>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>>> +{
>>>>>>>>> +       int i;
>>>>>>>>> +
>>>>>>>>> +       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>>>>> +
>>>>>>>>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>>>>> +               adev->reset_info.reset_dump_reg_value[i] =
>>>>>>>>> +
>>>>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>>> A suspend also involves power/clock ungate. When reg dump is moved
>>>>>>> earlier, I'm not sure if this read works for all. If it's left to
>>>>>>> individual IP call backs, they could just do the same or better
>>>>>>> to move
>>>>>>> these up before a dump.
>>>>>> Suspend also put the status.hw = false and each IP in their
>>>>>> respective
>>>>>> suspend state which i feel does change the state of the HW.
>>>>>> To get the correct snapshot of the GPU register we should not be
>>>>>> fiddling with the HW IP at least till we capture the dump and that is
>>>>>> the intention behind the change.
>>>>>>
>>>>>> Do you think there is a problem in this approach?
>>>>>>>            amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>>>>            amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>>> Adding this does sounds better to enable just before we dump the
>>>>>> registers but i am not very sure if ungating would be clean here or
>>>>>> not. i Could try quickly adding these two calls just before dump.
>>>>>>
>>>>>> All i am worried if it does change some register reflecting the
>>>>>> original state of registers at dump.
>>>>>>
>>>> I was thinking that if it includes some GFX regs and the hang happened
>>>> because of some SDMA/VCN jobs which somehow keeps GFXOFF state intact.
>>> For GFX and SDMA hangs we make sure to disable GFXOFF before so for
>>> those IP's
>>> I am not worried and also based on my testing on NAVI22 for GPU hang
>>> their registers
>>> seems to be read cleanly.
>>> Another point that i was thinking is after a GPU hang no where till the
>>> point of dump
>>> any registers are touched and that is what we need considering we are
>>> able to read the registers.
>>>
>>> For VCN there is dynamic gating which is controlled by the FW if i am
>>> not wrong which makes the VCN
>>> off and registers cant be read. Only in case of VCN hang i am assuming
>>> due to a Job pending VCN should be in power ON
>>> state and out intent of reading the registers should work fine. Sadly i
>>> am unable to validate it as there arent any test readily
>>> available to hang VCN.
>> We want to take the register dump as early as possible in the reset
>> sequence, ideally before any of the hw gets touched as part of the
>> reset sequence.  Otherwise, the dump is not going to be useful.
>>
>> Alex
> 
> Sure Alex. I am also of the same view that we dont want to change any
> power status of any IP as it could change the values.

There is a debugfs interface 'amdgpu_reset_dump_register_list_write' tp
add registers to reset_info.reset_dump_reg_list. Presently there is no
check about which registers are added to that list. For ex: if user has
added some GFX related registers, this is going to hang while in GFXOFF
as ip dump state comes later.

Also, all IPs don't handle dynamic wakeup; therefore, regardless of a
reset scenario, direct access to powergated IPs could result in a hang
and that will make things worse.

Thanks,
Lijo

> 
> Regards
> Sunil Khatri
> 
>>
>>
>>>> BTW, since there is already dump_ip state which could capture IP regs
>>>> separately and handle their power/clock gate situations, do you think
>>>> this generic one is still needed?
>>>>
>>>> For whatever we have implemented till now seems to work fine in case
>>>> of GPU hang withotu fidling the
>>>> power state explicitly. But Calling suspend of each IP surely seems
>>>> to change some state in IPs and SW state too.
>>>> So i think until we experience a real problem we should go without
>>>> the generic UNGATE call for all IP's
>>>>
>>>> But i am not an expert of power, so whatever you suggest would make
>>>> more sense for the driver.
>>> Regards
>>> Sunil Khatri
>>>> Thanks,
>>>> Lijo
>>>>
>>>>>> What u suggest ?
>>>>>> Regards
>>>>>> Sunil
>>>>> I quickly validated on Navi22 by adding below call just before we dump
>>>>> registers
>>>>> if(!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>>>>
>>>>>       amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>>       amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>>
>>>>>       amdgpu_reset_reg_dumps(tmp_adev);
>>>>>       dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>       /* Trigger ip dump before we reset the asic */
>>>>>       for(i=0; i<tmp_adev->num_ip_blocks; i++)
>>>>>           if(tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>               tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>>                                       (void*)tmp_adev);
>>>>>       dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
>>>>> }
>>>>> If this sounds fine with you i am update that. Regards Sunil Khatri
>>>>>>> Thanks,
>>>>>>> Lijo
>>>>>>>
>>>>>>>>> +
>>>>>>>>> +
>>>>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>>>>> +
>>>>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>>>>> +       }
>>>>>>>>> +
>>>>>>>>> +       return 0;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>     int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>>>>>>                                     struct amdgpu_reset_context
>>>>>>>>> *reset_context)
>>>>>>>>>     {
>>>>>>>>>            int i, r = 0;
>>>>>>>>>            struct amdgpu_job *job = NULL;
>>>>>>>>> +       struct amdgpu_device *tmp_adev =
>>>>>>>>> reset_context->reset_req_dev;
>>>>>>>>>            bool need_full_reset =
>>>>>>>>>                    test_bit(AMDGPU_NEED_FULL_RESET,
>>>>>>>>> &reset_context->flags);
>>>>>>>>>
>>>>>>>>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>                            }
>>>>>>>>>                    }
>>>>>>>>>
>>>>>>>>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>>>>>> &reset_context->flags)) {
>>>>>>>>> +                       amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>> +
>>>>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP
>>>>>>>>> State\n");
>>>>>>>>> +                       /* Trigger ip dump before we reset the
>>>>>>>>> asic */
>>>>>>>>> +                       for (i = 0; i <
>>>>>>>>> tmp_adev->num_ip_blocks; i++)
>>>>>>>>> +                               if
>>>>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>>> +
>>>>>>>>> tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>>>>>> +                                                       (void
>>>>>>>>> *)tmp_adev);
>>>>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State
>>>>>>>>> Completed\n");
>>>>>>>>> +               }
>>>>>>>>> +
>>>>>>>>>                    if (need_full_reset)
>>>>>>>>>                            r = amdgpu_device_ip_suspend(adev);
>>>>>>>>>                    if (need_full_reset)
>>>>>>>>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>            return r;
>>>>>>>>>     }
>>>>>>>>>
>>>>>>>>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>>> -{
>>>>>>>>> -       int i;
>>>>>>>>> -
>>>>>>>>> -       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>>>>> -
>>>>>>>>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>>>>> -               adev->reset_info.reset_dump_reg_value[i] =
>>>>>>>>> -
>>>>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>>>>> -
>>>>>>>>> -
>>>>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>>>>> -
>>>>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>>>>> -       }
>>>>>>>>> -
>>>>>>>>> -       return 0;
>>>>>>>>> -}
>>>>>>>>> -
>>>>>>>>>     int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>>>>>                             struct amdgpu_reset_context
>>>>>>>>> *reset_context)
>>>>>>>>>     {
>>>>>>>>>            struct amdgpu_device *tmp_adev = NULL;
>>>>>>>>>            bool need_full_reset, skip_hw_reset, vram_lost = false;
>>>>>>>>>            int r = 0;
>>>>>>>>> -       uint32_t i;
>>>>>>>>>
>>>>>>>>>            /* Try reset handler method first */
>>>>>>>>>            tmp_adev = list_first_entry(device_list_handle, struct
>>>>>>>>> amdgpu_device,
>>>>>>>>>                                        reset_list);
>>>>>>>>>
>>>>>>>>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>>>>>> &reset_context->flags)) {
>>>>>>>>> -               amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>> -
>>>>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>>>>> -               /* Trigger ip dump before we reset the asic */
>>>>>>>>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>>>>>>> -                       if
>>>>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>>> -                              
>>>>>>>>> tmp_adev->ip_blocks[i].version->funcs
>>>>>>>>> -                               ->dump_ip_state((void *)tmp_adev);
>>>>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State
>>>>>>>>> Completed\n");
>>>>>>>>> -       }
>>>>>>>>> -
>>>>>>>>>            reset_context->reset_device_list = device_list_handle;
>>>>>>>>>            r = amdgpu_reset_perform_reset(tmp_adev,
>>>>>>>>> reset_context);
>>>>>>>>>            /* If reset handler not implemented, continue;
>>>>>>>>> otherwise
>>>>>>>>> return */
>>>>>>>>> -- 
>>>>>>>>> 2.34.1
>>>>>>>>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-29  4:38                   ` Lazar, Lijo
@ 2024-07-29  5:38                     ` Khatri, Sunil
  2024-07-29  5:47                       ` Lazar, Lijo
  0 siblings, 1 reply; 15+ messages in thread
From: Khatri, Sunil @ 2024-07-29  5:38 UTC (permalink / raw)
  To: Lazar, Lijo, Alex Deucher
  Cc: Sunil Khatri, Alex Deucher, Liu Leo, Christian König,
	amd-gfx

[-- Attachment #1: Type: text/plain, Size: 13116 bytes --]


On 7/29/2024 10:08 AM, Lazar, Lijo wrote:
>
> On 7/27/2024 12:51 AM, Khatri, Sunil wrote:
>> On 7/27/2024 12:13 AM, Alex Deucher wrote:
>>> On Fri, Jul 26, 2024 at 1:16 PM Khatri, Sunil<sukhatri@amd.com>  wrote:
>>>> On 7/26/2024 8:36 PM, Lazar, Lijo wrote:
>>>>> On 7/26/2024 8:11 PM, Khatri, Sunil wrote:
>>>>>> On 7/26/2024 7:53 PM, Khatri, Sunil wrote:
>>>>>>> On 7/26/2024 7:18 PM, Lazar, Lijo wrote:
>>>>>>>> On 7/26/2024 6:42 PM, Alex Deucher wrote:
>>>>>>>>> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri<sunil.khatri@amd.com>
>>>>>>>>> wrote:
>>>>>>>>>> Problem:
>>>>>>>>>> IP dump right now is done post suspend of
>>>>>>>>>> all IP's which for some IP's could change power
>>>>>>>>>> state and software state too which we do not want
>>>>>>>>>> to reflect in the dump as it might not be same at
>>>>>>>>>> the time of hang.
>>>>>>>>>>
>>>>>>>>>> Solution:
>>>>>>>>>> IP should be dumped as close to the HW state when
>>>>>>>>>> the GPU was in hung state without trying to reinitialize
>>>>>>>>>> any resource.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Sunil Khatri<sunil.khatri@amd.com>
>>>>>>>>> Acked-by: Alex Deucher<alexander.deucher@amd.com>
>>>>>>>>>
>>>>>>>>>> ---
>>>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60
>>>>>>>>>> +++++++++++-----------
>>>>>>>>>>      1 file changed, 30 insertions(+), 30 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> index 730dae77570c..74f6f15e73b5 100644
>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct
>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>             return ret;
>>>>>>>>>>      }
>>>>>>>>>>
>>>>>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>>>> +{
>>>>>>>>>> +       int i;
>>>>>>>>>> +
>>>>>>>>>> +       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>>>>>> +
>>>>>>>>>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>>>>>> +               adev->reset_info.reset_dump_reg_value[i] =
>>>>>>>>>> +
>>>>>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>>>> A suspend also involves power/clock ungate. When reg dump is moved
>>>>>>>> earlier, I'm not sure if this read works for all. If it's left to
>>>>>>>> individual IP call backs, they could just do the same or better
>>>>>>>> to move
>>>>>>>> these up before a dump.
>>>>>>> Suspend also put the status.hw = false and each IP in their
>>>>>>> respective
>>>>>>> suspend state which i feel does change the state of the HW.
>>>>>>> To get the correct snapshot of the GPU register we should not be
>>>>>>> fiddling with the HW IP at least till we capture the dump and that is
>>>>>>> the intention behind the change.
>>>>>>>
>>>>>>> Do you think there is a problem in this approach?
>>>>>>>>             amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>>>>>             amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>>>> Adding this does sounds better to enable just before we dump the
>>>>>>> registers but i am not very sure if ungating would be clean here or
>>>>>>> not. i Could try quickly adding these two calls just before dump.
>>>>>>>
>>>>>>> All i am worried if it does change some register reflecting the
>>>>>>> original state of registers at dump.
>>>>>>>
>>>>> I was thinking that if it includes some GFX regs and the hang happened
>>>>> because of some SDMA/VCN jobs which somehow keeps GFXOFF state intact.
>>>> For GFX and SDMA hangs we make sure to disable GFXOFF before so for
>>>> those IP's
>>>> I am not worried and also based on my testing on NAVI22 for GPU hang
>>>> their registers
>>>> seems to be read cleanly.
>>>> Another point that i was thinking is after a GPU hang no where till the
>>>> point of dump
>>>> any registers are touched and that is what we need considering we are
>>>> able to read the registers.
>>>>
>>>> For VCN there is dynamic gating which is controlled by the FW if i am
>>>> not wrong which makes the VCN
>>>> off and registers cant be read. Only in case of VCN hang i am assuming
>>>> due to a Job pending VCN should be in power ON
>>>> state and out intent of reading the registers should work fine. Sadly i
>>>> am unable to validate it as there arent any test readily
>>>> available to hang VCN.
>>> We want to take the register dump as early as possible in the reset
>>> sequence, ideally before any of the hw gets touched as part of the
>>> reset sequence.  Otherwise, the dump is not going to be useful.
>>>
>>> Alex
>> Sure Alex. I am also of the same view that we dont want to change any
>> power status of any IP as it could change the values.
> There is a debugfs interface 'amdgpu_reset_dump_register_list_write' tp
> add registers to reset_info.reset_dump_reg_list. Presently there is no
> check about which registers are added to that list. For ex: if user has
> added some GFX related registers, this is going to hang while in GFXOFF
> as ip dump state comes later.
>
this isnt being used and i will clean it up. its original intent was to 
for dump only which we based on all conditions are taking care. So this 
needs clean up and i will check on it.

> Also, all IPs don't handle dynamic wakeup; therefore, regardless of a
> reset scenario, direct access to powergated IPs could result in a hang
> and that will make things worse.

Before dumping any IP we are taking care of Power status of the IP so we 
should be fine. Like for GFX, SDMA we make sure GFXOFF is disabled. VCN 
we are dumping only if its power is shown as ON and like wise it will be 
done for other IPs too.

Regards

Sunil Khatri

> Thanks,
> Lijo
>
>> Regards
>> Sunil Khatri
>>
>>>
>>>>> BTW, since there is already dump_ip state which could capture IP regs
>>>>> separately and handle their power/clock gate situations, do you think
>>>>> this generic one is still needed?
>>>>>
>>>>> For whatever we have implemented till now seems to work fine in case
>>>>> of GPU hang withotu fidling the
>>>>> power state explicitly. But Calling suspend of each IP surely seems
>>>>> to change some state in IPs and SW state too.
>>>>> So i think until we experience a real problem we should go without
>>>>> the generic UNGATE call for all IP's
>>>>>
>>>>> But i am not an expert of power, so whatever you suggest would make
>>>>> more sense for the driver.
>>>> Regards
>>>> Sunil Khatri
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>>>> What u suggest ?
>>>>>>> Regards
>>>>>>> Sunil
>>>>>> I quickly validated on Navi22 by adding below call just before we dump
>>>>>> registers
>>>>>> if(!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>>>>>
>>>>>>        amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>>>        amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>>>
>>>>>>        amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>        dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>>        /* Trigger ip dump before we reset the asic */
>>>>>>        for(i=0; i<tmp_adev->num_ip_blocks; i++)
>>>>>>            if(tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>                tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>>>                                        (void*)tmp_adev);
>>>>>>        dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
>>>>>> }
>>>>>> If this sounds fine with you i am update that. Regards Sunil Khatri
>>>>>>>> Thanks,
>>>>>>>> Lijo
>>>>>>>>
>>>>>>>>>> +
>>>>>>>>>> +
>>>>>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>>>>>> +
>>>>>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>>>>>> +       }
>>>>>>>>>> +
>>>>>>>>>> +       return 0;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>      int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>>>>>>>                                      struct amdgpu_reset_context
>>>>>>>>>> *reset_context)
>>>>>>>>>>      {
>>>>>>>>>>             int i, r = 0;
>>>>>>>>>>             struct amdgpu_job *job = NULL;
>>>>>>>>>> +       struct amdgpu_device *tmp_adev =
>>>>>>>>>> reset_context->reset_req_dev;
>>>>>>>>>>             bool need_full_reset =
>>>>>>>>>>                     test_bit(AMDGPU_NEED_FULL_RESET,
>>>>>>>>>> &reset_context->flags);
>>>>>>>>>>
>>>>>>>>>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>                             }
>>>>>>>>>>                     }
>>>>>>>>>>
>>>>>>>>>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>>>>>>> &reset_context->flags)) {
>>>>>>>>>> +                       amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>>> +
>>>>>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP
>>>>>>>>>> State\n");
>>>>>>>>>> +                       /* Trigger ip dump before we reset the
>>>>>>>>>> asic */
>>>>>>>>>> +                       for (i = 0; i <
>>>>>>>>>> tmp_adev->num_ip_blocks; i++)
>>>>>>>>>> +                               if
>>>>>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>>>> +
>>>>>>>>>> tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>>>>>>> +                                                       (void
>>>>>>>>>> *)tmp_adev);
>>>>>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State
>>>>>>>>>> Completed\n");
>>>>>>>>>> +               }
>>>>>>>>>> +
>>>>>>>>>>                     if (need_full_reset)
>>>>>>>>>>                             r = amdgpu_device_ip_suspend(adev);
>>>>>>>>>>                     if (need_full_reset)
>>>>>>>>>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>             return r;
>>>>>>>>>>      }
>>>>>>>>>>
>>>>>>>>>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>>>> -{
>>>>>>>>>> -       int i;
>>>>>>>>>> -
>>>>>>>>>> -       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>>>>>> -
>>>>>>>>>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>>>>>> -               adev->reset_info.reset_dump_reg_value[i] =
>>>>>>>>>> -
>>>>>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>>>>>> -
>>>>>>>>>> -
>>>>>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>>>>>> -
>>>>>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>>>>>> -       }
>>>>>>>>>> -
>>>>>>>>>> -       return 0;
>>>>>>>>>> -}
>>>>>>>>>> -
>>>>>>>>>>      int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>>>>>>                              struct amdgpu_reset_context
>>>>>>>>>> *reset_context)
>>>>>>>>>>      {
>>>>>>>>>>             struct amdgpu_device *tmp_adev = NULL;
>>>>>>>>>>             bool need_full_reset, skip_hw_reset, vram_lost = false;
>>>>>>>>>>             int r = 0;
>>>>>>>>>> -       uint32_t i;
>>>>>>>>>>
>>>>>>>>>>             /* Try reset handler method first */
>>>>>>>>>>             tmp_adev = list_first_entry(device_list_handle, struct
>>>>>>>>>> amdgpu_device,
>>>>>>>>>>                                         reset_list);
>>>>>>>>>>
>>>>>>>>>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>>>>>>> &reset_context->flags)) {
>>>>>>>>>> -               amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>>> -
>>>>>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>>>>>> -               /* Trigger ip dump before we reset the asic */
>>>>>>>>>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>>>>>>>> -                       if
>>>>>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>>>> -
>>>>>>>>>> tmp_adev->ip_blocks[i].version->funcs
>>>>>>>>>> -                               ->dump_ip_state((void *)tmp_adev);
>>>>>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State
>>>>>>>>>> Completed\n");
>>>>>>>>>> -       }
>>>>>>>>>> -
>>>>>>>>>>             reset_context->reset_device_list = device_list_handle;
>>>>>>>>>>             r = amdgpu_reset_perform_reset(tmp_adev,
>>>>>>>>>> reset_context);
>>>>>>>>>>             /* If reset handler not implemented, continue;
>>>>>>>>>> otherwise
>>>>>>>>>> return */
>>>>>>>>>> -- 
>>>>>>>>>> 2.34.1
>>>>>>>>>>

[-- Attachment #2: Type: text/html, Size: 19295 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-29  5:38                     ` Khatri, Sunil
@ 2024-07-29  5:47                       ` Lazar, Lijo
  2024-07-29  8:27                         ` Khatri, Sunil
  0 siblings, 1 reply; 15+ messages in thread
From: Lazar, Lijo @ 2024-07-29  5:47 UTC (permalink / raw)
  To: Khatri, Sunil, Alex Deucher
  Cc: Sunil Khatri, Alex Deucher, Liu Leo, Christian König,
	amd-gfx



On 7/29/2024 11:08 AM, Khatri, Sunil wrote:
> 
> On 7/29/2024 10:08 AM, Lazar, Lijo wrote:
>> On 7/27/2024 12:51 AM, Khatri, Sunil wrote:
>>> On 7/27/2024 12:13 AM, Alex Deucher wrote:
>>>> On Fri, Jul 26, 2024 at 1:16 PM Khatri, Sunil <sukhatri@amd.com> wrote:
>>>>> On 7/26/2024 8:36 PM, Lazar, Lijo wrote:
>>>>>> On 7/26/2024 8:11 PM, Khatri, Sunil wrote:
>>>>>>> On 7/26/2024 7:53 PM, Khatri, Sunil wrote:
>>>>>>>> On 7/26/2024 7:18 PM, Lazar, Lijo wrote:
>>>>>>>>> On 7/26/2024 6:42 PM, Alex Deucher wrote:
>>>>>>>>>> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com>
>>>>>>>>>> wrote:
>>>>>>>>>>> Problem:
>>>>>>>>>>> IP dump right now is done post suspend of
>>>>>>>>>>> all IP's which for some IP's could change power
>>>>>>>>>>> state and software state too which we do not want
>>>>>>>>>>> to reflect in the dump as it might not be same at
>>>>>>>>>>> the time of hang.
>>>>>>>>>>>
>>>>>>>>>>> Solution:
>>>>>>>>>>> IP should be dumped as close to the HW state when
>>>>>>>>>>> the GPU was in hung state without trying to reinitialize
>>>>>>>>>>> any resource.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>>>>>>>>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>
>>>>>>>>>>> ---
>>>>>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60
>>>>>>>>>>> +++++++++++-----------
>>>>>>>>>>>     1 file changed, 30 insertions(+), 30 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> index 730dae77570c..74f6f15e73b5 100644
>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct
>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>            return ret;
>>>>>>>>>>>     }
>>>>>>>>>>>
>>>>>>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>>>>> +{
>>>>>>>>>>> +       int i;
>>>>>>>>>>> +
>>>>>>>>>>> +       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>>>>>>> +
>>>>>>>>>>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>>>>>>> +               adev->reset_info.reset_dump_reg_value[i] =
>>>>>>>>>>> +
>>>>>>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>>>>> A suspend also involves power/clock ungate. When reg dump is moved
>>>>>>>>> earlier, I'm not sure if this read works for all. If it's left to
>>>>>>>>> individual IP call backs, they could just do the same or better
>>>>>>>>> to move
>>>>>>>>> these up before a dump.
>>>>>>>> Suspend also put the status.hw = false and each IP in their
>>>>>>>> respective
>>>>>>>> suspend state which i feel does change the state of the HW.
>>>>>>>> To get the correct snapshot of the GPU register we should not be
>>>>>>>> fiddling with the HW IP at least till we capture the dump and that is
>>>>>>>> the intention behind the change.
>>>>>>>>
>>>>>>>> Do you think there is a problem in this approach?
>>>>>>>>>            amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>>>>>>            amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>>>>> Adding this does sounds better to enable just before we dump the
>>>>>>>> registers but i am not very sure if ungating would be clean here or
>>>>>>>> not. i Could try quickly adding these two calls just before dump.
>>>>>>>>
>>>>>>>> All i am worried if it does change some register reflecting the
>>>>>>>> original state of registers at dump.
>>>>>>>>
>>>>>> I was thinking that if it includes some GFX regs and the hang happened
>>>>>> because of some SDMA/VCN jobs which somehow keeps GFXOFF state intact.
>>>>> For GFX and SDMA hangs we make sure to disable GFXOFF before so for
>>>>> those IP's
>>>>> I am not worried and also based on my testing on NAVI22 for GPU hang
>>>>> their registers
>>>>> seems to be read cleanly.
>>>>> Another point that i was thinking is after a GPU hang no where till the
>>>>> point of dump
>>>>> any registers are touched and that is what we need considering we are
>>>>> able to read the registers.
>>>>>
>>>>> For VCN there is dynamic gating which is controlled by the FW if i am
>>>>> not wrong which makes the VCN
>>>>> off and registers cant be read. Only in case of VCN hang i am assuming
>>>>> due to a Job pending VCN should be in power ON
>>>>> state and out intent of reading the registers should work fine. Sadly i
>>>>> am unable to validate it as there arent any test readily
>>>>> available to hang VCN.
>>>> We want to take the register dump as early as possible in the reset
>>>> sequence, ideally before any of the hw gets touched as part of the
>>>> reset sequence.  Otherwise, the dump is not going to be useful.
>>>>
>>>> Alex
>>> Sure Alex. I am also of the same view that we dont want to change any
>>> power status of any IP as it could change the values.
>> There is a debugfs interface 'amdgpu_reset_dump_register_list_write' tp
>> add registers to reset_info.reset_dump_reg_list. Presently there is no
>> check about which registers are added to that list. For ex: if user has
>> added some GFX related registers, this is going to hang while in GFXOFF
>> as ip dump state comes later.
>>
> this isnt being used and i will clean it up. its original intent was to
> for dump only which we based on all conditions are taking care. So this
> needs clean up and i will check on it.
> 

Right, that's why I asked before whether this generic dump is really
required - "since there is already dump_ip state which could capture IP
regs separately and handle their power/clock gate situations, do you
think this generic one is still needed?"

>> Also, all IPs don't handle dynamic wakeup; therefore, regardless of a
>> reset scenario, direct access to powergated IPs could result in a hang
>> and that will make things worse.
> 
> Before dumping any IP we are taking care of Power status of the IP so we
> should be fine. Like for GFX, SDMA we make sure GFXOFF is disabled. VCN
> we are dumping only if its power is shown as ON and like wise it will be
> done for other IPs too.
> 

Yes, it's better to handle at IP level itself and remove the generic
interface. If the IP that is hung is known, it's likely that there is no
need to power ungate as the IP could be in some sort of busy state.
Also, there may not be any response to further operations from that IP.

Thanks,
Lijo

> Regards
> 
> Sunil Khatri
> 
>> Thanks,
>> Lijo
>>
>>> Regards
>>> Sunil Khatri
>>>
>>>>>> BTW, since there is already dump_ip state which could capture IP regs
>>>>>> separately and handle their power/clock gate situations, do you think
>>>>>> this generic one is still needed?
>>>>>>
>>>>>> For whatever we have implemented till now seems to work fine in case
>>>>>> of GPU hang withotu fidling the
>>>>>> power state explicitly. But Calling suspend of each IP surely seems
>>>>>> to change some state in IPs and SW state too.
>>>>>> So i think until we experience a real problem we should go without
>>>>>> the generic UNGATE call for all IP's
>>>>>>
>>>>>> But i am not an expert of power, so whatever you suggest would make
>>>>>> more sense for the driver.
>>>>> Regards
>>>>> Sunil Khatri
>>>>>> Thanks,
>>>>>> Lijo
>>>>>>
>>>>>>>> What u suggest ?
>>>>>>>> Regards
>>>>>>>> Sunil
>>>>>>> I quickly validated on Navi22 by adding below call just before we dump
>>>>>>> registers
>>>>>>> if(!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>>>>>>
>>>>>>>       amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>>>>       amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>>>>
>>>>>>>       amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>       dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>>>       /* Trigger ip dump before we reset the asic */
>>>>>>>       for(i=0; i<tmp_adev->num_ip_blocks; i++)
>>>>>>>           if(tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>               tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>>>>                                       (void*)tmp_adev);
>>>>>>>       dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
>>>>>>> }
>>>>>>> If this sounds fine with you i am update that. Regards Sunil Khatri
>>>>>>>>> Thanks,
>>>>>>>>> Lijo
>>>>>>>>>
>>>>>>>>>>> +
>>>>>>>>>>> +
>>>>>>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>>>>>>> +
>>>>>>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>>>>>>> +       }
>>>>>>>>>>> +
>>>>>>>>>>> +       return 0;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>>     int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>>>>>>>>                                     struct amdgpu_reset_context
>>>>>>>>>>> *reset_context)
>>>>>>>>>>>     {
>>>>>>>>>>>            int i, r = 0;
>>>>>>>>>>>            struct amdgpu_job *job = NULL;
>>>>>>>>>>> +       struct amdgpu_device *tmp_adev =
>>>>>>>>>>> reset_context->reset_req_dev;
>>>>>>>>>>>            bool need_full_reset =
>>>>>>>>>>>                    test_bit(AMDGPU_NEED_FULL_RESET,
>>>>>>>>>>> &reset_context->flags);
>>>>>>>>>>>
>>>>>>>>>>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>                            }
>>>>>>>>>>>                    }
>>>>>>>>>>>
>>>>>>>>>>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>>>>>>>> &reset_context->flags)) {
>>>>>>>>>>> +                       amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>>>> +
>>>>>>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP
>>>>>>>>>>> State\n");
>>>>>>>>>>> +                       /* Trigger ip dump before we reset the
>>>>>>>>>>> asic */
>>>>>>>>>>> +                       for (i = 0; i <
>>>>>>>>>>> tmp_adev->num_ip_blocks; i++)
>>>>>>>>>>> +                               if
>>>>>>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>>>>> +
>>>>>>>>>>> tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>>>>>>>> +                                                       (void
>>>>>>>>>>> *)tmp_adev);
>>>>>>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State
>>>>>>>>>>> Completed\n");
>>>>>>>>>>> +               }
>>>>>>>>>>> +
>>>>>>>>>>>                    if (need_full_reset)
>>>>>>>>>>>                            r = amdgpu_device_ip_suspend(adev);
>>>>>>>>>>>                    if (need_full_reset)
>>>>>>>>>>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>            return r;
>>>>>>>>>>>     }
>>>>>>>>>>>
>>>>>>>>>>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>>>>> -{
>>>>>>>>>>> -       int i;
>>>>>>>>>>> -
>>>>>>>>>>> -       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>>>>>>> -
>>>>>>>>>>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>>>>>>> -               adev->reset_info.reset_dump_reg_value[i] =
>>>>>>>>>>> -
>>>>>>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>>>>>>> -
>>>>>>>>>>> -
>>>>>>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>>>>>>> -
>>>>>>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>>>>>>> -       }
>>>>>>>>>>> -
>>>>>>>>>>> -       return 0;
>>>>>>>>>>> -}
>>>>>>>>>>> -
>>>>>>>>>>>     int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>>>>>>>                             struct amdgpu_reset_context
>>>>>>>>>>> *reset_context)
>>>>>>>>>>>     {
>>>>>>>>>>>            struct amdgpu_device *tmp_adev = NULL;
>>>>>>>>>>>            bool need_full_reset, skip_hw_reset, vram_lost = false;
>>>>>>>>>>>            int r = 0;
>>>>>>>>>>> -       uint32_t i;
>>>>>>>>>>>
>>>>>>>>>>>            /* Try reset handler method first */
>>>>>>>>>>>            tmp_adev = list_first_entry(device_list_handle, struct
>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>                                        reset_list);
>>>>>>>>>>>
>>>>>>>>>>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>>>>>>>> &reset_context->flags)) {
>>>>>>>>>>> -               amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>>>> -
>>>>>>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>>>>>>> -               /* Trigger ip dump before we reset the asic */
>>>>>>>>>>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>>>>>>>>> -                       if
>>>>>>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>>>>> -                              
>>>>>>>>>>> tmp_adev->ip_blocks[i].version->funcs
>>>>>>>>>>> -                               ->dump_ip_state((void *)tmp_adev);
>>>>>>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State
>>>>>>>>>>> Completed\n");
>>>>>>>>>>> -       }
>>>>>>>>>>> -
>>>>>>>>>>>            reset_context->reset_device_list = device_list_handle;
>>>>>>>>>>>            r = amdgpu_reset_perform_reset(tmp_adev,
>>>>>>>>>>> reset_context);
>>>>>>>>>>>            /* If reset handler not implemented, continue;
>>>>>>>>>>> otherwise
>>>>>>>>>>> return */
>>>>>>>>>>> -- 
>>>>>>>>>>> 2.34.1
>>>>>>>>>>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
  2024-07-29  5:47                       ` Lazar, Lijo
@ 2024-07-29  8:27                         ` Khatri, Sunil
  0 siblings, 0 replies; 15+ messages in thread
From: Khatri, Sunil @ 2024-07-29  8:27 UTC (permalink / raw)
  To: Lazar, Lijo, Alex Deucher
  Cc: Sunil Khatri, Alex Deucher, Liu Leo, Christian König,
	amd-gfx


On 7/29/2024 11:17 AM, Lazar, Lijo wrote:
>
> On 7/29/2024 11:08 AM, Khatri, Sunil wrote:
>> On 7/29/2024 10:08 AM, Lazar, Lijo wrote:
>>> On 7/27/2024 12:51 AM, Khatri, Sunil wrote:
>>>> On 7/27/2024 12:13 AM, Alex Deucher wrote:
>>>>> On Fri, Jul 26, 2024 at 1:16 PM Khatri, Sunil <sukhatri@amd.com> wrote:
>>>>>> On 7/26/2024 8:36 PM, Lazar, Lijo wrote:
>>>>>>> On 7/26/2024 8:11 PM, Khatri, Sunil wrote:
>>>>>>>> On 7/26/2024 7:53 PM, Khatri, Sunil wrote:
>>>>>>>>> On 7/26/2024 7:18 PM, Lazar, Lijo wrote:
>>>>>>>>>> On 7/26/2024 6:42 PM, Alex Deucher wrote:
>>>>>>>>>>> On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@amd.com>
>>>>>>>>>>> wrote:
>>>>>>>>>>>> Problem:
>>>>>>>>>>>> IP dump right now is done post suspend of
>>>>>>>>>>>> all IP's which for some IP's could change power
>>>>>>>>>>>> state and software state too which we do not want
>>>>>>>>>>>> to reflect in the dump as it might not be same at
>>>>>>>>>>>> the time of hang.
>>>>>>>>>>>>
>>>>>>>>>>>> Solution:
>>>>>>>>>>>> IP should be dumped as close to the HW state when
>>>>>>>>>>>> the GPU was in hung state without trying to reinitialize
>>>>>>>>>>>> any resource.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Sunil Khatri <sunil.khatri@amd.com>
>>>>>>>>>>> Acked-by: Alex Deucher <alexander.deucher@amd.com>
>>>>>>>>>>>
>>>>>>>>>>>> ---
>>>>>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60
>>>>>>>>>>>> +++++++++++-----------
>>>>>>>>>>>>      1 file changed, 30 insertions(+), 30 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> index 730dae77570c..74f6f15e73b5 100644
>>>>>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>>>>>> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct
>>>>>>>>>>>> amdgpu_device *adev)
>>>>>>>>>>>>             return ret;
>>>>>>>>>>>>      }
>>>>>>>>>>>>
>>>>>>>>>>>> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>>>>>> +{
>>>>>>>>>>>> +       int i;
>>>>>>>>>>>> +
>>>>>>>>>>>> +       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>>>>>>>> +
>>>>>>>>>>>> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>>>>>>>> +               adev->reset_info.reset_dump_reg_value[i] =
>>>>>>>>>>>> +
>>>>>>>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>>>>>> A suspend also involves power/clock ungate. When reg dump is moved
>>>>>>>>>> earlier, I'm not sure if this read works for all. If it's left to
>>>>>>>>>> individual IP call backs, they could just do the same or better
>>>>>>>>>> to move
>>>>>>>>>> these up before a dump.
>>>>>>>>> Suspend also put the status.hw = false and each IP in their
>>>>>>>>> respective
>>>>>>>>> suspend state which i feel does change the state of the HW.
>>>>>>>>> To get the correct snapshot of the GPU register we should not be
>>>>>>>>> fiddling with the HW IP at least till we capture the dump and that is
>>>>>>>>> the intention behind the change.
>>>>>>>>>
>>>>>>>>> Do you think there is a problem in this approach?
>>>>>>>>>>             amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>>>>>>>             amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>>>>>> Adding this does sounds better to enable just before we dump the
>>>>>>>>> registers but i am not very sure if ungating would be clean here or
>>>>>>>>> not. i Could try quickly adding these two calls just before dump.
>>>>>>>>>
>>>>>>>>> All i am worried if it does change some register reflecting the
>>>>>>>>> original state of registers at dump.
>>>>>>>>>
>>>>>>> I was thinking that if it includes some GFX regs and the hang happened
>>>>>>> because of some SDMA/VCN jobs which somehow keeps GFXOFF state intact.
>>>>>> For GFX and SDMA hangs we make sure to disable GFXOFF before so for
>>>>>> those IP's
>>>>>> I am not worried and also based on my testing on NAVI22 for GPU hang
>>>>>> their registers
>>>>>> seems to be read cleanly.
>>>>>> Another point that i was thinking is after a GPU hang no where till the
>>>>>> point of dump
>>>>>> any registers are touched and that is what we need considering we are
>>>>>> able to read the registers.
>>>>>>
>>>>>> For VCN there is dynamic gating which is controlled by the FW if i am
>>>>>> not wrong which makes the VCN
>>>>>> off and registers cant be read. Only in case of VCN hang i am assuming
>>>>>> due to a Job pending VCN should be in power ON
>>>>>> state and out intent of reading the registers should work fine. Sadly i
>>>>>> am unable to validate it as there arent any test readily
>>>>>> available to hang VCN.
>>>>> We want to take the register dump as early as possible in the reset
>>>>> sequence, ideally before any of the hw gets touched as part of the
>>>>> reset sequence.  Otherwise, the dump is not going to be useful.
>>>>>
>>>>> Alex
>>>> Sure Alex. I am also of the same view that we dont want to change any
>>>> power status of any IP as it could change the values.
>>> There is a debugfs interface 'amdgpu_reset_dump_register_list_write' tp
>>> add registers to reset_info.reset_dump_reg_list. Presently there is no
>>> check about which registers are added to that list. For ex: if user has
>>> added some GFX related registers, this is going to hang while in GFXOFF
>>> as ip dump state comes later.
>>>
>> this isnt being used and i will clean it up. its original intent was to
>> for dump only which we based on all conditions are taking care. So this
>> needs clean up and i will check on it.
>>
> Right, that's why I asked before whether this generic dump is really
> required - "since there is already dump_ip state which could capture IP
> regs separately and handle their power/clock gate situations, do you
> think this generic one is still needed?"
Sorry i could not understand that you were talking about debugfs reg 
dump first.
i got it and i will clean that up. With that in mind we will not be 
needing to ungate
or change the state of IP but just before we dump we would use the ip 
specific power
calls based on need and hence generic ones are not needed.
>
>>> Also, all IPs don't handle dynamic wakeup; therefore, regardless of a
>>> reset scenario, direct access to powergated IPs could result in a hang
>>> and that will make things worse.
Noted.
>> Before dumping any IP we are taking care of Power status of the IP so we
>> should be fine. Like for GFX, SDMA we make sure GFXOFF is disabled. VCN
>> we are dumping only if its power is shown as ON and like wise it will be
>> done for other IPs too.
>>
> Yes, it's better to handle at IP level itself and remove the generic
> interface. If the IP that is hung is known, it's likely that there is no
> need to power ungate as the IP could be in some sort of busy state.
> Also, there may not be any response to further operations from that IP.
Got it.
>
> Thanks,
> Lijo
>
>> Regards
>>
>> Sunil Khatri
>>
>>> Thanks,
>>> Lijo
>>>
>>>> Regards
>>>> Sunil Khatri
>>>>
>>>>>>> BTW, since there is already dump_ip state which could capture IP regs
>>>>>>> separately and handle their power/clock gate situations, do you think
>>>>>>> this generic one is still needed?
>>>>>>>
>>>>>>> For whatever we have implemented till now seems to work fine in case
>>>>>>> of GPU hang withotu fidling the
>>>>>>> power state explicitly. But Calling suspend of each IP surely seems
>>>>>>> to change some state in IPs and SW state too.
>>>>>>> So i think until we experience a real problem we should go without
>>>>>>> the generic UNGATE call for all IP's
>>>>>>>
>>>>>>> But i am not an expert of power, so whatever you suggest would make
>>>>>>> more sense for the driver.
>>>>>> Regards
>>>>>> Sunil Khatri
>>>>>>> Thanks,
>>>>>>> Lijo
>>>>>>>
>>>>>>>>> What u suggest ?
>>>>>>>>> Regards
>>>>>>>>> Sunil
>>>>>>>> I quickly validated on Navi22 by adding below call just before we dump
>>>>>>>> registers
>>>>>>>> if(!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
>>>>>>>>
>>>>>>>>        amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>>>>>>>>        amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>>>>>>>>
>>>>>>>>        amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>        dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>>>>        /* Trigger ip dump before we reset the asic */
>>>>>>>>        for(i=0; i<tmp_adev->num_ip_blocks; i++)
>>>>>>>>            if(tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>>                tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>>>>>                                        (void*)tmp_adev);
>>>>>>>>        dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
>>>>>>>> }
>>>>>>>> If this sounds fine with you i am update that. Regards Sunil Khatri
>>>>>>>>>> Thanks,
>>>>>>>>>> Lijo
>>>>>>>>>>
>>>>>>>>>>>> +
>>>>>>>>>>>> +
>>>>>>>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>>>>>>>> +
>>>>>>>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>>>>>>>> +       }
>>>>>>>>>>>> +
>>>>>>>>>>>> +       return 0;
>>>>>>>>>>>> +}
>>>>>>>>>>>> +
>>>>>>>>>>>>      int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>>>>>>>>>>                                      struct amdgpu_reset_context
>>>>>>>>>>>> *reset_context)
>>>>>>>>>>>>      {
>>>>>>>>>>>>             int i, r = 0;
>>>>>>>>>>>>             struct amdgpu_job *job = NULL;
>>>>>>>>>>>> +       struct amdgpu_device *tmp_adev =
>>>>>>>>>>>> reset_context->reset_req_dev;
>>>>>>>>>>>>             bool need_full_reset =
>>>>>>>>>>>>                     test_bit(AMDGPU_NEED_FULL_RESET,
>>>>>>>>>>>> &reset_context->flags);
>>>>>>>>>>>>
>>>>>>>>>>>> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>                             }
>>>>>>>>>>>>                     }
>>>>>>>>>>>>
>>>>>>>>>>>> +               if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>>>>>>>>> &reset_context->flags)) {
>>>>>>>>>>>> +                       amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>>>>> +
>>>>>>>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP
>>>>>>>>>>>> State\n");
>>>>>>>>>>>> +                       /* Trigger ip dump before we reset the
>>>>>>>>>>>> asic */
>>>>>>>>>>>> +                       for (i = 0; i <
>>>>>>>>>>>> tmp_adev->num_ip_blocks; i++)
>>>>>>>>>>>> +                               if
>>>>>>>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>>>>>> +
>>>>>>>>>>>> tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
>>>>>>>>>>>> +                                                       (void
>>>>>>>>>>>> *)tmp_adev);
>>>>>>>>>>>> +                       dev_info(tmp_adev->dev, "Dumping IP State
>>>>>>>>>>>> Completed\n");
>>>>>>>>>>>> +               }
>>>>>>>>>>>> +
>>>>>>>>>>>>                     if (need_full_reset)
>>>>>>>>>>>>                             r = amdgpu_device_ip_suspend(adev);
>>>>>>>>>>>>                     if (need_full_reset)
>>>>>>>>>>>> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct
>>>>>>>>>>>> amdgpu_device *adev,
>>>>>>>>>>>>             return r;
>>>>>>>>>>>>      }
>>>>>>>>>>>>
>>>>>>>>>>>> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
>>>>>>>>>>>> -{
>>>>>>>>>>>> -       int i;
>>>>>>>>>>>> -
>>>>>>>>>>>> -       lockdep_assert_held(&adev->reset_domain->sem);
>>>>>>>>>>>> -
>>>>>>>>>>>> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>>>>>>>>>> -               adev->reset_info.reset_dump_reg_value[i] =
>>>>>>>>>>>> -
>>>>>>>>>>>> RREG32(adev->reset_info.reset_dump_reg_list[i]);
>>>>>>>>>>>> -
>>>>>>>>>>>> -
>>>>>>>>>>>> trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
>>>>>>>>>>>> -
>>>>>>>>>>>> adev->reset_info.reset_dump_reg_value[i]);
>>>>>>>>>>>> -       }
>>>>>>>>>>>> -
>>>>>>>>>>>> -       return 0;
>>>>>>>>>>>> -}
>>>>>>>>>>>> -
>>>>>>>>>>>>      int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>>>>>>>>>>                              struct amdgpu_reset_context
>>>>>>>>>>>> *reset_context)
>>>>>>>>>>>>      {
>>>>>>>>>>>>             struct amdgpu_device *tmp_adev = NULL;
>>>>>>>>>>>>             bool need_full_reset, skip_hw_reset, vram_lost = false;
>>>>>>>>>>>>             int r = 0;
>>>>>>>>>>>> -       uint32_t i;
>>>>>>>>>>>>
>>>>>>>>>>>>             /* Try reset handler method first */
>>>>>>>>>>>>             tmp_adev = list_first_entry(device_list_handle, struct
>>>>>>>>>>>> amdgpu_device,
>>>>>>>>>>>>                                         reset_list);
>>>>>>>>>>>>
>>>>>>>>>>>> -       if (!test_bit(AMDGPU_SKIP_COREDUMP,
>>>>>>>>>>>> &reset_context->flags)) {
>>>>>>>>>>>> -               amdgpu_reset_reg_dumps(tmp_adev);
>>>>>>>>>>>> -
>>>>>>>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
>>>>>>>>>>>> -               /* Trigger ip dump before we reset the asic */
>>>>>>>>>>>> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
>>>>>>>>>>>> -                       if
>>>>>>>>>>>> (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
>>>>>>>>>>>> -
>>>>>>>>>>>> tmp_adev->ip_blocks[i].version->funcs
>>>>>>>>>>>> -                               ->dump_ip_state((void *)tmp_adev);
>>>>>>>>>>>> -               dev_info(tmp_adev->dev, "Dumping IP State
>>>>>>>>>>>> Completed\n");
>>>>>>>>>>>> -       }
>>>>>>>>>>>> -
>>>>>>>>>>>>             reset_context->reset_device_list = device_list_handle;
>>>>>>>>>>>>             r = amdgpu_reset_perform_reset(tmp_adev,
>>>>>>>>>>>> reset_context);
>>>>>>>>>>>>             /* If reset handler not implemented, continue;
>>>>>>>>>>>> otherwise
>>>>>>>>>>>> return */
>>>>>>>>>>>> -- 
>>>>>>>>>>>> 2.34.1
>>>>>>>>>>>>

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2024-07-29  8:27 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-07-26 12:47 [PATCH 1/2] drm/amdgpu: print VCN instance dump for valid instance Sunil Khatri
2024-07-26 12:47 ` [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's Sunil Khatri
2024-07-26 13:12   ` Alex Deucher
2024-07-26 13:48     ` Lazar, Lijo
2024-07-26 14:23       ` Khatri, Sunil
2024-07-26 14:41         ` Khatri, Sunil
2024-07-26 15:06           ` Lazar, Lijo
2024-07-26 17:16             ` Khatri, Sunil
2024-07-26 18:43               ` Alex Deucher
2024-07-26 19:21                 ` Khatri, Sunil
2024-07-29  4:38                   ` Lazar, Lijo
2024-07-29  5:38                     ` Khatri, Sunil
2024-07-29  5:47                       ` Lazar, Lijo
2024-07-29  8:27                         ` Khatri, Sunil
2024-07-26 13:12 ` [PATCH 1/2] drm/amdgpu: print VCN instance dump for valid instance Alex Deucher

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox