AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Felix Kuehling <felix.kuehling@amd.com>
To: Jonathan Kim <jonathan.kim@amd.com>, amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls
Date: Tue, 22 Nov 2022 18:59:06 -0500	[thread overview]
Message-ID: <0e6d5813-7031-451d-be18-da08f6632530@amd.com> (raw)
In-Reply-To: <20221031162359.445805-7-jonathan.kim@amd.com>


On 2022-10-31 12:23, Jonathan Kim wrote:
> On GFX9.4.1, the implicit wait count instruction on s_barrier is
> disabled by default in the driver during normal operation for
> performance requirements.
>
> There is a hardware bug in GFX9.4.1 where if the implicit wait count
> instruction after an s_barrier instruction is disabled, any wave that
> hits an exception may step over the s_barrier when returning from the
> trap handler with the barrier logic having no ability to be
> aware of this, thereby causing other waves to wait at the barrier
> indefinitely resulting in a shader hang.  This bug has been corrected
> for GFX9.4.2 and onward.
>
> Since the debugger subscribes to hardware exceptions, in order to avoid
> this bug, the debugger must enable implicit wait count on s_barrier
> for a debug session and disable it on detach.
>
> In order to change this setting in the in the device global SQ_CONFIG
> register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
> will either dispatch work through the compute ring buffers used for
> image post processing or through the hardware scheduler by the KFD.
>
> Have the KGD suspend and drain the compute ring buffer, then suspend the
> hardware scheduler and block any future KFD process job requests before
> changing the implicit wait count setting.  Once set, resume all work.
>
> Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   3 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 105 +++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c         |   4 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      |   2 +-
>   4 files changed, 110 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0e6ddf05c23c..9f2499f52d2c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1034,6 +1034,9 @@ struct amdgpu_device {
>   	struct pci_saved_state          *pci_state;
>   	pci_channel_state_t		pci_channel_state;
>   
> +	/* Track auto wait count on s_barrier settings */
> +	bool				barrier_has_auto_waitcnt;
> +
>   	struct amdgpu_reset_control     *reset_cntl;
>   	uint32_t                        ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 4191af5a3f13..13f02a0aa828 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -26,6 +26,7 @@
>   #include "amdgpu.h"
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu_amdkfd_arcturus.h"
> +#include "amdgpu_reset.h"
>   #include "sdma0/sdma0_4_2_2_offset.h"
>   #include "sdma0/sdma0_4_2_2_sh_mask.h"
>   #include "sdma1/sdma1_4_2_2_offset.h"
> @@ -48,6 +49,8 @@
>   #include "amdgpu_amdkfd_gfx_v9.h"
>   #include "gfxhub_v1_0.h"
>   #include "mmhub_v9_4.h"
> +#include "gc/gc_9_0_offset.h"
> +#include "gc/gc_9_0_sh_mask.h"
>   
>   #define HQD_N_REGS 56
>   #define DUMP_REG(addr) do {				\
> @@ -276,6 +279,104 @@ int kgd_arcturus_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd,
>   	return 0;
>   }
>   
> +/*
> + * Helper used to suspend/resume gfx pipe for image post process work to set
> + * barrier behaviour.
> + */
> +static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool suspend)
> +{
> +	int i, r = 0;
> +
> +	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> +		struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
> +
> +		if (!(ring && ring->sched.thread))
> +			continue;
> +
> +		/* stop secheduler and drain ring. */
> +		if (suspend) {
> +			drm_sched_stop(&ring->sched, NULL);
> +			r = amdgpu_fence_wait_empty(ring);
> +			if (r)
> +				goto out;
> +		} else {
> +			drm_sched_start(&ring->sched, false);
> +		}
> +	}
> +
> +out:
> +	/* return on resume or failure to drain rings. */
> +	if (!suspend || r)
> +		return r;
> +
> +	return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
> +}
> +
> +static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool enable_waitcnt)
> +{
> +	uint32_t data;
> +
> +	WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
> +
> +	if (!down_read_trylock(&adev->reset_domain->sem))
> +		return;
> +
> +	amdgpu_amdkfd_suspend(adev, false);
> +
> +	if (suspend_resume_compute_scheduler(adev, true))
> +		goto out;
> +
> +	data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
> +	data = REG_SET_FIELD(data, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
> +						enable_waitcnt ? 0 : 1);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
> +
> +out:
> +	suspend_resume_compute_scheduler(adev, false);
> +
> +	amdgpu_amdkfd_resume(adev, false);
> +
> +	up_read(&adev->reset_domain->sem);
> +}
> +
> +static uint32_t kgd_arcturus_enable_debug_trap(struct amdgpu_device *adev,
> +				bool restore_dbg_registers,
> +				uint32_t vmid)
> +{
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	set_barrier_auto_waitcnt(adev, true);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
> +static uint32_t kgd_arcturus_disable_debug_trap(struct amdgpu_device *adev,
> +					bool keep_trap_enabled,
> +					uint32_t vmid)
> +{
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	set_barrier_auto_waitcnt(adev, false);
> +
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
>   const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>   	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> @@ -294,6 +395,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
>   	.set_vm_context_page_table_base =
>   				kgd_gfx_v9_set_vm_context_page_table_base,
> +	.enable_debug_trap = kgd_arcturus_enable_debug_trap,
> +	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> -	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings
> +	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index a0e5ad342f13..8ed1b5d255f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2424,8 +2424,8 @@ static void gfx_v9_0_init_sq_config(struct amdgpu_device *adev)
>   	switch (adev->ip_versions[GC_HWIP][0]) {
>   	case IP_VERSION(9, 4, 1):
>   		tmp = RREG32_SOC15(GC, 0, mmSQ_CONFIG);
> -		tmp = REG_SET_FIELD(tmp, SQ_CONFIG,
> -					DISABLE_BARRIER_WAITCNT, 1);
> +		tmp = REG_SET_FIELD(tmp, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
> +				READ_ONCE(adev->barrier_has_auto_waitcnt) ? 0 : 1);
>   		WREG32_SOC15(GC, 0, mmSQ_CONFIG, tmp);
>   		break;
>   	default:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 56ad38fcd26e..efb81ccef8f5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1946,7 +1946,7 @@ void kfd_suspend_all_processes(void)
>   	WARN(debug_evictions, "Evicting all processes");
>   	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>   		cancel_delayed_work_sync(&p->eviction_work);
> -		cancel_delayed_work_sync(&p->restore_work);
> +		flush_delayed_work(&p->restore_work);

This looks like a sneak bug fix. Should this be a separate patch 
independent of this path series?

Regards,
   Felix


>   
>   		if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>   			pr_err("Failed to suspend process 0x%x\n", p->pasid);

  reply	other threads:[~2022-11-23  1:44 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-31 16:23 [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Jonathan Kim
2022-10-31 16:23 ` [PATCH 02/29] drm/amdkfd: display debug capabilities Jonathan Kim
2022-11-22 23:08   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 03/29] drm/amdkfd: prepare per-process debug enable and disable Jonathan Kim
2022-11-22 23:31   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 04/29] drm/amdgpu: add kgd hw debug mode setting interface Jonathan Kim
2022-12-01  0:08   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 05/29] drm/amdgpu: setup hw debug registers on driver initialization Jonathan Kim
2022-11-22 23:38   ` Felix Kuehling
2022-11-23 20:53     ` Kim, Jonathan
2022-12-01  0:18     ` Felix Kuehling
2022-12-01  0:23   ` Felix Kuehling
2022-12-02 17:42     ` Kim, Jonathan
2022-10-31 16:23 ` [PATCH 06/29] drm/amdgpu: add gfx9 hw debug mode enable and disable calls Jonathan Kim
2022-11-22 23:50   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 07/29] drm/amdgpu: add gfx9.4.1 " Jonathan Kim
2022-11-22 23:59   ` Felix Kuehling [this message]
2022-11-24 14:58     ` Kim, Jonathan
2022-11-24 16:25       ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 08/29] drm/amdgpu: add gfx10 " Jonathan Kim
2022-10-31 16:23 ` [PATCH 09/29] drm/amdgpu: add gfx9.4.2 " Jonathan Kim
2022-10-31 16:23 ` [PATCH 10/29] drm/amdgpu: add configurable grace period for unmap queues Jonathan Kim
2022-11-23  0:21   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 11/29] drm/amdkfd: prepare map process for single process debug devices Jonathan Kim
2022-10-31 16:23 ` [PATCH 12/29] drm/amdgpu: prepare map process for multi-process " Jonathan Kim
2022-10-31 16:23 ` [PATCH 13/29] drm/amdkfd: add per process hw trap enable and disable functions Jonathan Kim
2022-10-31 16:23 ` [PATCH 14/29] drm/amdkfd: add raise exception event function Jonathan Kim
2022-10-31 16:23 ` [PATCH 15/29] drm/amdkfd: add send exception operation Jonathan Kim
2022-10-31 16:23 ` [PATCH 16/29] drm/amdkfd: add runtime enable operation Jonathan Kim
2022-11-23  0:52   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 17/29] drm/amdkfd: Add debug trap enabled flag to TMA Jonathan Kim
2022-11-23  0:44   ` Felix Kuehling
2022-11-24 14:51     ` Kim, Jonathan
2022-11-24 16:23       ` Felix Kuehling
2022-11-24 20:27         ` Kim, Jonathan
2022-11-25 16:53           ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 18/29] drm/amdkfd: update process interrupt handling for debug events Jonathan Kim
2022-10-31 16:23 ` [PATCH 19/29] drm/amdkfd: add debug set exceptions enabled operation Jonathan Kim
2022-11-24 21:24   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 20/29] drm/amdkfd: add debug wave launch override operation Jonathan Kim
2022-11-29 22:37   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 21/29] drm/amdkfd: add debug wave launch mode operation Jonathan Kim
2022-12-01  0:02   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 22/29] drm/amdkfd: add debug suspend and resume process queues operation Jonathan Kim
2022-11-29 23:55   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 23/29] drm/amdkfd: add debug set and clear address watch points operation Jonathan Kim
2022-11-30  0:34   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 24/29] drm/amdkfd: add debug set flags operation Jonathan Kim
2022-11-30  0:39   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 25/29] drm/amdkfd: add debug query event operation Jonathan Kim
2022-11-30  0:44   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 26/29] drm/amdkfd: add debug query exception info operation Jonathan Kim
2022-11-30  0:50   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 27/29] drm/amdkfd: add debug queue snapshot operation Jonathan Kim
2022-11-30 23:55   ` Felix Kuehling
2022-12-02 19:13     ` Kim, Jonathan
2022-10-31 16:23 ` [PATCH 28/29] drm/amdkfd: add debug device " Jonathan Kim
2022-12-01  0:00   ` Felix Kuehling
2022-10-31 16:23 ` [PATCH 29/29] drm/amdkfd: bump kfd ioctl minor version for debug api availability Jonathan Kim
2022-12-01  0:00   ` Felix Kuehling
2022-11-22 23:05 ` [PATCH 01/29] drm/amdkfd: add debug and runtime enable interface Felix Kuehling
2022-11-23 20:45   ` Kim, Jonathan
  -- strict thread matches above, loose matches on Subject: below --
2022-08-29 14:29 [PATCH 0/29] Introduce AMD GPU ISA Debugging for HSA Compute Jonathan Kim
2022-08-29 14:30 ` [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls Jonathan Kim

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=0e6d5813-7031-451d-be18-da08f6632530@amd.com \
    --to=felix.kuehling@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=jonathan.kim@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox