AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] drm/amdkfd: Trap handler support for expert scheduling mode
@ 2025-11-14 20:48 Jay Cornwall
  2025-12-01  9:53 ` Lancelot SIX
  0 siblings, 1 reply; 2+ messages in thread
From: Jay Cornwall @ 2025-11-14 20:48 UTC (permalink / raw)
  To: amd-gfx; +Cc: Jay Cornwall, Lancelot Six

The trap may be entered with dependency checking disabled.
Wait for dependency counters and save/restore scheduling mode.

v2:

Use ttmp1 instead of ttmp11. ttmp11 is not zero-initialized.
While the trap handler does zero this field before use, a user-mode
second-level trap handler could not rely on this being zero when
using an older kernel mode driver.

v3:

Use ttmp11 primarily but copy to ttmp1 before jumping to the
second level trap handler. ttmp1 is inspectable by a debugger.
Unexpected bits in the unused space may regress existing software.

Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
Cc: Lancelot Six <lancelot.six@amd.com>
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 62 +++++++++++--------
 .../amd/amdkfd/cwsr_trap_handler_gfx12.asm    | 37 +++++++++++
 2 files changed, 73 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 0320163b6e74..f98c735b2905 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -3644,14 +3644,18 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = {
 };
 
 static const uint32_t cwsr_trap_gfx12_hex[] = {
-	0xbfa00001, 0xbfa002a2,
-	0xb0804009, 0xb8f8f804,
+	0xbfa00001, 0xbfa002b2,
+	0xb0804009, 0xb8eef81a,
+	0xbf880000, 0xb980081a,
+	0x00000000, 0xb8f8f804,
+	0x9177ff77, 0x0c000000,
+	0x846e9a6e, 0x8c776e77,
 	0x9178ff78, 0x00008c00,
 	0xb8fbf811, 0x8b6eff78,
 	0x00004000, 0xbfa10008,
 	0x8b6eff7b, 0x00000080,
 	0xbfa20018, 0x8b6ea07b,
-	0xbfa20042, 0xbf830010,
+	0xbfa2004a, 0xbf830010,
 	0xb8fbf811, 0xbfa0fffb,
 	0x8b6eff7b, 0x00000bd0,
 	0xbfa20010, 0xb8eef812,
@@ -3662,28 +3666,32 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0xf0000000, 0xbfa20005,
 	0x8b6fff6f, 0x00000200,
 	0xbfa20002, 0x8b6ea07b,
-	0xbfa2002c, 0xbefa4d82,
+	0xbfa20034, 0xbefa4d82,
 	0xbf8a0000, 0x84fa887a,
 	0xbf0d8f7b, 0xbfa10002,
 	0x8c7bff7b, 0xffff0000,
-	0xf4601bbd, 0xf8000010,
-	0xbf8a0000, 0x846e976e,
-	0x9177ff77, 0x00800000,
-	0x8c776e77, 0xf4603bbd,
-	0xf8000000, 0xbf8a0000,
-	0xf4603ebd, 0xf8000008,
-	0xbf8a0000, 0x8bee6e6e,
-	0xbfa10001, 0xbe80486e,
-	0x8b6eff6d, 0xf0000000,
-	0xbfa20009, 0xb8eef811,
-	0x8b6eff6e, 0x00000080,
-	0xbfa20007, 0x8c78ff78,
-	0x00004000, 0x80ec886c,
-	0x82ed806d, 0xbfa00002,
-	0x806c846c, 0x826d806d,
-	0x8b6dff6d, 0x0000ffff,
-	0x8bfe7e7e, 0x8bea6a6a,
-	0x85788978, 0xb9783244,
+	0x8b6eff77, 0x0c000000,
+	0x916dff6d, 0x0c000000,
+	0x8c6d6e6d, 0xf4601bbd,
+	0xf8000010, 0xbf8a0000,
+	0x846e976e, 0x9177ff77,
+	0x00800000, 0x8c776e77,
+	0xf4603bbd, 0xf8000000,
+	0xbf8a0000, 0xf4603ebd,
+	0xf8000008, 0xbf8a0000,
+	0x8bee6e6e, 0xbfa10001,
+	0xbe80486e, 0x8b6eff6d,
+	0xf0000000, 0xbfa20009,
+	0xb8eef811, 0x8b6eff6e,
+	0x00000080, 0xbfa20007,
+	0x8c78ff78, 0x00004000,
+	0x80ec886c, 0x82ed806d,
+	0xbfa00002, 0x806c846c,
+	0x826d806d, 0x8b6dff6d,
+	0x0000ffff, 0x8bfe7e7e,
+	0x8bea6a6a, 0x85788978,
+	0x936eff77, 0x0002001a,
+	0xb96ef81a, 0xb9783244,
 	0xbe804a6c, 0xb8faf802,
 	0xbf0d987a, 0xbfa10001,
 	0xbfb00000, 0x8b6dff6d,
@@ -3981,7 +3989,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x008ce800, 0x00000000,
 	0x807d817d, 0x8070ff70,
 	0x00000080, 0xbf0a7b7d,
-	0xbfa2fff7, 0xbfa0016e,
+	0xbfa2fff7, 0xbfa00171,
 	0xbef4007e, 0x8b75ff7f,
 	0x0000ffff, 0x8c75ff75,
 	0x00040000, 0xbef60080,
@@ -4163,12 +4171,14 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0xf8000074, 0xbf8a0000,
 	0x8b6dff6d, 0x0000ffff,
 	0x8bfe7e7e, 0x8bea6a6a,
-	0xb97af804, 0xbe804ec2,
-	0xbf94fffe, 0xbe804a6c,
+	0x936eff77, 0x0002001a,
+	0xb96ef81a, 0xb97af804,
 	0xbe804ec2, 0xbf94fffe,
-	0xbfb10000, 0xbf9f0000,
+	0xbe804a6c, 0xbe804ec2,
+	0xbf94fffe, 0xbfb10000,
 	0xbf9f0000, 0xbf9f0000,
 	0xbf9f0000, 0xbf9f0000,
+	0xbf9f0000, 0x00000000,
 };
 
 static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
index 5a1a1b1f897f..07999b4649de 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
@@ -78,9 +78,16 @@ var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT	= SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL
 var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE	= SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
 var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT	= SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
 var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE	= 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT
+
+var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT		= 0
+var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE		= 2
+
 var BARRIER_STATE_SIGNAL_OFFSET			= 16
 var BARRIER_STATE_VALID_OFFSET			= 0
 
+var TTMP11_SCHED_MODE_SHIFT			= 26
+var TTMP11_SCHED_MODE_SIZE			= 2
+var TTMP11_SCHED_MODE_MASK			= 0xC000000
 var TTMP11_DEBUG_TRAP_ENABLED_SHIFT		= 23
 var TTMP11_DEBUG_TRAP_ENABLED_MASK		= 0x800000
 
@@ -160,8 +167,19 @@ L_JUMP_TO_RESTORE:
 	s_branch	L_RESTORE
 
 L_SKIP_RESTORE:
+	// Assume most relaxed scheduling mode is set. Save and revert to normal mode.
+	s_getreg_b32	ttmp2, hwreg(HW_REG_WAVE_SCHED_MODE)
+	s_wait_alu	0
+	s_setreg_imm32_b32	hwreg(HW_REG_WAVE_SCHED_MODE, \
+		SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT, SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE), 0
+
 	s_getreg_b32	s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV)	//save STATUS since we will change SCC
 
+	// Save SCHED_MODE[1:0] into ttmp11[27:26].
+	s_andn2_b32	ttmp11, ttmp11, TTMP11_SCHED_MODE_MASK
+	s_lshl_b32	ttmp2, ttmp2, TTMP11_SCHED_MODE_SHIFT
+	s_or_b32	ttmp11, ttmp11, ttmp2
+
 	// Clear SPI_PRIO: do not save with elevated priority.
 	// Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
 	s_andn2_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK
@@ -238,6 +256,13 @@ L_FETCH_2ND_TRAP:
 	s_cbranch_scc0	L_NO_SIGN_EXTEND_TMA
 	s_or_b32	ttmp15, ttmp15, 0xFFFF0000
 L_NO_SIGN_EXTEND_TMA:
+#if ASIC_FAMILY == CHIP_GFX12
+	// Move SCHED_MODE[1:0] from ttmp11 to unused bits in ttmp1[27:26] (return PC_HI).
+	// The second-level trap will restore from ttmp1 for backwards compatibility.
+	s_and_b32	ttmp2, ttmp11, TTMP11_SCHED_MODE_MASK
+	s_andn2_b32	ttmp1, ttmp1, TTMP11_SCHED_MODE_MASK
+	s_or_b32	ttmp1, ttmp1, ttmp2
+#endif
 
 	s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS		// debug trap enabled flag
 	s_wait_idle
@@ -287,6 +312,10 @@ L_EXIT_TRAP:
 	// STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
 	// Only restore fields which the trap handler changes.
 	s_lshr_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
+
+	// Assume relaxed scheduling mode after this point.
+	restore_sched_mode(ttmp2)
+
 	s_setreg_b32	hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
 		SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv
 
@@ -1043,6 +1072,9 @@ L_SKIP_BARRIER_RESTORE:
 	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
 	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
 
+	// Assume relaxed scheduling mode after this point.
+	restore_sched_mode(s_restore_tmp)
+
 	s_setreg_b32	hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv	// SCC is included, which is changed by previous salu
 
 	// Make barrier and LDS state visible to all waves in the group.
@@ -1134,3 +1166,8 @@ function valu_sgpr_hazard
 	end
 #endif
 end
+
+function restore_sched_mode(s_tmp)
+	s_bfe_u32	s_tmp, ttmp11, (TTMP11_SCHED_MODE_SHIFT | (TTMP11_SCHED_MODE_SIZE << 0x10))
+	s_setreg_b32	hwreg(HW_REG_WAVE_SCHED_MODE), s_tmp
+end
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH v3] drm/amdkfd: Trap handler support for expert scheduling mode
  2025-11-14 20:48 [PATCH v3] drm/amdkfd: Trap handler support for expert scheduling mode Jay Cornwall
@ 2025-12-01  9:53 ` Lancelot SIX
  0 siblings, 0 replies; 2+ messages in thread
From: Lancelot SIX @ 2025-12-01  9:53 UTC (permalink / raw)
  To: Jay Cornwall, amd-gfx

Hi,

I thought I replied to the list, but only replied to the author.

This looks good to me, thanks.
I noted one possible improvement lower, could be added in a later patch.

Reviewed-by: Lancelot Six <lancelot.six@amd.com>

Best,
Lancelot.

On 14/11/2025 20:48, Jay Cornwall wrote:
> The trap may be entered with dependency checking disabled.
> Wait for dependency counters and save/restore scheduling mode.
> 
> v2:
> 
> Use ttmp1 instead of ttmp11. ttmp11 is not zero-initialized.
> While the trap handler does zero this field before use, a user-mode
> second-level trap handler could not rely on this being zero when
> using an older kernel mode driver.
> 
> v3:
> 
> Use ttmp11 primarily but copy to ttmp1 before jumping to the
> second level trap handler. ttmp1 is inspectable by a debugger.
> Unexpected bits in the unused space may regress existing software.
> 
> Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
> Cc: Lancelot Six <lancelot.six@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 62 +++++++++++--------
>   .../amd/amdkfd/cwsr_trap_handler_gfx12.asm    | 37 +++++++++++
>   2 files changed, 73 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> index 0320163b6e74..f98c735b2905 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> @@ -3644,14 +3644,18 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = {
>   };
>   
>   static const uint32_t cwsr_trap_gfx12_hex[] = {
> -	0xbfa00001, 0xbfa002a2,
> -	0xb0804009, 0xb8f8f804,
> +	0xbfa00001, 0xbfa002b2,
> +	0xb0804009, 0xb8eef81a,
> +	0xbf880000, 0xb980081a,
> +	0x00000000, 0xb8f8f804,
> +	0x9177ff77, 0x0c000000,
> +	0x846e9a6e, 0x8c776e77,
>   	0x9178ff78, 0x00008c00,
>   	0xb8fbf811, 0x8b6eff78,
>   	0x00004000, 0xbfa10008,
>   	0x8b6eff7b, 0x00000080,
>   	0xbfa20018, 0x8b6ea07b,
> -	0xbfa20042, 0xbf830010,
> +	0xbfa2004a, 0xbf830010,
>   	0xb8fbf811, 0xbfa0fffb,
>   	0x8b6eff7b, 0x00000bd0,
>   	0xbfa20010, 0xb8eef812,
> @@ -3662,28 +3666,32 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0xf0000000, 0xbfa20005,
>   	0x8b6fff6f, 0x00000200,
>   	0xbfa20002, 0x8b6ea07b,
> -	0xbfa2002c, 0xbefa4d82,
> +	0xbfa20034, 0xbefa4d82,
>   	0xbf8a0000, 0x84fa887a,
>   	0xbf0d8f7b, 0xbfa10002,
>   	0x8c7bff7b, 0xffff0000,
> -	0xf4601bbd, 0xf8000010,
> -	0xbf8a0000, 0x846e976e,
> -	0x9177ff77, 0x00800000,
> -	0x8c776e77, 0xf4603bbd,
> -	0xf8000000, 0xbf8a0000,
> -	0xf4603ebd, 0xf8000008,
> -	0xbf8a0000, 0x8bee6e6e,
> -	0xbfa10001, 0xbe80486e,
> -	0x8b6eff6d, 0xf0000000,
> -	0xbfa20009, 0xb8eef811,
> -	0x8b6eff6e, 0x00000080,
> -	0xbfa20007, 0x8c78ff78,
> -	0x00004000, 0x80ec886c,
> -	0x82ed806d, 0xbfa00002,
> -	0x806c846c, 0x826d806d,
> -	0x8b6dff6d, 0x0000ffff,
> -	0x8bfe7e7e, 0x8bea6a6a,
> -	0x85788978, 0xb9783244,
> +	0x8b6eff77, 0x0c000000,
> +	0x916dff6d, 0x0c000000,
> +	0x8c6d6e6d, 0xf4601bbd,
> +	0xf8000010, 0xbf8a0000,
> +	0x846e976e, 0x9177ff77,
> +	0x00800000, 0x8c776e77,
> +	0xf4603bbd, 0xf8000000,
> +	0xbf8a0000, 0xf4603ebd,
> +	0xf8000008, 0xbf8a0000,
> +	0x8bee6e6e, 0xbfa10001,
> +	0xbe80486e, 0x8b6eff6d,
> +	0xf0000000, 0xbfa20009,
> +	0xb8eef811, 0x8b6eff6e,
> +	0x00000080, 0xbfa20007,
> +	0x8c78ff78, 0x00004000,
> +	0x80ec886c, 0x82ed806d,
> +	0xbfa00002, 0x806c846c,
> +	0x826d806d, 0x8b6dff6d,
> +	0x0000ffff, 0x8bfe7e7e,
> +	0x8bea6a6a, 0x85788978,
> +	0x936eff77, 0x0002001a,
> +	0xb96ef81a, 0xb9783244,
>   	0xbe804a6c, 0xb8faf802,
>   	0xbf0d987a, 0xbfa10001,
>   	0xbfb00000, 0x8b6dff6d,
> @@ -3981,7 +3989,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x008ce800, 0x00000000,
>   	0x807d817d, 0x8070ff70,
>   	0x00000080, 0xbf0a7b7d,
> -	0xbfa2fff7, 0xbfa0016e,
> +	0xbfa2fff7, 0xbfa00171,
>   	0xbef4007e, 0x8b75ff7f,
>   	0x0000ffff, 0x8c75ff75,
>   	0x00040000, 0xbef60080,
> @@ -4163,12 +4171,14 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0xf8000074, 0xbf8a0000,
>   	0x8b6dff6d, 0x0000ffff,
>   	0x8bfe7e7e, 0x8bea6a6a,
> -	0xb97af804, 0xbe804ec2,
> -	0xbf94fffe, 0xbe804a6c,
> +	0x936eff77, 0x0002001a,
> +	0xb96ef81a, 0xb97af804,
>   	0xbe804ec2, 0xbf94fffe,
> -	0xbfb10000, 0xbf9f0000,
> +	0xbe804a6c, 0xbe804ec2,
> +	0xbf94fffe, 0xbfb10000,
>   	0xbf9f0000, 0xbf9f0000,
>   	0xbf9f0000, 0xbf9f0000,
> +	0xbf9f0000, 0x00000000,
>   };
>   
>   static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
> diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
> index 5a1a1b1f897f..07999b4649de 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
> +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
> @@ -78,9 +78,16 @@ var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SHIFT	= SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL
>   var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_2_SIZE	= SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
>   var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT	= SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT
>   var SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SIZE	= 32 - SQ_WAVE_EXCP_FLAG_PRIV_RESTORE_PART_3_SHIFT
> +
> +var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT		= 0
> +var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE		= 2
> +
>   var BARRIER_STATE_SIGNAL_OFFSET			= 16
>   var BARRIER_STATE_VALID_OFFSET			= 0
>   
> +var TTMP11_SCHED_MODE_SHIFT			= 26
> +var TTMP11_SCHED_MODE_SIZE			= 2
> +var TTMP11_SCHED_MODE_MASK			= 0xC000000
>   var TTMP11_DEBUG_TRAP_ENABLED_SHIFT		= 23
>   var TTMP11_DEBUG_TRAP_ENABLED_MASK		= 0x800000
>   
> @@ -160,8 +167,19 @@ L_JUMP_TO_RESTORE:
>   	s_branch	L_RESTORE
>   
>   L_SKIP_RESTORE:
> +	// Assume most relaxed scheduling mode is set. Save and revert to normal mode.
> +	s_getreg_b32	ttmp2, hwreg(HW_REG_WAVE_SCHED_MODE)

Could be hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2)

> +	s_wait_alu	0
> +	s_setreg_imm32_b32	hwreg(HW_REG_WAVE_SCHED_MODE, \
> +		SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT, SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE), 0
> +
>   	s_getreg_b32	s_save_state_priv, hwreg(HW_REG_WAVE_STATE_PRIV)	//save STATUS since we will change SCC
>   
> +	// Save SCHED_MODE[1:0] into ttmp11[27:26].
> +	s_andn2_b32	ttmp11, ttmp11, TTMP11_SCHED_MODE_MASK
> +	s_lshl_b32	ttmp2, ttmp2, TTMP11_SCHED_MODE_SHIFT
> +	s_or_b32	ttmp11, ttmp11, ttmp2
> +
>   	// Clear SPI_PRIO: do not save with elevated priority.
>   	// Clear ECC_ERR: prevents SQC store and triggers FATAL_HALT if setreg'd.
>   	s_andn2_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALWAYS_CLEAR_MASK
> @@ -238,6 +256,13 @@ L_FETCH_2ND_TRAP:
>   	s_cbranch_scc0	L_NO_SIGN_EXTEND_TMA
>   	s_or_b32	ttmp15, ttmp15, 0xFFFF0000
>   L_NO_SIGN_EXTEND_TMA:
> +#if ASIC_FAMILY == CHIP_GFX12
> +	// Move SCHED_MODE[1:0] from ttmp11 to unused bits in ttmp1[27:26] (return PC_HI).
> +	// The second-level trap will restore from ttmp1 for backwards compatibility.
> +	s_and_b32	ttmp2, ttmp11, TTMP11_SCHED_MODE_MASK
> +	s_andn2_b32	ttmp1, ttmp1, TTMP11_SCHED_MODE_MASK
> +	s_or_b32	ttmp1, ttmp1, ttmp2
> +#endif
>   
>   	s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 scope:SCOPE_SYS		// debug trap enabled flag
>   	s_wait_idle
> @@ -287,6 +312,10 @@ L_EXIT_TRAP:
>   	// STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
>   	// Only restore fields which the trap handler changes.
>   	s_lshr_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_SCC_SHIFT
> +
> +	// Assume relaxed scheduling mode after this point.
> +	restore_sched_mode(ttmp2)
> +
>   	s_setreg_b32	hwreg(HW_REG_WAVE_STATE_PRIV, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
>   		SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_state_priv
>   
> @@ -1043,6 +1072,9 @@ L_SKIP_BARRIER_RESTORE:
>   	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
>   	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
>   
> +	// Assume relaxed scheduling mode after this point.
> +	restore_sched_mode(s_restore_tmp)
> +
>   	s_setreg_b32	hwreg(HW_REG_WAVE_STATE_PRIV), s_restore_state_priv	// SCC is included, which is changed by previous salu
>   
>   	// Make barrier and LDS state visible to all waves in the group.
> @@ -1134,3 +1166,8 @@ function valu_sgpr_hazard
>   	end
>   #endif
>   end
> +
> +function restore_sched_mode(s_tmp)
> +	s_bfe_u32	s_tmp, ttmp11, (TTMP11_SCHED_MODE_SHIFT | (TTMP11_SCHED_MODE_SIZE << 0x10))
> +	s_setreg_b32	hwreg(HW_REG_WAVE_SCHED_MODE), s_tmp
> +end


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-12-01  9:53 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-14 20:48 [PATCH v3] drm/amdkfd: Trap handler support for expert scheduling mode Jay Cornwall
2025-12-01  9:53 ` Lancelot SIX

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox