AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Jay Cornwall <jay.cornwall@amd.com>
To: <amd-gfx@lists.freedesktop.org>
Cc: Jay Cornwall <jay.cornwall@amd.com>, Gang Ba <Gang.Ba@amd.com>,
	"Harish Kasiviswanathan" <Harish.Kasiviswanathan@amd.com>,
	Lancelot Six <lancelot.six@amd.com>,
	Vladimir Indic <vladimir.indic@amd.com>
Subject: [PATCH 3/5] drm/amdkfd: gfx12.1 cluster barrier context save workaround
Date: Fri, 16 Jan 2026 14:39:30 -0600	[thread overview]
Message-ID: <20260116203932.988704-4-jay.cornwall@amd.com> (raw)
In-Reply-To: <20260116203932.988704-1-jay.cornwall@amd.com>

Trap cluster barrier may not serialize with user cluster barrier
under some circumstances. Add a check for pending user cluster
barrier complete.

Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
Tested-by: Gang Ba <Gang.Ba@amd.com>
Cc: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Cc: Lancelot Six <lancelot.six@amd.com>
Cc: Vladimir Indic <vladimir.indic@amd.com>
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 31 +++++++++-------
 .../amd/amdkfd/cwsr_trap_handler_gfx12.asm    | 36 +++++++++++++++----
 2 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 453c08845d74..d86bccc49e3f 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -3754,11 +3754,11 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x84708a70, 0x8070ff70,
 	0x00000200, 0x7e000280,
 	0x7e020280, 0x7e040280,
-	0xbefd0080, 0xbe804ec2,
-	0xbf94fffe, 0xb8faf804,
-	0x8b7a847a, 0x91788478,
-	0x8c787a78, 0xd7610002,
+	0xbefd0080, 0xd7610002,
 	0x0000fa71, 0x807d817d,
+	0xbe804ec2, 0xbf94fffe,
+	0xb8faf804, 0x8b7a847a,
+	0x91788478, 0x8c787a78,
 	0xd7610002, 0x0000fa6c,
 	0x807d817d, 0x917aff6d,
 	0x80000000, 0xd7610002,
@@ -4587,7 +4587,7 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
 };
 
 static const uint32_t cwsr_trap_gfx12_1_0_hex[] = {
-	0xbfa00001, 0xbfa003aa,
+	0xbfa00001, 0xbfa003b4,
 	0xb0804009, 0xb8eef81a,
 	0xbf880000, 0xb980081a,
 	0x00000000, 0xb8f8f804,
@@ -4838,15 +4838,20 @@ static const uint32_t cwsr_trap_gfx12_1_0_hex[] = {
 	0x84708a70, 0x8070ff70,
 	0x00000200, 0x7e000280,
 	0x7e020280, 0x7e040280,
-	0xbefd0080, 0xb8faf802,
-	0xbf0c8b7a, 0xbfa20003,
-	0xbe804fc2, 0xbf94fffe,
-	0xbfa10001, 0xbe804ec4,
-	0xbf94fffc, 0xb8faf804,
-	0x8b7aff7a, 0x0001000c,
-	0x9178ff78, 0x0001000c,
-	0x8c787a78, 0xd7610002,
+	0xbefd0080, 0xd7610002,
 	0x0000fa71, 0x807d817d,
+	0xb8faf802, 0xbf0c8b7a,
+	0xbfa20003, 0xbe804fc2,
+	0xbf94fffe, 0xbfa10001,
+	0xbe804ec4, 0xbf94fffc,
+	0xbefa4c88, 0xbfc70000,
+	0xbf0c807a, 0xbfa20006,
+	0x9371ff7a, 0x00070004,
+	0x937aff7a, 0x00070010,
+	0xbf06717a, 0xbfa2fff6,
+	0xb8faf804, 0x8b7aff7a,
+	0x0001000c, 0x9178ff78,
+	0x0001000c, 0x8c787a78,
 	0xd7610002, 0x0000fa6c,
 	0x807d817d, 0x917aff6d,
 	0x80000000, 0xd7610002,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
index 7ed4b502eb22..ace2a9f2ac73 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm
@@ -35,6 +35,7 @@
 #define HAVE_BANKED_VGPRS (ASIC_FAMILY == CHIP_GC_12_0_3)
 #define NUM_NAMED_BARRIERS (ASIC_FAMILY == CHIP_GC_12_0_3 ? 0x10 : 0)
 #define HAVE_CLUSTER_BARRIER (ASIC_FAMILY == CHIP_GC_12_0_3)
+#define CLUSTER_BARRIER_SERIALIZE_WORKAROUND (ASIC_FAMILY == CHIP_GC_12_0_3)
 
 #define SINGLE_STEP_MISSED_WORKAROUND 1	//workaround for lost TRAP_AFTER_INST exception when SAVECTX raised
 #define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12)
@@ -104,6 +105,7 @@ var SQ_WAVE_SCHED_MODE_DEP_MODE_SHIFT		= 0
 var SQ_WAVE_SCHED_MODE_DEP_MODE_SIZE		= 2
 
 var BARRIER_STATE_SIGNAL_OFFSET			= 16
+var BARRIER_STATE_SIGNAL_SIZE			= 7
 var BARRIER_STATE_MEMBER_OFFSET			= 4
 var BARRIER_STATE_MEMBER_SIZE			= 7
 var BARRIER_STATE_VALID_OFFSET			= 0
@@ -520,9 +522,11 @@ L_SAVE_HWREG:
 	v_mov_b32	v2, 0x0							//Set of SGPRs for TCP store
 	s_mov_b32	m0, 0x0							//Next lane of v2 to write to
 
+	write_hwreg_to_v2(s_save_m0)
+
 	// Ensure no further changes to barrier or LDS state.
 	// STATE_PRIV.*BARRIER_COMPLETE may change up to this point.
-	wait_trap_barriers(s_save_tmp)
+	wait_trap_barriers(s_save_tmp, s_save_m0, 1)
 
 	// Re-read final state of *BARRIER_COMPLETE fields for save.
 	s_getreg_b32	s_save_tmp, hwreg(HW_REG_WAVE_STATE_PRIV)
@@ -530,7 +534,6 @@ L_SAVE_HWREG:
 	s_andn2_b32	s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_ALL_BARRIER_COMPLETE_MASK
 	s_or_b32	s_save_state_priv, s_save_state_priv, s_save_tmp
 
-	write_hwreg_to_v2(s_save_m0)
 	write_hwreg_to_v2(s_save_pc_lo)
 	s_andn2_b32	s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK
 	write_hwreg_to_v2(s_save_tmp)
@@ -1198,7 +1201,7 @@ L_SKIP_CLUSTER_BARRIER_RESTORE:
 
 	// Make barrier and LDS state visible to all waves in the group/cluster.
 	// STATE_PRIV.*BARRIER_COMPLETE may change after this point.
-	wait_trap_barriers(s_restore_tmp)
+	wait_trap_barriers(s_restore_tmp, 0, 0)
 
 #if HAVE_CLUSTER_BARRIER
 	// SCC is changed by wait_trap_barriers, restore it separately.
@@ -1211,7 +1214,7 @@ L_SKIP_CLUSTER_BARRIER_RESTORE:
 L_END_PGM:
 	// Make sure that no wave of the group/cluster can exit the trap handler
 	// before the group/cluster barrier state is saved.
-	wait_trap_barriers(s_restore_tmp)
+	wait_trap_barriers(s_restore_tmp, 0, 0)
 
 	s_endpgm_saved
 end
@@ -1301,11 +1304,11 @@ function restore_xnack_state_priv(s_tmp)
 end
 #endif
 
-function wait_trap_barriers(s_tmp)
+function wait_trap_barriers(s_tmp1, s_tmp2, serialize_wa)
 #if HAVE_CLUSTER_BARRIER
 	// If not in a WG then wave cannot use s_barrier_signal_isfirst.
-	s_getreg_b32	s_tmp, hwreg(HW_REG_WAVE_STATUS)
-	s_bitcmp0_b32	s_tmp, SQ_WAVE_STATUS_IN_WG_SHIFT
+	s_getreg_b32	s_tmp1, hwreg(HW_REG_WAVE_STATUS)
+	s_bitcmp0_b32	s_tmp1, SQ_WAVE_STATUS_IN_WG_SHIFT
 	s_cbranch_scc1	L_TRAP_CLUSTER_BARRIER_SIGNAL
 
 	s_barrier_signal_isfirst	-2
@@ -1319,6 +1322,25 @@ L_TRAP_CLUSTER_BARRIER_SIGNAL:
 
 L_SKIP_TRAP_CLUSTER_BARRIER_SIGNAL:
 	s_barrier_wait	-4
+
+#if CLUSTER_BARRIER_SERIALIZE_WORKAROUND
+if serialize_wa
+	// Trap cluster barrier may complete with a user cluster barrier in-flight.
+	// This is indicated if user cluster member count and signal count are equal.
+L_WAIT_USER_CLUSTER_BARRIER_COMPLETE:
+	s_sendmsg_rtn_b32	s_tmp1, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE)
+	s_wait_kmcnt	0
+	s_bitcmp0_b32	s_tmp1, BARRIER_STATE_VALID_OFFSET
+	s_cbranch_scc1	L_NOT_IN_CLUSTER
+
+	s_bfe_u32	s_tmp2, s_tmp1, (BARRIER_STATE_MEMBER_OFFSET | (BARRIER_STATE_MEMBER_SIZE << 0x10))
+	s_bfe_u32	s_tmp1, s_tmp1, (BARRIER_STATE_SIGNAL_OFFSET | (BARRIER_STATE_SIGNAL_SIZE << 0x10))
+	s_cmp_eq_u32	s_tmp1, s_tmp2
+	s_cbranch_scc1	L_WAIT_USER_CLUSTER_BARRIER_COMPLETE
+end
+L_NOT_IN_CLUSTER:
+#endif
+
 #else
 	s_barrier_signal	-2
 	s_barrier_wait	-2
-- 
2.34.1


  parent reply	other threads:[~2026-01-16 20:40 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-16 20:39 [PATCH 0/5] drm/amdkfd: Trap handler fixes and gfx12.1 support Jay Cornwall
2026-01-16 20:39 ` [PATCH 1/5] drm/amdkfd: Sync trap handler binary with source Jay Cornwall
2026-01-20 22:34   ` Lancelot SIX
2026-01-21 10:27     ` Indic, Vladimir
2026-01-16 20:39 ` [PATCH 2/5] drm/amdkfd: Fix scalar load ordering in gfx12.1 trap handler Jay Cornwall
2026-01-20 22:38   ` Lancelot SIX
2026-01-21 10:32     ` Indic, Vladimir
2026-01-16 20:39 ` Jay Cornwall [this message]
2026-01-20 23:27   ` [PATCH 3/5] drm/amdkfd: gfx12.1 cluster barrier context save workaround Lancelot SIX
2026-01-21 10:37     ` Indic, Vladimir
2026-01-16 20:39 ` [PATCH 4/5] drm/amdkfd: gfx12.1 trap handler support for expert scheduling mode Jay Cornwall
2026-01-20 23:30   ` Lancelot SIX
2026-01-21 10:46     ` Indic, Vladimir
2026-01-16 20:39 ` [PATCH 5/5] drm/amdkfd: Do not include VGPR MSBs in saved PC during save Jay Cornwall

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260116203932.988704-4-jay.cornwall@amd.com \
    --to=jay.cornwall@amd.com \
    --cc=Gang.Ba@amd.com \
    --cc=Harish.Kasiviswanathan@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=lancelot.six@amd.com \
    --cc=vladimir.indic@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox