AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source
@ 2024-05-23 14:08 Jay Cornwall
  2024-05-23 14:08 ` [PATCH 2/3] drm/amdkfd: Replace deprecated gfx12 trap handler instructions Jay Cornwall
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Jay Cornwall @ 2024-05-23 14:08 UTC (permalink / raw)
  To: amd-gfx; +Cc: Jay Cornwall, Lancelot Six

Source and binary have become mismatched during branch activity.

Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
Cc: Lancelot Six <lancelot.six@amd.com>
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 57 ++++++++-----------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 73d3772cdb76..11d076eb770c 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -718,12 +718,12 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0xf4051ebd, 0xfa000008,
 	0xbf8cc07f, 0x87ee6e6e,
 	0xbf840001, 0xbe80206e,
-	0x876eff6d, 0x01ff0000,
-	0xbf850005, 0x8878ff78,
-	0x00002000, 0x80ec886c,
-	0x82ed806d, 0xbf820005,
-	0x876eff6d, 0x01000000,
-	0xbf850002, 0x806c846c,
+	0x876eff6d, 0x00ff0000,
+	0xbf850008, 0x876eff6d,
+	0x01000000, 0xbf850007,
+	0x8878ff78, 0x00002000,
+	0x80ec886c, 0x82ed806d,
+	0xbf820002, 0x806c846c,
 	0x826d806d, 0x876dff6d,
 	0x0000ffff, 0x907a8977,
 	0x877bff7a, 0x003f8000,
@@ -1136,7 +1136,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0xe0704000, 0x705d0000,
 	0x807c817c, 0x8070ff70,
 	0x00000080, 0xbf0a7b7c,
-	0xbf85fff8, 0xbf820144,
+	0xbf85fff8, 0xbf82013e,
 	0xbef4037e, 0x8775ff7f,
 	0x0000ffff, 0x8875ff75,
 	0x00040000, 0xbef60380,
@@ -1276,10 +1276,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x80788478, 0xbf8cc07f,
 	0xb9eef815, 0xbefc036f,
 	0xbefe0370, 0xbeff0371,
-	0x876f7bff, 0x000003ff,
-	0xb9ef4803, 0xb9f9f816,
-	0x876f7bff, 0xfffff800,
-	0x906f8b6f, 0xb9efa2c3,
+	0xb9f9f816, 0xb9fbf803,
 	0xb9f3f801, 0xb96e3a05,
 	0x806e816e, 0xbf0d9972,
 	0xbf850002, 0x8f6e896e,
@@ -2309,12 +2306,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0xf4051ebd, 0xfa000008,
 	0xbf8cc07f, 0x87ee6e6e,
 	0xbf840001, 0xbe80206e,
-	0x876eff6d, 0x01ff0000,
-	0xbf850005, 0x8878ff78,
-	0x00002000, 0x80ec886c,
-	0x82ed806d, 0xbf820005,
-	0x876eff6d, 0x01000000,
-	0xbf850002, 0x806c846c,
+	0x876eff6d, 0x00ff0000,
+	0xbf850008, 0x876eff6d,
+	0x01000000, 0xbf850007,
+	0x8878ff78, 0x00002000,
+	0x80ec886c, 0x82ed806d,
+	0xbf820002, 0x806c846c,
 	0x826d806d, 0x876dff6d,
 	0x0000ffff, 0x87fe7e7e,
 	0x87ea6a6a, 0xb9f8f802,
@@ -2549,7 +2546,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0x705d0000, 0x807c817c,
 	0x8070ff70, 0x00000080,
 	0xbf0a7b7c, 0xbf85fff8,
-	0xbf82013b, 0xbef4037e,
+	0xbf820135, 0xbef4037e,
 	0x8775ff7f, 0x0000ffff,
 	0x8875ff75, 0x00040000,
 	0xbef60380, 0xbef703ff,
@@ -2688,10 +2685,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0xf0000000, 0x80788478,
 	0xbf8cc07f, 0xb9eef815,
 	0xbefc036f, 0xbefe0370,
-	0xbeff0371, 0x876f7bff,
-	0x000003ff, 0xb9ef4803,
-	0x876f7bff, 0xfffff800,
-	0x906f8b6f, 0xb9efa2c3,
+	0xbeff0371, 0xb9fbf803,
 	0xb9f3f801, 0xb96e3a05,
 	0x806e816e, 0xbf0d9972,
 	0xbf850002, 0x8f6e896e,
@@ -2749,11 +2743,11 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0xf8000008, 0xbf89fc07,
 	0x8bee6e6e, 0xbfa10001,
 	0xbe80486e, 0x8b6eff6d,
-	0x01ff0000, 0xbfa20005,
-	0x8c78ff78, 0x00002000,
-	0x80ec886c, 0x82ed806d,
-	0xbfa00005, 0x8b6eff6d,
-	0x01000000, 0xbfa20002,
+	0x00ff0000, 0xbfa20008,
+	0x8b6eff6d, 0x01000000,
+	0xbfa20007, 0x8c78ff78,
+	0x00002000, 0x80ec886c,
+	0x82ed806d, 0xbfa00002,
 	0x806c846c, 0x826d806d,
 	0x8b6dff6d, 0x0000ffff,
 	0x8bfe7e7e, 0x8bea6a6a,
@@ -2988,7 +2982,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0x701d0000, 0x807d817d,
 	0x8070ff70, 0x00000080,
 	0xbf0a7b7d, 0xbfa2fff8,
-	0xbfa00146, 0xbef4007e,
+	0xbfa00140, 0xbef4007e,
 	0x8b75ff7f, 0x0000ffff,
 	0x8c75ff75, 0x00040000,
 	0xbef60080, 0xbef700ff,
@@ -3130,10 +3124,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0xf0000000, 0x80788478,
 	0xbf89fc07, 0xb96ef815,
 	0xbefd006f, 0xbefe0070,
-	0xbeff0071, 0x8b6f7bff,
-	0x000003ff, 0xb96f4803,
-	0x8b6f7bff, 0xfffff800,
-	0x856f8b6f, 0xb96fa2c3,
+	0xbeff0071, 0xb97bf803,
 	0xb973f801, 0xb8ee3b05,
 	0x806e816e, 0xbf0d9972,
 	0xbfa20002, 0x846e896e,
@@ -4119,7 +4110,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x8b6dff6d, 0x0000ffff,
 	0x8bfe7e7e, 0x8bea6a6a,
 	0xb97af804, 0xbe804a6c,
-	0xbfb00000, 0xbf9f0000,
+	0xbfb10000, 0xbf9f0000,
 	0xbf9f0000, 0xbf9f0000,
 	0xbf9f0000, 0xbf9f0000,
 };
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/3] drm/amdkfd: Replace deprecated gfx12 trap handler instructions
  2024-05-23 14:08 [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source Jay Cornwall
@ 2024-05-23 14:08 ` Jay Cornwall
  2024-05-23 18:43   ` Lancelot SIX
  2024-05-23 14:08 ` [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes Jay Cornwall
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Jay Cornwall @ 2024-05-23 14:08 UTC (permalink / raw)
  To: amd-gfx; +Cc: Jay Cornwall, Lancelot Six

Newer assemblers reject S_WAITCNT. All instances of S_WAITCNT can be
replaced by S_WAITCNT 0 (< gfx12) or S_WAIT_IDLE (>= gfx12) since
there is no concurrency of different memory instruction classes.

Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
Cc: Lancelot Six <lancelot.six@amd.com>
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 140 +++++++++---------
 .../amd/amdkfd/cwsr_trap_handler_gfx10.asm    |  52 +++----
 2 files changed, 97 insertions(+), 95 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 11d076eb770c..d61b2c3bd0ac 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -711,12 +711,12 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0xbf0d8f7b, 0xbf840002,
 	0x887bff7b, 0xffff0000,
 	0xf4011bbd, 0xfa000010,
-	0xbf8cc07f, 0x8f6e976e,
+	0xbf8c0000, 0x8f6e976e,
 	0x8a77ff77, 0x00800000,
 	0x88776e77, 0xf4051bbd,
-	0xfa000000, 0xbf8cc07f,
+	0xfa000000, 0xbf8c0000,
 	0xf4051ebd, 0xfa000008,
-	0xbf8cc07f, 0x87ee6e6e,
+	0xbf8c0000, 0x87ee6e6e,
 	0xbf840001, 0xbe80206e,
 	0x876eff6d, 0x00ff0000,
 	0xbf850008, 0x876eff6d,
@@ -1185,7 +1185,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x785d0000, 0xe0304080,
 	0x785d0100, 0xe0304100,
 	0x785d0200, 0xe0304180,
-	0x785d0300, 0xbf8c3f70,
+	0x785d0300, 0xbf8c0000,
 	0x7e008500, 0x7e028501,
 	0x7e048502, 0x7e068503,
 	0x807c847c, 0x8078ff78,
@@ -1194,7 +1194,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x6e5d0000, 0xe0304080,
 	0x6e5d0100, 0xe0304100,
 	0x6e5d0200, 0xe0304180,
-	0x6e5d0300, 0xbf8c3f70,
+	0x6e5d0300, 0xbf8c0000,
 	0xbf820034, 0xbef603ff,
 	0x01000000, 0xbeee0378,
 	0x8078ff78, 0x00000400,
@@ -1203,7 +1203,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x785d0000, 0xe0304100,
 	0x785d0100, 0xe0304200,
 	0x785d0200, 0xe0304300,
-	0x785d0300, 0xbf8c3f70,
+	0x785d0300, 0xbf8c0000,
 	0x7e008500, 0x7e028501,
 	0x7e048502, 0x7e068503,
 	0x807c847c, 0x8078ff78,
@@ -1213,7 +1213,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x8f6f836f, 0x806f7c6f,
 	0xbefe03c1, 0xbeff0380,
 	0xe0304000, 0x785d0000,
-	0xbf8c3f70, 0x7e008500,
+	0xbf8c0000, 0x7e008500,
 	0x807c817c, 0x8078ff78,
 	0x00000080, 0xbf0a6f7c,
 	0xbf85fff7, 0xbeff03c1,
@@ -1221,7 +1221,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0xe0304100, 0x6e5d0100,
 	0xe0304200, 0x6e5d0200,
 	0xe0304300, 0x6e5d0300,
-	0xbf8c3f70, 0xb9783a05,
+	0xbf8c0000, 0xb9783a05,
 	0x80788178, 0xbf0d9972,
 	0xbf850002, 0x8f788978,
 	0xbf820001, 0x8f788a78,
@@ -1232,16 +1232,16 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x01000000, 0xbefc03ff,
 	0x0000006c, 0x80f89078,
 	0xf429003a, 0xf0000000,
-	0xbf8cc07f, 0x80fc847c,
+	0xbf8c0000, 0x80fc847c,
 	0xbf800000, 0xbe803100,
 	0xbe823102, 0x80f8a078,
 	0xf42d003a, 0xf0000000,
-	0xbf8cc07f, 0x80fc887c,
+	0xbf8c0000, 0x80fc887c,
 	0xbf800000, 0xbe803100,
 	0xbe823102, 0xbe843104,
 	0xbe863106, 0x80f8c078,
 	0xf431003a, 0xf0000000,
-	0xbf8cc07f, 0x80fc907c,
+	0xbf8c0000, 0x80fc907c,
 	0xbf800000, 0xbe803100,
 	0xbe823102, 0xbe843104,
 	0xbe863106, 0xbe883108,
@@ -1271,9 +1271,9 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0xf4211cfa, 0xf0000000,
 	0x80788478, 0xf4211bba,
 	0xf0000000, 0x80788478,
-	0xbf8cc07f, 0xb9eef814,
+	0xbf8c0000, 0xb9eef814,
 	0xf4211bba, 0xf0000000,
-	0x80788478, 0xbf8cc07f,
+	0x80788478, 0xbf8c0000,
 	0xb9eef815, 0xbefc036f,
 	0xbefe0370, 0xbeff0371,
 	0xb9f9f816, 0xb9fbf803,
@@ -1288,7 +1288,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x0000ffff, 0xf4091c37,
 	0xfa000050, 0xf4091d37,
 	0xfa000060, 0xf4011e77,
-	0xfa000074, 0xbf8cc07f,
+	0xfa000074, 0xbf8c0000,
 	0x906e8977, 0x876fff6e,
 	0x003f8000, 0x906e8677,
 	0x876eff6e, 0x02000000,
@@ -2299,12 +2299,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0xbf0d8f7b, 0xbf840002,
 	0x887bff7b, 0xffff0000,
 	0xf4011bbd, 0xfa000010,
-	0xbf8cc07f, 0x8f6e976e,
+	0xbf8c0000, 0x8f6e976e,
 	0x8a77ff77, 0x00800000,
 	0x88776e77, 0xf4051bbd,
-	0xfa000000, 0xbf8cc07f,
+	0xfa000000, 0xbf8c0000,
 	0xf4051ebd, 0xfa000008,
-	0xbf8cc07f, 0x87ee6e6e,
+	0xbf8c0000, 0x87ee6e6e,
 	0xbf840001, 0xbe80206e,
 	0x876eff6d, 0x00ff0000,
 	0xbf850008, 0x876eff6d,
@@ -2319,7 +2319,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0x0000ffff, 0xbefa0380,
 	0xb9fa0283, 0xbeee037e,
 	0xbeef037f, 0xbefe0480,
-	0xbf900004, 0xbf8cc07f,
+	0xbf900004, 0xbf8c0000,
 	0x877aff7f, 0x04000000,
 	0x8f7a857a, 0x886d7a6d,
 	0x7e008200, 0xbefa037e,
@@ -2595,7 +2595,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0xe0304080, 0x785d0100,
 	0xe0304100, 0x785d0200,
 	0xe0304180, 0x785d0300,
-	0xbf8c3f70, 0x7e008500,
+	0xbf8c0000, 0x7e008500,
 	0x7e028501, 0x7e048502,
 	0x7e068503, 0x807c847c,
 	0x8078ff78, 0x00000200,
@@ -2604,7 +2604,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0xe0304080, 0x6e5d0100,
 	0xe0304100, 0x6e5d0200,
 	0xe0304180, 0x6e5d0300,
-	0xbf8c3f70, 0xbf820034,
+	0xbf8c0000, 0xbf820034,
 	0xbef603ff, 0x01000000,
 	0xbeee0378, 0x8078ff78,
 	0x00000400, 0xbefc0384,
@@ -2613,7 +2613,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0xe0304100, 0x785d0100,
 	0xe0304200, 0x785d0200,
 	0xe0304300, 0x785d0300,
-	0xbf8c3f70, 0x7e008500,
+	0xbf8c0000, 0x7e008500,
 	0x7e028501, 0x7e048502,
 	0x7e068503, 0x807c847c,
 	0x8078ff78, 0x00000400,
@@ -2622,7 +2622,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0xbf84000e, 0x8f6f836f,
 	0x806f7c6f, 0xbefe03c1,
 	0xbeff0380, 0xe0304000,
-	0x785d0000, 0xbf8c3f70,
+	0x785d0000, 0xbf8c0000,
 	0x7e008500, 0x807c817c,
 	0x8078ff78, 0x00000080,
 	0xbf0a6f7c, 0xbf85fff7,
@@ -2630,7 +2630,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0x6e5d0000, 0xe0304100,
 	0x6e5d0100, 0xe0304200,
 	0x6e5d0200, 0xe0304300,
-	0x6e5d0300, 0xbf8c3f70,
+	0x6e5d0300, 0xbf8c0000,
 	0xb9783a05, 0x80788178,
 	0xbf0d9972, 0xbf850002,
 	0x8f788978, 0xbf820001,
@@ -2641,16 +2641,16 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0xbef603ff, 0x01000000,
 	0xbefc03ff, 0x0000006c,
 	0x80f89078, 0xf429003a,
-	0xf0000000, 0xbf8cc07f,
+	0xf0000000, 0xbf8c0000,
 	0x80fc847c, 0xbf800000,
 	0xbe803100, 0xbe823102,
 	0x80f8a078, 0xf42d003a,
-	0xf0000000, 0xbf8cc07f,
+	0xf0000000, 0xbf8c0000,
 	0x80fc887c, 0xbf800000,
 	0xbe803100, 0xbe823102,
 	0xbe843104, 0xbe863106,
 	0x80f8c078, 0xf431003a,
-	0xf0000000, 0xbf8cc07f,
+	0xf0000000, 0xbf8c0000,
 	0x80fc907c, 0xbf800000,
 	0xbe803100, 0xbe823102,
 	0xbe843104, 0xbe863106,
@@ -2680,10 +2680,10 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0x80788478, 0xf4211cfa,
 	0xf0000000, 0x80788478,
 	0xf4211bba, 0xf0000000,
-	0x80788478, 0xbf8cc07f,
+	0x80788478, 0xbf8c0000,
 	0xb9eef814, 0xf4211bba,
 	0xf0000000, 0x80788478,
-	0xbf8cc07f, 0xb9eef815,
+	0xbf8c0000, 0xb9eef815,
 	0xbefc036f, 0xbefe0370,
 	0xbeff0371, 0xb9fbf803,
 	0xb9f3f801, 0xb96e3a05,
@@ -2697,7 +2697,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0x0000ffff, 0xf4091c37,
 	0xfa000050, 0xf4091d37,
 	0xfa000060, 0xf4011e77,
-	0xfa000074, 0xbf8cc07f,
+	0xfa000074, 0xbf8c0000,
 	0x876dff6d, 0x0000ffff,
 	0x87fe7e7e, 0x87ea6a6a,
 	0xb9faf802, 0xbe80226c,
@@ -2731,16 +2731,16 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0x8b6eff6e, 0x00000800,
 	0xbfa20003, 0x8b6eff7b,
 	0x00000400, 0xbfa2002a,
-	0xbefa4d82, 0xbf89fc07,
+	0xbefa4d82, 0xbf890000,
 	0x84fa887a, 0xbf0d8f7b,
 	0xbfa10002, 0x8c7bff7b,
 	0xffff0000, 0xf4005bbd,
-	0xf8000010, 0xbf89fc07,
+	0xf8000010, 0xbf890000,
 	0x846e976e, 0x9177ff77,
 	0x00800000, 0x8c776e77,
 	0xf4045bbd, 0xf8000000,
-	0xbf89fc07, 0xf4045ebd,
-	0xf8000008, 0xbf89fc07,
+	0xbf890000, 0xf4045ebd,
+	0xf8000008, 0xbf890000,
 	0x8bee6e6e, 0xbfa10001,
 	0xbe80486e, 0x8b6eff6d,
 	0x00ff0000, 0xbfa20008,
@@ -2756,7 +2756,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0xbefa0080, 0xb97a0283,
 	0xbeee007e, 0xbeef007f,
 	0xbefe0180, 0xbefe4d84,
-	0xbf89fc07, 0x8b7aff7f,
+	0xbf890000, 0x8b7aff7f,
 	0x04000000, 0x847a857a,
 	0x8c6d7a6d, 0xbefa007e,
 	0x8b7bff7f, 0x0000ffff,
@@ -3007,13 +3007,13 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0x857d9972, 0x8b7d817d,
 	0xbf06817d, 0xbefd0080,
 	0xbfa2000c, 0xe0500000,
-	0x781d0000, 0xbf8903f7,
+	0x781d0000, 0xbf890000,
 	0xdac00000, 0x00000000,
 	0x807dff7d, 0x00000080,
 	0x8078ff78, 0x00000080,
 	0xbf0a6f7d, 0xbfa2fff5,
 	0xbfa0000b, 0xe0500000,
-	0x781d0000, 0xbf8903f7,
+	0x781d0000, 0xbf890000,
 	0xdac00000, 0x00000000,
 	0x807dff7d, 0x00000100,
 	0x8078ff78, 0x00000100,
@@ -3034,7 +3034,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0xe0505080, 0x781d0100,
 	0xe0505100, 0x781d0200,
 	0xe0505180, 0x781d0300,
-	0xbf8903f7, 0x7e008500,
+	0xbf890000, 0x7e008500,
 	0x7e028501, 0x7e048502,
 	0x7e068503, 0x807d847d,
 	0x8078ff78, 0x00000200,
@@ -3043,7 +3043,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0xe0505080, 0x6e1d0100,
 	0xe0505100, 0x6e1d0200,
 	0xe0505180, 0x6e1d0300,
-	0xbf8903f7, 0xbfa00034,
+	0xbf890000, 0xbfa00034,
 	0xbef600ff, 0x01000000,
 	0xbeee0078, 0x8078ff78,
 	0x00000400, 0xbefd0084,
@@ -3052,7 +3052,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0xe0505100, 0x781d0100,
 	0xe0505200, 0x781d0200,
 	0xe0505300, 0x781d0300,
-	0xbf8903f7, 0x7e008500,
+	0xbf890000, 0x7e008500,
 	0x7e028501, 0x7e048502,
 	0x7e068503, 0x807d847d,
 	0x8078ff78, 0x00000400,
@@ -3061,7 +3061,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0xbfa1000e, 0x846f836f,
 	0x806f7d6f, 0xbefe00c1,
 	0xbeff0080, 0xe0505000,
-	0x781d0000, 0xbf8903f7,
+	0x781d0000, 0xbf890000,
 	0x7e008500, 0x807d817d,
 	0x8078ff78, 0x00000080,
 	0xbf0a6f7d, 0xbfa2fff7,
@@ -3069,7 +3069,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0x6e1d0000, 0xe0505100,
 	0x6e1d0100, 0xe0505200,
 	0x6e1d0200, 0xe0505300,
-	0x6e1d0300, 0xbf8903f7,
+	0x6e1d0300, 0xbf890000,
 	0xb8f83b05, 0x80788178,
 	0xbf0d9972, 0xbfa20002,
 	0x84788978, 0xbfa00001,
@@ -3080,16 +3080,16 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0xbef600ff, 0x01000000,
 	0xbefd00ff, 0x0000006c,
 	0x80f89078, 0xf428403a,
-	0xf0000000, 0xbf89fc07,
+	0xf0000000, 0xbf890000,
 	0x80fd847d, 0xbf800000,
 	0xbe804300, 0xbe824302,
 	0x80f8a078, 0xf42c403a,
-	0xf0000000, 0xbf89fc07,
+	0xf0000000, 0xbf890000,
 	0x80fd887d, 0xbf800000,
 	0xbe804300, 0xbe824302,
 	0xbe844304, 0xbe864306,
 	0x80f8c078, 0xf430403a,
-	0xf0000000, 0xbf89fc07,
+	0xf0000000, 0xbf890000,
 	0x80fd907d, 0xbf800000,
 	0xbe804300, 0xbe824302,
 	0xbe844304, 0xbe864306,
@@ -3119,10 +3119,10 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0x80788478, 0xf4205cfa,
 	0xf0000000, 0x80788478,
 	0xf4205bba, 0xf0000000,
-	0x80788478, 0xbf89fc07,
+	0x80788478, 0xbf890000,
 	0xb96ef814, 0xf4205bba,
 	0xf0000000, 0x80788478,
-	0xbf89fc07, 0xb96ef815,
+	0xbf890000, 0xb96ef815,
 	0xbefd006f, 0xbefe0070,
 	0xbeff0071, 0xb97bf803,
 	0xb973f801, 0xb8ee3b05,
@@ -3136,7 +3136,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0x0000ffff, 0xf4085c37,
 	0xf8000050, 0xf4085d37,
 	0xf8000060, 0xf4005e77,
-	0xf8000074, 0xbf89fc07,
+	0xf8000074, 0xbf890000,
 	0x8b6dff6d, 0x0000ffff,
 	0x8bfe7e7e, 0x8bea6a6a,
 	0xb8eef802, 0xbf0d866e,
@@ -3657,16 +3657,16 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x8b6fff6f, 0x00000200,
 	0xbfa20002, 0x8b6ea07b,
 	0xbfa2002b, 0xbefa4d82,
-	0xbf89fc07, 0x84fa887a,
+	0xbf8a0000, 0x84fa887a,
 	0xbf0d8f7b, 0xbfa10002,
 	0x8c7bff7b, 0xffff0000,
 	0xf4601bbd, 0xf8000010,
-	0xbf89fc07, 0x846e976e,
+	0xbf8a0000, 0x846e976e,
 	0x9177ff77, 0x00800000,
 	0x8c776e77, 0xf4603bbd,
-	0xf8000000, 0xbf89fc07,
+	0xf8000000, 0xbf8a0000,
 	0xf4603ebd, 0xf8000008,
-	0xbf89fc07, 0x8bee6e6e,
+	0xbf8a0000, 0x8bee6e6e,
 	0xbfa10001, 0xbe80486e,
 	0x8b6eff6d, 0xf0000000,
 	0xbfa20009, 0xb8eef811,
@@ -3682,7 +3682,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0xbefa0080, 0xb97a0151,
 	0xbeee007e, 0xbeef007f,
 	0xbefe0180, 0xbefe4d84,
-	0xbf89fc07, 0x8b7aff7f,
+	0xbf8a0000, 0x8b7aff7f,
 	0x04000000, 0x847a857a,
 	0x8c6d7a6d, 0xbefa007e,
 	0x8b7bff7f, 0x0000ffff,
@@ -3869,7 +3869,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x00000080, 0xbf800000,
 	0xbf800000, 0xbf800000,
 	0xd8d80000, 0x01000000,
-	0xbf890000, 0xc4068070,
+	0xbf8a0000, 0xc4068070,
 	0x008ce801, 0x00000000,
 	0x807d037d, 0x80700370,
 	0xd5250000, 0x0001ff00,
@@ -3878,7 +3878,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0xbe8300ff, 0x00000100,
 	0xbf800000, 0xbf800000,
 	0xbf800000, 0xd8d80000,
-	0x01000000, 0xbf890000,
+	0x01000000, 0xbf8a0000,
 	0xc4068070, 0x008ce801,
 	0x00000000, 0x807d037d,
 	0x80700370, 0xd5250000,
@@ -3954,14 +3954,14 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x8b7d817d, 0xbf06817d,
 	0xbefd0080, 0xbfa2000d,
 	0xc4050078, 0x0080e800,
-	0x00000000, 0xbf8903f7,
+	0x00000000, 0xbf8a0000,
 	0xdac00000, 0x00000000,
 	0x807dff7d, 0x00000080,
 	0x8078ff78, 0x00000080,
 	0xbf0a6f7d, 0xbfa2fff4,
 	0xbfa0000c, 0xc4050078,
 	0x0080e800, 0x00000000,
-	0xbf8903f7, 0xdac00000,
+	0xbf8a0000, 0xdac00000,
 	0x00000000, 0x807dff7d,
 	0x00000100, 0x8078ff78,
 	0x00000100, 0xbf0a6f7d,
@@ -3983,7 +3983,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x00008000, 0xc4050078,
 	0x008ce802, 0x00010000,
 	0xc4050078, 0x008ce803,
-	0x00018000, 0xbf8903f7,
+	0x00018000, 0xbf8a0000,
 	0x7e008500, 0x7e028501,
 	0x7e048502, 0x7e068503,
 	0x807d847d, 0x8078ff78,
@@ -3994,7 +3994,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x00008000, 0xc405006e,
 	0x008ce802, 0x00010000,
 	0xc405006e, 0x008ce803,
-	0x00018000, 0xbf8903f7,
+	0x00018000, 0xbf8a0000,
 	0xbfa0003d, 0xbef600ff,
 	0x01000000, 0xbeee0078,
 	0x8078ff78, 0x00000400,
@@ -4005,7 +4005,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x00010000, 0xc4050078,
 	0x008ce802, 0x00020000,
 	0xc4050078, 0x008ce803,
-	0x00030000, 0xbf8903f7,
+	0x00030000, 0xbf8a0000,
 	0x7e008500, 0x7e028501,
 	0x7e048502, 0x7e068503,
 	0x807d847d, 0x8078ff78,
@@ -4015,7 +4015,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x846f836f, 0x806f7d6f,
 	0xbefe00c1, 0xbeff0080,
 	0xc4050078, 0x008ce800,
-	0x00000000, 0xbf8903f7,
+	0x00000000, 0xbf8a0000,
 	0x7e008500, 0x807d817d,
 	0x8078ff78, 0x00000080,
 	0xbf0a6f7d, 0xbfa2fff6,
@@ -4025,7 +4025,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x00010000, 0xc405006e,
 	0x008ce802, 0x00020000,
 	0xc405006e, 0x008ce803,
-	0x00030000, 0xbf8903f7,
+	0x00030000, 0xbf8a0000,
 	0xb8f83b05, 0x80788178,
 	0xbf0d9972, 0xbfa20002,
 	0x84788978, 0xbfa00001,
@@ -4036,16 +4036,16 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0xbef600ff, 0x01000000,
 	0xbefd00ff, 0x0000006c,
 	0x80f89078, 0xf462403a,
-	0xf0000000, 0xbf89fc07,
+	0xf0000000, 0xbf8a0000,
 	0x80fd847d, 0xbf800000,
 	0xbe804300, 0xbe824302,
 	0x80f8a078, 0xf462603a,
-	0xf0000000, 0xbf89fc07,
+	0xf0000000, 0xbf8a0000,
 	0x80fd887d, 0xbf800000,
 	0xbe804300, 0xbe824302,
 	0xbe844304, 0xbe864306,
 	0x80f8c078, 0xf462803a,
-	0xf0000000, 0xbf89fc07,
+	0xf0000000, 0xbf8a0000,
 	0x80fd907d, 0xbf800000,
 	0xbe804300, 0xbe824302,
 	0xbe844304, 0xbe864306,
@@ -4075,19 +4075,19 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x80788478, 0xf4621cfa,
 	0xf0000000, 0x80788478,
 	0xf4621bba, 0xf0000000,
-	0x80788478, 0xbf89fc07,
+	0x80788478, 0xbf8a0000,
 	0xb96ef814, 0xf4621bba,
 	0xf0000000, 0x80788478,
-	0xbf89fc07, 0xb96ef815,
+	0xbf8a0000, 0xb96ef815,
 	0xf4621bba, 0xf0000000,
-	0x80788478, 0xbf89fc07,
+	0x80788478, 0xbf8a0000,
 	0xb96ef812, 0xf4621bba,
 	0xf0000000, 0x80788478,
-	0xbf89fc07, 0xb96ef813,
+	0xbf8a0000, 0xb96ef813,
 	0x8b6eff7f, 0x04000000,
 	0xbfa1000d, 0x80788478,
 	0xf4621bba, 0xf0000000,
-	0x80788478, 0xbf89fc07,
+	0x80788478, 0xbf8a0000,
 	0xbf0d806e, 0xbfa10006,
 	0x856e906e, 0x8b6e6e6e,
 	0xbfa10003, 0xbe804ec1,
@@ -4106,7 +4106,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x0000ffff, 0xf4605c37,
 	0xf8000050, 0xf4605d37,
 	0xf8000060, 0xf4601e77,
-	0xf8000074, 0xbf89fc07,
+	0xf8000074, 0xbf8a0000,
 	0x8b6dff6d, 0x0000ffff,
 	0x8bfe7e7e, 0x8bea6a6a,
 	0xb97af804, 0xbe804a6c,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index cb619e49228c..77ae25b6753c 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -55,9 +55,11 @@
 #if ASIC_FAMILY < CHIP_GFX12
 #define S_COHERENCE glc:1
 #define V_COHERENCE slc:1 glc:1
+#define S_WAITCNT_0 s_waitcnt 0
 #else
 #define S_COHERENCE scope:SCOPE_SYS
 #define V_COHERENCE scope:SCOPE_SYS
+#define S_WAITCNT_0 s_wait_idle
 
 #define HW_REG_SHADER_FLAT_SCRATCH_LO HW_REG_WAVE_SCRATCH_BASE_LO
 #define HW_REG_SHADER_FLAT_SCRATCH_HI HW_REG_WAVE_SCRATCH_BASE_HI
@@ -364,7 +366,7 @@ L_FETCH_2ND_TRAP:
 	// ttmp12 holds SQ_WAVE_STATUS
 #if HAVE_SENDMSG_RTN
 	s_sendmsg_rtn_b64       [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA)
-	s_waitcnt       lgkmcnt(0)
+	S_WAITCNT_0
 #else
 	s_getreg_b32	ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
 	s_getreg_b32	ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
@@ -377,15 +379,15 @@ L_FETCH_2ND_TRAP:
 L_NO_SIGN_EXTEND_TMA:
 
 	s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 S_COHERENCE		// debug trap enabled flag
-	s_waitcnt       lgkmcnt(0)
+	S_WAITCNT_0
 	s_lshl_b32      ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
 	s_andn2_b32     ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
 	s_or_b32        ttmp11, ttmp11, ttmp2
 
 	s_load_dwordx2	[ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 S_COHERENCE	// second-level TBA
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 	s_load_dwordx2	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 S_COHERENCE	// second-level TMA
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 
 	s_and_b64	[ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
 	s_cbranch_scc0	L_NO_NEXT_TRAP						// second-level trap handler not been set
@@ -460,7 +462,7 @@ L_SLEEP:
 	s_sleep		0x2
 	s_cbranch_execz	L_SLEEP
 #else
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 #endif
 
 	// Save first_wave flag so we can clear high bits of save address.
@@ -794,7 +796,7 @@ L_SAVE_LDS_W32:
 
 L_SAVE_LDS_LOOP_SQC_W32:
 	ds_read_b32	v1, v0
-	s_waitcnt	0
+	S_WAITCNT_0
 
 	write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
 
@@ -814,7 +816,7 @@ L_SAVE_LDS_WITH_TCP_W32:
 	s_nop		0
 L_SAVE_LDS_LOOP_W32:
 	ds_read_b32	v1, v0
-	s_waitcnt	0
+	S_WAITCNT_0
 	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
 
 	s_add_u32	m0, m0, s3						//every buffer_store_lds does 128 bytes
@@ -832,7 +834,7 @@ L_SAVE_LDS_W64:
 
 L_SAVE_LDS_LOOP_SQC_W64:
 	ds_read_b32	v1, v0
-	s_waitcnt	0
+	S_WAITCNT_0
 
 	write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
 
@@ -852,7 +854,7 @@ L_SAVE_LDS_WITH_TCP_W64:
 	s_nop		0
 L_SAVE_LDS_LOOP_W64:
 	ds_read_b32	v1, v0
-	s_waitcnt	0
+	S_WAITCNT_0
 	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
 
 	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
@@ -1073,7 +1075,7 @@ L_RESTORE_LDS_LOOP_W32:
 	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
 #else
 	buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
-	s_waitcnt	vmcnt(0)
+	S_WAITCNT_0
 	ds_store_addtid_b32     v0
 #endif
 	s_add_u32	m0, m0, 128						// 128 DW
@@ -1087,7 +1089,7 @@ L_RESTORE_LDS_LOOP_W64:
 	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
 #else
 	buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
-	s_waitcnt	vmcnt(0)
+	S_WAITCNT_0
 	ds_store_addtid_b32     v0
 #endif
 	s_add_u32	m0, m0, 256						// 256 DW
@@ -1132,7 +1134,7 @@ L_RESTORE_VGPR_WAVE32_LOOP:
 	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128
 	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*2
 	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*3
-	s_waitcnt	vmcnt(0)
+	S_WAITCNT_0
 	v_movreld_b32	v0, v0							//v[0+m0] = v0
 	v_movreld_b32	v1, v1
 	v_movreld_b32	v2, v2
@@ -1147,7 +1149,7 @@ L_RESTORE_VGPR_WAVE32_LOOP:
 	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128
 	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*2
 	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*3
-	s_waitcnt	vmcnt(0)
+	S_WAITCNT_0
 
 	s_branch	L_RESTORE_SGPR
 
@@ -1166,7 +1168,7 @@ L_RESTORE_VGPR_WAVE64_LOOP:
 	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256
 	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*2
 	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*3
-	s_waitcnt	vmcnt(0)
+	S_WAITCNT_0
 	v_movreld_b32	v0, v0							//v[0+m0] = v0
 	v_movreld_b32	v1, v1
 	v_movreld_b32	v2, v2
@@ -1189,7 +1191,7 @@ L_RESTORE_SHARED_VGPR:
 	s_mov_b32	exec_hi, 0x00000000
 L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
 	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
-	s_waitcnt	vmcnt(0)
+	S_WAITCNT_0
 	v_movreld_b32	v0, v0							//v[0+m0] = v0
 	s_add_u32	m0, m0, 1						//next vgpr index
 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128
@@ -1204,7 +1206,7 @@ L_RESTORE_V0:
 	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256
 	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*2
 	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*3
-	s_waitcnt	vmcnt(0)
+	S_WAITCNT_0
 
 	/* restore SGPRs */
 	//will be 2+8+16*6
@@ -1221,7 +1223,7 @@ L_RESTORE_SGPR:
 	s_mov_b32	m0, s_sgpr_save_num
 
 	read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 
 	s_sub_u32	m0, m0, 4						// Restore from S[0] to S[104]
 	s_nop		0							// hazard SALU M0=> S_MOVREL
@@ -1230,7 +1232,7 @@ L_RESTORE_SGPR:
 	s_movreld_b64	s2, s2
 
 	read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 
 	s_sub_u32	m0, m0, 8						// Restore from S[0] to S[96]
 	s_nop		0							// hazard SALU M0=> S_MOVREL
@@ -1242,7 +1244,7 @@ L_RESTORE_SGPR:
 
  L_RESTORE_SGPR_LOOP:
 	read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 
 	s_sub_u32	m0, m0, 16						// Restore from S[n] to S[0]
 	s_nop		0							// hazard SALU M0=> S_MOVREL
@@ -1291,22 +1293,22 @@ L_RESTORE_HWREG:
 	read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
 	read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
 	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 
 	s_setreg_b32	hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch
 
 	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
-	s_waitcnt	lgkmcnt(0)						//from now on, it is safe to restore STATUS and IB_STS
+	S_WAITCNT_0
 
 	s_setreg_b32	hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch
 
 #if ASIC_FAMILY >= CHIP_GFX12
 	read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp
 
 	read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 	s_setreg_b32	hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp
 
 	// Only the first wave needs to restore the workgroup barrier.
@@ -1317,7 +1319,7 @@ L_RESTORE_HWREG:
 	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 4
 
 	read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 
 	s_bitcmp1_b32	s_restore_tmp, BARRIER_STATE_VALID_OFFSET
 	s_cbranch_scc0	L_SKIP_BARRIER_RESTORE
@@ -1364,7 +1366,7 @@ L_SKIP_BARRIER_RESTORE:
 	s_load_dwordx4	[ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 S_COHERENCE
 	s_load_dwordx4	[ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 S_COHERENCE
 	s_load_dword	ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 S_COHERENCE
-	s_waitcnt	lgkmcnt(0)
+	S_WAITCNT_0
 
 #if HAVE_XNACK
 	restore_ib_sts(s_restore_tmp, s_restore_m0)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes
  2024-05-23 14:08 [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source Jay Cornwall
  2024-05-23 14:08 ` [PATCH 2/3] drm/amdkfd: Replace deprecated gfx12 trap handler instructions Jay Cornwall
@ 2024-05-23 14:08 ` Jay Cornwall
  2024-05-23 18:37   ` Lancelot SIX
  2024-05-23 18:27 ` [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source Alex Deucher
  2024-05-23 18:41 ` Lancelot SIX
  3 siblings, 1 reply; 9+ messages in thread
From: Jay Cornwall @ 2024-05-23 14:08 UTC (permalink / raw)
  To: amd-gfx; +Cc: Jay Cornwall, Lancelot Six

Fix LDS size interpretation: 512 bytes (>= gfx12) vs 256 (< gfx12).

Ensure STATE_PRIV.BARRIER_COMPLETE cannot change after reading or
before writing. Other waves in the threadgroup may cause this field
to assert if they complete the barrier.

Do not overwrite EXCP_FLAG_PRIV.{SAVE_CONTEXT,HOST_TRAP} when
restoring this register. Both of these fields can assert while the
wavefront is running the trap handler.

Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
Cc: Lancelot Six <lancelot.six@amd.com>
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 1191 +++++++++--------
 .../amd/amdkfd/cwsr_trap_handler_gfx10.asm    |   55 +-
 2 files changed, 639 insertions(+), 607 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index d61b2c3bd0ac..85a41e121cce 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -678,7 +678,7 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
 };
 
 static const uint32_t cwsr_trap_nv1x_hex[] = {
-	0xbf820001, 0xbf820394,
+	0xbf820001, 0xbf820393,
 	0xb0804004, 0xb978f802,
 	0x8a78ff78, 0x00020006,
 	0xb97bf803, 0x876eff78,
@@ -932,23 +932,48 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0xbf850002, 0xbeff0380,
 	0xbf820001, 0xbeff03c1,
 	0xb97b4306, 0x877bc17b,
-	0xbf840086, 0xbf8a0000,
+	0xbf840085, 0xbf8a0000,
 	0x877aff6d, 0x80000000,
-	0xbf840082, 0x8f7b867b,
-	0x8f7b827b, 0xbef6037b,
-	0xb9703a05, 0x80708170,
-	0xbf0d9973, 0xbf850002,
-	0x8f708970, 0xbf820001,
-	0x8f708a70, 0xb97a1e06,
-	0x8f7a8a7a, 0x80707a70,
-	0x8070ff70, 0x00000200,
-	0x8070ff70, 0x00000080,
-	0xbef603ff, 0x01000000,
-	0xd7650000, 0x000100c1,
-	0xd7660000, 0x000200c1,
-	0x16000084, 0x907c9973,
-	0x877c817c, 0xbf06817c,
-	0xbefc0380, 0xbf850033,
+	0xbf840081, 0x8f7b887b,
+	0xbef6037b, 0xb9703a05,
+	0x80708170, 0xbf0d9973,
+	0xbf850002, 0x8f708970,
+	0xbf820001, 0x8f708a70,
+	0xb97a1e06, 0x8f7a8a7a,
+	0x80707a70, 0x8070ff70,
+	0x00000200, 0x8070ff70,
+	0x00000080, 0xbef603ff,
+	0x01000000, 0xd7650000,
+	0x000100c1, 0xd7660000,
+	0x000200c1, 0x16000084,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbefc0380,
+	0xbf850033, 0xb97af803,
+	0x8a7a7aff, 0x10000000,
+	0xbf85001d, 0xd8d80000,
+	0x01000000, 0xbf8c0000,
+	0xbe840380, 0xd7600000,
+	0x00000901, 0x80048104,
+	0xd7600001, 0x00000901,
+	0x80048104, 0xd7600002,
+	0x00000901, 0x80048104,
+	0xd7600003, 0x00000901,
+	0x80048104, 0xf469003a,
+	0xe0000000, 0x80709070,
+	0xbf06a004, 0xbf84ffef,
+	0x807cff7c, 0x00000080,
+	0xd5250000, 0x0001ff00,
+	0x00000080, 0xbf0a7b7c,
+	0xbf85ffe4, 0xbf820044,
+	0xbe8303ff, 0x00000080,
+	0xbf800000, 0xbf800000,
+	0xbf800000, 0xd8d80000,
+	0x01000000, 0xbf8c0000,
+	0xe0704000, 0x705d0100,
+	0x807c037c, 0x80700370,
+	0xd5250000, 0x0001ff00,
+	0x00000080, 0xbf0a7b7c,
+	0xbf85fff4, 0xbf820032,
 	0xb97af803, 0x8a7a7aff,
 	0x10000000, 0xbf85001d,
 	0xd8d80000, 0x01000000,
@@ -960,24 +985,45 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x80048104, 0xd7600003,
 	0x00000901, 0x80048104,
 	0xf469003a, 0xe0000000,
-	0x80709070, 0xbf06a004,
+	0x80709070, 0xbf06c004,
 	0xbf84ffef, 0x807cff7c,
-	0x00000080, 0xd5250000,
-	0x0001ff00, 0x00000080,
+	0x00000100, 0xd5250000,
+	0x0001ff00, 0x00000100,
 	0xbf0a7b7c, 0xbf85ffe4,
-	0xbf820044, 0xbe8303ff,
-	0x00000080, 0xbf800000,
+	0xbf820011, 0xbe8303ff,
+	0x00000100, 0xbf800000,
 	0xbf800000, 0xbf800000,
 	0xd8d80000, 0x01000000,
 	0xbf8c0000, 0xe0704000,
 	0x705d0100, 0x807c037c,
 	0x80700370, 0xd5250000,
-	0x0001ff00, 0x00000080,
+	0x0001ff00, 0x00000100,
 	0xbf0a7b7c, 0xbf85fff4,
-	0xbf820032, 0xb97af803,
-	0x8a7a7aff, 0x10000000,
-	0xbf85001d, 0xd8d80000,
-	0x01000000, 0xbf8c0000,
+	0xbefe03c1, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850004, 0xbef003ff,
+	0x00000200, 0xbeff0380,
+	0xbf820003, 0xbef003ff,
+	0x00000400, 0xbeff03c1,
+	0xb97b3a05, 0x807b817b,
+	0x8f7b827b, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf85006b, 0xbef603ff,
+	0x01000000, 0xbefc0384,
+	0xbf0a7b7c, 0xbf8400fa,
+	0xb97af803, 0x8a7a7aff,
+	0x10000000, 0xbf850050,
+	0x7e008700, 0x7e028701,
+	0x7e048702, 0x7e068703,
+	0xbe840380, 0xd7600000,
+	0x00000900, 0x80048104,
+	0xd7600001, 0x00000900,
+	0x80048104, 0xd7600002,
+	0x00000900, 0x80048104,
+	0xd7600003, 0x00000900,
+	0x80048104, 0xf469003a,
+	0xe0000000, 0x80709070,
+	0xbf06a004, 0xbf84ffef,
 	0xbe840380, 0xd7600000,
 	0x00000901, 0x80048104,
 	0xd7600001, 0x00000901,
@@ -986,32 +1032,39 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0xd7600003, 0x00000901,
 	0x80048104, 0xf469003a,
 	0xe0000000, 0x80709070,
-	0xbf06c004, 0xbf84ffef,
-	0x807cff7c, 0x00000100,
-	0xd5250000, 0x0001ff00,
-	0x00000100, 0xbf0a7b7c,
-	0xbf85ffe4, 0xbf820011,
-	0xbe8303ff, 0x00000100,
-	0xbf800000, 0xbf800000,
-	0xbf800000, 0xd8d80000,
-	0x01000000, 0xbf8c0000,
-	0xe0704000, 0x705d0100,
-	0x807c037c, 0x80700370,
-	0xd5250000, 0x0001ff00,
-	0x00000100, 0xbf0a7b7c,
-	0xbf85fff4, 0xbefe03c1,
-	0x907c9973, 0x877c817c,
-	0xbf06817c, 0xbf850004,
-	0xbef003ff, 0x00000200,
-	0xbeff0380, 0xbf820003,
-	0xbef003ff, 0x00000400,
-	0xbeff03c1, 0xb97b3a05,
-	0x807b817b, 0x8f7b827b,
-	0x907c9973, 0x877c817c,
-	0xbf06817c, 0xbf85006b,
+	0xbf06a004, 0xbf84ffef,
+	0xbe840380, 0xd7600000,
+	0x00000902, 0x80048104,
+	0xd7600001, 0x00000902,
+	0x80048104, 0xd7600002,
+	0x00000902, 0x80048104,
+	0xd7600003, 0x00000902,
+	0x80048104, 0xf469003a,
+	0xe0000000, 0x80709070,
+	0xbf06a004, 0xbf84ffef,
+	0xbe840380, 0xd7600000,
+	0x00000903, 0x80048104,
+	0xd7600001, 0x00000903,
+	0x80048104, 0xd7600002,
+	0x00000903, 0x80048104,
+	0xd7600003, 0x00000903,
+	0x80048104, 0xf469003a,
+	0xe0000000, 0x80709070,
+	0xbf06a004, 0xbf84ffef,
+	0x807c847c, 0xbf0a7b7c,
+	0xbf85ffb1, 0xbf8200a6,
+	0x7e008700, 0x7e028701,
+	0x7e048702, 0x7e068703,
+	0xe0704000, 0x705d0000,
+	0xe0704080, 0x705d0100,
+	0xe0704100, 0x705d0200,
+	0xe0704180, 0x705d0300,
+	0x807c847c, 0x8070ff70,
+	0x00000200, 0xbf0a7b7c,
+	0xbf85ffef, 0xbf820094,
 	0xbef603ff, 0x01000000,
 	0xbefc0384, 0xbf0a7b7c,
-	0xbf8400fa, 0xb97af803,
+	0xbf840065, 0xb97af803,
 	0x8a7a7aff, 0x10000000,
 	0xbf850050, 0x7e008700,
 	0x7e028701, 0x7e048702,
@@ -1023,7 +1076,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x80048104, 0xd7600003,
 	0x00000900, 0x80048104,
 	0xf469003a, 0xe0000000,
-	0x80709070, 0xbf06a004,
+	0x80709070, 0xbf06c004,
 	0xbf84ffef, 0xbe840380,
 	0xd7600000, 0x00000901,
 	0x80048104, 0xd7600001,
@@ -1032,7 +1085,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x80048104, 0xd7600003,
 	0x00000901, 0x80048104,
 	0xf469003a, 0xe0000000,
-	0x80709070, 0xbf06a004,
+	0x80709070, 0xbf06c004,
 	0xbf84ffef, 0xbe840380,
 	0xd7600000, 0x00000902,
 	0x80048104, 0xd7600001,
@@ -1041,7 +1094,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x80048104, 0xd7600003,
 	0x00000902, 0x80048104,
 	0xf469003a, 0xe0000000,
-	0x80709070, 0xbf06a004,
+	0x80709070, 0xbf06c004,
 	0xbf84ffef, 0xbe840380,
 	0xd7600000, 0x00000903,
 	0x80048104, 0xd7600001,
@@ -1050,25 +1103,24 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x80048104, 0xd7600003,
 	0x00000903, 0x80048104,
 	0xf469003a, 0xe0000000,
-	0x80709070, 0xbf06a004,
+	0x80709070, 0xbf06c004,
 	0xbf84ffef, 0x807c847c,
 	0xbf0a7b7c, 0xbf85ffb1,
-	0xbf8200a6, 0x7e008700,
+	0xbf82003b, 0x7e008700,
 	0x7e028701, 0x7e048702,
 	0x7e068703, 0xe0704000,
-	0x705d0000, 0xe0704080,
-	0x705d0100, 0xe0704100,
-	0x705d0200, 0xe0704180,
+	0x705d0000, 0xe0704100,
+	0x705d0100, 0xe0704200,
+	0x705d0200, 0xe0704300,
 	0x705d0300, 0x807c847c,
-	0x8070ff70, 0x00000200,
+	0x8070ff70, 0x00000400,
 	0xbf0a7b7c, 0xbf85ffef,
-	0xbf820094, 0xbef603ff,
-	0x01000000, 0xbefc0384,
-	0xbf0a7b7c, 0xbf840065,
-	0xb97af803, 0x8a7a7aff,
-	0x10000000, 0xbf850050,
-	0x7e008700, 0x7e028701,
-	0x7e048702, 0x7e068703,
+	0xb97b1e06, 0x877bc17b,
+	0xbf840027, 0x8f7b837b,
+	0x807b7c7b, 0xbefe03c1,
+	0xbeff0380, 0xb97af803,
+	0x8a7a7aff, 0x10000000,
+	0xbf850017, 0x7e008700,
 	0xbe840380, 0xd7600000,
 	0x00000900, 0x80048104,
 	0xd7600001, 0x00000900,
@@ -1078,78 +1130,25 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
 	0x80048104, 0xf469003a,
 	0xe0000000, 0x80709070,
 	0xbf06c004, 0xbf84ffef,
-	0xbe840380, 0xd7600000,
-	0x00000901, 0x80048104,
-	0xd7600001, 0x00000901,
-	0x80048104, 0xd7600002,
-	0x00000901, 0x80048104,
-	0xd7600003, 0x00000901,
-	0x80048104, 0xf469003a,
-	0xe0000000, 0x80709070,
-	0xbf06c004, 0xbf84ffef,
-	0xbe840380, 0xd7600000,
-	0x00000902, 0x80048104,
-	0xd7600001, 0x00000902,
-	0x80048104, 0xd7600002,
-	0x00000902, 0x80048104,
-	0xd7600003, 0x00000902,
-	0x80048104, 0xf469003a,
-	0xe0000000, 0x80709070,
-	0xbf06c004, 0xbf84ffef,
-	0xbe840380, 0xd7600000,
-	0x00000903, 0x80048104,
-	0xd7600001, 0x00000903,
-	0x80048104, 0xd7600002,
-	0x00000903, 0x80048104,
-	0xd7600003, 0x00000903,
-	0x80048104, 0xf469003a,
-	0xe0000000, 0x80709070,
-	0xbf06c004, 0xbf84ffef,
-	0x807c847c, 0xbf0a7b7c,
-	0xbf85ffb1, 0xbf82003b,
-	0x7e008700, 0x7e028701,
-	0x7e048702, 0x7e068703,
-	0xe0704000, 0x705d0000,
-	0xe0704100, 0x705d0100,
-	0xe0704200, 0x705d0200,
-	0xe0704300, 0x705d0300,
-	0x807c847c, 0x8070ff70,
-	0x00000400, 0xbf0a7b7c,
-	0xbf85ffef, 0xb97b1e06,
-	0x877bc17b, 0xbf840027,
-	0x8f7b837b, 0x807b7c7b,
-	0xbefe03c1, 0xbeff0380,
-	0xb97af803, 0x8a7a7aff,
-	0x10000000, 0xbf850017,
-	0x7e008700, 0xbe840380,
-	0xd7600000, 0x00000900,
-	0x80048104, 0xd7600001,
-	0x00000900, 0x80048104,
-	0xd7600002, 0x00000900,
-	0x80048104, 0xd7600003,
-	0x00000900, 0x80048104,
-	0xf469003a, 0xe0000000,
-	0x80709070, 0xbf06c004,
-	0xbf84ffef, 0x807c817c,
-	0xbf0a7b7c, 0xbf85ffea,
-	0xbf820008, 0x7e008700,
-	0xe0704000, 0x705d0000,
-	0x807c817c, 0x8070ff70,
-	0x00000080, 0xbf0a7b7c,
-	0xbf85fff8, 0xbf82013e,
-	0xbef4037e, 0x8775ff7f,
-	0x0000ffff, 0x8875ff75,
-	0x00040000, 0xbef60380,
-	0xbef703ff, 0x10807fac,
-	0xb97202dc, 0x8f729972,
-	0x876eff7f, 0x04000000,
-	0xbf840034, 0xbefe03c1,
-	0x907c9972, 0x877c817c,
-	0xbf06817c, 0xbf850002,
-	0xbeff0380, 0xbf820001,
-	0xbeff03c1, 0xb96f4306,
-	0x876fc16f, 0xbf840029,
-	0x8f6f866f, 0x8f6f826f,
+	0x807c817c, 0xbf0a7b7c,
+	0xbf85ffea, 0xbf820008,
+	0x7e008700, 0xe0704000,
+	0x705d0000, 0x807c817c,
+	0x8070ff70, 0x00000080,
+	0xbf0a7b7c, 0xbf85fff8,
+	0xbf82013d, 0xbef4037e,
+	0x8775ff7f, 0x0000ffff,
+	0x8875ff75, 0x00040000,
+	0xbef60380, 0xbef703ff,
+	0x10807fac, 0xb97202dc,
+	0x8f729972, 0x876eff7f,
+	0x04000000, 0xbf840033,
+	0xbefe03c1, 0x907c9972,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0xbeff0380,
+	0xbf820001, 0xbeff03c1,
+	0xb96f4306, 0x876fc16f,
+	0xbf840028, 0x8f6f886f,
 	0xbef6036f, 0xb9783a05,
 	0x80788178, 0xbf0d9972,
 	0xbf850002, 0x8f788978,
@@ -2273,7 +2272,7 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = {
 };
 
 static const uint32_t cwsr_trap_gfx10_hex[] = {
-	0xbf820001, 0xbf820221,
+	0xbf820001, 0xbf820220,
 	0xb0804004, 0xb978f802,
 	0x8a78ff78, 0x00020006,
 	0xb97bf803, 0x876eff78,
@@ -2472,94 +2471,93 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 	0xbf850002, 0xbeff0380,
 	0xbf820001, 0xbeff03c1,
 	0xb97b4306, 0x877bc17b,
-	0xbf840044, 0xbf8a0000,
+	0xbf840043, 0xbf8a0000,
 	0x877aff6d, 0x80000000,
-	0xbf840040, 0x8f7b867b,
-	0x8f7b827b, 0xbef6037b,
-	0xb9703a05, 0x80708170,
-	0xbf0d9973, 0xbf850002,
-	0x8f708970, 0xbf820001,
-	0x8f708a70, 0xb97a1e06,
-	0x8f7a8a7a, 0x80707a70,
-	0x8070ff70, 0x00000200,
-	0x8070ff70, 0x00000080,
-	0xbef603ff, 0x01000000,
-	0xd7650000, 0x000100c1,
-	0xd7660000, 0x000200c1,
-	0x16000084, 0x907c9973,
-	0x877c817c, 0xbf06817c,
-	0xbefc0380, 0xbf850012,
-	0xbe8303ff, 0x00000080,
+	0xbf84003f, 0x8f7b887b,
+	0xbef6037b, 0xb9703a05,
+	0x80708170, 0xbf0d9973,
+	0xbf850002, 0x8f708970,
+	0xbf820001, 0x8f708a70,
+	0xb97a1e06, 0x8f7a8a7a,
+	0x80707a70, 0x8070ff70,
+	0x00000200, 0x8070ff70,
+	0x00000080, 0xbef603ff,
+	0x01000000, 0xd7650000,
+	0x000100c1, 0xd7660000,
+	0x000200c1, 0x16000084,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbefc0380,
+	0xbf850012, 0xbe8303ff,
+	0x00000080, 0xbf800000,
 	0xbf800000, 0xbf800000,
-	0xbf800000, 0xd8d80000,
-	0x01000000, 0xbf8c0000,
-	0xe0704000, 0x705d0100,
-	0x807c037c, 0x80700370,
-	0xd5250000, 0x0001ff00,
-	0x00000080, 0xbf0a7b7c,
-	0xbf85fff4, 0xbf820011,
-	0xbe8303ff, 0x00000100,
+	0xd8d80000, 0x01000000,
+	0xbf8c0000, 0xe0704000,
+	0x705d0100, 0x807c037c,
+	0x80700370, 0xd5250000,
+	0x0001ff00, 0x00000080,
+	0xbf0a7b7c, 0xbf85fff4,
+	0xbf820011, 0xbe8303ff,
+	0x00000100, 0xbf800000,
 	0xbf800000, 0xbf800000,
-	0xbf800000, 0xd8d80000,
-	0x01000000, 0xbf8c0000,
-	0xe0704000, 0x705d0100,
-	0x807c037c, 0x80700370,
-	0xd5250000, 0x0001ff00,
-	0x00000100, 0xbf0a7b7c,
-	0xbf85fff4, 0xbefe03c1,
-	0x907c9973, 0x877c817c,
-	0xbf06817c, 0xbf850004,
-	0xbef003ff, 0x00000200,
-	0xbeff0380, 0xbf820003,
-	0xbef003ff, 0x00000400,
-	0xbeff03c1, 0xb97b3a05,
-	0x807b817b, 0x8f7b827b,
-	0x907c9973, 0x877c817c,
-	0xbf06817c, 0xbf850017,
+	0xd8d80000, 0x01000000,
+	0xbf8c0000, 0xe0704000,
+	0x705d0100, 0x807c037c,
+	0x80700370, 0xd5250000,
+	0x0001ff00, 0x00000100,
+	0xbf0a7b7c, 0xbf85fff4,
+	0xbefe03c1, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850004, 0xbef003ff,
+	0x00000200, 0xbeff0380,
+	0xbf820003, 0xbef003ff,
+	0x00000400, 0xbeff03c1,
+	0xb97b3a05, 0x807b817b,
+	0x8f7b827b, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850017, 0xbef603ff,
+	0x01000000, 0xbefc0384,
+	0xbf0a7b7c, 0xbf840037,
+	0x7e008700, 0x7e028701,
+	0x7e048702, 0x7e068703,
+	0xe0704000, 0x705d0000,
+	0xe0704080, 0x705d0100,
+	0xe0704100, 0x705d0200,
+	0xe0704180, 0x705d0300,
+	0x807c847c, 0x8070ff70,
+	0x00000200, 0xbf0a7b7c,
+	0xbf85ffef, 0xbf820025,
 	0xbef603ff, 0x01000000,
 	0xbefc0384, 0xbf0a7b7c,
-	0xbf840037, 0x7e008700,
+	0xbf840011, 0x7e008700,
 	0x7e028701, 0x7e048702,
 	0x7e068703, 0xe0704000,
-	0x705d0000, 0xe0704080,
-	0x705d0100, 0xe0704100,
-	0x705d0200, 0xe0704180,
+	0x705d0000, 0xe0704100,
+	0x705d0100, 0xe0704200,
+	0x705d0200, 0xe0704300,
 	0x705d0300, 0x807c847c,
-	0x8070ff70, 0x00000200,
+	0x8070ff70, 0x00000400,
 	0xbf0a7b7c, 0xbf85ffef,
-	0xbf820025, 0xbef603ff,
-	0x01000000, 0xbefc0384,
-	0xbf0a7b7c, 0xbf840011,
-	0x7e008700, 0x7e028701,
-	0x7e048702, 0x7e068703,
+	0xb97b1e06, 0x877bc17b,
+	0xbf84000c, 0x8f7b837b,
+	0x807b7c7b, 0xbefe03c1,
+	0xbeff0380, 0x7e008700,
 	0xe0704000, 0x705d0000,
-	0xe0704100, 0x705d0100,
-	0xe0704200, 0x705d0200,
-	0xe0704300, 0x705d0300,
-	0x807c847c, 0x8070ff70,
-	0x00000400, 0xbf0a7b7c,
-	0xbf85ffef, 0xb97b1e06,
-	0x877bc17b, 0xbf84000c,
-	0x8f7b837b, 0x807b7c7b,
-	0xbefe03c1, 0xbeff0380,
-	0x7e008700, 0xe0704000,
-	0x705d0000, 0x807c817c,
-	0x8070ff70, 0x00000080,
-	0xbf0a7b7c, 0xbf85fff8,
-	0xbf820135, 0xbef4037e,
-	0x8775ff7f, 0x0000ffff,
-	0x8875ff75, 0x00040000,
-	0xbef60380, 0xbef703ff,
-	0x10807fac, 0xb97202dc,
-	0x8f729972, 0x876eff7f,
-	0x04000000, 0xbf840034,
-	0xbefe03c1, 0x907c9972,
-	0x877c817c, 0xbf06817c,
-	0xbf850002, 0xbeff0380,
-	0xbf820001, 0xbeff03c1,
-	0xb96f4306, 0x876fc16f,
-	0xbf840029, 0x8f6f866f,
-	0x8f6f826f, 0xbef6036f,
+	0x807c817c, 0x8070ff70,
+	0x00000080, 0xbf0a7b7c,
+	0xbf85fff8, 0xbf820134,
+	0xbef4037e, 0x8775ff7f,
+	0x0000ffff, 0x8875ff75,
+	0x00040000, 0xbef60380,
+	0xbef703ff, 0x10807fac,
+	0xb97202dc, 0x8f729972,
+	0x876eff7f, 0x04000000,
+	0xbf840033, 0xbefe03c1,
+	0x907c9972, 0x877c817c,
+	0xbf06817c, 0xbf850002,
+	0xbeff0380, 0xbf820001,
+	0xbeff03c1, 0xb96f4306,
+	0x876fc16f, 0xbf840028,
+	0x8f6f886f, 0xbef6036f,
 	0xb9783a05, 0x80788178,
 	0xbf0d9972, 0xbf850002,
 	0x8f788978, 0xbf820001,
@@ -2707,7 +2705,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
 };
 
 static const uint32_t cwsr_trap_gfx11_hex[] = {
-	0xbfa00001, 0xbfa00225,
+	0xbfa00001, 0xbfa00224,
 	0xb0804006, 0xb8f8f802,
 	0x9178ff78, 0x00020006,
 	0xb8fbf803, 0xbf0d9e6d,
@@ -2908,94 +2906,93 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
 	0xbfa20002, 0xbeff0080,
 	0xbfa00001, 0xbeff00c1,
 	0xb8fb4306, 0x8b7bc17b,
-	0xbfa10044, 0xbfbd0000,
+	0xbfa10043, 0xbfbd0000,
 	0x8b7aff6d, 0x80000000,
-	0xbfa10040, 0x847b867b,
-	0x847b827b, 0xbef6007b,
-	0xb8f03b05, 0x80708170,
-	0xbf0d9973, 0xbfa20002,
-	0x84708970, 0xbfa00001,
-	0x84708a70, 0xb8fa1e06,
-	0x847a8a7a, 0x80707a70,
-	0x8070ff70, 0x00000200,
-	0x8070ff70, 0x00000080,
-	0xbef600ff, 0x01000000,
-	0xd71f0000, 0x000100c1,
-	0xd7200000, 0x000200c1,
-	0x16000084, 0x857d9973,
-	0x8b7d817d, 0xbf06817d,
-	0xbefd0080, 0xbfa20012,
-	0xbe8300ff, 0x00000080,
+	0xbfa1003f, 0x847b887b,
+	0xbef6007b, 0xb8f03b05,
+	0x80708170, 0xbf0d9973,
+	0xbfa20002, 0x84708970,
+	0xbfa00001, 0x84708a70,
+	0xb8fa1e06, 0x847a8a7a,
+	0x80707a70, 0x8070ff70,
+	0x00000200, 0x8070ff70,
+	0x00000080, 0xbef600ff,
+	0x01000000, 0xd71f0000,
+	0x000100c1, 0xd7200000,
+	0x000200c1, 0x16000084,
+	0x857d9973, 0x8b7d817d,
+	0xbf06817d, 0xbefd0080,
+	0xbfa20012, 0xbe8300ff,
+	0x00000080, 0xbf800000,
 	0xbf800000, 0xbf800000,
-	0xbf800000, 0xd8d80000,
-	0x01000000, 0xbf890000,
-	0xe0685000, 0x701d0100,
-	0x807d037d, 0x80700370,
-	0xd5250000, 0x0001ff00,
-	0x00000080, 0xbf0a7b7d,
-	0xbfa2fff4, 0xbfa00011,
-	0xbe8300ff, 0x00000100,
+	0xd8d80000, 0x01000000,
+	0xbf890000, 0xe0685000,
+	0x701d0100, 0x807d037d,
+	0x80700370, 0xd5250000,
+	0x0001ff00, 0x00000080,
+	0xbf0a7b7d, 0xbfa2fff4,
+	0xbfa00011, 0xbe8300ff,
+	0x00000100, 0xbf800000,
 	0xbf800000, 0xbf800000,
-	0xbf800000, 0xd8d80000,
-	0x01000000, 0xbf890000,
-	0xe0685000, 0x701d0100,
-	0x807d037d, 0x80700370,
-	0xd5250000, 0x0001ff00,
-	0x00000100, 0xbf0a7b7d,
-	0xbfa2fff4, 0xbefe00c1,
-	0x857d9973, 0x8b7d817d,
-	0xbf06817d, 0xbfa20004,
-	0xbef000ff, 0x00000200,
-	0xbeff0080, 0xbfa00003,
-	0xbef000ff, 0x00000400,
-	0xbeff00c1, 0xb8fb3b05,
-	0x807b817b, 0x847b827b,
-	0x857d9973, 0x8b7d817d,
-	0xbf06817d, 0xbfa20017,
+	0xd8d80000, 0x01000000,
+	0xbf890000, 0xe0685000,
+	0x701d0100, 0x807d037d,
+	0x80700370, 0xd5250000,
+	0x0001ff00, 0x00000100,
+	0xbf0a7b7d, 0xbfa2fff4,
+	0xbefe00c1, 0x857d9973,
+	0x8b7d817d, 0xbf06817d,
+	0xbfa20004, 0xbef000ff,
+	0x00000200, 0xbeff0080,
+	0xbfa00003, 0xbef000ff,
+	0x00000400, 0xbeff00c1,
+	0xb8fb3b05, 0x807b817b,
+	0x847b827b, 0x857d9973,
+	0x8b7d817d, 0xbf06817d,
+	0xbfa20017, 0xbef600ff,
+	0x01000000, 0xbefd0084,
+	0xbf0a7b7d, 0xbfa10037,
+	0x7e008700, 0x7e028701,
+	0x7e048702, 0x7e068703,
+	0xe0685000, 0x701d0000,
+	0xe0685080, 0x701d0100,
+	0xe0685100, 0x701d0200,
+	0xe0685180, 0x701d0300,
+	0x807d847d, 0x8070ff70,
+	0x00000200, 0xbf0a7b7d,
+	0xbfa2ffef, 0xbfa00025,
 	0xbef600ff, 0x01000000,
 	0xbefd0084, 0xbf0a7b7d,
-	0xbfa10037, 0x7e008700,
+	0xbfa10011, 0x7e008700,
 	0x7e028701, 0x7e048702,
 	0x7e068703, 0xe0685000,
-	0x701d0000, 0xe0685080,
-	0x701d0100, 0xe0685100,
-	0x701d0200, 0xe0685180,
+	0x701d0000, 0xe0685100,
+	0x701d0100, 0xe0685200,
+	0x701d0200, 0xe0685300,
 	0x701d0300, 0x807d847d,
-	0x8070ff70, 0x00000200,
+	0x8070ff70, 0x00000400,
 	0xbf0a7b7d, 0xbfa2ffef,
-	0xbfa00025, 0xbef600ff,
-	0x01000000, 0xbefd0084,
-	0xbf0a7b7d, 0xbfa10011,
-	0x7e008700, 0x7e028701,
-	0x7e048702, 0x7e068703,
+	0xb8fb1e06, 0x8b7bc17b,
+	0xbfa1000c, 0x847b837b,
+	0x807b7d7b, 0xbefe00c1,
+	0xbeff0080, 0x7e008700,
 	0xe0685000, 0x701d0000,
-	0xe0685100, 0x701d0100,
-	0xe0685200, 0x701d0200,
-	0xe0685300, 0x701d0300,
-	0x807d847d, 0x8070ff70,
-	0x00000400, 0xbf0a7b7d,
-	0xbfa2ffef, 0xb8fb1e06,
-	0x8b7bc17b, 0xbfa1000c,
-	0x847b837b, 0x807b7d7b,
-	0xbefe00c1, 0xbeff0080,
-	0x7e008700, 0xe0685000,
-	0x701d0000, 0x807d817d,
-	0x8070ff70, 0x00000080,
-	0xbf0a7b7d, 0xbfa2fff8,
-	0xbfa00140, 0xbef4007e,
-	0x8b75ff7f, 0x0000ffff,
-	0x8c75ff75, 0x00040000,
-	0xbef60080, 0xbef700ff,
-	0x10807fac, 0xb8f202dc,
-	0x84729972, 0x8b6eff7f,
-	0x04000000, 0xbfa1003a,
-	0xbefe00c1, 0x857d9972,
-	0x8b7d817d, 0xbf06817d,
-	0xbfa20002, 0xbeff0080,
-	0xbfa00001, 0xbeff00c1,
-	0xb8ef4306, 0x8b6fc16f,
-	0xbfa1002f, 0x846f866f,
-	0x846f826f, 0xbef6006f,
+	0x807d817d, 0x8070ff70,
+	0x00000080, 0xbf0a7b7d,
+	0xbfa2fff8, 0xbfa0013f,
+	0xbef4007e, 0x8b75ff7f,
+	0x0000ffff, 0x8c75ff75,
+	0x00040000, 0xbef60080,
+	0xbef700ff, 0x10807fac,
+	0xb8f202dc, 0x84729972,
+	0x8b6eff7f, 0x04000000,
+	0xbfa10039, 0xbefe00c1,
+	0x857d9972, 0x8b7d817d,
+	0xbf06817d, 0xbfa20002,
+	0xbeff0080, 0xbfa00001,
+	0xbeff00c1, 0xb8ef4306,
+	0x8b6fc16f, 0xbfa1002e,
+	0x846f886f, 0xbef6006f,
 	0xb8f83b05, 0x80788178,
 	0xbf0d9972, 0xbfa20002,
 	0x84788978, 0xbfa00001,
@@ -3638,14 +3635,14 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = {
 };
 
 static const uint32_t cwsr_trap_gfx12_hex[] = {
-	0xbfa00001, 0xbfa00243,
+	0xbfa00001, 0xbfa00247,
 	0xb0804009, 0xb8f8f804,
 	0x9178ff78, 0x00008c00,
 	0xb8fbf811, 0x8b6eff78,
 	0x00004000, 0xbfa10008,
 	0x8b6eff7b, 0x00000080,
 	0xbfa20018, 0x8b6ea07b,
-	0xbfa20041, 0xbf830010,
+	0xbfa20042, 0xbf830010,
 	0xb8fbf811, 0xbfa0fffb,
 	0x8b6eff7b, 0x00000bd0,
 	0xbfa20010, 0xb8eef812,
@@ -3656,7 +3653,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0xf0000000, 0xbfa20005,
 	0x8b6fff6f, 0x00000200,
 	0xbfa20002, 0x8b6ea07b,
-	0xbfa2002b, 0xbefa4d82,
+	0xbfa2002c, 0xbefa4d82,
 	0xbf8a0000, 0x84fa887a,
 	0xbf0d8f7b, 0xbfa10002,
 	0x8c7bff7b, 0xffff0000,
@@ -3677,120 +3674,156 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x806c846c, 0x826d806d,
 	0x8b6dff6d, 0x0000ffff,
 	0x8bfe7e7e, 0x8bea6a6a,
-	0xb978f804, 0xbe804a6c,
-	0x8b6dff6d, 0x0000ffff,
-	0xbefa0080, 0xb97a0151,
-	0xbeee007e, 0xbeef007f,
-	0xbefe0180, 0xbefe4d84,
-	0xbf8a0000, 0x8b7aff7f,
-	0x04000000, 0x847a857a,
-	0x8c6d7a6d, 0xbefa007e,
-	0x8b7bff7f, 0x0000ffff,
-	0xbefe00c1, 0xbeff00c1,
+	0x85788978, 0xb9783244,
+	0xbe804a6c, 0x8b6dff6d,
+	0x0000ffff, 0xbefa0080,
+	0xb97a0151, 0xbeee007e,
+	0xbeef007f, 0xbefe0180,
+	0xbefe4d84, 0xbf8a0000,
+	0x8b7aff7f, 0x04000000,
+	0x847a857a, 0x8c6d7a6d,
+	0xbefa007e, 0x8b7bff7f,
+	0x0000ffff, 0xbefe00c1,
+	0xbeff00c1, 0xee0a407a,
+	0x000c0000, 0x00000000,
+	0x7e000280, 0xbefe007a,
+	0xbeff007b, 0xb8fb0742,
+	0x847b997b, 0xb8fa3b05,
+	0x807a817a, 0xbf0d997b,
+	0xbfa20002, 0x847a897a,
+	0xbfa00001, 0x847a8a7a,
+	0xb8fb1e06, 0x847b8a7b,
+	0x807a7b7a, 0x8b7bff7f,
+	0x0000ffff, 0x807aff7a,
+	0x00000200, 0x807a7e7a,
+	0x827b807b, 0xd7610000,
+	0x00010870, 0xd7610000,
+	0x00010a71, 0xd7610000,
+	0x00010c72, 0xd7610000,
+	0x00010e73, 0xd7610000,
+	0x00011074, 0xd7610000,
+	0x00011275, 0xd7610000,
+	0x00011476, 0xd7610000,
+	0x00011677, 0xd7610000,
+	0x00011a79, 0xd7610000,
+	0x00011c7e, 0xd7610000,
+	0x00011e7f, 0xbefe00ff,
+	0x00003fff, 0xbeff0080,
 	0xee0a407a, 0x000c0000,
-	0x00000000, 0x7e000280,
-	0xbefe007a, 0xbeff007b,
-	0xb8fb0742, 0x847b997b,
-	0xb8fa3b05, 0x807a817a,
-	0xbf0d997b, 0xbfa20002,
-	0x847a897a, 0xbfa00001,
-	0x847a8a7a, 0xb8fb1e06,
-	0x847b8a7b, 0x807a7b7a,
-	0x8b7bff7f, 0x0000ffff,
-	0x807aff7a, 0x00000200,
-	0x807a7e7a, 0x827b807b,
-	0xd7610000, 0x00010870,
-	0xd7610000, 0x00010a71,
-	0xd7610000, 0x00010c72,
-	0xd7610000, 0x00010e73,
-	0xd7610000, 0x00011074,
-	0xd7610000, 0x00011275,
-	0xd7610000, 0x00011476,
-	0xd7610000, 0x00011677,
-	0xd7610000, 0x00011a79,
-	0xd7610000, 0x00011c7e,
-	0xd7610000, 0x00011e7f,
-	0xbefe00ff, 0x00003fff,
-	0xbeff0080, 0xee0a407a,
-	0x000c0000, 0x00004000,
-	0xd760007a, 0x00011d00,
-	0xd760007b, 0x00011f00,
-	0xbefe007a, 0xbeff007b,
-	0xbef4007e, 0x8b75ff7f,
-	0x0000ffff, 0x8c75ff75,
-	0x00040000, 0xbef60080,
-	0xbef700ff, 0x10807fac,
-	0xbef1007d, 0xbef00080,
-	0xb8f30742, 0x84739973,
-	0xbefe00c1, 0x857d9973,
-	0x8b7d817d, 0xbf06817d,
-	0xbfa20002, 0xbeff0080,
-	0xbfa00002, 0xbeff00c1,
-	0xbfa0000c, 0xbef600ff,
-	0x01000000, 0xc4068070,
-	0x008ce801, 0x00008000,
-	0xc4068070, 0x008ce802,
+	0x00004000, 0xd760007a,
+	0x00011d00, 0xd760007b,
+	0x00011f00, 0xbefe007a,
+	0xbeff007b, 0xbef4007e,
+	0x8b75ff7f, 0x0000ffff,
+	0x8c75ff75, 0x00040000,
+	0xbef60080, 0xbef700ff,
+	0x10807fac, 0xbef1007d,
+	0xbef00080, 0xb8f30742,
+	0x84739973, 0xbefe00c1,
+	0x857d9973, 0x8b7d817d,
+	0xbf06817d, 0xbfa20002,
+	0xbeff0080, 0xbfa00002,
+	0xbeff00c1, 0xbfa0000c,
+	0xbef600ff, 0x01000000,
+	0xc4068070, 0x008ce801,
+	0x00008000, 0xc4068070,
+	0x008ce802, 0x00010000,
+	0xc4068070, 0x008ce803,
+	0x00018000, 0xbfa0000b,
+	0xbef600ff, 0x01000000,
+	0xc4068070, 0x008ce801,
 	0x00010000, 0xc4068070,
-	0x008ce803, 0x00018000,
-	0xbfa0000b, 0xbef600ff,
-	0x01000000, 0xc4068070,
-	0x008ce801, 0x00010000,
+	0x008ce802, 0x00020000,
+	0xc4068070, 0x008ce803,
+	0x00030000, 0xb8f03b05,
+	0x80708170, 0xbf0d9973,
+	0xbfa20002, 0x84708970,
+	0xbfa00001, 0x84708a70,
+	0xb8fa1e06, 0x847a8a7a,
+	0x80707a70, 0x8070ff70,
+	0x00000200, 0xbef600ff,
+	0x01000000, 0x7e000280,
+	0x7e020280, 0x7e040280,
+	0xbefd0080, 0xbe804ec2,
+	0xbf94fffe, 0xb8faf804,
+	0x8b7a847a, 0x91788478,
+	0x8c787a78, 0xd7610002,
+	0x0000fa71, 0x807d817d,
+	0xd7610002, 0x0000fa6c,
+	0x807d817d, 0x917aff6d,
+	0x80000000, 0xd7610002,
+	0x0000fa7a, 0x807d817d,
+	0xd7610002, 0x0000fa6e,
+	0x807d817d, 0xd7610002,
+	0x0000fa6f, 0x807d817d,
+	0xd7610002, 0x0000fa78,
+	0x807d817d, 0xb8faf811,
+	0xd7610002, 0x0000fa7a,
+	0x807d817d, 0xd7610002,
+	0x0000fa7b, 0x807d817d,
+	0xb8f1f801, 0xd7610002,
+	0x0000fa71, 0x807d817d,
+	0xb8f1f814, 0xd7610002,
+	0x0000fa71, 0x807d817d,
+	0xb8f1f815, 0xd7610002,
+	0x0000fa71, 0x807d817d,
+	0xb8f1f812, 0xd7610002,
+	0x0000fa71, 0x807d817d,
+	0xb8f1f813, 0xd7610002,
+	0x0000fa71, 0x807d817d,
+	0xb8faf802, 0xd7610002,
+	0x0000fa7a, 0x807d817d,
+	0xbefa50c1, 0xbfc70000,
+	0xd7610002, 0x0000fa7a,
+	0x807d817d, 0xbefe00ff,
+	0x0000ffff, 0xbeff0080,
 	0xc4068070, 0x008ce802,
-	0x00020000, 0xc4068070,
-	0x008ce803, 0x00030000,
+	0x00000000, 0xbefe00c1,
 	0xb8f03b05, 0x80708170,
 	0xbf0d9973, 0xbfa20002,
 	0x84708970, 0xbfa00001,
 	0x84708a70, 0xb8fa1e06,
 	0x847a8a7a, 0x80707a70,
-	0x8070ff70, 0x00000200,
 	0xbef600ff, 0x01000000,
-	0x7e000280, 0x7e020280,
-	0x7e040280, 0xbefd0080,
-	0xbe804ec2, 0xbf94fffe,
-	0xd7610002, 0x0000fa71,
-	0x807d817d, 0xd7610002,
-	0x0000fa6c, 0x807d817d,
-	0x917aff6d, 0x80000000,
-	0xd7610002, 0x0000fa7a,
-	0x807d817d, 0xd7610002,
-	0x0000fa6e, 0x807d817d,
-	0xd7610002, 0x0000fa6f,
-	0x807d817d, 0xd7610002,
-	0x0000fa78, 0x807d817d,
-	0xb8faf811, 0xd7610002,
-	0x0000fa7a, 0x807d817d,
-	0xd7610002, 0x0000fa7b,
-	0x807d817d, 0xb8f1f801,
-	0xd7610002, 0x0000fa71,
-	0x807d817d, 0xb8f1f814,
-	0xd7610002, 0x0000fa71,
-	0x807d817d, 0xb8f1f815,
-	0xd7610002, 0x0000fa71,
-	0x807d817d, 0xb8f1f812,
-	0xd7610002, 0x0000fa71,
-	0x807d817d, 0xb8f1f813,
-	0xd7610002, 0x0000fa71,
-	0x807d817d, 0xb8faf802,
-	0xd7610002, 0x0000fa7a,
-	0x807d817d, 0xbefa50c1,
-	0xbfc70000, 0xd7610002,
-	0x0000fa7a, 0x807d817d,
-	0xbefe00ff, 0x0000ffff,
-	0xbeff0080, 0xc4068070,
+	0xbef90080, 0xbefd0080,
+	0xbf800000, 0xbe804100,
+	0xbe824102, 0xbe844104,
+	0xbe864106, 0xbe884108,
+	0xbe8a410a, 0xbe8c410c,
+	0xbe8e410e, 0xd7610002,
+	0x0000f200, 0x80798179,
+	0xd7610002, 0x0000f201,
+	0x80798179, 0xd7610002,
+	0x0000f202, 0x80798179,
+	0xd7610002, 0x0000f203,
+	0x80798179, 0xd7610002,
+	0x0000f204, 0x80798179,
+	0xd7610002, 0x0000f205,
+	0x80798179, 0xd7610002,
+	0x0000f206, 0x80798179,
+	0xd7610002, 0x0000f207,
+	0x80798179, 0xd7610002,
+	0x0000f208, 0x80798179,
+	0xd7610002, 0x0000f209,
+	0x80798179, 0xd7610002,
+	0x0000f20a, 0x80798179,
+	0xd7610002, 0x0000f20b,
+	0x80798179, 0xd7610002,
+	0x0000f20c, 0x80798179,
+	0xd7610002, 0x0000f20d,
+	0x80798179, 0xd7610002,
+	0x0000f20e, 0x80798179,
+	0xd7610002, 0x0000f20f,
+	0x80798179, 0xbf06a079,
+	0xbfa10007, 0xc4068070,
 	0x008ce802, 0x00000000,
-	0xbefe00c1, 0xb8f03b05,
-	0x80708170, 0xbf0d9973,
-	0xbfa20002, 0x84708970,
-	0xbfa00001, 0x84708a70,
-	0xb8fa1e06, 0x847a8a7a,
-	0x80707a70, 0xbef600ff,
-	0x01000000, 0xbef90080,
-	0xbefd0080, 0xbf800000,
+	0x8070ff70, 0x00000080,
+	0xbef90080, 0x7e040280,
+	0x807d907d, 0xbf0aff7d,
+	0x00000060, 0xbfa2ffbb,
 	0xbe804100, 0xbe824102,
 	0xbe844104, 0xbe864106,
 	0xbe884108, 0xbe8a410a,
-	0xbe8c410c, 0xbe8e410e,
 	0xd7610002, 0x0000f200,
 	0x80798179, 0xd7610002,
 	0x0000f201, 0x80798179,
@@ -3809,49 +3842,15 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0xd7610002, 0x0000f20a,
 	0x80798179, 0xd7610002,
 	0x0000f20b, 0x80798179,
-	0xd7610002, 0x0000f20c,
-	0x80798179, 0xd7610002,
-	0x0000f20d, 0x80798179,
-	0xd7610002, 0x0000f20e,
-	0x80798179, 0xd7610002,
-	0x0000f20f, 0x80798179,
-	0xbf06a079, 0xbfa10007,
 	0xc4068070, 0x008ce802,
-	0x00000000, 0x8070ff70,
-	0x00000080, 0xbef90080,
-	0x7e040280, 0x807d907d,
-	0xbf0aff7d, 0x00000060,
-	0xbfa2ffbb, 0xbe804100,
-	0xbe824102, 0xbe844104,
-	0xbe864106, 0xbe884108,
-	0xbe8a410a, 0xd7610002,
-	0x0000f200, 0x80798179,
-	0xd7610002, 0x0000f201,
-	0x80798179, 0xd7610002,
-	0x0000f202, 0x80798179,
-	0xd7610002, 0x0000f203,
-	0x80798179, 0xd7610002,
-	0x0000f204, 0x80798179,
-	0xd7610002, 0x0000f205,
-	0x80798179, 0xd7610002,
-	0x0000f206, 0x80798179,
-	0xd7610002, 0x0000f207,
-	0x80798179, 0xd7610002,
-	0x0000f208, 0x80798179,
-	0xd7610002, 0x0000f209,
-	0x80798179, 0xd7610002,
-	0x0000f20a, 0x80798179,
-	0xd7610002, 0x0000f20b,
-	0x80798179, 0xc4068070,
-	0x008ce802, 0x00000000,
-	0xbefe00c1, 0x857d9973,
-	0x8b7d817d, 0xbf06817d,
-	0xbfa20002, 0xbeff0080,
-	0xbfa00001, 0xbeff00c1,
-	0xb8fb4306, 0x8b7bc17b,
-	0xbfa10045, 0x8b7aff6d,
-	0x80000000, 0xbfa10042,
-	0x847b867b, 0x847b827b,
+	0x00000000, 0xbefe00c1,
+	0x857d9973, 0x8b7d817d,
+	0xbf06817d, 0xbfa20002,
+	0xbeff0080, 0xbfa00001,
+	0xbeff00c1, 0xb8fb4306,
+	0x8b7bc17b, 0xbfa10044,
+	0x8b7aff6d, 0x80000000,
+	0xbfa10041, 0x847b897b,
 	0xbef6007b, 0xb8f03b05,
 	0x80708170, 0xbf0d9973,
 	0xbfa20002, 0x84708970,
@@ -3928,189 +3927,191 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 	0x00000000, 0x807d817d,
 	0x8070ff70, 0x00000080,
 	0xbf0a7b7d, 0xbfa2fff7,
-	0xbfa0016b, 0xbef4007e,
+	0xbfa0016e, 0xbef4007e,
 	0x8b75ff7f, 0x0000ffff,
 	0x8c75ff75, 0x00040000,
 	0xbef60080, 0xbef700ff,
 	0x10807fac, 0xbef1007f,
 	0xb8f20742, 0x84729972,
 	0x8b6eff7f, 0x04000000,
-	0xbfa1003c, 0xbefe00c1,
+	0xbfa1003b, 0xbefe00c1,
 	0x857d9972, 0x8b7d817d,
 	0xbf06817d, 0xbfa20002,
 	0xbeff0080, 0xbfa00001,
 	0xbeff00c1, 0xb8ef4306,
-	0x8b6fc16f, 0xbfa10031,
-	0x846f866f, 0x846f826f,
-	0xbef6006f, 0xb8f83b05,
-	0x80788178, 0xbf0d9972,
-	0xbfa20002, 0x84788978,
-	0xbfa00001, 0x84788a78,
-	0xb8ee1e06, 0x846e8a6e,
-	0x80786e78, 0x8078ff78,
-	0x00000200, 0x8078ff78,
-	0x00000080, 0xbef600ff,
-	0x01000000, 0x857d9972,
-	0x8b7d817d, 0xbf06817d,
-	0xbefd0080, 0xbfa2000d,
-	0xc4050078, 0x0080e800,
-	0x00000000, 0xbf8a0000,
-	0xdac00000, 0x00000000,
-	0x807dff7d, 0x00000080,
-	0x8078ff78, 0x00000080,
-	0xbf0a6f7d, 0xbfa2fff4,
-	0xbfa0000c, 0xc4050078,
-	0x0080e800, 0x00000000,
-	0xbf8a0000, 0xdac00000,
-	0x00000000, 0x807dff7d,
-	0x00000100, 0x8078ff78,
-	0x00000100, 0xbf0a6f7d,
-	0xbfa2fff4, 0xbef80080,
-	0xbefe00c1, 0x857d9972,
-	0x8b7d817d, 0xbf06817d,
-	0xbfa20002, 0xbeff0080,
-	0xbfa00001, 0xbeff00c1,
-	0xb8ef3b05, 0x806f816f,
-	0x846f826f, 0x857d9972,
-	0x8b7d817d, 0xbf06817d,
-	0xbfa2002c, 0xbef600ff,
-	0x01000000, 0xbeee0078,
-	0x8078ff78, 0x00000200,
-	0xbefd0084, 0xbf0a6f7d,
-	0xbfa10061, 0xc4050078,
-	0x008ce800, 0x00000000,
-	0xc4050078, 0x008ce801,
-	0x00008000, 0xc4050078,
-	0x008ce802, 0x00010000,
-	0xc4050078, 0x008ce803,
-	0x00018000, 0xbf8a0000,
-	0x7e008500, 0x7e028501,
-	0x7e048502, 0x7e068503,
-	0x807d847d, 0x8078ff78,
-	0x00000200, 0xbf0a6f7d,
-	0xbfa2ffea, 0xc405006e,
-	0x008ce800, 0x00000000,
-	0xc405006e, 0x008ce801,
-	0x00008000, 0xc405006e,
-	0x008ce802, 0x00010000,
-	0xc405006e, 0x008ce803,
-	0x00018000, 0xbf8a0000,
-	0xbfa0003d, 0xbef600ff,
-	0x01000000, 0xbeee0078,
-	0x8078ff78, 0x00000400,
-	0xbefd0084, 0xbf0a6f7d,
-	0xbfa10016, 0xc4050078,
-	0x008ce800, 0x00000000,
-	0xc4050078, 0x008ce801,
-	0x00010000, 0xc4050078,
-	0x008ce802, 0x00020000,
-	0xc4050078, 0x008ce803,
-	0x00030000, 0xbf8a0000,
-	0x7e008500, 0x7e028501,
-	0x7e048502, 0x7e068503,
-	0x807d847d, 0x8078ff78,
-	0x00000400, 0xbf0a6f7d,
-	0xbfa2ffea, 0xb8ef1e06,
-	0x8b6fc16f, 0xbfa1000f,
-	0x846f836f, 0x806f7d6f,
-	0xbefe00c1, 0xbeff0080,
-	0xc4050078, 0x008ce800,
-	0x00000000, 0xbf8a0000,
-	0x7e008500, 0x807d817d,
-	0x8078ff78, 0x00000080,
-	0xbf0a6f7d, 0xbfa2fff6,
-	0xbeff00c1, 0xc405006e,
-	0x008ce800, 0x00000000,
-	0xc405006e, 0x008ce801,
-	0x00010000, 0xc405006e,
-	0x008ce802, 0x00020000,
-	0xc405006e, 0x008ce803,
-	0x00030000, 0xbf8a0000,
+	0x8b6fc16f, 0xbfa10030,
+	0x846f896f, 0xbef6006f,
 	0xb8f83b05, 0x80788178,
 	0xbf0d9972, 0xbfa20002,
 	0x84788978, 0xbfa00001,
 	0x84788a78, 0xb8ee1e06,
 	0x846e8a6e, 0x80786e78,
 	0x8078ff78, 0x00000200,
-	0x80f8ff78, 0x00000050,
+	0x8078ff78, 0x00000080,
 	0xbef600ff, 0x01000000,
-	0xbefd00ff, 0x0000006c,
-	0x80f89078, 0xf462403a,
-	0xf0000000, 0xbf8a0000,
-	0x80fd847d, 0xbf800000,
-	0xbe804300, 0xbe824302,
-	0x80f8a078, 0xf462603a,
-	0xf0000000, 0xbf8a0000,
-	0x80fd887d, 0xbf800000,
-	0xbe804300, 0xbe824302,
-	0xbe844304, 0xbe864306,
-	0x80f8c078, 0xf462803a,
-	0xf0000000, 0xbf8a0000,
-	0x80fd907d, 0xbf800000,
-	0xbe804300, 0xbe824302,
-	0xbe844304, 0xbe864306,
-	0xbe884308, 0xbe8a430a,
-	0xbe8c430c, 0xbe8e430e,
-	0xbf06807d, 0xbfa1fff0,
-	0xb980f801, 0x00000000,
-	0xb8f83b05, 0x80788178,
-	0xbf0d9972, 0xbfa20002,
-	0x84788978, 0xbfa00001,
-	0x84788a78, 0xb8ee1e06,
-	0x846e8a6e, 0x80786e78,
+	0x857d9972, 0x8b7d817d,
+	0xbf06817d, 0xbefd0080,
+	0xbfa2000d, 0xc4050078,
+	0x0080e800, 0x00000000,
+	0xbf8a0000, 0xdac00000,
+	0x00000000, 0x807dff7d,
+	0x00000080, 0x8078ff78,
+	0x00000080, 0xbf0a6f7d,
+	0xbfa2fff4, 0xbfa0000c,
+	0xc4050078, 0x0080e800,
+	0x00000000, 0xbf8a0000,
+	0xdac00000, 0x00000000,
+	0x807dff7d, 0x00000100,
+	0x8078ff78, 0x00000100,
+	0xbf0a6f7d, 0xbfa2fff4,
+	0xbef80080, 0xbefe00c1,
+	0x857d9972, 0x8b7d817d,
+	0xbf06817d, 0xbfa20002,
+	0xbeff0080, 0xbfa00001,
+	0xbeff00c1, 0xb8ef3b05,
+	0x806f816f, 0x846f826f,
+	0x857d9972, 0x8b7d817d,
+	0xbf06817d, 0xbfa2002c,
+	0xbef600ff, 0x01000000,
+	0xbeee0078, 0x8078ff78,
+	0x00000200, 0xbefd0084,
+	0xbf0a6f7d, 0xbfa10061,
+	0xc4050078, 0x008ce800,
+	0x00000000, 0xc4050078,
+	0x008ce801, 0x00008000,
+	0xc4050078, 0x008ce802,
+	0x00010000, 0xc4050078,
+	0x008ce803, 0x00018000,
+	0xbf8a0000, 0x7e008500,
+	0x7e028501, 0x7e048502,
+	0x7e068503, 0x807d847d,
 	0x8078ff78, 0x00000200,
+	0xbf0a6f7d, 0xbfa2ffea,
+	0xc405006e, 0x008ce800,
+	0x00000000, 0xc405006e,
+	0x008ce801, 0x00008000,
+	0xc405006e, 0x008ce802,
+	0x00010000, 0xc405006e,
+	0x008ce803, 0x00018000,
+	0xbf8a0000, 0xbfa0003d,
 	0xbef600ff, 0x01000000,
-	0xbeff0071, 0xf4621bfa,
+	0xbeee0078, 0x8078ff78,
+	0x00000400, 0xbefd0084,
+	0xbf0a6f7d, 0xbfa10016,
+	0xc4050078, 0x008ce800,
+	0x00000000, 0xc4050078,
+	0x008ce801, 0x00010000,
+	0xc4050078, 0x008ce802,
+	0x00020000, 0xc4050078,
+	0x008ce803, 0x00030000,
+	0xbf8a0000, 0x7e008500,
+	0x7e028501, 0x7e048502,
+	0x7e068503, 0x807d847d,
+	0x8078ff78, 0x00000400,
+	0xbf0a6f7d, 0xbfa2ffea,
+	0xb8ef1e06, 0x8b6fc16f,
+	0xbfa1000f, 0x846f836f,
+	0x806f7d6f, 0xbefe00c1,
+	0xbeff0080, 0xc4050078,
+	0x008ce800, 0x00000000,
+	0xbf8a0000, 0x7e008500,
+	0x807d817d, 0x8078ff78,
+	0x00000080, 0xbf0a6f7d,
+	0xbfa2fff6, 0xbeff00c1,
+	0xc405006e, 0x008ce800,
+	0x00000000, 0xc405006e,
+	0x008ce801, 0x00010000,
+	0xc405006e, 0x008ce802,
+	0x00020000, 0xc405006e,
+	0x008ce803, 0x00030000,
+	0xbf8a0000, 0xb8f83b05,
+	0x80788178, 0xbf0d9972,
+	0xbfa20002, 0x84788978,
+	0xbfa00001, 0x84788a78,
+	0xb8ee1e06, 0x846e8a6e,
+	0x80786e78, 0x8078ff78,
+	0x00000200, 0x80f8ff78,
+	0x00000050, 0xbef600ff,
+	0x01000000, 0xbefd00ff,
+	0x0000006c, 0x80f89078,
+	0xf462403a, 0xf0000000,
+	0xbf8a0000, 0x80fd847d,
+	0xbf800000, 0xbe804300,
+	0xbe824302, 0x80f8a078,
+	0xf462603a, 0xf0000000,
+	0xbf8a0000, 0x80fd887d,
+	0xbf800000, 0xbe804300,
+	0xbe824302, 0xbe844304,
+	0xbe864306, 0x80f8c078,
+	0xf462803a, 0xf0000000,
+	0xbf8a0000, 0x80fd907d,
+	0xbf800000, 0xbe804300,
+	0xbe824302, 0xbe844304,
+	0xbe864306, 0xbe884308,
+	0xbe8a430a, 0xbe8c430c,
+	0xbe8e430e, 0xbf06807d,
+	0xbfa1fff0, 0xb980f801,
+	0x00000000, 0xb8f83b05,
+	0x80788178, 0xbf0d9972,
+	0xbfa20002, 0x84788978,
+	0xbfa00001, 0x84788a78,
+	0xb8ee1e06, 0x846e8a6e,
+	0x80786e78, 0x8078ff78,
+	0x00000200, 0xbef600ff,
+	0x01000000, 0xbeff0071,
+	0xf4621bfa, 0xf0000000,
+	0x80788478, 0xf4621b3a,
 	0xf0000000, 0x80788478,
-	0xf4621b3a, 0xf0000000,
-	0x80788478, 0xf4621b7a,
+	0xf4621b7a, 0xf0000000,
+	0x80788478, 0xf4621c3a,
 	0xf0000000, 0x80788478,
-	0xf4621c3a, 0xf0000000,
-	0x80788478, 0xf4621c7a,
+	0xf4621c7a, 0xf0000000,
+	0x80788478, 0xf4621eba,
 	0xf0000000, 0x80788478,
-	0xf4621eba, 0xf0000000,
-	0x80788478, 0xf4621efa,
+	0xf4621efa, 0xf0000000,
+	0x80788478, 0xf4621e7a,
 	0xf0000000, 0x80788478,
-	0xf4621e7a, 0xf0000000,
-	0x80788478, 0xf4621cfa,
+	0xf4621cfa, 0xf0000000,
+	0x80788478, 0xf4621bba,
 	0xf0000000, 0x80788478,
+	0xbf8a0000, 0xb96ef814,
 	0xf4621bba, 0xf0000000,
 	0x80788478, 0xbf8a0000,
-	0xb96ef814, 0xf4621bba,
+	0xb96ef815, 0xf4621bba,
 	0xf0000000, 0x80788478,
-	0xbf8a0000, 0xb96ef815,
+	0xbf8a0000, 0xb96ef812,
 	0xf4621bba, 0xf0000000,
 	0x80788478, 0xbf8a0000,
-	0xb96ef812, 0xf4621bba,
+	0xb96ef813, 0x8b6eff7f,
+	0x04000000, 0xbfa1000d,
+	0x80788478, 0xf4621bba,
 	0xf0000000, 0x80788478,
-	0xbf8a0000, 0xb96ef813,
-	0x8b6eff7f, 0x04000000,
-	0xbfa1000d, 0x80788478,
-	0xf4621bba, 0xf0000000,
-	0x80788478, 0xbf8a0000,
-	0xbf0d806e, 0xbfa10006,
-	0x856e906e, 0x8b6e6e6e,
-	0xbfa10003, 0xbe804ec1,
-	0x816ec16e, 0xbfa0fffb,
+	0xbf8a0000, 0xbf0d806e,
+	0xbfa10006, 0x856e906e,
+	0x8b6e6e6e, 0xbfa10003,
+	0xbe804ec1, 0x816ec16e,
+	0xbfa0fffb, 0xbefd006f,
+	0xbefe0070, 0xbeff0071,
+	0xb97b2011, 0x857b867b,
+	0xb97b0191, 0x857b827b,
+	0xb97bba11, 0xb973f801,
+	0xb8ee3b05, 0x806e816e,
+	0xbf0d9972, 0xbfa20002,
+	0x846e896e, 0xbfa00001,
+	0x846e8a6e, 0xb8ef1e06,
+	0x846f8a6f, 0x806e6f6e,
+	0x806eff6e, 0x00000200,
+	0x806e746e, 0x826f8075,
+	0x8b6fff6f, 0x0000ffff,
+	0xf4605c37, 0xf8000050,
+	0xf4605d37, 0xf8000060,
+	0xf4601e77, 0xf8000074,
+	0xbf8a0000, 0x8b6dff6d,
+	0x0000ffff, 0x8bfe7e7e,
+	0x8bea6a6a, 0xb97af804,
 	0xbe804ec2, 0xbf94fffe,
-	0xbefd006f, 0xbefe0070,
-	0xbeff0071, 0xb97bf811,
-	0xb973f801, 0xb8ee3b05,
-	0x806e816e, 0xbf0d9972,
-	0xbfa20002, 0x846e896e,
-	0xbfa00001, 0x846e8a6e,
-	0xb8ef1e06, 0x846f8a6f,
-	0x806e6f6e, 0x806eff6e,
-	0x00000200, 0x806e746e,
-	0x826f8075, 0x8b6fff6f,
-	0x0000ffff, 0xf4605c37,
-	0xf8000050, 0xf4605d37,
-	0xf8000060, 0xf4601e77,
-	0xf8000074, 0xbf8a0000,
-	0x8b6dff6d, 0x0000ffff,
-	0x8bfe7e7e, 0x8bea6a6a,
-	0xb97af804, 0xbe804a6c,
-	0xbfb10000, 0xbf9f0000,
+	0xbe804a6c, 0xbfb10000,
 	0xbf9f0000, 0xbf9f0000,
 	0xbf9f0000, 0xbf9f0000,
+	0xbf9f0000, 0x00000000,
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index 77ae25b6753c..18e012e04493 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -75,17 +75,22 @@ var SQ_WAVE_STATUS_ECC_ERR_MASK			= 0x20000
 var SQ_WAVE_STATUS_TRAP_EN_SHIFT		= 6
 var SQ_WAVE_IB_STS2_WAVE64_SHIFT		= 11
 var SQ_WAVE_IB_STS2_WAVE64_SIZE			= 1
+var SQ_WAVE_LDS_ALLOC_GRANULARITY		= 8
 var S_STATUS_HWREG				= HW_REG_STATUS
 var S_STATUS_ALWAYS_CLEAR_MASK			= SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK
 var S_STATUS_HALT_MASK				= SQ_WAVE_STATUS_HALT_MASK
 var S_SAVE_PC_HI_TRAP_ID_MASK			= 0x00FF0000
 var S_SAVE_PC_HI_HT_MASK			= 0x01000000
 #else
+var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK	= 0x4
+var SQ_WAVE_STATE_PRIV_SCC_SHIFT		= 9
 var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK		= 0xC00
 var SQ_WAVE_STATE_PRIV_HALT_MASK		= 0x4000
 var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK		= 0x8000
+var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT		= 15
 var SQ_WAVE_STATUS_WAVE64_SHIFT			= 29
 var SQ_WAVE_STATUS_WAVE64_SIZE			= 1
+var SQ_WAVE_LDS_ALLOC_GRANULARITY		= 9
 var S_STATUS_HWREG				= HW_REG_WAVE_STATE_PRIV
 var S_STATUS_ALWAYS_CLEAR_MASK			= SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK
 var S_STATUS_HALT_MASK				= SQ_WAVE_STATE_PRIV_HALT_MASK
@@ -149,8 +154,10 @@ var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK	= 0x10
 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT	= 5
 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK	= 0x20
 var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK	= 0x40
+var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT	= 6
 var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK	= 0x80
 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK	= 0x100
+var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT	= 8
 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK	= 0x200
 var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK	= 0x800
 var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK		= 0x80
@@ -430,7 +437,16 @@ L_EXIT_TRAP:
 	// Restore SQ_WAVE_STATUS.
 	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
 	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
+
+#if ASIC_FAMILY < CHIP_GFX12
 	s_setreg_b32	hwreg(S_STATUS_HWREG), s_save_status
+#else
+	// STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
+	// Only restore fields which the trap handler changes.
+	s_lshr_b32	s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_SCC_SHIFT
+	s_setreg_b32	hwreg(S_STATUS_HWREG, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
+		SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_status
+#endif
 
 	s_rfe_b64	[ttmp0, ttmp1]
 
@@ -622,8 +638,15 @@ L_SAVE_HWREG:
 
 #if ASIC_FAMILY >= CHIP_GFX12
 	// Ensure no further changes to barrier or LDS state.
+	// STATE_PRIV.BARRIER_COMPLETE may change up to this point.
 	s_barrier_signal	-2
 	s_barrier_wait	-2
+
+	// Re-read final state of BARRIER_COMPLETE field for save.
+	s_getreg_b32	s_save_tmp, hwreg(S_STATUS_HWREG)
+	s_and_b32	s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
+	s_andn2_b32	s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
+	s_or_b32	s_save_status, s_save_status, s_save_tmp
 #endif
 
 	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
@@ -764,8 +787,7 @@ L_SAVE_LDS_NORMAL:
 
 	// first wave do LDS save;
 
-	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 6			//LDS size in dwords = lds_size * 64dw
-	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//LDS size in bytes
+	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
 	s_mov_b32	s_save_buf_rsrc2, s_save_alloc_size			//NUM_RECORDS in bytes
 
 	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
@@ -1050,8 +1072,7 @@ L_RESTORE_LDS_NORMAL:
 	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
 	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//lds_size is zero?
 	s_cbranch_scc0	L_RESTORE_VGPR						//no lds used? jump to L_RESTORE_VGPR
-	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 6		//LDS size in dwords = lds_size * 64dw
-	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//LDS size in bytes
+	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
 	s_mov_b32	s_restore_buf_rsrc2, s_restore_alloc_size		//NUM_RECORDS in bytes
 
 	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
@@ -1338,9 +1359,6 @@ L_BARRIER_RESTORE_LOOP:
 	s_branch	L_BARRIER_RESTORE_LOOP
 
 L_SKIP_BARRIER_RESTORE:
-	// Make barrier and LDS state visible to all waves in the group.
-	s_barrier_signal	-2
-	s_barrier_wait	-2
 #endif
 
 	s_mov_b32	m0, s_restore_m0
@@ -1351,7 +1369,17 @@ L_SKIP_BARRIER_RESTORE:
 	s_setreg_b32	hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
 #endif
 
+#if ASIC_FAMILY < CHIP_GFX12
 	s_setreg_b32	hwreg(S_TRAPSTS_HWREG), s_restore_trapsts
+#else
+	// EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed.
+	// Only restore the other fields to avoid clobbering them.
+	s_setreg_b32	hwreg(S_TRAPSTS_HWREG, 0, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT), s_restore_trapsts
+	s_lshr_b32	s_restore_trapsts, s_restore_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
+	s_setreg_b32	hwreg(S_TRAPSTS_HWREG, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT, 1), s_restore_trapsts
+	s_lshr_b32	s_restore_trapsts, s_restore_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
+	s_setreg_b32	hwreg(S_TRAPSTS_HWREG, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT, 32 - SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT), s_restore_trapsts
+#endif
 	s_setreg_b32	hwreg(HW_REG_MODE), s_restore_mode
 
 	// Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
@@ -1389,6 +1417,14 @@ L_RETURN_WITHOUT_PRIV:
 #endif
 
 	s_setreg_b32	hwreg(S_STATUS_HWREG), s_restore_status			// SCC is included, which is changed by previous salu
+
+#if ASIC_FAMILY >= CHIP_GFX12
+	// Make barrier and LDS state visible to all waves in the group.
+	// STATE_PRIV.BARRIER_COMPLETE may change after this point.
+	s_barrier_signal	-2
+	s_barrier_wait	-2
+#endif
+
 	s_rfe_b64	s_restore_pc_lo						//Return to the main shader program and resume execution
 
 L_END_PGM:
@@ -1501,11 +1537,6 @@ function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset)
 end
 #endif
 
-function get_lds_size_bytes(s_lds_size_byte)
-	s_getreg_b32	s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
-	s_lshl_b32	s_lds_size_byte, s_lds_size_byte, 8			//LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
-end
-
 function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
 	s_getreg_b32	s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
 	s_add_u32	s_vgpr_size_byte, s_vgpr_size_byte, 1
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source
  2024-05-23 14:08 [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source Jay Cornwall
  2024-05-23 14:08 ` [PATCH 2/3] drm/amdkfd: Replace deprecated gfx12 trap handler instructions Jay Cornwall
  2024-05-23 14:08 ` [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes Jay Cornwall
@ 2024-05-23 18:27 ` Alex Deucher
  2024-05-23 18:41 ` Lancelot SIX
  3 siblings, 0 replies; 9+ messages in thread
From: Alex Deucher @ 2024-05-23 18:27 UTC (permalink / raw)
  To: Jay Cornwall; +Cc: amd-gfx, Lancelot Six

Series is:
Acked-by: Alex Deucher <alexander.deucher@amd.com>

On Thu, May 23, 2024 at 10:27 AM Jay Cornwall <jay.cornwall@amd.com> wrote:
>
> Source and binary have become mismatched during branch activity.
>
> Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
> Cc: Lancelot Six <lancelot.six@amd.com>
> ---
>  .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 57 ++++++++-----------
>  1 file changed, 24 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> index 73d3772cdb76..11d076eb770c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> @@ -718,12 +718,12 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>         0xf4051ebd, 0xfa000008,
>         0xbf8cc07f, 0x87ee6e6e,
>         0xbf840001, 0xbe80206e,
> -       0x876eff6d, 0x01ff0000,
> -       0xbf850005, 0x8878ff78,
> -       0x00002000, 0x80ec886c,
> -       0x82ed806d, 0xbf820005,
> -       0x876eff6d, 0x01000000,
> -       0xbf850002, 0x806c846c,
> +       0x876eff6d, 0x00ff0000,
> +       0xbf850008, 0x876eff6d,
> +       0x01000000, 0xbf850007,
> +       0x8878ff78, 0x00002000,
> +       0x80ec886c, 0x82ed806d,
> +       0xbf820002, 0x806c846c,
>         0x826d806d, 0x876dff6d,
>         0x0000ffff, 0x907a8977,
>         0x877bff7a, 0x003f8000,
> @@ -1136,7 +1136,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>         0xe0704000, 0x705d0000,
>         0x807c817c, 0x8070ff70,
>         0x00000080, 0xbf0a7b7c,
> -       0xbf85fff8, 0xbf820144,
> +       0xbf85fff8, 0xbf82013e,
>         0xbef4037e, 0x8775ff7f,
>         0x0000ffff, 0x8875ff75,
>         0x00040000, 0xbef60380,
> @@ -1276,10 +1276,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>         0x80788478, 0xbf8cc07f,
>         0xb9eef815, 0xbefc036f,
>         0xbefe0370, 0xbeff0371,
> -       0x876f7bff, 0x000003ff,
> -       0xb9ef4803, 0xb9f9f816,
> -       0x876f7bff, 0xfffff800,
> -       0x906f8b6f, 0xb9efa2c3,
> +       0xb9f9f816, 0xb9fbf803,
>         0xb9f3f801, 0xb96e3a05,
>         0x806e816e, 0xbf0d9972,
>         0xbf850002, 0x8f6e896e,
> @@ -2309,12 +2306,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>         0xf4051ebd, 0xfa000008,
>         0xbf8cc07f, 0x87ee6e6e,
>         0xbf840001, 0xbe80206e,
> -       0x876eff6d, 0x01ff0000,
> -       0xbf850005, 0x8878ff78,
> -       0x00002000, 0x80ec886c,
> -       0x82ed806d, 0xbf820005,
> -       0x876eff6d, 0x01000000,
> -       0xbf850002, 0x806c846c,
> +       0x876eff6d, 0x00ff0000,
> +       0xbf850008, 0x876eff6d,
> +       0x01000000, 0xbf850007,
> +       0x8878ff78, 0x00002000,
> +       0x80ec886c, 0x82ed806d,
> +       0xbf820002, 0x806c846c,
>         0x826d806d, 0x876dff6d,
>         0x0000ffff, 0x87fe7e7e,
>         0x87ea6a6a, 0xb9f8f802,
> @@ -2549,7 +2546,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>         0x705d0000, 0x807c817c,
>         0x8070ff70, 0x00000080,
>         0xbf0a7b7c, 0xbf85fff8,
> -       0xbf82013b, 0xbef4037e,
> +       0xbf820135, 0xbef4037e,
>         0x8775ff7f, 0x0000ffff,
>         0x8875ff75, 0x00040000,
>         0xbef60380, 0xbef703ff,
> @@ -2688,10 +2685,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>         0xf0000000, 0x80788478,
>         0xbf8cc07f, 0xb9eef815,
>         0xbefc036f, 0xbefe0370,
> -       0xbeff0371, 0x876f7bff,
> -       0x000003ff, 0xb9ef4803,
> -       0x876f7bff, 0xfffff800,
> -       0x906f8b6f, 0xb9efa2c3,
> +       0xbeff0371, 0xb9fbf803,
>         0xb9f3f801, 0xb96e3a05,
>         0x806e816e, 0xbf0d9972,
>         0xbf850002, 0x8f6e896e,
> @@ -2749,11 +2743,11 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>         0xf8000008, 0xbf89fc07,
>         0x8bee6e6e, 0xbfa10001,
>         0xbe80486e, 0x8b6eff6d,
> -       0x01ff0000, 0xbfa20005,
> -       0x8c78ff78, 0x00002000,
> -       0x80ec886c, 0x82ed806d,
> -       0xbfa00005, 0x8b6eff6d,
> -       0x01000000, 0xbfa20002,
> +       0x00ff0000, 0xbfa20008,
> +       0x8b6eff6d, 0x01000000,
> +       0xbfa20007, 0x8c78ff78,
> +       0x00002000, 0x80ec886c,
> +       0x82ed806d, 0xbfa00002,
>         0x806c846c, 0x826d806d,
>         0x8b6dff6d, 0x0000ffff,
>         0x8bfe7e7e, 0x8bea6a6a,
> @@ -2988,7 +2982,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>         0x701d0000, 0x807d817d,
>         0x8070ff70, 0x00000080,
>         0xbf0a7b7d, 0xbfa2fff8,
> -       0xbfa00146, 0xbef4007e,
> +       0xbfa00140, 0xbef4007e,
>         0x8b75ff7f, 0x0000ffff,
>         0x8c75ff75, 0x00040000,
>         0xbef60080, 0xbef700ff,
> @@ -3130,10 +3124,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>         0xf0000000, 0x80788478,
>         0xbf89fc07, 0xb96ef815,
>         0xbefd006f, 0xbefe0070,
> -       0xbeff0071, 0x8b6f7bff,
> -       0x000003ff, 0xb96f4803,
> -       0x8b6f7bff, 0xfffff800,
> -       0x856f8b6f, 0xb96fa2c3,
> +       0xbeff0071, 0xb97bf803,
>         0xb973f801, 0xb8ee3b05,
>         0x806e816e, 0xbf0d9972,
>         0xbfa20002, 0x846e896e,
> @@ -4119,7 +4110,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>         0x8b6dff6d, 0x0000ffff,
>         0x8bfe7e7e, 0x8bea6a6a,
>         0xb97af804, 0xbe804a6c,
> -       0xbfb00000, 0xbf9f0000,
> +       0xbfb10000, 0xbf9f0000,
>         0xbf9f0000, 0xbf9f0000,
>         0xbf9f0000, 0xbf9f0000,
>  };
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes
  2024-05-23 14:08 ` [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes Jay Cornwall
@ 2024-05-23 18:37   ` Lancelot SIX
  2024-05-23 19:31     ` Jay Cornwall
  0 siblings, 1 reply; 9+ messages in thread
From: Lancelot SIX @ 2024-05-23 18:37 UTC (permalink / raw)
  To: Jay Cornwall, amd-gfx

Hi Jay,

I have added a couple (minor) of comments below.

On 23/05/2024 15:08, Jay Cornwall wrote:
> Fix LDS size interpretation: 512 bytes (>= gfx12) vs 256 (< gfx12).
> 
> Ensure STATE_PRIV.BARRIER_COMPLETE cannot change after reading or
> before writing. Other waves in the threadgroup may cause this field
> to assert if they complete the barrier.
> 
> Do not overwrite EXCP_FLAG_PRIV.{SAVE_CONTEXT,HOST_TRAP} when
> restoring this register. Both of these fields can assert while the
> wavefront is running the trap handler.
> 
> Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
> Cc: Lancelot Six <lancelot.six@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 1191 +++++++++--------
>   .../amd/amdkfd/cwsr_trap_handler_gfx10.asm    |   55 +-
>   2 files changed, 639 insertions(+), 607 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
> index 77ae25b6753c..18e012e04493 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
> +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
> @@ -75,17 +75,22 @@ var SQ_WAVE_STATUS_ECC_ERR_MASK			= 0x20000
>   var SQ_WAVE_STATUS_TRAP_EN_SHIFT		= 6
>   var SQ_WAVE_IB_STS2_WAVE64_SHIFT		= 11
>   var SQ_WAVE_IB_STS2_WAVE64_SIZE			= 1
> +var SQ_WAVE_LDS_ALLOC_GRANULARITY		= 8
>   var S_STATUS_HWREG				= HW_REG_STATUS
>   var S_STATUS_ALWAYS_CLEAR_MASK			= SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK
>   var S_STATUS_HALT_MASK				= SQ_WAVE_STATUS_HALT_MASK
>   var S_SAVE_PC_HI_TRAP_ID_MASK			= 0x00FF0000
>   var S_SAVE_PC_HI_HT_MASK			= 0x01000000
>   #else
> +var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK	= 0x4
> +var SQ_WAVE_STATE_PRIV_SCC_SHIFT		= 9
>   var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK		= 0xC00
>   var SQ_WAVE_STATE_PRIV_HALT_MASK		= 0x4000
>   var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK		= 0x8000
> +var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT		= 15
>   var SQ_WAVE_STATUS_WAVE64_SHIFT			= 29
>   var SQ_WAVE_STATUS_WAVE64_SIZE			= 1
> +var SQ_WAVE_LDS_ALLOC_GRANULARITY		= 9
>   var S_STATUS_HWREG				= HW_REG_WAVE_STATE_PRIV
>   var S_STATUS_ALWAYS_CLEAR_MASK			= SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK
>   var S_STATUS_HALT_MASK				= SQ_WAVE_STATE_PRIV_HALT_MASK
> @@ -149,8 +154,10 @@ var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK	= 0x10
>   var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT	= 5
>   var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK	= 0x20
>   var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK	= 0x40
> +var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT	= 6
>   var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK	= 0x80
>   var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK	= 0x100
> +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT	= 8
>   var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK	= 0x200
>   var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK	= 0x800
>   var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK		= 0x80
> @@ -430,7 +437,16 @@ L_EXIT_TRAP:
>   	// Restore SQ_WAVE_STATUS.
>   	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
>   	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
> +
> +#if ASIC_FAMILY < CHIP_GFX12
>   	s_setreg_b32	hwreg(S_STATUS_HWREG), s_save_status
> +#else
> +	// STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
> +	// Only restore fields which the trap handler changes.
> +	s_lshr_b32	s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_SCC_SHIFT
> +	s_setreg_b32	hwreg(S_STATUS_HWREG, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
> +		SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_status
> +#endif
>   
>   	s_rfe_b64	[ttmp0, ttmp1]
>   
> @@ -622,8 +638,15 @@ L_SAVE_HWREG:
>   
>   #if ASIC_FAMILY >= CHIP_GFX12
>   	// Ensure no further changes to barrier or LDS state.
> +	// STATE_PRIV.BARRIER_COMPLETE may change up to this point.
>   	s_barrier_signal	-2
>   	s_barrier_wait	-2
> +
> +	// Re-read final state of BARRIER_COMPLETE field for save.
> +	s_getreg_b32	s_save_tmp, hwreg(S_STATUS_HWREG)
> +	s_and_b32	s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
> +	s_andn2_b32	s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK

Even if BARRIER_COMPLETE can be asserted while we are in the trap 
hadler, I do not think it can be cleared.  That being said, it might be 
easier to just replace the bit, making it clearer.

> +	s_or_b32	s_save_status, s_save_status, s_save_tmp
>   #endif
>   
>   	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
> @@ -764,8 +787,7 @@ L_SAVE_LDS_NORMAL:
>   
>   	// first wave do LDS save;
>   
> -	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 6			//LDS size in dwords = lds_size * 64dw
> -	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//LDS size in bytes
> +	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
>   	s_mov_b32	s_save_buf_rsrc2, s_save_alloc_size			//NUM_RECORDS in bytes
>   
>   	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
> @@ -1050,8 +1072,7 @@ L_RESTORE_LDS_NORMAL:
>   	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
>   	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//lds_size is zero?
>   	s_cbranch_scc0	L_RESTORE_VGPR						//no lds used? jump to L_RESTORE_VGPR
> -	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 6		//LDS size in dwords = lds_size * 64dw
> -	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//LDS size in bytes
> +	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, SQ_WAVE_LDS_ALLOC_GRANULARITY
>   	s_mov_b32	s_restore_buf_rsrc2, s_restore_alloc_size		//NUM_RECORDS in bytes
>   
>   	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
> @@ -1338,9 +1359,6 @@ L_BARRIER_RESTORE_LOOP:
>   	s_branch	L_BARRIER_RESTORE_LOOP
>   
>   L_SKIP_BARRIER_RESTORE:
> -	// Make barrier and LDS state visible to all waves in the group.
> -	s_barrier_signal	-2
> -	s_barrier_wait	-2
>   #endif
>   
>   	s_mov_b32	m0, s_restore_m0
> @@ -1351,7 +1369,17 @@ L_SKIP_BARRIER_RESTORE:
>   	s_setreg_b32	hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
>   #endif
>   
> +#if ASIC_FAMILY < CHIP_GFX12
>   	s_setreg_b32	hwreg(S_TRAPSTS_HWREG), s_restore_trapsts

Wouldn't other gfx1x architectures have a similar issue when writing 
TRAPSTS here?  That is if TRAPSTS.SAVECTX is set while we are restoring, 
wouldn't we loose it?

And for gfx11, there is TRAPSTS.HOST_TRAP that could have the same issue 
to some degree (not sure if we would loose the host trap completly, or 
re-enter with trap ID + HT bit set in ttmp1).

That is not a regression, nor something this patch claims to address, so 
maybe it can be a seperate patch.

Best,
Lancelot.

> +#else
> +	// EXCP_FLAG_PRIV.SAVE_CONTEXT and HOST_TRAP may have changed.
> +	// Only restore the other fields to avoid clobbering them.
> +	s_setreg_b32	hwreg(S_TRAPSTS_HWREG, 0, SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT), s_restore_trapsts
> +	s_lshr_b32	s_restore_trapsts, s_restore_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
> +	s_setreg_b32	hwreg(S_TRAPSTS_HWREG, SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT, 1), s_restore_trapsts
> +	s_lshr_b32	s_restore_trapsts, s_restore_trapsts, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT - SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT
> +	s_setreg_b32	hwreg(S_TRAPSTS_HWREG, SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT, 32 - SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT), s_restore_trapsts
> +#endif
>   	s_setreg_b32	hwreg(HW_REG_MODE), s_restore_mode
>   
>   	// Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
> @@ -1389,6 +1417,14 @@ L_RETURN_WITHOUT_PRIV:
>   #endif
>   
>   	s_setreg_b32	hwreg(S_STATUS_HWREG), s_restore_status			// SCC is included, which is changed by previous salu
> +
> +#if ASIC_FAMILY >= CHIP_GFX12
> +	// Make barrier and LDS state visible to all waves in the group.
> +	// STATE_PRIV.BARRIER_COMPLETE may change after this point.
> +	s_barrier_signal	-2
> +	s_barrier_wait	-2
> +#endif
> +
>   	s_rfe_b64	s_restore_pc_lo						//Return to the main shader program and resume execution
>   
>   L_END_PGM:
> @@ -1501,11 +1537,6 @@ function write_vgprs_to_mem_with_sqc_w64(vgpr0, n_vgprs, s_rsrc, s_mem_offset)
>   end
>   #endif
>   
> -function get_lds_size_bytes(s_lds_size_byte)
> -	s_getreg_b32	s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
> -	s_lshl_b32	s_lds_size_byte, s_lds_size_byte, 8			//LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
> -end
> -
>   function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
>   	s_getreg_b32	s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
>   	s_add_u32	s_vgpr_size_byte, s_vgpr_size_byte, 1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source
  2024-05-23 14:08 [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source Jay Cornwall
                   ` (2 preceding siblings ...)
  2024-05-23 18:27 ` [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source Alex Deucher
@ 2024-05-23 18:41 ` Lancelot SIX
  3 siblings, 0 replies; 9+ messages in thread
From: Lancelot SIX @ 2024-05-23 18:41 UTC (permalink / raw)
  To: Jay Cornwall, amd-gfx



On 23/05/2024 15:08, Jay Cornwall wrote:
> Source and binary have become mismatched during branch activity.
> 
> Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
> Cc: Lancelot Six <lancelot.six@amd.com>

Thanks for doing this.

This matches what I have when rebuilding the trap handlers.

Reviewed-by: Lancelot Six <lancelot.six@amd.com>

> ---
>   .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 57 ++++++++-----------
>   1 file changed, 24 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> index 73d3772cdb76..11d076eb770c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> @@ -718,12 +718,12 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0xf4051ebd, 0xfa000008,
>   	0xbf8cc07f, 0x87ee6e6e,
>   	0xbf840001, 0xbe80206e,
> -	0x876eff6d, 0x01ff0000,
> -	0xbf850005, 0x8878ff78,
> -	0x00002000, 0x80ec886c,
> -	0x82ed806d, 0xbf820005,
> -	0x876eff6d, 0x01000000,
> -	0xbf850002, 0x806c846c,
> +	0x876eff6d, 0x00ff0000,
> +	0xbf850008, 0x876eff6d,
> +	0x01000000, 0xbf850007,
> +	0x8878ff78, 0x00002000,
> +	0x80ec886c, 0x82ed806d,
> +	0xbf820002, 0x806c846c,
>   	0x826d806d, 0x876dff6d,
>   	0x0000ffff, 0x907a8977,
>   	0x877bff7a, 0x003f8000,
> @@ -1136,7 +1136,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0xe0704000, 0x705d0000,
>   	0x807c817c, 0x8070ff70,
>   	0x00000080, 0xbf0a7b7c,
> -	0xbf85fff8, 0xbf820144,
> +	0xbf85fff8, 0xbf82013e,
>   	0xbef4037e, 0x8775ff7f,
>   	0x0000ffff, 0x8875ff75,
>   	0x00040000, 0xbef60380,
> @@ -1276,10 +1276,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0x80788478, 0xbf8cc07f,
>   	0xb9eef815, 0xbefc036f,
>   	0xbefe0370, 0xbeff0371,
> -	0x876f7bff, 0x000003ff,
> -	0xb9ef4803, 0xb9f9f816,
> -	0x876f7bff, 0xfffff800,
> -	0x906f8b6f, 0xb9efa2c3,
> +	0xb9f9f816, 0xb9fbf803,
>   	0xb9f3f801, 0xb96e3a05,
>   	0x806e816e, 0xbf0d9972,
>   	0xbf850002, 0x8f6e896e,
> @@ -2309,12 +2306,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0xf4051ebd, 0xfa000008,
>   	0xbf8cc07f, 0x87ee6e6e,
>   	0xbf840001, 0xbe80206e,
> -	0x876eff6d, 0x01ff0000,
> -	0xbf850005, 0x8878ff78,
> -	0x00002000, 0x80ec886c,
> -	0x82ed806d, 0xbf820005,
> -	0x876eff6d, 0x01000000,
> -	0xbf850002, 0x806c846c,
> +	0x876eff6d, 0x00ff0000,
> +	0xbf850008, 0x876eff6d,
> +	0x01000000, 0xbf850007,
> +	0x8878ff78, 0x00002000,
> +	0x80ec886c, 0x82ed806d,
> +	0xbf820002, 0x806c846c,
>   	0x826d806d, 0x876dff6d,
>   	0x0000ffff, 0x87fe7e7e,
>   	0x87ea6a6a, 0xb9f8f802,
> @@ -2549,7 +2546,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0x705d0000, 0x807c817c,
>   	0x8070ff70, 0x00000080,
>   	0xbf0a7b7c, 0xbf85fff8,
> -	0xbf82013b, 0xbef4037e,
> +	0xbf820135, 0xbef4037e,
>   	0x8775ff7f, 0x0000ffff,
>   	0x8875ff75, 0x00040000,
>   	0xbef60380, 0xbef703ff,
> @@ -2688,10 +2685,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0xf0000000, 0x80788478,
>   	0xbf8cc07f, 0xb9eef815,
>   	0xbefc036f, 0xbefe0370,
> -	0xbeff0371, 0x876f7bff,
> -	0x000003ff, 0xb9ef4803,
> -	0x876f7bff, 0xfffff800,
> -	0x906f8b6f, 0xb9efa2c3,
> +	0xbeff0371, 0xb9fbf803,
>   	0xb9f3f801, 0xb96e3a05,
>   	0x806e816e, 0xbf0d9972,
>   	0xbf850002, 0x8f6e896e,
> @@ -2749,11 +2743,11 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0xf8000008, 0xbf89fc07,
>   	0x8bee6e6e, 0xbfa10001,
>   	0xbe80486e, 0x8b6eff6d,
> -	0x01ff0000, 0xbfa20005,
> -	0x8c78ff78, 0x00002000,
> -	0x80ec886c, 0x82ed806d,
> -	0xbfa00005, 0x8b6eff6d,
> -	0x01000000, 0xbfa20002,
> +	0x00ff0000, 0xbfa20008,
> +	0x8b6eff6d, 0x01000000,
> +	0xbfa20007, 0x8c78ff78,
> +	0x00002000, 0x80ec886c,
> +	0x82ed806d, 0xbfa00002,
>   	0x806c846c, 0x826d806d,
>   	0x8b6dff6d, 0x0000ffff,
>   	0x8bfe7e7e, 0x8bea6a6a,
> @@ -2988,7 +2982,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0x701d0000, 0x807d817d,
>   	0x8070ff70, 0x00000080,
>   	0xbf0a7b7d, 0xbfa2fff8,
> -	0xbfa00146, 0xbef4007e,
> +	0xbfa00140, 0xbef4007e,
>   	0x8b75ff7f, 0x0000ffff,
>   	0x8c75ff75, 0x00040000,
>   	0xbef60080, 0xbef700ff,
> @@ -3130,10 +3124,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0xf0000000, 0x80788478,
>   	0xbf89fc07, 0xb96ef815,
>   	0xbefd006f, 0xbefe0070,
> -	0xbeff0071, 0x8b6f7bff,
> -	0x000003ff, 0xb96f4803,
> -	0x8b6f7bff, 0xfffff800,
> -	0x856f8b6f, 0xb96fa2c3,
> +	0xbeff0071, 0xb97bf803,
>   	0xb973f801, 0xb8ee3b05,
>   	0x806e816e, 0xbf0d9972,
>   	0xbfa20002, 0x846e896e,
> @@ -4119,7 +4110,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x8b6dff6d, 0x0000ffff,
>   	0x8bfe7e7e, 0x8bea6a6a,
>   	0xb97af804, 0xbe804a6c,
> -	0xbfb00000, 0xbf9f0000,
> +	0xbfb10000, 0xbf9f0000,
>   	0xbf9f0000, 0xbf9f0000,
>   	0xbf9f0000, 0xbf9f0000,
>   };

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/3] drm/amdkfd: Replace deprecated gfx12 trap handler instructions
  2024-05-23 14:08 ` [PATCH 2/3] drm/amdkfd: Replace deprecated gfx12 trap handler instructions Jay Cornwall
@ 2024-05-23 18:43   ` Lancelot SIX
  0 siblings, 0 replies; 9+ messages in thread
From: Lancelot SIX @ 2024-05-23 18:43 UTC (permalink / raw)
  To: Jay Cornwall, amd-gfx



On 23/05/2024 15:08, Jay Cornwall wrote:
> Newer assemblers reject S_WAITCNT. All instances of S_WAITCNT can be
> replaced by S_WAITCNT 0 (< gfx12) or S_WAIT_IDLE (>= gfx12) since
> there is no concurrency of different memory instruction classes.
> 
> Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
> Cc: Lancelot Six <lancelot.six@amd.com>

Thanks, that looks good to me.

Reviewed-by: Lancelot Six <lancelot.six@amd.com>

> ---
>   .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 140 +++++++++---------
>   .../amd/amdkfd/cwsr_trap_handler_gfx10.asm    |  52 +++----
>   2 files changed, 97 insertions(+), 95 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> index 11d076eb770c..d61b2c3bd0ac 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
> @@ -711,12 +711,12 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0xbf0d8f7b, 0xbf840002,
>   	0x887bff7b, 0xffff0000,
>   	0xf4011bbd, 0xfa000010,
> -	0xbf8cc07f, 0x8f6e976e,
> +	0xbf8c0000, 0x8f6e976e,
>   	0x8a77ff77, 0x00800000,
>   	0x88776e77, 0xf4051bbd,
> -	0xfa000000, 0xbf8cc07f,
> +	0xfa000000, 0xbf8c0000,
>   	0xf4051ebd, 0xfa000008,
> -	0xbf8cc07f, 0x87ee6e6e,
> +	0xbf8c0000, 0x87ee6e6e,
>   	0xbf840001, 0xbe80206e,
>   	0x876eff6d, 0x00ff0000,
>   	0xbf850008, 0x876eff6d,
> @@ -1185,7 +1185,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0x785d0000, 0xe0304080,
>   	0x785d0100, 0xe0304100,
>   	0x785d0200, 0xe0304180,
> -	0x785d0300, 0xbf8c3f70,
> +	0x785d0300, 0xbf8c0000,
>   	0x7e008500, 0x7e028501,
>   	0x7e048502, 0x7e068503,
>   	0x807c847c, 0x8078ff78,
> @@ -1194,7 +1194,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0x6e5d0000, 0xe0304080,
>   	0x6e5d0100, 0xe0304100,
>   	0x6e5d0200, 0xe0304180,
> -	0x6e5d0300, 0xbf8c3f70,
> +	0x6e5d0300, 0xbf8c0000,
>   	0xbf820034, 0xbef603ff,
>   	0x01000000, 0xbeee0378,
>   	0x8078ff78, 0x00000400,
> @@ -1203,7 +1203,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0x785d0000, 0xe0304100,
>   	0x785d0100, 0xe0304200,
>   	0x785d0200, 0xe0304300,
> -	0x785d0300, 0xbf8c3f70,
> +	0x785d0300, 0xbf8c0000,
>   	0x7e008500, 0x7e028501,
>   	0x7e048502, 0x7e068503,
>   	0x807c847c, 0x8078ff78,
> @@ -1213,7 +1213,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0x8f6f836f, 0x806f7c6f,
>   	0xbefe03c1, 0xbeff0380,
>   	0xe0304000, 0x785d0000,
> -	0xbf8c3f70, 0x7e008500,
> +	0xbf8c0000, 0x7e008500,
>   	0x807c817c, 0x8078ff78,
>   	0x00000080, 0xbf0a6f7c,
>   	0xbf85fff7, 0xbeff03c1,
> @@ -1221,7 +1221,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0xe0304100, 0x6e5d0100,
>   	0xe0304200, 0x6e5d0200,
>   	0xe0304300, 0x6e5d0300,
> -	0xbf8c3f70, 0xb9783a05,
> +	0xbf8c0000, 0xb9783a05,
>   	0x80788178, 0xbf0d9972,
>   	0xbf850002, 0x8f788978,
>   	0xbf820001, 0x8f788a78,
> @@ -1232,16 +1232,16 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0x01000000, 0xbefc03ff,
>   	0x0000006c, 0x80f89078,
>   	0xf429003a, 0xf0000000,
> -	0xbf8cc07f, 0x80fc847c,
> +	0xbf8c0000, 0x80fc847c,
>   	0xbf800000, 0xbe803100,
>   	0xbe823102, 0x80f8a078,
>   	0xf42d003a, 0xf0000000,
> -	0xbf8cc07f, 0x80fc887c,
> +	0xbf8c0000, 0x80fc887c,
>   	0xbf800000, 0xbe803100,
>   	0xbe823102, 0xbe843104,
>   	0xbe863106, 0x80f8c078,
>   	0xf431003a, 0xf0000000,
> -	0xbf8cc07f, 0x80fc907c,
> +	0xbf8c0000, 0x80fc907c,
>   	0xbf800000, 0xbe803100,
>   	0xbe823102, 0xbe843104,
>   	0xbe863106, 0xbe883108,
> @@ -1271,9 +1271,9 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0xf4211cfa, 0xf0000000,
>   	0x80788478, 0xf4211bba,
>   	0xf0000000, 0x80788478,
> -	0xbf8cc07f, 0xb9eef814,
> +	0xbf8c0000, 0xb9eef814,
>   	0xf4211bba, 0xf0000000,
> -	0x80788478, 0xbf8cc07f,
> +	0x80788478, 0xbf8c0000,
>   	0xb9eef815, 0xbefc036f,
>   	0xbefe0370, 0xbeff0371,
>   	0xb9f9f816, 0xb9fbf803,
> @@ -1288,7 +1288,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
>   	0x0000ffff, 0xf4091c37,
>   	0xfa000050, 0xf4091d37,
>   	0xfa000060, 0xf4011e77,
> -	0xfa000074, 0xbf8cc07f,
> +	0xfa000074, 0xbf8c0000,
>   	0x906e8977, 0x876fff6e,
>   	0x003f8000, 0x906e8677,
>   	0x876eff6e, 0x02000000,
> @@ -2299,12 +2299,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0xbf0d8f7b, 0xbf840002,
>   	0x887bff7b, 0xffff0000,
>   	0xf4011bbd, 0xfa000010,
> -	0xbf8cc07f, 0x8f6e976e,
> +	0xbf8c0000, 0x8f6e976e,
>   	0x8a77ff77, 0x00800000,
>   	0x88776e77, 0xf4051bbd,
> -	0xfa000000, 0xbf8cc07f,
> +	0xfa000000, 0xbf8c0000,
>   	0xf4051ebd, 0xfa000008,
> -	0xbf8cc07f, 0x87ee6e6e,
> +	0xbf8c0000, 0x87ee6e6e,
>   	0xbf840001, 0xbe80206e,
>   	0x876eff6d, 0x00ff0000,
>   	0xbf850008, 0x876eff6d,
> @@ -2319,7 +2319,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0x0000ffff, 0xbefa0380,
>   	0xb9fa0283, 0xbeee037e,
>   	0xbeef037f, 0xbefe0480,
> -	0xbf900004, 0xbf8cc07f,
> +	0xbf900004, 0xbf8c0000,
>   	0x877aff7f, 0x04000000,
>   	0x8f7a857a, 0x886d7a6d,
>   	0x7e008200, 0xbefa037e,
> @@ -2595,7 +2595,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0xe0304080, 0x785d0100,
>   	0xe0304100, 0x785d0200,
>   	0xe0304180, 0x785d0300,
> -	0xbf8c3f70, 0x7e008500,
> +	0xbf8c0000, 0x7e008500,
>   	0x7e028501, 0x7e048502,
>   	0x7e068503, 0x807c847c,
>   	0x8078ff78, 0x00000200,
> @@ -2604,7 +2604,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0xe0304080, 0x6e5d0100,
>   	0xe0304100, 0x6e5d0200,
>   	0xe0304180, 0x6e5d0300,
> -	0xbf8c3f70, 0xbf820034,
> +	0xbf8c0000, 0xbf820034,
>   	0xbef603ff, 0x01000000,
>   	0xbeee0378, 0x8078ff78,
>   	0x00000400, 0xbefc0384,
> @@ -2613,7 +2613,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0xe0304100, 0x785d0100,
>   	0xe0304200, 0x785d0200,
>   	0xe0304300, 0x785d0300,
> -	0xbf8c3f70, 0x7e008500,
> +	0xbf8c0000, 0x7e008500,
>   	0x7e028501, 0x7e048502,
>   	0x7e068503, 0x807c847c,
>   	0x8078ff78, 0x00000400,
> @@ -2622,7 +2622,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0xbf84000e, 0x8f6f836f,
>   	0x806f7c6f, 0xbefe03c1,
>   	0xbeff0380, 0xe0304000,
> -	0x785d0000, 0xbf8c3f70,
> +	0x785d0000, 0xbf8c0000,
>   	0x7e008500, 0x807c817c,
>   	0x8078ff78, 0x00000080,
>   	0xbf0a6f7c, 0xbf85fff7,
> @@ -2630,7 +2630,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0x6e5d0000, 0xe0304100,
>   	0x6e5d0100, 0xe0304200,
>   	0x6e5d0200, 0xe0304300,
> -	0x6e5d0300, 0xbf8c3f70,
> +	0x6e5d0300, 0xbf8c0000,
>   	0xb9783a05, 0x80788178,
>   	0xbf0d9972, 0xbf850002,
>   	0x8f788978, 0xbf820001,
> @@ -2641,16 +2641,16 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0xbef603ff, 0x01000000,
>   	0xbefc03ff, 0x0000006c,
>   	0x80f89078, 0xf429003a,
> -	0xf0000000, 0xbf8cc07f,
> +	0xf0000000, 0xbf8c0000,
>   	0x80fc847c, 0xbf800000,
>   	0xbe803100, 0xbe823102,
>   	0x80f8a078, 0xf42d003a,
> -	0xf0000000, 0xbf8cc07f,
> +	0xf0000000, 0xbf8c0000,
>   	0x80fc887c, 0xbf800000,
>   	0xbe803100, 0xbe823102,
>   	0xbe843104, 0xbe863106,
>   	0x80f8c078, 0xf431003a,
> -	0xf0000000, 0xbf8cc07f,
> +	0xf0000000, 0xbf8c0000,
>   	0x80fc907c, 0xbf800000,
>   	0xbe803100, 0xbe823102,
>   	0xbe843104, 0xbe863106,
> @@ -2680,10 +2680,10 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0x80788478, 0xf4211cfa,
>   	0xf0000000, 0x80788478,
>   	0xf4211bba, 0xf0000000,
> -	0x80788478, 0xbf8cc07f,
> +	0x80788478, 0xbf8c0000,
>   	0xb9eef814, 0xf4211bba,
>   	0xf0000000, 0x80788478,
> -	0xbf8cc07f, 0xb9eef815,
> +	0xbf8c0000, 0xb9eef815,
>   	0xbefc036f, 0xbefe0370,
>   	0xbeff0371, 0xb9fbf803,
>   	0xb9f3f801, 0xb96e3a05,
> @@ -2697,7 +2697,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
>   	0x0000ffff, 0xf4091c37,
>   	0xfa000050, 0xf4091d37,
>   	0xfa000060, 0xf4011e77,
> -	0xfa000074, 0xbf8cc07f,
> +	0xfa000074, 0xbf8c0000,
>   	0x876dff6d, 0x0000ffff,
>   	0x87fe7e7e, 0x87ea6a6a,
>   	0xb9faf802, 0xbe80226c,
> @@ -2731,16 +2731,16 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0x8b6eff6e, 0x00000800,
>   	0xbfa20003, 0x8b6eff7b,
>   	0x00000400, 0xbfa2002a,
> -	0xbefa4d82, 0xbf89fc07,
> +	0xbefa4d82, 0xbf890000,
>   	0x84fa887a, 0xbf0d8f7b,
>   	0xbfa10002, 0x8c7bff7b,
>   	0xffff0000, 0xf4005bbd,
> -	0xf8000010, 0xbf89fc07,
> +	0xf8000010, 0xbf890000,
>   	0x846e976e, 0x9177ff77,
>   	0x00800000, 0x8c776e77,
>   	0xf4045bbd, 0xf8000000,
> -	0xbf89fc07, 0xf4045ebd,
> -	0xf8000008, 0xbf89fc07,
> +	0xbf890000, 0xf4045ebd,
> +	0xf8000008, 0xbf890000,
>   	0x8bee6e6e, 0xbfa10001,
>   	0xbe80486e, 0x8b6eff6d,
>   	0x00ff0000, 0xbfa20008,
> @@ -2756,7 +2756,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0xbefa0080, 0xb97a0283,
>   	0xbeee007e, 0xbeef007f,
>   	0xbefe0180, 0xbefe4d84,
> -	0xbf89fc07, 0x8b7aff7f,
> +	0xbf890000, 0x8b7aff7f,
>   	0x04000000, 0x847a857a,
>   	0x8c6d7a6d, 0xbefa007e,
>   	0x8b7bff7f, 0x0000ffff,
> @@ -3007,13 +3007,13 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0x857d9972, 0x8b7d817d,
>   	0xbf06817d, 0xbefd0080,
>   	0xbfa2000c, 0xe0500000,
> -	0x781d0000, 0xbf8903f7,
> +	0x781d0000, 0xbf890000,
>   	0xdac00000, 0x00000000,
>   	0x807dff7d, 0x00000080,
>   	0x8078ff78, 0x00000080,
>   	0xbf0a6f7d, 0xbfa2fff5,
>   	0xbfa0000b, 0xe0500000,
> -	0x781d0000, 0xbf8903f7,
> +	0x781d0000, 0xbf890000,
>   	0xdac00000, 0x00000000,
>   	0x807dff7d, 0x00000100,
>   	0x8078ff78, 0x00000100,
> @@ -3034,7 +3034,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0xe0505080, 0x781d0100,
>   	0xe0505100, 0x781d0200,
>   	0xe0505180, 0x781d0300,
> -	0xbf8903f7, 0x7e008500,
> +	0xbf890000, 0x7e008500,
>   	0x7e028501, 0x7e048502,
>   	0x7e068503, 0x807d847d,
>   	0x8078ff78, 0x00000200,
> @@ -3043,7 +3043,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0xe0505080, 0x6e1d0100,
>   	0xe0505100, 0x6e1d0200,
>   	0xe0505180, 0x6e1d0300,
> -	0xbf8903f7, 0xbfa00034,
> +	0xbf890000, 0xbfa00034,
>   	0xbef600ff, 0x01000000,
>   	0xbeee0078, 0x8078ff78,
>   	0x00000400, 0xbefd0084,
> @@ -3052,7 +3052,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0xe0505100, 0x781d0100,
>   	0xe0505200, 0x781d0200,
>   	0xe0505300, 0x781d0300,
> -	0xbf8903f7, 0x7e008500,
> +	0xbf890000, 0x7e008500,
>   	0x7e028501, 0x7e048502,
>   	0x7e068503, 0x807d847d,
>   	0x8078ff78, 0x00000400,
> @@ -3061,7 +3061,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0xbfa1000e, 0x846f836f,
>   	0x806f7d6f, 0xbefe00c1,
>   	0xbeff0080, 0xe0505000,
> -	0x781d0000, 0xbf8903f7,
> +	0x781d0000, 0xbf890000,
>   	0x7e008500, 0x807d817d,
>   	0x8078ff78, 0x00000080,
>   	0xbf0a6f7d, 0xbfa2fff7,
> @@ -3069,7 +3069,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0x6e1d0000, 0xe0505100,
>   	0x6e1d0100, 0xe0505200,
>   	0x6e1d0200, 0xe0505300,
> -	0x6e1d0300, 0xbf8903f7,
> +	0x6e1d0300, 0xbf890000,
>   	0xb8f83b05, 0x80788178,
>   	0xbf0d9972, 0xbfa20002,
>   	0x84788978, 0xbfa00001,
> @@ -3080,16 +3080,16 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0xbef600ff, 0x01000000,
>   	0xbefd00ff, 0x0000006c,
>   	0x80f89078, 0xf428403a,
> -	0xf0000000, 0xbf89fc07,
> +	0xf0000000, 0xbf890000,
>   	0x80fd847d, 0xbf800000,
>   	0xbe804300, 0xbe824302,
>   	0x80f8a078, 0xf42c403a,
> -	0xf0000000, 0xbf89fc07,
> +	0xf0000000, 0xbf890000,
>   	0x80fd887d, 0xbf800000,
>   	0xbe804300, 0xbe824302,
>   	0xbe844304, 0xbe864306,
>   	0x80f8c078, 0xf430403a,
> -	0xf0000000, 0xbf89fc07,
> +	0xf0000000, 0xbf890000,
>   	0x80fd907d, 0xbf800000,
>   	0xbe804300, 0xbe824302,
>   	0xbe844304, 0xbe864306,
> @@ -3119,10 +3119,10 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0x80788478, 0xf4205cfa,
>   	0xf0000000, 0x80788478,
>   	0xf4205bba, 0xf0000000,
> -	0x80788478, 0xbf89fc07,
> +	0x80788478, 0xbf890000,
>   	0xb96ef814, 0xf4205bba,
>   	0xf0000000, 0x80788478,
> -	0xbf89fc07, 0xb96ef815,
> +	0xbf890000, 0xb96ef815,
>   	0xbefd006f, 0xbefe0070,
>   	0xbeff0071, 0xb97bf803,
>   	0xb973f801, 0xb8ee3b05,
> @@ -3136,7 +3136,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
>   	0x0000ffff, 0xf4085c37,
>   	0xf8000050, 0xf4085d37,
>   	0xf8000060, 0xf4005e77,
> -	0xf8000074, 0xbf89fc07,
> +	0xf8000074, 0xbf890000,
>   	0x8b6dff6d, 0x0000ffff,
>   	0x8bfe7e7e, 0x8bea6a6a,
>   	0xb8eef802, 0xbf0d866e,
> @@ -3657,16 +3657,16 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x8b6fff6f, 0x00000200,
>   	0xbfa20002, 0x8b6ea07b,
>   	0xbfa2002b, 0xbefa4d82,
> -	0xbf89fc07, 0x84fa887a,
> +	0xbf8a0000, 0x84fa887a,
>   	0xbf0d8f7b, 0xbfa10002,
>   	0x8c7bff7b, 0xffff0000,
>   	0xf4601bbd, 0xf8000010,
> -	0xbf89fc07, 0x846e976e,
> +	0xbf8a0000, 0x846e976e,
>   	0x9177ff77, 0x00800000,
>   	0x8c776e77, 0xf4603bbd,
> -	0xf8000000, 0xbf89fc07,
> +	0xf8000000, 0xbf8a0000,
>   	0xf4603ebd, 0xf8000008,
> -	0xbf89fc07, 0x8bee6e6e,
> +	0xbf8a0000, 0x8bee6e6e,
>   	0xbfa10001, 0xbe80486e,
>   	0x8b6eff6d, 0xf0000000,
>   	0xbfa20009, 0xb8eef811,
> @@ -3682,7 +3682,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0xbefa0080, 0xb97a0151,
>   	0xbeee007e, 0xbeef007f,
>   	0xbefe0180, 0xbefe4d84,
> -	0xbf89fc07, 0x8b7aff7f,
> +	0xbf8a0000, 0x8b7aff7f,
>   	0x04000000, 0x847a857a,
>   	0x8c6d7a6d, 0xbefa007e,
>   	0x8b7bff7f, 0x0000ffff,
> @@ -3869,7 +3869,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x00000080, 0xbf800000,
>   	0xbf800000, 0xbf800000,
>   	0xd8d80000, 0x01000000,
> -	0xbf890000, 0xc4068070,
> +	0xbf8a0000, 0xc4068070,
>   	0x008ce801, 0x00000000,
>   	0x807d037d, 0x80700370,
>   	0xd5250000, 0x0001ff00,
> @@ -3878,7 +3878,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0xbe8300ff, 0x00000100,
>   	0xbf800000, 0xbf800000,
>   	0xbf800000, 0xd8d80000,
> -	0x01000000, 0xbf890000,
> +	0x01000000, 0xbf8a0000,
>   	0xc4068070, 0x008ce801,
>   	0x00000000, 0x807d037d,
>   	0x80700370, 0xd5250000,
> @@ -3954,14 +3954,14 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x8b7d817d, 0xbf06817d,
>   	0xbefd0080, 0xbfa2000d,
>   	0xc4050078, 0x0080e800,
> -	0x00000000, 0xbf8903f7,
> +	0x00000000, 0xbf8a0000,
>   	0xdac00000, 0x00000000,
>   	0x807dff7d, 0x00000080,
>   	0x8078ff78, 0x00000080,
>   	0xbf0a6f7d, 0xbfa2fff4,
>   	0xbfa0000c, 0xc4050078,
>   	0x0080e800, 0x00000000,
> -	0xbf8903f7, 0xdac00000,
> +	0xbf8a0000, 0xdac00000,
>   	0x00000000, 0x807dff7d,
>   	0x00000100, 0x8078ff78,
>   	0x00000100, 0xbf0a6f7d,
> @@ -3983,7 +3983,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x00008000, 0xc4050078,
>   	0x008ce802, 0x00010000,
>   	0xc4050078, 0x008ce803,
> -	0x00018000, 0xbf8903f7,
> +	0x00018000, 0xbf8a0000,
>   	0x7e008500, 0x7e028501,
>   	0x7e048502, 0x7e068503,
>   	0x807d847d, 0x8078ff78,
> @@ -3994,7 +3994,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x00008000, 0xc405006e,
>   	0x008ce802, 0x00010000,
>   	0xc405006e, 0x008ce803,
> -	0x00018000, 0xbf8903f7,
> +	0x00018000, 0xbf8a0000,
>   	0xbfa0003d, 0xbef600ff,
>   	0x01000000, 0xbeee0078,
>   	0x8078ff78, 0x00000400,
> @@ -4005,7 +4005,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x00010000, 0xc4050078,
>   	0x008ce802, 0x00020000,
>   	0xc4050078, 0x008ce803,
> -	0x00030000, 0xbf8903f7,
> +	0x00030000, 0xbf8a0000,
>   	0x7e008500, 0x7e028501,
>   	0x7e048502, 0x7e068503,
>   	0x807d847d, 0x8078ff78,
> @@ -4015,7 +4015,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x846f836f, 0x806f7d6f,
>   	0xbefe00c1, 0xbeff0080,
>   	0xc4050078, 0x008ce800,
> -	0x00000000, 0xbf8903f7,
> +	0x00000000, 0xbf8a0000,
>   	0x7e008500, 0x807d817d,
>   	0x8078ff78, 0x00000080,
>   	0xbf0a6f7d, 0xbfa2fff6,
> @@ -4025,7 +4025,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x00010000, 0xc405006e,
>   	0x008ce802, 0x00020000,
>   	0xc405006e, 0x008ce803,
> -	0x00030000, 0xbf8903f7,
> +	0x00030000, 0xbf8a0000,
>   	0xb8f83b05, 0x80788178,
>   	0xbf0d9972, 0xbfa20002,
>   	0x84788978, 0xbfa00001,
> @@ -4036,16 +4036,16 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0xbef600ff, 0x01000000,
>   	0xbefd00ff, 0x0000006c,
>   	0x80f89078, 0xf462403a,
> -	0xf0000000, 0xbf89fc07,
> +	0xf0000000, 0xbf8a0000,
>   	0x80fd847d, 0xbf800000,
>   	0xbe804300, 0xbe824302,
>   	0x80f8a078, 0xf462603a,
> -	0xf0000000, 0xbf89fc07,
> +	0xf0000000, 0xbf8a0000,
>   	0x80fd887d, 0xbf800000,
>   	0xbe804300, 0xbe824302,
>   	0xbe844304, 0xbe864306,
>   	0x80f8c078, 0xf462803a,
> -	0xf0000000, 0xbf89fc07,
> +	0xf0000000, 0xbf8a0000,
>   	0x80fd907d, 0xbf800000,
>   	0xbe804300, 0xbe824302,
>   	0xbe844304, 0xbe864306,
> @@ -4075,19 +4075,19 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x80788478, 0xf4621cfa,
>   	0xf0000000, 0x80788478,
>   	0xf4621bba, 0xf0000000,
> -	0x80788478, 0xbf89fc07,
> +	0x80788478, 0xbf8a0000,
>   	0xb96ef814, 0xf4621bba,
>   	0xf0000000, 0x80788478,
> -	0xbf89fc07, 0xb96ef815,
> +	0xbf8a0000, 0xb96ef815,
>   	0xf4621bba, 0xf0000000,
> -	0x80788478, 0xbf89fc07,
> +	0x80788478, 0xbf8a0000,
>   	0xb96ef812, 0xf4621bba,
>   	0xf0000000, 0x80788478,
> -	0xbf89fc07, 0xb96ef813,
> +	0xbf8a0000, 0xb96ef813,
>   	0x8b6eff7f, 0x04000000,
>   	0xbfa1000d, 0x80788478,
>   	0xf4621bba, 0xf0000000,
> -	0x80788478, 0xbf89fc07,
> +	0x80788478, 0xbf8a0000,
>   	0xbf0d806e, 0xbfa10006,
>   	0x856e906e, 0x8b6e6e6e,
>   	0xbfa10003, 0xbe804ec1,
> @@ -4106,7 +4106,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
>   	0x0000ffff, 0xf4605c37,
>   	0xf8000050, 0xf4605d37,
>   	0xf8000060, 0xf4601e77,
> -	0xf8000074, 0xbf89fc07,
> +	0xf8000074, 0xbf8a0000,
>   	0x8b6dff6d, 0x0000ffff,
>   	0x8bfe7e7e, 0x8bea6a6a,
>   	0xb97af804, 0xbe804a6c,
> diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
> index cb619e49228c..77ae25b6753c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
> +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
> @@ -55,9 +55,11 @@
>   #if ASIC_FAMILY < CHIP_GFX12
>   #define S_COHERENCE glc:1
>   #define V_COHERENCE slc:1 glc:1
> +#define S_WAITCNT_0 s_waitcnt 0
>   #else
>   #define S_COHERENCE scope:SCOPE_SYS
>   #define V_COHERENCE scope:SCOPE_SYS
> +#define S_WAITCNT_0 s_wait_idle
>   
>   #define HW_REG_SHADER_FLAT_SCRATCH_LO HW_REG_WAVE_SCRATCH_BASE_LO
>   #define HW_REG_SHADER_FLAT_SCRATCH_HI HW_REG_WAVE_SCRATCH_BASE_HI
> @@ -364,7 +366,7 @@ L_FETCH_2ND_TRAP:
>   	// ttmp12 holds SQ_WAVE_STATUS
>   #if HAVE_SENDMSG_RTN
>   	s_sendmsg_rtn_b64       [ttmp14, ttmp15], sendmsg(MSG_RTN_GET_TMA)
> -	s_waitcnt       lgkmcnt(0)
> +	S_WAITCNT_0
>   #else
>   	s_getreg_b32	ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
>   	s_getreg_b32	ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
> @@ -377,15 +379,15 @@ L_FETCH_2ND_TRAP:
>   L_NO_SIGN_EXTEND_TMA:
>   
>   	s_load_dword    ttmp2, [ttmp14, ttmp15], 0x10 S_COHERENCE		// debug trap enabled flag
> -	s_waitcnt       lgkmcnt(0)
> +	S_WAITCNT_0
>   	s_lshl_b32      ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT
>   	s_andn2_b32     ttmp11, ttmp11, TTMP11_DEBUG_TRAP_ENABLED_MASK
>   	s_or_b32        ttmp11, ttmp11, ttmp2
>   
>   	s_load_dwordx2	[ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 S_COHERENCE	// second-level TBA
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   	s_load_dwordx2	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 S_COHERENCE	// second-level TMA
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   
>   	s_and_b64	[ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
>   	s_cbranch_scc0	L_NO_NEXT_TRAP						// second-level trap handler not been set
> @@ -460,7 +462,7 @@ L_SLEEP:
>   	s_sleep		0x2
>   	s_cbranch_execz	L_SLEEP
>   #else
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   #endif
>   
>   	// Save first_wave flag so we can clear high bits of save address.
> @@ -794,7 +796,7 @@ L_SAVE_LDS_W32:
>   
>   L_SAVE_LDS_LOOP_SQC_W32:
>   	ds_read_b32	v1, v0
> -	s_waitcnt	0
> +	S_WAITCNT_0
>   
>   	write_vgprs_to_mem_with_sqc_w32(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
>   
> @@ -814,7 +816,7 @@ L_SAVE_LDS_WITH_TCP_W32:
>   	s_nop		0
>   L_SAVE_LDS_LOOP_W32:
>   	ds_read_b32	v1, v0
> -	s_waitcnt	0
> +	S_WAITCNT_0
>   	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
>   
>   	s_add_u32	m0, m0, s3						//every buffer_store_lds does 128 bytes
> @@ -832,7 +834,7 @@ L_SAVE_LDS_W64:
>   
>   L_SAVE_LDS_LOOP_SQC_W64:
>   	ds_read_b32	v1, v0
> -	s_waitcnt	0
> +	S_WAITCNT_0
>   
>   	write_vgprs_to_mem_with_sqc_w64(v1, 1, s_save_buf_rsrc0, s_save_mem_offset)
>   
> @@ -852,7 +854,7 @@ L_SAVE_LDS_WITH_TCP_W64:
>   	s_nop		0
>   L_SAVE_LDS_LOOP_W64:
>   	ds_read_b32	v1, v0
> -	s_waitcnt	0
> +	S_WAITCNT_0
>   	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset V_COHERENCE
>   
>   	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
> @@ -1073,7 +1075,7 @@ L_RESTORE_LDS_LOOP_W32:
>   	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
>   #else
>   	buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
> -	s_waitcnt	vmcnt(0)
> +	S_WAITCNT_0
>   	ds_store_addtid_b32     v0
>   #endif
>   	s_add_u32	m0, m0, 128						// 128 DW
> @@ -1087,7 +1089,7 @@ L_RESTORE_LDS_LOOP_W64:
>   	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
>   #else
>   	buffer_load_dword       v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset
> -	s_waitcnt	vmcnt(0)
> +	S_WAITCNT_0
>   	ds_store_addtid_b32     v0
>   #endif
>   	s_add_u32	m0, m0, 256						// 256 DW
> @@ -1132,7 +1134,7 @@ L_RESTORE_VGPR_WAVE32_LOOP:
>   	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128
>   	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*2
>   	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:128*3
> -	s_waitcnt	vmcnt(0)
> +	S_WAITCNT_0
>   	v_movreld_b32	v0, v0							//v[0+m0] = v0
>   	v_movreld_b32	v1, v1
>   	v_movreld_b32	v2, v2
> @@ -1147,7 +1149,7 @@ L_RESTORE_VGPR_WAVE32_LOOP:
>   	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128
>   	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*2
>   	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:128*3
> -	s_waitcnt	vmcnt(0)
> +	S_WAITCNT_0
>   
>   	s_branch	L_RESTORE_SGPR
>   
> @@ -1166,7 +1168,7 @@ L_RESTORE_VGPR_WAVE64_LOOP:
>   	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256
>   	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*2
>   	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE offset:256*3
> -	s_waitcnt	vmcnt(0)
> +	S_WAITCNT_0
>   	v_movreld_b32	v0, v0							//v[0+m0] = v0
>   	v_movreld_b32	v1, v1
>   	v_movreld_b32	v2, v2
> @@ -1189,7 +1191,7 @@ L_RESTORE_SHARED_VGPR:
>   	s_mov_b32	exec_hi, 0x00000000
>   L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
>   	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset V_COHERENCE
> -	s_waitcnt	vmcnt(0)
> +	S_WAITCNT_0
>   	v_movreld_b32	v0, v0							//v[0+m0] = v0
>   	s_add_u32	m0, m0, 1						//next vgpr index
>   	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128
> @@ -1204,7 +1206,7 @@ L_RESTORE_V0:
>   	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256
>   	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*2
>   	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save V_COHERENCE offset:256*3
> -	s_waitcnt	vmcnt(0)
> +	S_WAITCNT_0
>   
>   	/* restore SGPRs */
>   	//will be 2+8+16*6
> @@ -1221,7 +1223,7 @@ L_RESTORE_SGPR:
>   	s_mov_b32	m0, s_sgpr_save_num
>   
>   	read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   
>   	s_sub_u32	m0, m0, 4						// Restore from S[0] to S[104]
>   	s_nop		0							// hazard SALU M0=> S_MOVREL
> @@ -1230,7 +1232,7 @@ L_RESTORE_SGPR:
>   	s_movreld_b64	s2, s2
>   
>   	read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   
>   	s_sub_u32	m0, m0, 8						// Restore from S[0] to S[96]
>   	s_nop		0							// hazard SALU M0=> S_MOVREL
> @@ -1242,7 +1244,7 @@ L_RESTORE_SGPR:
>   
>    L_RESTORE_SGPR_LOOP:
>   	read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   
>   	s_sub_u32	m0, m0, 16						// Restore from S[n] to S[0]
>   	s_nop		0							// hazard SALU M0=> S_MOVREL
> @@ -1291,22 +1293,22 @@ L_RESTORE_HWREG:
>   	read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
>   	read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
>   	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   
>   	s_setreg_b32	hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch
>   
>   	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
> -	s_waitcnt	lgkmcnt(0)						//from now on, it is safe to restore STATUS and IB_STS
> +	S_WAITCNT_0
>   
>   	s_setreg_b32	hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch
>   
>   #if ASIC_FAMILY >= CHIP_GFX12
>   	read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   	s_setreg_b32	hwreg(HW_REG_WAVE_EXCP_FLAG_USER), s_restore_tmp
>   
>   	read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   	s_setreg_b32	hwreg(HW_REG_WAVE_TRAP_CTRL), s_restore_tmp
>   
>   	// Only the first wave needs to restore the workgroup barrier.
> @@ -1317,7 +1319,7 @@ L_RESTORE_HWREG:
>   	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 4
>   
>   	read_hwreg_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset)
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   
>   	s_bitcmp1_b32	s_restore_tmp, BARRIER_STATE_VALID_OFFSET
>   	s_cbranch_scc0	L_SKIP_BARRIER_RESTORE
> @@ -1364,7 +1366,7 @@ L_SKIP_BARRIER_RESTORE:
>   	s_load_dwordx4	[ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 S_COHERENCE
>   	s_load_dwordx4	[ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 S_COHERENCE
>   	s_load_dword	ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 S_COHERENCE
> -	s_waitcnt	lgkmcnt(0)
> +	S_WAITCNT_0
>   
>   #if HAVE_XNACK
>   	restore_ib_sts(s_restore_tmp, s_restore_m0)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes
  2024-05-23 18:37   ` Lancelot SIX
@ 2024-05-23 19:31     ` Jay Cornwall
  2024-05-23 20:41       ` Lancelot SIX
  0 siblings, 1 reply; 9+ messages in thread
From: Jay Cornwall @ 2024-05-23 19:31 UTC (permalink / raw)
  To: Lancelot SIX, amd-gfx

On 5/23/2024 13:37, Lancelot SIX wrote:

>> @@ -622,8 +638,15 @@ L_SAVE_HWREG:
>>   #if ASIC_FAMILY >= CHIP_GFX12
>>       // Ensure no further changes to barrier or LDS state.
>> +    // STATE_PRIV.BARRIER_COMPLETE may change up to this point.
>>       s_barrier_signal    -2
>>       s_barrier_wait    -2
>> +
>> +    // Re-read final state of BARRIER_COMPLETE field for save.
>> +    s_getreg_b32    s_save_tmp, hwreg(S_STATUS_HWREG)
>> +    s_and_b32    s_save_tmp, s_save_tmp, 
>> SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
>> +    s_andn2_b32    s_save_status, s_save_status, 
>> SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
> 
> Even if BARRIER_COMPLETE can be asserted while we are in the trap 
> hadler, I do not think it can be cleared.  That being said, it might be 
> easier to just replace the bit, making it clearer.

Yes, I chose to structure it this way to make the intent clearer. We 
don't gain much from dropping the s_andn2. Most of the time spent in the 
save handler is stalled on memory instructions.

>> @@ -1351,7 +1369,17 @@ L_SKIP_BARRIER_RESTORE:
>>       s_setreg_b32    hwreg(HW_REG_SHADER_XNACK_MASK), 
>> s_restore_xnack_mask
>>   #endif
>> +#if ASIC_FAMILY < CHIP_GFX12
>>       s_setreg_b32    hwreg(S_TRAPSTS_HWREG), s_restore_trapsts
> 
> Wouldn't other gfx1x architectures have a similar issue when writing 
> TRAPSTS here?  That is if TRAPSTS.SAVECTX is set while we are restoring, 
> wouldn't we loose it?
> 
> And for gfx11, there is TRAPSTS.HOST_TRAP that could have the same issue 
> to some degree (not sure if we would loose the host trap completly, or 
> re-enter with trap ID + HT bit set in ttmp1).

Prior to gfx12 context save and host trap exceptions are not delivered 
to a wave until STATUS.PRIV=0, i.e. it leaves the trap handler.

The changes needed for gfx12 are due to a design change in this area. 
Exceptions are now flagged immediately and cause re-entry to the trap if 
any are non-zero.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes
  2024-05-23 19:31     ` Jay Cornwall
@ 2024-05-23 20:41       ` Lancelot SIX
  0 siblings, 0 replies; 9+ messages in thread
From: Lancelot SIX @ 2024-05-23 20:41 UTC (permalink / raw)
  To: Jay Cornwall, amd-gfx



On 23/05/2024 20:31, Jay Cornwall wrote:
> On 5/23/2024 13:37, Lancelot SIX wrote:
> 
>>> @@ -622,8 +638,15 @@ L_SAVE_HWREG:
>>>   #if ASIC_FAMILY >= CHIP_GFX12
>>>       // Ensure no further changes to barrier or LDS state.
>>> +    // STATE_PRIV.BARRIER_COMPLETE may change up to this point.
>>>       s_barrier_signal    -2
>>>       s_barrier_wait    -2
>>> +
>>> +    // Re-read final state of BARRIER_COMPLETE field for save.
>>> +    s_getreg_b32    s_save_tmp, hwreg(S_STATUS_HWREG)
>>> +    s_and_b32    s_save_tmp, s_save_tmp, 
>>> SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
>>> +    s_andn2_b32    s_save_status, s_save_status, 
>>> SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
>>
>> Even if BARRIER_COMPLETE can be asserted while we are in the trap 
>> hadler, I do not think it can be cleared.  That being said, it might 
>> be easier to just replace the bit, making it clearer.
> 
> Yes, I chose to structure it this way to make the intent clearer. We 
> don't gain much from dropping the s_andn2. Most of the time spent in the 
> save handler is stalled on memory instructions.
> 
>>> @@ -1351,7 +1369,17 @@ L_SKIP_BARRIER_RESTORE:
>>>       s_setreg_b32    hwreg(HW_REG_SHADER_XNACK_MASK), 
>>> s_restore_xnack_mask
>>>   #endif
>>> +#if ASIC_FAMILY < CHIP_GFX12
>>>       s_setreg_b32    hwreg(S_TRAPSTS_HWREG), s_restore_trapsts
>>
>> Wouldn't other gfx1x architectures have a similar issue when writing 
>> TRAPSTS here?  That is if TRAPSTS.SAVECTX is set while we are 
>> restoring, wouldn't we loose it?
>>
>> And for gfx11, there is TRAPSTS.HOST_TRAP that could have the same 
>> issue to some degree (not sure if we would loose the host trap 
>> completly, or re-enter with trap ID + HT bit set in ttmp1).
> 
> Prior to gfx12 context save and host trap exceptions are not delivered 
> to a wave until STATUS.PRIV=0, i.e. it leaves the trap handler.
> 
> The changes needed for gfx12 are due to a design change in this area. 
> Exceptions are now flagged immediately and cause re-entry to the trap if 
> any are non-zero.

Thanks for the clarifications.  The patch looks good to me.

Reviewed-by: Lancelot Six <lancelot.six@amd.com>

Best,
Lancelot.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-05-23 20:41 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-05-23 14:08 [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source Jay Cornwall
2024-05-23 14:08 ` [PATCH 2/3] drm/amdkfd: Replace deprecated gfx12 trap handler instructions Jay Cornwall
2024-05-23 18:43   ` Lancelot SIX
2024-05-23 14:08 ` [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes Jay Cornwall
2024-05-23 18:37   ` Lancelot SIX
2024-05-23 19:31     ` Jay Cornwall
2024-05-23 20:41       ` Lancelot SIX
2024-05-23 18:27 ` [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source Alex Deucher
2024-05-23 18:41 ` Lancelot SIX

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox