All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
@ 2026-06-29  0:50 James Zhu
  2026-06-29  9:57 ` Christian König
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: James Zhu @ 2026-06-29  0:50 UTC (permalink / raw)
  To: amd-gfx
  Cc: christian.koenig, Felix.kuehling, Yifan1.Zhang, philip.yang,
	Harish.Kasiviswanathan, Bob.Zhou, jamesz, Claude Opus 4 . 6,
	Yifan Zhang

After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
unsafe because VRAM is accessed via MMIO (PCI BAR mappings).

Direct writes like 'tma[2] = enabled' and memcpy() can fail or
produce incorrect results on non-x86 architectures because:
- MMIO requires specific accessor functions (writeq/readq)
- Compiler optimizations may generate invalid instruction sequences
- No guarantee of proper memory barriers or atomic access

This patch converts CWSR buffer access to use struct iosys_map,
which automatically handles both system memory (GTT) and MMIO
(VRAM) correctly by:
- Using writeq/writel/memcpy_toio for MMIO regions
- Using WRITE_ONCE/memcpy for system memory
- Providing proper memory barriers and access guarantees

Changes:
- Replace void *cwsr_kaddr with struct iosys_map cwsr_map
- Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
- Use iosys_map_wr() for writing trap handler addresses and flags
- Use iosys_map_memcpy_to() for copying CWSR ISA code

This ensures correct operation on all architectures while maintaining
backward compatibility with older GPUs and APUs that use GTT.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ad4897f094a2..6e559aab4009 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -32,6 +32,7 @@
 #include <linux/atomic.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
+#include <linux/iosys-map.h>
 #include <uapi/linux/kfd_ioctl.h>
 #include <linux/idr.h>
 #include <linux/kfifo.h>
@@ -710,7 +711,7 @@ struct qcm_process_device {
 
 	/* CWSR memory */
 	struct kgd_mem *cwsr_mem;
-	void *cwsr_kaddr;
+	struct iosys_map cwsr_map;
 	uint64_t cwsr_base;
 	uint64_t tba_addr;
 	uint64_t tma_addr;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 8e701dcda8ec..7fd65c31afa2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -33,6 +33,7 @@
 #include <linux/mman.h>
 #include <linux/file.h>
 #include <linux/pm_runtime.h>
+#include <drm/ttm/ttm_bo.h>
 #include "amdgpu_amdkfd.h"
 #include "amdgpu.h"
 #include "amdgpu_reset.h"
@@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
 					       NULL);
 }
 
+static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
+			struct kfd_process_device *pdd, struct iosys_map *map)
+{
+	struct kfd_node *dev = pdd->dev;
+
+	if (map && !iosys_map_is_null(map)) {
+		amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
+		iosys_map_clear(map);
+	}
+
+	amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
+	amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
+					       NULL);
+}
+
 /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
  *	This function should be only called right after the process
  *	is created and when kfd_processes_mutex is still being held
@@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
 		if (pdd->drm_file)
 			fput(pdd->drm_file);
 
-		if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
-			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
+		if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
+			free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
 				get_order(KFD_CWSR_TBA_TMA_SIZE));
 
 		idr_destroy(&pdd->alloc_idr);
@@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
 	void *kaddr;
 	int ret;
 
-	if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
+	if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
 		return 0;
 
 	if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
@@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
 		return ret;
 
 	qpd->cwsr_mem = mem;
-	qpd->cwsr_kaddr = kaddr;
+
+	/* Set up iosys_map based on whether memory is MMIO or system memory */
+	if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
+		iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
+	else
+		iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
+
 	qpd->tba_addr = qpd->cwsr_base;
 
-	memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
+	/* Copy CWSR ISA to buffer using appropriate accessor */
+	iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
+			    dev->kfd->cwsr_isa_size);
 
 	kfd_process_set_trap_debug_flag(&pdd->qpd,
 					pdd->process->debug_trap_enabled);
 
 	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
-	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
-		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
+	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
+		 qpd->tba_addr, qpd->tma_addr,
+		 qpd->cwsr_map.is_iomem ? "iomem" : "system",
+		 qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
+					  qpd->cwsr_map.vaddr);
 
 	return 0;
 }
@@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
 	struct kfd_node *dev = pdd->dev;
 	struct qcm_process_device *qpd = &pdd->qpd;
 
-	if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
+	if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
 		return;
 
-	kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
+	kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
 }
 
 void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tba_addr,
 				  uint64_t tma_addr)
 {
-	if (qpd->cwsr_kaddr) {
+	if (!iosys_map_is_null(&qpd->cwsr_map)) {
 		/* KFD trap handler is bound, record as second-level TBA/TMA
 		 * in first-level TMA. First-level trap will jump to second.
 		 */
-		uint64_t *tma =
-			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
-		tma[0] = tba_addr;
-		tma[1] = tma_addr;
+		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
+			     uint64_t, tba_addr);
+		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
+			     uint64_t, tma_addr);
 	} else {
 		/* No trap handler bound, bind as first-level TBA/TMA. */
 		qpd->tba_addr = tba_addr;
@@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
 void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
 				     bool enabled)
 {
-	if (qpd->cwsr_kaddr) {
-		uint64_t *tma =
-			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
-		tma[2] = enabled;
+	if (!iosys_map_is_null(&qpd->cwsr_map)) {
+		iosys_map_wr(&qpd->cwsr_map,
+			     KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
+			     uint64_t, enabled);
 	}
 }
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-06-29 19:43 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-29  0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
2026-06-29  9:57 ` Christian König
2026-06-29 17:54 ` Kuehling, Felix
2026-06-29 18:37 ` Mario Limonciello
2026-06-29 19:43   ` James Zhu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.