All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
@ 2026-06-29  0:50 James Zhu
  2026-06-29  9:57 ` Christian König
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: James Zhu @ 2026-06-29  0:50 UTC (permalink / raw)
  To: amd-gfx
  Cc: christian.koenig, Felix.kuehling, Yifan1.Zhang, philip.yang,
	Harish.Kasiviswanathan, Bob.Zhou, jamesz, Claude Opus 4 . 6,
	Yifan Zhang

After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
unsafe because VRAM is accessed via MMIO (PCI BAR mappings).

Direct writes like 'tma[2] = enabled' and memcpy() can fail or
produce incorrect results on non-x86 architectures because:
- MMIO requires specific accessor functions (writeq/readq)
- Compiler optimizations may generate invalid instruction sequences
- No guarantee of proper memory barriers or atomic access

This patch converts CWSR buffer access to use struct iosys_map,
which automatically handles both system memory (GTT) and MMIO
(VRAM) correctly by:
- Using writeq/writel/memcpy_toio for MMIO regions
- Using WRITE_ONCE/memcpy for system memory
- Providing proper memory barriers and access guarantees

Changes:
- Replace void *cwsr_kaddr with struct iosys_map cwsr_map
- Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
- Use iosys_map_wr() for writing trap handler addresses and flags
- Use iosys_map_memcpy_to() for copying CWSR ISA code

This ensures correct operation on all architectures while maintaining
backward compatibility with older GPUs and APUs that use GTT.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ad4897f094a2..6e559aab4009 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -32,6 +32,7 @@
 #include <linux/atomic.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
+#include <linux/iosys-map.h>
 #include <uapi/linux/kfd_ioctl.h>
 #include <linux/idr.h>
 #include <linux/kfifo.h>
@@ -710,7 +711,7 @@ struct qcm_process_device {
 
 	/* CWSR memory */
 	struct kgd_mem *cwsr_mem;
-	void *cwsr_kaddr;
+	struct iosys_map cwsr_map;
 	uint64_t cwsr_base;
 	uint64_t tba_addr;
 	uint64_t tma_addr;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 8e701dcda8ec..7fd65c31afa2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -33,6 +33,7 @@
 #include <linux/mman.h>
 #include <linux/file.h>
 #include <linux/pm_runtime.h>
+#include <drm/ttm/ttm_bo.h>
 #include "amdgpu_amdkfd.h"
 #include "amdgpu.h"
 #include "amdgpu_reset.h"
@@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
 					       NULL);
 }
 
+static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
+			struct kfd_process_device *pdd, struct iosys_map *map)
+{
+	struct kfd_node *dev = pdd->dev;
+
+	if (map && !iosys_map_is_null(map)) {
+		amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
+		iosys_map_clear(map);
+	}
+
+	amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
+	amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
+					       NULL);
+}
+
 /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
  *	This function should be only called right after the process
  *	is created and when kfd_processes_mutex is still being held
@@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
 		if (pdd->drm_file)
 			fput(pdd->drm_file);
 
-		if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
-			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
+		if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
+			free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
 				get_order(KFD_CWSR_TBA_TMA_SIZE));
 
 		idr_destroy(&pdd->alloc_idr);
@@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
 	void *kaddr;
 	int ret;
 
-	if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
+	if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
 		return 0;
 
 	if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
@@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
 		return ret;
 
 	qpd->cwsr_mem = mem;
-	qpd->cwsr_kaddr = kaddr;
+
+	/* Set up iosys_map based on whether memory is MMIO or system memory */
+	if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
+		iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
+	else
+		iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
+
 	qpd->tba_addr = qpd->cwsr_base;
 
-	memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
+	/* Copy CWSR ISA to buffer using appropriate accessor */
+	iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
+			    dev->kfd->cwsr_isa_size);
 
 	kfd_process_set_trap_debug_flag(&pdd->qpd,
 					pdd->process->debug_trap_enabled);
 
 	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
-	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
-		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
+	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
+		 qpd->tba_addr, qpd->tma_addr,
+		 qpd->cwsr_map.is_iomem ? "iomem" : "system",
+		 qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
+					  qpd->cwsr_map.vaddr);
 
 	return 0;
 }
@@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
 	struct kfd_node *dev = pdd->dev;
 	struct qcm_process_device *qpd = &pdd->qpd;
 
-	if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
+	if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
 		return;
 
-	kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
+	kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
 }
 
 void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
 				  uint64_t tba_addr,
 				  uint64_t tma_addr)
 {
-	if (qpd->cwsr_kaddr) {
+	if (!iosys_map_is_null(&qpd->cwsr_map)) {
 		/* KFD trap handler is bound, record as second-level TBA/TMA
 		 * in first-level TMA. First-level trap will jump to second.
 		 */
-		uint64_t *tma =
-			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
-		tma[0] = tba_addr;
-		tma[1] = tma_addr;
+		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
+			     uint64_t, tba_addr);
+		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
+			     uint64_t, tma_addr);
 	} else {
 		/* No trap handler bound, bind as first-level TBA/TMA. */
 		qpd->tba_addr = tba_addr;
@@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
 void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
 				     bool enabled)
 {
-	if (qpd->cwsr_kaddr) {
-		uint64_t *tma =
-			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
-		tma[2] = enabled;
+	if (!iosys_map_is_null(&qpd->cwsr_map)) {
+		iosys_map_wr(&qpd->cwsr_map,
+			     KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
+			     uint64_t, enabled);
 	}
 }
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
  2026-06-29  0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
@ 2026-06-29  9:57 ` Christian König
  2026-06-29 17:54 ` Kuehling, Felix
  2026-06-29 18:37 ` Mario Limonciello
  2 siblings, 0 replies; 5+ messages in thread
From: Christian König @ 2026-06-29  9:57 UTC (permalink / raw)
  To: James Zhu, amd-gfx
  Cc: Felix.kuehling, Yifan1.Zhang, philip.yang, Harish.Kasiviswanathan,
	Bob.Zhou, jamesz, Claude Opus 4 . 6

On 6/29/26 02:50, James Zhu wrote:
> After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
> 5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
> unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
> 
> Direct writes like 'tma[2] = enabled' and memcpy() can fail or
> produce incorrect results on non-x86 architectures because:
> - MMIO requires specific accessor functions (writeq/readq)
> - Compiler optimizations may generate invalid instruction sequences
> - No guarantee of proper memory barriers or atomic access
> 
> This patch converts CWSR buffer access to use struct iosys_map,
> which automatically handles both system memory (GTT) and MMIO
> (VRAM) correctly by:
> - Using writeq/writel/memcpy_toio for MMIO regions
> - Using WRITE_ONCE/memcpy for system memory
> - Providing proper memory barriers and access guarantees
> 
> Changes:
> - Replace void *cwsr_kaddr with struct iosys_map cwsr_map
> - Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
> - Use iosys_map_wr() for writing trap handler addresses and flags
> - Use iosys_map_memcpy_to() for copying CWSR ISA code
> 
> This ensures correct operation on all architectures while maintaining
> backward compatibility with older GPUs and APUs that use GTT.
> 
> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
> Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>

Looks valid of hand, but I have no time for an in deep review. 
Somebody who knows the code should take another look as well.

But feel free to add Acked-by: Christian König <christian.koenig@amd.com>.

Regards,
Christian.

> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
>  2 files changed, 47 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ad4897f094a2..6e559aab4009 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -32,6 +32,7 @@
>  #include <linux/atomic.h>
>  #include <linux/workqueue.h>
>  #include <linux/spinlock.h>
> +#include <linux/iosys-map.h>
>  #include <uapi/linux/kfd_ioctl.h>
>  #include <linux/idr.h>
>  #include <linux/kfifo.h>
> @@ -710,7 +711,7 @@ struct qcm_process_device {
>  
>  	/* CWSR memory */
>  	struct kgd_mem *cwsr_mem;
> -	void *cwsr_kaddr;
> +	struct iosys_map cwsr_map;
>  	uint64_t cwsr_base;
>  	uint64_t tba_addr;
>  	uint64_t tma_addr;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8e701dcda8ec..7fd65c31afa2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -33,6 +33,7 @@
>  #include <linux/mman.h>
>  #include <linux/file.h>
>  #include <linux/pm_runtime.h>
> +#include <drm/ttm/ttm_bo.h>
>  #include "amdgpu_amdkfd.h"
>  #include "amdgpu.h"
>  #include "amdgpu_reset.h"
> @@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
>  					       NULL);
>  }
>  
> +static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
> +			struct kfd_process_device *pdd, struct iosys_map *map)
> +{
> +	struct kfd_node *dev = pdd->dev;
> +
> +	if (map && !iosys_map_is_null(map)) {
> +		amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
> +		iosys_map_clear(map);
> +	}
> +
> +	amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
> +	amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
> +					       NULL);
> +}
> +
>  /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
>   *	This function should be only called right after the process
>   *	is created and when kfd_processes_mutex is still being held
> @@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
>  		if (pdd->drm_file)
>  			fput(pdd->drm_file);
>  
> -		if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
> -			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
> +		if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
> +			free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
>  				get_order(KFD_CWSR_TBA_TMA_SIZE));
>  
>  		idr_destroy(&pdd->alloc_idr);
> @@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>  	void *kaddr;
>  	int ret;
>  
> -	if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
> +	if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
>  		return 0;
>  
>  	if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
> @@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>  		return ret;
>  
>  	qpd->cwsr_mem = mem;
> -	qpd->cwsr_kaddr = kaddr;
> +
> +	/* Set up iosys_map based on whether memory is MMIO or system memory */
> +	if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
> +		iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
> +	else
> +		iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
> +
>  	qpd->tba_addr = qpd->cwsr_base;
>  
> -	memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
> +	/* Copy CWSR ISA to buffer using appropriate accessor */
> +	iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
> +			    dev->kfd->cwsr_isa_size);
>  
>  	kfd_process_set_trap_debug_flag(&pdd->qpd,
>  					pdd->process->debug_trap_enabled);
>  
>  	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> -	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> -		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> +	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
> +		 qpd->tba_addr, qpd->tma_addr,
> +		 qpd->cwsr_map.is_iomem ? "iomem" : "system",
> +		 qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
> +					  qpd->cwsr_map.vaddr);
>  
>  	return 0;
>  }
> @@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
>  	struct kfd_node *dev = pdd->dev;
>  	struct qcm_process_device *qpd = &pdd->qpd;
>  
> -	if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
> +	if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
>  		return;
>  
> -	kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
> +	kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
>  }
>  
>  void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>  				  uint64_t tba_addr,
>  				  uint64_t tma_addr)
>  {
> -	if (qpd->cwsr_kaddr) {
> +	if (!iosys_map_is_null(&qpd->cwsr_map)) {
>  		/* KFD trap handler is bound, record as second-level TBA/TMA
>  		 * in first-level TMA. First-level trap will jump to second.
>  		 */
> -		uint64_t *tma =
> -			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> -		tma[0] = tba_addr;
> -		tma[1] = tma_addr;
> +		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
> +			     uint64_t, tba_addr);
> +		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
> +			     uint64_t, tma_addr);
>  	} else {
>  		/* No trap handler bound, bind as first-level TBA/TMA. */
>  		qpd->tba_addr = tba_addr;
> @@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
>  void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
>  				     bool enabled)
>  {
> -	if (qpd->cwsr_kaddr) {
> -		uint64_t *tma =
> -			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> -		tma[2] = enabled;
> +	if (!iosys_map_is_null(&qpd->cwsr_map)) {
> +		iosys_map_wr(&qpd->cwsr_map,
> +			     KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
> +			     uint64_t, enabled);
>  	}
>  }
>  


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
  2026-06-29  0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
  2026-06-29  9:57 ` Christian König
@ 2026-06-29 17:54 ` Kuehling, Felix
  2026-06-29 18:37 ` Mario Limonciello
  2 siblings, 0 replies; 5+ messages in thread
From: Kuehling, Felix @ 2026-06-29 17:54 UTC (permalink / raw)
  To: James Zhu, amd-gfx
  Cc: christian.koenig, Yifan1.Zhang, philip.yang,
	Harish.Kasiviswanathan, Bob.Zhou, jamesz

On 2026-06-28 20:50, James Zhu wrote:
> After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
> 5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
> unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
>
> Direct writes like 'tma[2] = enabled' and memcpy() can fail or
> produce incorrect results on non-x86 architectures because:
> - MMIO requires specific accessor functions (writeq/readq)
> - Compiler optimizations may generate invalid instruction sequences
> - No guarantee of proper memory barriers or atomic access
>
> This patch converts CWSR buffer access to use struct iosys_map,
> which automatically handles both system memory (GTT) and MMIO
> (VRAM) correctly by:
> - Using writeq/writel/memcpy_toio for MMIO regions
> - Using WRITE_ONCE/memcpy for system memory
> - Providing proper memory barriers and access guarantees
>
> Changes:
> - Replace void *cwsr_kaddr with struct iosys_map cwsr_map
> - Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
> - Use iosys_map_wr() for writing trap handler addresses and flags
> - Use iosys_map_memcpy_to() for copying CWSR ISA code
>
> This ensures correct operation on all architectures while maintaining
> backward compatibility with older GPUs and APUs that use GTT.
>
> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
> Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>

Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
>   2 files changed, 47 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ad4897f094a2..6e559aab4009 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -32,6 +32,7 @@
>   #include <linux/atomic.h>
>   #include <linux/workqueue.h>
>   #include <linux/spinlock.h>
> +#include <linux/iosys-map.h>
>   #include <uapi/linux/kfd_ioctl.h>
>   #include <linux/idr.h>
>   #include <linux/kfifo.h>
> @@ -710,7 +711,7 @@ struct qcm_process_device {
>   
>   	/* CWSR memory */
>   	struct kgd_mem *cwsr_mem;
> -	void *cwsr_kaddr;
> +	struct iosys_map cwsr_map;
>   	uint64_t cwsr_base;
>   	uint64_t tba_addr;
>   	uint64_t tma_addr;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8e701dcda8ec..7fd65c31afa2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -33,6 +33,7 @@
>   #include <linux/mman.h>
>   #include <linux/file.h>
>   #include <linux/pm_runtime.h>
> +#include <drm/ttm/ttm_bo.h>
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu.h"
>   #include "amdgpu_reset.h"
> @@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
>   					       NULL);
>   }
>   
> +static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
> +			struct kfd_process_device *pdd, struct iosys_map *map)
> +{
> +	struct kfd_node *dev = pdd->dev;
> +
> +	if (map && !iosys_map_is_null(map)) {
> +		amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
> +		iosys_map_clear(map);
> +	}
> +
> +	amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
> +	amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
> +					       NULL);
> +}
> +
>   /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
>    *	This function should be only called right after the process
>    *	is created and when kfd_processes_mutex is still being held
> @@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
>   		if (pdd->drm_file)
>   			fput(pdd->drm_file);
>   
> -		if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
> -			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
> +		if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
> +			free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
>   				get_order(KFD_CWSR_TBA_TMA_SIZE));
>   
>   		idr_destroy(&pdd->alloc_idr);
> @@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>   	void *kaddr;
>   	int ret;
>   
> -	if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
> +	if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
>   		return 0;
>   
>   	if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
> @@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>   		return ret;
>   
>   	qpd->cwsr_mem = mem;
> -	qpd->cwsr_kaddr = kaddr;
> +
> +	/* Set up iosys_map based on whether memory is MMIO or system memory */
> +	if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
> +		iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
> +	else
> +		iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
> +
>   	qpd->tba_addr = qpd->cwsr_base;
>   
> -	memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
> +	/* Copy CWSR ISA to buffer using appropriate accessor */
> +	iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
> +			    dev->kfd->cwsr_isa_size);
>   
>   	kfd_process_set_trap_debug_flag(&pdd->qpd,
>   					pdd->process->debug_trap_enabled);
>   
>   	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> -	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> -		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> +	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
> +		 qpd->tba_addr, qpd->tma_addr,
> +		 qpd->cwsr_map.is_iomem ? "iomem" : "system",
> +		 qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
> +					  qpd->cwsr_map.vaddr);
>   
>   	return 0;
>   }
> @@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
>   	struct kfd_node *dev = pdd->dev;
>   	struct qcm_process_device *qpd = &pdd->qpd;
>   
> -	if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
> +	if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
>   		return;
>   
> -	kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
> +	kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
>   }
>   
>   void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>   				  uint64_t tba_addr,
>   				  uint64_t tma_addr)
>   {
> -	if (qpd->cwsr_kaddr) {
> +	if (!iosys_map_is_null(&qpd->cwsr_map)) {
>   		/* KFD trap handler is bound, record as second-level TBA/TMA
>   		 * in first-level TMA. First-level trap will jump to second.
>   		 */
> -		uint64_t *tma =
> -			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> -		tma[0] = tba_addr;
> -		tma[1] = tma_addr;
> +		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
> +			     uint64_t, tba_addr);
> +		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
> +			     uint64_t, tma_addr);
>   	} else {
>   		/* No trap handler bound, bind as first-level TBA/TMA. */
>   		qpd->tba_addr = tba_addr;
> @@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
>   void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
>   				     bool enabled)
>   {
> -	if (qpd->cwsr_kaddr) {
> -		uint64_t *tma =
> -			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> -		tma[2] = enabled;
> +	if (!iosys_map_is_null(&qpd->cwsr_map)) {
> +		iosys_map_wr(&qpd->cwsr_map,
> +			     KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
> +			     uint64_t, enabled);
>   	}
>   }
>   

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
  2026-06-29  0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
  2026-06-29  9:57 ` Christian König
  2026-06-29 17:54 ` Kuehling, Felix
@ 2026-06-29 18:37 ` Mario Limonciello
  2026-06-29 19:43   ` James Zhu
  2 siblings, 1 reply; 5+ messages in thread
From: Mario Limonciello @ 2026-06-29 18:37 UTC (permalink / raw)
  To: James Zhu, amd-gfx
  Cc: christian.koenig, Felix.kuehling, Yifan1.Zhang, philip.yang,
	Harish.Kasiviswanathan, Bob.Zhou, jamesz, Claude Opus 4 . 6



On 6/28/26 19:50, James Zhu wrote:
> After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
> 5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
> unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
> 
> Direct writes like 'tma[2] = enabled' and memcpy() can fail or
> produce incorrect results on non-x86 architectures because:
> - MMIO requires specific accessor functions (writeq/readq)
> - Compiler optimizations may generate invalid instruction sequences
> - No guarantee of proper memory barriers or atomic access
> 
> This patch converts CWSR buffer access to use struct iosys_map,
> which automatically handles both system memory (GTT) and MMIO
> (VRAM) correctly by:
> - Using writeq/writel/memcpy_toio for MMIO regions
> - Using WRITE_ONCE/memcpy for system memory
> - Providing proper memory barriers and access guarantees
> 
> Changes:
> - Replace void *cwsr_kaddr with struct iosys_map cwsr_map
> - Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
> - Use iosys_map_wr() for writing trap handler addresses and flags
> - Use iosys_map_memcpy_to() for copying CWSR ISA code
> 
> This ensures correct operation on all architectures while maintaining
> backward compatibility with older GPUs and APUs that use GTT.
> 
> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
> Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
The correct tags would be Co-developed-by and Assisted-by.
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
>   2 files changed, 47 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ad4897f094a2..6e559aab4009 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -32,6 +32,7 @@
>   #include <linux/atomic.h>
>   #include <linux/workqueue.h>
>   #include <linux/spinlock.h>
> +#include <linux/iosys-map.h>
>   #include <uapi/linux/kfd_ioctl.h>
>   #include <linux/idr.h>
>   #include <linux/kfifo.h>
> @@ -710,7 +711,7 @@ struct qcm_process_device {
>   
>   	/* CWSR memory */
>   	struct kgd_mem *cwsr_mem;
> -	void *cwsr_kaddr;
> +	struct iosys_map cwsr_map;
>   	uint64_t cwsr_base;
>   	uint64_t tba_addr;
>   	uint64_t tma_addr;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8e701dcda8ec..7fd65c31afa2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -33,6 +33,7 @@
>   #include <linux/mman.h>
>   #include <linux/file.h>
>   #include <linux/pm_runtime.h>
> +#include <drm/ttm/ttm_bo.h>
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu.h"
>   #include "amdgpu_reset.h"
> @@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
>   					       NULL);
>   }
>   
> +static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
> +			struct kfd_process_device *pdd, struct iosys_map *map)
> +{
> +	struct kfd_node *dev = pdd->dev;
> +
> +	if (map && !iosys_map_is_null(map)) {
> +		amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
> +		iosys_map_clear(map);
> +	}
> +
> +	amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
> +	amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
> +					       NULL);
> +}
> +
>   /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
>    *	This function should be only called right after the process
>    *	is created and when kfd_processes_mutex is still being held
> @@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
>   		if (pdd->drm_file)
>   			fput(pdd->drm_file);
>   
> -		if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
> -			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
> +		if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
> +			free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
>   				get_order(KFD_CWSR_TBA_TMA_SIZE));
>   
>   		idr_destroy(&pdd->alloc_idr);
> @@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>   	void *kaddr;
>   	int ret;
>   
> -	if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
> +	if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
>   		return 0;
>   
>   	if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
> @@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>   		return ret;
>   
>   	qpd->cwsr_mem = mem;
> -	qpd->cwsr_kaddr = kaddr;
> +
> +	/* Set up iosys_map based on whether memory is MMIO or system memory */
> +	if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
> +		iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
> +	else
> +		iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
> +
>   	qpd->tba_addr = qpd->cwsr_base;
>   
> -	memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
> +	/* Copy CWSR ISA to buffer using appropriate accessor */
> +	iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
> +			    dev->kfd->cwsr_isa_size);
>   
>   	kfd_process_set_trap_debug_flag(&pdd->qpd,
>   					pdd->process->debug_trap_enabled);
>   
>   	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> -	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> -		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> +	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
> +		 qpd->tba_addr, qpd->tma_addr,
> +		 qpd->cwsr_map.is_iomem ? "iomem" : "system",
> +		 qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
> +					  qpd->cwsr_map.vaddr);
>   
>   	return 0;
>   }
> @@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
>   	struct kfd_node *dev = pdd->dev;
>   	struct qcm_process_device *qpd = &pdd->qpd;
>   
> -	if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
> +	if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
>   		return;
>   
> -	kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
> +	kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
>   }
>   
>   void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>   				  uint64_t tba_addr,
>   				  uint64_t tma_addr)
>   {
> -	if (qpd->cwsr_kaddr) {
> +	if (!iosys_map_is_null(&qpd->cwsr_map)) {
>   		/* KFD trap handler is bound, record as second-level TBA/TMA
>   		 * in first-level TMA. First-level trap will jump to second.
>   		 */
> -		uint64_t *tma =
> -			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> -		tma[0] = tba_addr;
> -		tma[1] = tma_addr;
> +		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
> +			     uint64_t, tba_addr);
> +		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
> +			     uint64_t, tma_addr);
>   	} else {
>   		/* No trap handler bound, bind as first-level TBA/TMA. */
>   		qpd->tba_addr = tba_addr;
> @@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
>   void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
>   				     bool enabled)
>   {
> -	if (qpd->cwsr_kaddr) {
> -		uint64_t *tma =
> -			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> -		tma[2] = enabled;
> +	if (!iosys_map_is_null(&qpd->cwsr_map)) {
> +		iosys_map_wr(&qpd->cwsr_map,
> +			     KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
> +			     uint64_t, enabled);
>   	}
>   }
>   


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
  2026-06-29 18:37 ` Mario Limonciello
@ 2026-06-29 19:43   ` James Zhu
  0 siblings, 0 replies; 5+ messages in thread
From: James Zhu @ 2026-06-29 19:43 UTC (permalink / raw)
  To: Mario Limonciello, James Zhu, amd-gfx
  Cc: christian.koenig, Felix.kuehling, Yifan1.Zhang, philip.yang,
	Harish.Kasiviswanathan, Bob.Zhou, Claude Opus 4 . 6


On 2026-06-29 14:37, Mario Limonciello wrote:
>
>
> On 6/28/26 19:50, James Zhu wrote:
>> After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
>> 5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
>> unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
>>
>> Direct writes like 'tma[2] = enabled' and memcpy() can fail or
>> produce incorrect results on non-x86 architectures because:
>> - MMIO requires specific accessor functions (writeq/readq)
>> - Compiler optimizations may generate invalid instruction sequences
>> - No guarantee of proper memory barriers or atomic access
>>
>> This patch converts CWSR buffer access to use struct iosys_map,
>> which automatically handles both system memory (GTT) and MMIO
>> (VRAM) correctly by:
>> - Using writeq/writel/memcpy_toio for MMIO regions
>> - Using WRITE_ONCE/memcpy for system memory
>> - Providing proper memory barriers and access guarantees
>>
>> Changes:
>> - Replace void *cwsr_kaddr with struct iosys_map cwsr_map
>> - Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
>> - Use iosys_map_wr() for writing trap handler addresses and flags
>> - Use iosys_map_memcpy_to() for copying CWSR ISA code
>>
>> This ensures correct operation on all architectures while maintaining
>> backward compatibility with older GPUs and APUs that use GTT.
>>
>> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
>> Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
> The correct tags would be Co-developed-by and Assisted-by.
JZ] Thanks! I will correct it.
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +-
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
>>   2 files changed, 47 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index ad4897f094a2..6e559aab4009 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -32,6 +32,7 @@
>>   #include <linux/atomic.h>
>>   #include <linux/workqueue.h>
>>   #include <linux/spinlock.h>
>> +#include <linux/iosys-map.h>
>>   #include <uapi/linux/kfd_ioctl.h>
>>   #include <linux/idr.h>
>>   #include <linux/kfifo.h>
>> @@ -710,7 +711,7 @@ struct qcm_process_device {
>>         /* CWSR memory */
>>       struct kgd_mem *cwsr_mem;
>> -    void *cwsr_kaddr;
>> +    struct iosys_map cwsr_map;
>>       uint64_t cwsr_base;
>>       uint64_t tba_addr;
>>       uint64_t tma_addr;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 8e701dcda8ec..7fd65c31afa2 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -33,6 +33,7 @@
>>   #include <linux/mman.h>
>>   #include <linux/file.h>
>>   #include <linux/pm_runtime.h>
>> +#include <drm/ttm/ttm_bo.h>
>>   #include "amdgpu_amdkfd.h"
>>   #include "amdgpu.h"
>>   #include "amdgpu_reset.h"
>> @@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct 
>> kgd_mem *mem,
>>                              NULL);
>>   }
>>   +static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
>> +            struct kfd_process_device *pdd, struct iosys_map *map)
>> +{
>> +    struct kfd_node *dev = pdd->dev;
>> +
>> +    if (map && !iosys_map_is_null(map)) {
>> +        amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
>> +        iosys_map_clear(map);
>> +    }
>> +
>> +    amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, 
>> pdd->drm_priv);
>> +    amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, 
>> pdd->drm_priv,
>> +                           NULL);
>> +}
>> +
>>   /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
>>    *    This function should be only called right after the process
>>    *    is created and when kfd_processes_mutex is still being held
>> @@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct 
>> kfd_process *p)
>>           if (pdd->drm_file)
>>               fput(pdd->drm_file);
>>   -        if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
>> -            free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
>> +        if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && 
>> !pdd->qpd.cwsr_base)
>> +            free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
>>                   get_order(KFD_CWSR_TBA_TMA_SIZE));
>>             idr_destroy(&pdd->alloc_idr);
>> @@ -1501,7 +1517,7 @@ static int 
>> kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>>       void *kaddr;
>>       int ret;
>>   -    if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || 
>> !qpd->cwsr_base)
>> +    if (!dev->kfd->cwsr_enabled || 
>> !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
>>           return 0;
>>         if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && 
>> !dev->adev->apu_prefer_gtt)
>> @@ -1516,17 +1532,28 @@ static int 
>> kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>>           return ret;
>>         qpd->cwsr_mem = mem;
>> -    qpd->cwsr_kaddr = kaddr;
>> +
>> +    /* Set up iosys_map based on whether memory is MMIO or system 
>> memory */
>> +    if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
>> +        iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
>> +    else
>> +        iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
>> +
>>       qpd->tba_addr = qpd->cwsr_base;
>>   -    memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, 
>> dev->kfd->cwsr_isa_size);
>> +    /* Copy CWSR ISA to buffer using appropriate accessor */
>> +    iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
>> +                dev->kfd->cwsr_isa_size);
>>         kfd_process_set_trap_debug_flag(&pdd->qpd,
>>                       pdd->process->debug_trap_enabled);
>>         qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
>> -    pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
>> -         qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
>> +    pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for 
>> pqm.\n",
>> +         qpd->tba_addr, qpd->tma_addr,
>> +         qpd->cwsr_map.is_iomem ? "iomem" : "system",
>> +         qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
>> +                      qpd->cwsr_map.vaddr);
>>         return 0;
>>   }
>> @@ -1536,24 +1563,24 @@ static void 
>> kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
>>       struct kfd_node *dev = pdd->dev;
>>       struct qcm_process_device *qpd = &pdd->qpd;
>>   -    if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || 
>> !qpd->cwsr_base)
>> +    if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) 
>> || !qpd->cwsr_base)
>>           return;
>>   -    kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
>> +    kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
>>   }
>>     void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>>                     uint64_t tba_addr,
>>                     uint64_t tma_addr)
>>   {
>> -    if (qpd->cwsr_kaddr) {
>> +    if (!iosys_map_is_null(&qpd->cwsr_map)) {
>>           /* KFD trap handler is bound, record as second-level TBA/TMA
>>            * in first-level TMA. First-level trap will jump to second.
>>            */
>> -        uint64_t *tma =
>> -            (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
>> -        tma[0] = tba_addr;
>> -        tma[1] = tma_addr;
>> +        iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
>> +                 uint64_t, tba_addr);
>> +        iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + 
>> sizeof(uint64_t),
>> +                 uint64_t, tma_addr);
>>       } else {
>>           /* No trap handler bound, bind as first-level TBA/TMA. */
>>           qpd->tba_addr = tba_addr;
>> @@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct 
>> kfd_process *p, bool supported)
>>   void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
>>                        bool enabled)
>>   {
>> -    if (qpd->cwsr_kaddr) {
>> -        uint64_t *tma =
>> -            (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
>> -        tma[2] = enabled;
>> +    if (!iosys_map_is_null(&qpd->cwsr_map)) {
>> +        iosys_map_wr(&qpd->cwsr_map,
>> +                 KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
>> +                 uint64_t, enabled);
>>       }
>>   }
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-06-29 19:43 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-29  0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
2026-06-29  9:57 ` Christian König
2026-06-29 17:54 ` Kuehling, Felix
2026-06-29 18:37 ` Mario Limonciello
2026-06-29 19:43   ` James Zhu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.