* [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
@ 2026-06-29 0:50 James Zhu
2026-06-29 9:57 ` Christian König
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: James Zhu @ 2026-06-29 0:50 UTC (permalink / raw)
To: amd-gfx
Cc: christian.koenig, Felix.kuehling, Yifan1.Zhang, philip.yang,
Harish.Kasiviswanathan, Bob.Zhou, jamesz, Claude Opus 4 . 6,
Yifan Zhang
After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
Direct writes like 'tma[2] = enabled' and memcpy() can fail or
produce incorrect results on non-x86 architectures because:
- MMIO requires specific accessor functions (writeq/readq)
- Compiler optimizations may generate invalid instruction sequences
- No guarantee of proper memory barriers or atomic access
This patch converts CWSR buffer access to use struct iosys_map,
which automatically handles both system memory (GTT) and MMIO
(VRAM) correctly by:
- Using writeq/writel/memcpy_toio for MMIO regions
- Using WRITE_ONCE/memcpy for system memory
- Providing proper memory barriers and access guarantees
Changes:
- Replace void *cwsr_kaddr with struct iosys_map cwsr_map
- Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
- Use iosys_map_wr() for writing trap handler addresses and flags
- Use iosys_map_memcpy_to() for copying CWSR ISA code
This ensures correct operation on all architectures while maintaining
backward compatibility with older GPUs and APUs that use GTT.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
Signed-off-by: James Zhu <James.Zhu@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +-
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
2 files changed, 47 insertions(+), 19 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ad4897f094a2..6e559aab4009 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -32,6 +32,7 @@
#include <linux/atomic.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
+#include <linux/iosys-map.h>
#include <uapi/linux/kfd_ioctl.h>
#include <linux/idr.h>
#include <linux/kfifo.h>
@@ -710,7 +711,7 @@ struct qcm_process_device {
/* CWSR memory */
struct kgd_mem *cwsr_mem;
- void *cwsr_kaddr;
+ struct iosys_map cwsr_map;
uint64_t cwsr_base;
uint64_t tba_addr;
uint64_t tma_addr;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 8e701dcda8ec..7fd65c31afa2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -33,6 +33,7 @@
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/pm_runtime.h>
+#include <drm/ttm/ttm_bo.h>
#include "amdgpu_amdkfd.h"
#include "amdgpu.h"
#include "amdgpu_reset.h"
@@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
NULL);
}
+static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
+ struct kfd_process_device *pdd, struct iosys_map *map)
+{
+ struct kfd_node *dev = pdd->dev;
+
+ if (map && !iosys_map_is_null(map)) {
+ amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
+ iosys_map_clear(map);
+ }
+
+ amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
+ amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
+ NULL);
+}
+
/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
* This function should be only called right after the process
* is created and when kfd_processes_mutex is still being held
@@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
if (pdd->drm_file)
fput(pdd->drm_file);
- if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
- free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
+ if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
+ free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
get_order(KFD_CWSR_TBA_TMA_SIZE));
idr_destroy(&pdd->alloc_idr);
@@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
void *kaddr;
int ret;
- if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
+ if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
return 0;
if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
@@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
return ret;
qpd->cwsr_mem = mem;
- qpd->cwsr_kaddr = kaddr;
+
+ /* Set up iosys_map based on whether memory is MMIO or system memory */
+ if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
+ iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
+ else
+ iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
+
qpd->tba_addr = qpd->cwsr_base;
- memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
+ /* Copy CWSR ISA to buffer using appropriate accessor */
+ iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
+ dev->kfd->cwsr_isa_size);
kfd_process_set_trap_debug_flag(&pdd->qpd,
pdd->process->debug_trap_enabled);
qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
- pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
- qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
+ pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
+ qpd->tba_addr, qpd->tma_addr,
+ qpd->cwsr_map.is_iomem ? "iomem" : "system",
+ qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
+ qpd->cwsr_map.vaddr);
return 0;
}
@@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
struct kfd_node *dev = pdd->dev;
struct qcm_process_device *qpd = &pdd->qpd;
- if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
+ if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
return;
- kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
+ kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
}
void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
uint64_t tba_addr,
uint64_t tma_addr)
{
- if (qpd->cwsr_kaddr) {
+ if (!iosys_map_is_null(&qpd->cwsr_map)) {
/* KFD trap handler is bound, record as second-level TBA/TMA
* in first-level TMA. First-level trap will jump to second.
*/
- uint64_t *tma =
- (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
- tma[0] = tba_addr;
- tma[1] = tma_addr;
+ iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
+ uint64_t, tba_addr);
+ iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
+ uint64_t, tma_addr);
} else {
/* No trap handler bound, bind as first-level TBA/TMA. */
qpd->tba_addr = tba_addr;
@@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
bool enabled)
{
- if (qpd->cwsr_kaddr) {
- uint64_t *tma =
- (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
- tma[2] = enabled;
+ if (!iosys_map_is_null(&qpd->cwsr_map)) {
+ iosys_map_wr(&qpd->cwsr_map,
+ KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
+ uint64_t, enabled);
}
}
--
2.43.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
2026-06-29 0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
@ 2026-06-29 9:57 ` Christian König
2026-06-29 17:54 ` Kuehling, Felix
2026-06-29 18:37 ` Mario Limonciello
2 siblings, 0 replies; 5+ messages in thread
From: Christian König @ 2026-06-29 9:57 UTC (permalink / raw)
To: James Zhu, amd-gfx
Cc: Felix.kuehling, Yifan1.Zhang, philip.yang, Harish.Kasiviswanathan,
Bob.Zhou, jamesz, Claude Opus 4 . 6
On 6/29/26 02:50, James Zhu wrote:
> After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
> 5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
> unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
>
> Direct writes like 'tma[2] = enabled' and memcpy() can fail or
> produce incorrect results on non-x86 architectures because:
> - MMIO requires specific accessor functions (writeq/readq)
> - Compiler optimizations may generate invalid instruction sequences
> - No guarantee of proper memory barriers or atomic access
>
> This patch converts CWSR buffer access to use struct iosys_map,
> which automatically handles both system memory (GTT) and MMIO
> (VRAM) correctly by:
> - Using writeq/writel/memcpy_toio for MMIO regions
> - Using WRITE_ONCE/memcpy for system memory
> - Providing proper memory barriers and access guarantees
>
> Changes:
> - Replace void *cwsr_kaddr with struct iosys_map cwsr_map
> - Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
> - Use iosys_map_wr() for writing trap handler addresses and flags
> - Use iosys_map_memcpy_to() for copying CWSR ISA code
>
> This ensures correct operation on all architectures while maintaining
> backward compatibility with older GPUs and APUs that use GTT.
>
> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
> Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
Looks valid of hand, but I have no time for an in deep review.
Somebody who knows the code should take another look as well.
But feel free to add Acked-by: Christian König <christian.koenig@amd.com>.
Regards,
Christian.
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +-
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
> 2 files changed, 47 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ad4897f094a2..6e559aab4009 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -32,6 +32,7 @@
> #include <linux/atomic.h>
> #include <linux/workqueue.h>
> #include <linux/spinlock.h>
> +#include <linux/iosys-map.h>
> #include <uapi/linux/kfd_ioctl.h>
> #include <linux/idr.h>
> #include <linux/kfifo.h>
> @@ -710,7 +711,7 @@ struct qcm_process_device {
>
> /* CWSR memory */
> struct kgd_mem *cwsr_mem;
> - void *cwsr_kaddr;
> + struct iosys_map cwsr_map;
> uint64_t cwsr_base;
> uint64_t tba_addr;
> uint64_t tma_addr;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8e701dcda8ec..7fd65c31afa2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -33,6 +33,7 @@
> #include <linux/mman.h>
> #include <linux/file.h>
> #include <linux/pm_runtime.h>
> +#include <drm/ttm/ttm_bo.h>
> #include "amdgpu_amdkfd.h"
> #include "amdgpu.h"
> #include "amdgpu_reset.h"
> @@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
> NULL);
> }
>
> +static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
> + struct kfd_process_device *pdd, struct iosys_map *map)
> +{
> + struct kfd_node *dev = pdd->dev;
> +
> + if (map && !iosys_map_is_null(map)) {
> + amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
> + iosys_map_clear(map);
> + }
> +
> + amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
> + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
> + NULL);
> +}
> +
> /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
> * This function should be only called right after the process
> * is created and when kfd_processes_mutex is still being held
> @@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
> if (pdd->drm_file)
> fput(pdd->drm_file);
>
> - if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
> - free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
> + if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
> + free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
> get_order(KFD_CWSR_TBA_TMA_SIZE));
>
> idr_destroy(&pdd->alloc_idr);
> @@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> void *kaddr;
> int ret;
>
> - if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
> + if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
> return 0;
>
> if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
> @@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> return ret;
>
> qpd->cwsr_mem = mem;
> - qpd->cwsr_kaddr = kaddr;
> +
> + /* Set up iosys_map based on whether memory is MMIO or system memory */
> + if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
> + iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
> + else
> + iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
> +
> qpd->tba_addr = qpd->cwsr_base;
>
> - memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
> + /* Copy CWSR ISA to buffer using appropriate accessor */
> + iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
> + dev->kfd->cwsr_isa_size);
>
> kfd_process_set_trap_debug_flag(&pdd->qpd,
> pdd->process->debug_trap_enabled);
>
> qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> - pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> - qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> + pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
> + qpd->tba_addr, qpd->tma_addr,
> + qpd->cwsr_map.is_iomem ? "iomem" : "system",
> + qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
> + qpd->cwsr_map.vaddr);
>
> return 0;
> }
> @@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
> struct kfd_node *dev = pdd->dev;
> struct qcm_process_device *qpd = &pdd->qpd;
>
> - if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
> + if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
> return;
>
> - kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
> + kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
> }
>
> void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
> uint64_t tba_addr,
> uint64_t tma_addr)
> {
> - if (qpd->cwsr_kaddr) {
> + if (!iosys_map_is_null(&qpd->cwsr_map)) {
> /* KFD trap handler is bound, record as second-level TBA/TMA
> * in first-level TMA. First-level trap will jump to second.
> */
> - uint64_t *tma =
> - (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> - tma[0] = tba_addr;
> - tma[1] = tma_addr;
> + iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
> + uint64_t, tba_addr);
> + iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
> + uint64_t, tma_addr);
> } else {
> /* No trap handler bound, bind as first-level TBA/TMA. */
> qpd->tba_addr = tba_addr;
> @@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
> void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
> bool enabled)
> {
> - if (qpd->cwsr_kaddr) {
> - uint64_t *tma =
> - (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> - tma[2] = enabled;
> + if (!iosys_map_is_null(&qpd->cwsr_map)) {
> + iosys_map_wr(&qpd->cwsr_map,
> + KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
> + uint64_t, enabled);
> }
> }
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
2026-06-29 0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
2026-06-29 9:57 ` Christian König
@ 2026-06-29 17:54 ` Kuehling, Felix
2026-06-29 18:37 ` Mario Limonciello
2 siblings, 0 replies; 5+ messages in thread
From: Kuehling, Felix @ 2026-06-29 17:54 UTC (permalink / raw)
To: James Zhu, amd-gfx
Cc: christian.koenig, Yifan1.Zhang, philip.yang,
Harish.Kasiviswanathan, Bob.Zhou, jamesz
On 2026-06-28 20:50, James Zhu wrote:
> After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
> 5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
> unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
>
> Direct writes like 'tma[2] = enabled' and memcpy() can fail or
> produce incorrect results on non-x86 architectures because:
> - MMIO requires specific accessor functions (writeq/readq)
> - Compiler optimizations may generate invalid instruction sequences
> - No guarantee of proper memory barriers or atomic access
>
> This patch converts CWSR buffer access to use struct iosys_map,
> which automatically handles both system memory (GTT) and MMIO
> (VRAM) correctly by:
> - Using writeq/writel/memcpy_toio for MMIO regions
> - Using WRITE_ONCE/memcpy for system memory
> - Providing proper memory barriers and access guarantees
>
> Changes:
> - Replace void *cwsr_kaddr with struct iosys_map cwsr_map
> - Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
> - Use iosys_map_wr() for writing trap handler addresses and flags
> - Use iosys_map_memcpy_to() for copying CWSR ISA code
>
> This ensures correct operation on all architectures while maintaining
> backward compatibility with older GPUs and APUs that use GTT.
>
> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
> Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +-
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
> 2 files changed, 47 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ad4897f094a2..6e559aab4009 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -32,6 +32,7 @@
> #include <linux/atomic.h>
> #include <linux/workqueue.h>
> #include <linux/spinlock.h>
> +#include <linux/iosys-map.h>
> #include <uapi/linux/kfd_ioctl.h>
> #include <linux/idr.h>
> #include <linux/kfifo.h>
> @@ -710,7 +711,7 @@ struct qcm_process_device {
>
> /* CWSR memory */
> struct kgd_mem *cwsr_mem;
> - void *cwsr_kaddr;
> + struct iosys_map cwsr_map;
> uint64_t cwsr_base;
> uint64_t tba_addr;
> uint64_t tma_addr;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8e701dcda8ec..7fd65c31afa2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -33,6 +33,7 @@
> #include <linux/mman.h>
> #include <linux/file.h>
> #include <linux/pm_runtime.h>
> +#include <drm/ttm/ttm_bo.h>
> #include "amdgpu_amdkfd.h"
> #include "amdgpu.h"
> #include "amdgpu_reset.h"
> @@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
> NULL);
> }
>
> +static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
> + struct kfd_process_device *pdd, struct iosys_map *map)
> +{
> + struct kfd_node *dev = pdd->dev;
> +
> + if (map && !iosys_map_is_null(map)) {
> + amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
> + iosys_map_clear(map);
> + }
> +
> + amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
> + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
> + NULL);
> +}
> +
> /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
> * This function should be only called right after the process
> * is created and when kfd_processes_mutex is still being held
> @@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
> if (pdd->drm_file)
> fput(pdd->drm_file);
>
> - if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
> - free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
> + if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
> + free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
> get_order(KFD_CWSR_TBA_TMA_SIZE));
>
> idr_destroy(&pdd->alloc_idr);
> @@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> void *kaddr;
> int ret;
>
> - if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
> + if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
> return 0;
>
> if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
> @@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> return ret;
>
> qpd->cwsr_mem = mem;
> - qpd->cwsr_kaddr = kaddr;
> +
> + /* Set up iosys_map based on whether memory is MMIO or system memory */
> + if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
> + iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
> + else
> + iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
> +
> qpd->tba_addr = qpd->cwsr_base;
>
> - memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
> + /* Copy CWSR ISA to buffer using appropriate accessor */
> + iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
> + dev->kfd->cwsr_isa_size);
>
> kfd_process_set_trap_debug_flag(&pdd->qpd,
> pdd->process->debug_trap_enabled);
>
> qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> - pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> - qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> + pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
> + qpd->tba_addr, qpd->tma_addr,
> + qpd->cwsr_map.is_iomem ? "iomem" : "system",
> + qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
> + qpd->cwsr_map.vaddr);
>
> return 0;
> }
> @@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
> struct kfd_node *dev = pdd->dev;
> struct qcm_process_device *qpd = &pdd->qpd;
>
> - if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
> + if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
> return;
>
> - kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
> + kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
> }
>
> void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
> uint64_t tba_addr,
> uint64_t tma_addr)
> {
> - if (qpd->cwsr_kaddr) {
> + if (!iosys_map_is_null(&qpd->cwsr_map)) {
> /* KFD trap handler is bound, record as second-level TBA/TMA
> * in first-level TMA. First-level trap will jump to second.
> */
> - uint64_t *tma =
> - (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> - tma[0] = tba_addr;
> - tma[1] = tma_addr;
> + iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
> + uint64_t, tba_addr);
> + iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
> + uint64_t, tma_addr);
> } else {
> /* No trap handler bound, bind as first-level TBA/TMA. */
> qpd->tba_addr = tba_addr;
> @@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
> void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
> bool enabled)
> {
> - if (qpd->cwsr_kaddr) {
> - uint64_t *tma =
> - (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> - tma[2] = enabled;
> + if (!iosys_map_is_null(&qpd->cwsr_map)) {
> + iosys_map_wr(&qpd->cwsr_map,
> + KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
> + uint64_t, enabled);
> }
> }
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
2026-06-29 0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
2026-06-29 9:57 ` Christian König
2026-06-29 17:54 ` Kuehling, Felix
@ 2026-06-29 18:37 ` Mario Limonciello
2026-06-29 19:43 ` James Zhu
2 siblings, 1 reply; 5+ messages in thread
From: Mario Limonciello @ 2026-06-29 18:37 UTC (permalink / raw)
To: James Zhu, amd-gfx
Cc: christian.koenig, Felix.kuehling, Yifan1.Zhang, philip.yang,
Harish.Kasiviswanathan, Bob.Zhou, jamesz, Claude Opus 4 . 6
On 6/28/26 19:50, James Zhu wrote:
> After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
> 5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
> unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
>
> Direct writes like 'tma[2] = enabled' and memcpy() can fail or
> produce incorrect results on non-x86 architectures because:
> - MMIO requires specific accessor functions (writeq/readq)
> - Compiler optimizations may generate invalid instruction sequences
> - No guarantee of proper memory barriers or atomic access
>
> This patch converts CWSR buffer access to use struct iosys_map,
> which automatically handles both system memory (GTT) and MMIO
> (VRAM) correctly by:
> - Using writeq/writel/memcpy_toio for MMIO regions
> - Using WRITE_ONCE/memcpy for system memory
> - Providing proper memory barriers and access guarantees
>
> Changes:
> - Replace void *cwsr_kaddr with struct iosys_map cwsr_map
> - Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
> - Use iosys_map_wr() for writing trap handler addresses and flags
> - Use iosys_map_memcpy_to() for copying CWSR ISA code
>
> This ensures correct operation on all architectures while maintaining
> backward compatibility with older GPUs and APUs that use GTT.
>
> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
> Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
The correct tags would be Co-developed-by and Assisted-by.
> Signed-off-by: James Zhu <James.Zhu@amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +-
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
> 2 files changed, 47 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ad4897f094a2..6e559aab4009 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -32,6 +32,7 @@
> #include <linux/atomic.h>
> #include <linux/workqueue.h>
> #include <linux/spinlock.h>
> +#include <linux/iosys-map.h>
> #include <uapi/linux/kfd_ioctl.h>
> #include <linux/idr.h>
> #include <linux/kfifo.h>
> @@ -710,7 +711,7 @@ struct qcm_process_device {
>
> /* CWSR memory */
> struct kgd_mem *cwsr_mem;
> - void *cwsr_kaddr;
> + struct iosys_map cwsr_map;
> uint64_t cwsr_base;
> uint64_t tba_addr;
> uint64_t tma_addr;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8e701dcda8ec..7fd65c31afa2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -33,6 +33,7 @@
> #include <linux/mman.h>
> #include <linux/file.h>
> #include <linux/pm_runtime.h>
> +#include <drm/ttm/ttm_bo.h>
> #include "amdgpu_amdkfd.h"
> #include "amdgpu.h"
> #include "amdgpu_reset.h"
> @@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
> NULL);
> }
>
> +static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
> + struct kfd_process_device *pdd, struct iosys_map *map)
> +{
> + struct kfd_node *dev = pdd->dev;
> +
> + if (map && !iosys_map_is_null(map)) {
> + amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
> + iosys_map_clear(map);
> + }
> +
> + amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
> + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
> + NULL);
> +}
> +
> /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
> * This function should be only called right after the process
> * is created and when kfd_processes_mutex is still being held
> @@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
> if (pdd->drm_file)
> fput(pdd->drm_file);
>
> - if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
> - free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
> + if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
> + free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
> get_order(KFD_CWSR_TBA_TMA_SIZE));
>
> idr_destroy(&pdd->alloc_idr);
> @@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> void *kaddr;
> int ret;
>
> - if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
> + if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
> return 0;
>
> if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
> @@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> return ret;
>
> qpd->cwsr_mem = mem;
> - qpd->cwsr_kaddr = kaddr;
> +
> + /* Set up iosys_map based on whether memory is MMIO or system memory */
> + if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
> + iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
> + else
> + iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
> +
> qpd->tba_addr = qpd->cwsr_base;
>
> - memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
> + /* Copy CWSR ISA to buffer using appropriate accessor */
> + iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
> + dev->kfd->cwsr_isa_size);
>
> kfd_process_set_trap_debug_flag(&pdd->qpd,
> pdd->process->debug_trap_enabled);
>
> qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> - pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> - qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> + pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
> + qpd->tba_addr, qpd->tma_addr,
> + qpd->cwsr_map.is_iomem ? "iomem" : "system",
> + qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
> + qpd->cwsr_map.vaddr);
>
> return 0;
> }
> @@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
> struct kfd_node *dev = pdd->dev;
> struct qcm_process_device *qpd = &pdd->qpd;
>
> - if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
> + if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
> return;
>
> - kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
> + kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
> }
>
> void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
> uint64_t tba_addr,
> uint64_t tma_addr)
> {
> - if (qpd->cwsr_kaddr) {
> + if (!iosys_map_is_null(&qpd->cwsr_map)) {
> /* KFD trap handler is bound, record as second-level TBA/TMA
> * in first-level TMA. First-level trap will jump to second.
> */
> - uint64_t *tma =
> - (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> - tma[0] = tba_addr;
> - tma[1] = tma_addr;
> + iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
> + uint64_t, tba_addr);
> + iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
> + uint64_t, tma_addr);
> } else {
> /* No trap handler bound, bind as first-level TBA/TMA. */
> qpd->tba_addr = tba_addr;
> @@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
> void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
> bool enabled)
> {
> - if (qpd->cwsr_kaddr) {
> - uint64_t *tma =
> - (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> - tma[2] = enabled;
> + if (!iosys_map_is_null(&qpd->cwsr_map)) {
> + iosys_map_wr(&qpd->cwsr_map,
> + KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
> + uint64_t, enabled);
> }
> }
>
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
2026-06-29 18:37 ` Mario Limonciello
@ 2026-06-29 19:43 ` James Zhu
0 siblings, 0 replies; 5+ messages in thread
From: James Zhu @ 2026-06-29 19:43 UTC (permalink / raw)
To: Mario Limonciello, James Zhu, amd-gfx
Cc: christian.koenig, Felix.kuehling, Yifan1.Zhang, philip.yang,
Harish.Kasiviswanathan, Bob.Zhou, Claude Opus 4 . 6
On 2026-06-29 14:37, Mario Limonciello wrote:
>
>
> On 6/28/26 19:50, James Zhu wrote:
>> After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
>> 5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
>> unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
>>
>> Direct writes like 'tma[2] = enabled' and memcpy() can fail or
>> produce incorrect results on non-x86 architectures because:
>> - MMIO requires specific accessor functions (writeq/readq)
>> - Compiler optimizations may generate invalid instruction sequences
>> - No guarantee of proper memory barriers or atomic access
>>
>> This patch converts CWSR buffer access to use struct iosys_map,
>> which automatically handles both system memory (GTT) and MMIO
>> (VRAM) correctly by:
>> - Using writeq/writel/memcpy_toio for MMIO regions
>> - Using WRITE_ONCE/memcpy for system memory
>> - Providing proper memory barriers and access guarantees
>>
>> Changes:
>> - Replace void *cwsr_kaddr with struct iosys_map cwsr_map
>> - Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
>> - Use iosys_map_wr() for writing trap handler addresses and flags
>> - Use iosys_map_memcpy_to() for copying CWSR ISA code
>>
>> This ensures correct operation on all architectures while maintaining
>> backward compatibility with older GPUs and APUs that use GTT.
>>
>> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
>> Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
> The correct tags would be Co-developed-by and Assisted-by.
JZ] Thanks! I will correct it.
>> Signed-off-by: James Zhu <James.Zhu@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +-
>> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
>> 2 files changed, 47 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index ad4897f094a2..6e559aab4009 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -32,6 +32,7 @@
>> #include <linux/atomic.h>
>> #include <linux/workqueue.h>
>> #include <linux/spinlock.h>
>> +#include <linux/iosys-map.h>
>> #include <uapi/linux/kfd_ioctl.h>
>> #include <linux/idr.h>
>> #include <linux/kfifo.h>
>> @@ -710,7 +711,7 @@ struct qcm_process_device {
>> /* CWSR memory */
>> struct kgd_mem *cwsr_mem;
>> - void *cwsr_kaddr;
>> + struct iosys_map cwsr_map;
>> uint64_t cwsr_base;
>> uint64_t tba_addr;
>> uint64_t tma_addr;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 8e701dcda8ec..7fd65c31afa2 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -33,6 +33,7 @@
>> #include <linux/mman.h>
>> #include <linux/file.h>
>> #include <linux/pm_runtime.h>
>> +#include <drm/ttm/ttm_bo.h>
>> #include "amdgpu_amdkfd.h"
>> #include "amdgpu.h"
>> #include "amdgpu_reset.h"
>> @@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct
>> kgd_mem *mem,
>> NULL);
>> }
>> +static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
>> + struct kfd_process_device *pdd, struct iosys_map *map)
>> +{
>> + struct kfd_node *dev = pdd->dev;
>> +
>> + if (map && !iosys_map_is_null(map)) {
>> + amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
>> + iosys_map_clear(map);
>> + }
>> +
>> + amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem,
>> pdd->drm_priv);
>> + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem,
>> pdd->drm_priv,
>> + NULL);
>> +}
>> +
>> /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
>> * This function should be only called right after the process
>> * is created and when kfd_processes_mutex is still being held
>> @@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct
>> kfd_process *p)
>> if (pdd->drm_file)
>> fput(pdd->drm_file);
>> - if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
>> - free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
>> + if (!iosys_map_is_null(&pdd->qpd.cwsr_map) &&
>> !pdd->qpd.cwsr_base)
>> + free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
>> get_order(KFD_CWSR_TBA_TMA_SIZE));
>> idr_destroy(&pdd->alloc_idr);
>> @@ -1501,7 +1517,7 @@ static int
>> kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>> void *kaddr;
>> int ret;
>> - if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr ||
>> !qpd->cwsr_base)
>> + if (!dev->kfd->cwsr_enabled ||
>> !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
>> return 0;
>> if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) &&
>> !dev->adev->apu_prefer_gtt)
>> @@ -1516,17 +1532,28 @@ static int
>> kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>> return ret;
>> qpd->cwsr_mem = mem;
>> - qpd->cwsr_kaddr = kaddr;
>> +
>> + /* Set up iosys_map based on whether memory is MMIO or system
>> memory */
>> + if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
>> + iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
>> + else
>> + iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
>> +
>> qpd->tba_addr = qpd->cwsr_base;
>> - memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa,
>> dev->kfd->cwsr_isa_size);
>> + /* Copy CWSR ISA to buffer using appropriate accessor */
>> + iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
>> + dev->kfd->cwsr_isa_size);
>> kfd_process_set_trap_debug_flag(&pdd->qpd,
>> pdd->process->debug_trap_enabled);
>> qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
>> - pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
>> - qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
>> + pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for
>> pqm.\n",
>> + qpd->tba_addr, qpd->tma_addr,
>> + qpd->cwsr_map.is_iomem ? "iomem" : "system",
>> + qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
>> + qpd->cwsr_map.vaddr);
>> return 0;
>> }
>> @@ -1536,24 +1563,24 @@ static void
>> kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
>> struct kfd_node *dev = pdd->dev;
>> struct qcm_process_device *qpd = &pdd->qpd;
>> - if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr ||
>> !qpd->cwsr_base)
>> + if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map)
>> || !qpd->cwsr_base)
>> return;
>> - kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
>> + kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
>> }
>> void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>> uint64_t tba_addr,
>> uint64_t tma_addr)
>> {
>> - if (qpd->cwsr_kaddr) {
>> + if (!iosys_map_is_null(&qpd->cwsr_map)) {
>> /* KFD trap handler is bound, record as second-level TBA/TMA
>> * in first-level TMA. First-level trap will jump to second.
>> */
>> - uint64_t *tma =
>> - (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
>> - tma[0] = tba_addr;
>> - tma[1] = tma_addr;
>> + iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
>> + uint64_t, tba_addr);
>> + iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET +
>> sizeof(uint64_t),
>> + uint64_t, tma_addr);
>> } else {
>> /* No trap handler bound, bind as first-level TBA/TMA. */
>> qpd->tba_addr = tba_addr;
>> @@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct
>> kfd_process *p, bool supported)
>> void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
>> bool enabled)
>> {
>> - if (qpd->cwsr_kaddr) {
>> - uint64_t *tma =
>> - (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
>> - tma[2] = enabled;
>> + if (!iosys_map_is_null(&qpd->cwsr_map)) {
>> + iosys_map_wr(&qpd->cwsr_map,
>> + KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
>> + uint64_t, enabled);
>> }
>> }
>
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2026-06-29 19:43 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-29 0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
2026-06-29 9:57 ` Christian König
2026-06-29 17:54 ` Kuehling, Felix
2026-06-29 18:37 ` Mario Limonciello
2026-06-29 19:43 ` James Zhu
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.