From: "Christian König" <christian.koenig@amd.com>
To: James Zhu <James.Zhu@amd.com>, amd-gfx@lists.freedesktop.org
Cc: Felix.kuehling@amd.com, Yifan1.Zhang@amd.com,
philip.yang@amd.com, Harish.Kasiviswanathan@amd.com,
Bob.Zhou@amd.com, jamesz@amd.com,
"Claude Opus 4 . 6" <noreply@anthropic.com>
Subject: Re: [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
Date: Mon, 29 Jun 2026 11:57:08 +0200 [thread overview]
Message-ID: <bdc2d660-2e57-4a70-b34a-d28a78b7536d@amd.com> (raw)
In-Reply-To: <20260629005028.2907470-1-James.Zhu@amd.com>
On 6/29/26 02:50, James Zhu wrote:
> After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
> 5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
> unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
>
> Direct writes like 'tma[2] = enabled' and memcpy() can fail or
> produce incorrect results on non-x86 architectures because:
> - MMIO requires specific accessor functions (writeq/readq)
> - Compiler optimizations may generate invalid instruction sequences
> - No guarantee of proper memory barriers or atomic access
>
> This patch converts CWSR buffer access to use struct iosys_map,
> which automatically handles both system memory (GTT) and MMIO
> (VRAM) correctly by:
> - Using writeq/writel/memcpy_toio for MMIO regions
> - Using WRITE_ONCE/memcpy for system memory
> - Providing proper memory barriers and access guarantees
>
> Changes:
> - Replace void *cwsr_kaddr with struct iosys_map cwsr_map
> - Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
> - Use iosys_map_wr() for writing trap handler addresses and flags
> - Use iosys_map_memcpy_to() for copying CWSR ISA code
>
> This ensures correct operation on all architectures while maintaining
> backward compatibility with older GPUs and APUs that use GTT.
>
> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
> Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>
Looks valid of hand, but I have no time for an in deep review.
Somebody who knows the code should take another look as well.
But feel free to add Acked-by: Christian König <christian.koenig@amd.com>.
Regards,
Christian.
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +-
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
> 2 files changed, 47 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ad4897f094a2..6e559aab4009 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -32,6 +32,7 @@
> #include <linux/atomic.h>
> #include <linux/workqueue.h>
> #include <linux/spinlock.h>
> +#include <linux/iosys-map.h>
> #include <uapi/linux/kfd_ioctl.h>
> #include <linux/idr.h>
> #include <linux/kfifo.h>
> @@ -710,7 +711,7 @@ struct qcm_process_device {
>
> /* CWSR memory */
> struct kgd_mem *cwsr_mem;
> - void *cwsr_kaddr;
> + struct iosys_map cwsr_map;
> uint64_t cwsr_base;
> uint64_t tba_addr;
> uint64_t tma_addr;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8e701dcda8ec..7fd65c31afa2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -33,6 +33,7 @@
> #include <linux/mman.h>
> #include <linux/file.h>
> #include <linux/pm_runtime.h>
> +#include <drm/ttm/ttm_bo.h>
> #include "amdgpu_amdkfd.h"
> #include "amdgpu.h"
> #include "amdgpu_reset.h"
> @@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
> NULL);
> }
>
> +static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
> + struct kfd_process_device *pdd, struct iosys_map *map)
> +{
> + struct kfd_node *dev = pdd->dev;
> +
> + if (map && !iosys_map_is_null(map)) {
> + amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
> + iosys_map_clear(map);
> + }
> +
> + amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
> + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
> + NULL);
> +}
> +
> /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
> * This function should be only called right after the process
> * is created and when kfd_processes_mutex is still being held
> @@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
> if (pdd->drm_file)
> fput(pdd->drm_file);
>
> - if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
> - free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
> + if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
> + free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
> get_order(KFD_CWSR_TBA_TMA_SIZE));
>
> idr_destroy(&pdd->alloc_idr);
> @@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> void *kaddr;
> int ret;
>
> - if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
> + if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
> return 0;
>
> if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
> @@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
> return ret;
>
> qpd->cwsr_mem = mem;
> - qpd->cwsr_kaddr = kaddr;
> +
> + /* Set up iosys_map based on whether memory is MMIO or system memory */
> + if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
> + iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
> + else
> + iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
> +
> qpd->tba_addr = qpd->cwsr_base;
>
> - memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
> + /* Copy CWSR ISA to buffer using appropriate accessor */
> + iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
> + dev->kfd->cwsr_isa_size);
>
> kfd_process_set_trap_debug_flag(&pdd->qpd,
> pdd->process->debug_trap_enabled);
>
> qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> - pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> - qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> + pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
> + qpd->tba_addr, qpd->tma_addr,
> + qpd->cwsr_map.is_iomem ? "iomem" : "system",
> + qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
> + qpd->cwsr_map.vaddr);
>
> return 0;
> }
> @@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
> struct kfd_node *dev = pdd->dev;
> struct qcm_process_device *qpd = &pdd->qpd;
>
> - if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
> + if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
> return;
>
> - kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
> + kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
> }
>
> void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
> uint64_t tba_addr,
> uint64_t tma_addr)
> {
> - if (qpd->cwsr_kaddr) {
> + if (!iosys_map_is_null(&qpd->cwsr_map)) {
> /* KFD trap handler is bound, record as second-level TBA/TMA
> * in first-level TMA. First-level trap will jump to second.
> */
> - uint64_t *tma =
> - (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> - tma[0] = tba_addr;
> - tma[1] = tma_addr;
> + iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
> + uint64_t, tba_addr);
> + iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
> + uint64_t, tma_addr);
> } else {
> /* No trap handler bound, bind as first-level TBA/TMA. */
> qpd->tba_addr = tba_addr;
> @@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
> void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
> bool enabled)
> {
> - if (qpd->cwsr_kaddr) {
> - uint64_t *tma =
> - (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> - tma[2] = enabled;
> + if (!iosys_map_is_null(&qpd->cwsr_map)) {
> + iosys_map_wr(&qpd->cwsr_map,
> + KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
> + uint64_t, enabled);
> }
> }
>
next prev parent reply other threads:[~2026-06-29 9:57 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-29 0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
2026-06-29 9:57 ` Christian König [this message]
2026-06-29 17:54 ` Kuehling, Felix
2026-06-29 18:37 ` Mario Limonciello
2026-06-29 19:43 ` James Zhu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=bdc2d660-2e57-4a70-b34a-d28a78b7536d@amd.com \
--to=christian.koenig@amd.com \
--cc=Bob.Zhou@amd.com \
--cc=Felix.kuehling@amd.com \
--cc=Harish.Kasiviswanathan@amd.com \
--cc=James.Zhu@amd.com \
--cc=Yifan1.Zhang@amd.com \
--cc=amd-gfx@lists.freedesktop.org \
--cc=jamesz@amd.com \
--cc=noreply@anthropic.com \
--cc=philip.yang@amd.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.