All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Kuehling, Felix" <felix.kuehling@amd.com>
To: James Zhu <James.Zhu@amd.com>, amd-gfx@lists.freedesktop.org
Cc: christian.koenig@amd.com, Yifan1.Zhang@amd.com,
	philip.yang@amd.com, Harish.Kasiviswanathan@amd.com,
	Bob.Zhou@amd.com, jamesz@amd.com
Subject: Re: [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access
Date: Mon, 29 Jun 2026 13:54:36 -0400	[thread overview]
Message-ID: <7f1ec051-74e2-4ac9-8ac8-9725ca82bb80@amd.com> (raw)
In-Reply-To: <20260629005028.2907470-1-James.Zhu@amd.com>

On 2026-06-28 20:50, James Zhu wrote:
> After moving TBA/TMA from GTT to VRAM for GFX9.4.2+ in commit
> 5088a1ba6d6d, direct pointer dereferences to CWSR buffers became
> unsafe because VRAM is accessed via MMIO (PCI BAR mappings).
>
> Direct writes like 'tma[2] = enabled' and memcpy() can fail or
> produce incorrect results on non-x86 architectures because:
> - MMIO requires specific accessor functions (writeq/readq)
> - Compiler optimizations may generate invalid instruction sequences
> - No guarantee of proper memory barriers or atomic access
>
> This patch converts CWSR buffer access to use struct iosys_map,
> which automatically handles both system memory (GTT) and MMIO
> (VRAM) correctly by:
> - Using writeq/writel/memcpy_toio for MMIO regions
> - Using WRITE_ONCE/memcpy for system memory
> - Providing proper memory barriers and access guarantees
>
> Changes:
> - Replace void *cwsr_kaddr with struct iosys_map cwsr_map
> - Detect MMIO vs system memory using TTM_BO_MAP_IOMEM_MASK
> - Use iosys_map_wr() for writing trap handler addresses and flags
> - Use iosys_map_memcpy_to() for copying CWSR ISA code
>
> This ensures correct operation on all architectures while maintaining
> backward compatibility with older GPUs and APUs that use GTT.
>
> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
> Co-Authored-By: Yifan Zhang <yifan1.zhang@amd.com>
> Signed-off-by: James Zhu <James.Zhu@amd.com>

Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>


> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  3 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 63 +++++++++++++++++-------
>   2 files changed, 47 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index ad4897f094a2..6e559aab4009 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -32,6 +32,7 @@
>   #include <linux/atomic.h>
>   #include <linux/workqueue.h>
>   #include <linux/spinlock.h>
> +#include <linux/iosys-map.h>
>   #include <uapi/linux/kfd_ioctl.h>
>   #include <linux/idr.h>
>   #include <linux/kfifo.h>
> @@ -710,7 +711,7 @@ struct qcm_process_device {
>   
>   	/* CWSR memory */
>   	struct kgd_mem *cwsr_mem;
> -	void *cwsr_kaddr;
> +	struct iosys_map cwsr_map;
>   	uint64_t cwsr_base;
>   	uint64_t tba_addr;
>   	uint64_t tma_addr;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8e701dcda8ec..7fd65c31afa2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -33,6 +33,7 @@
>   #include <linux/mman.h>
>   #include <linux/file.h>
>   #include <linux/pm_runtime.h>
> +#include <drm/ttm/ttm_bo.h>
>   #include "amdgpu_amdkfd.h"
>   #include "amdgpu.h"
>   #include "amdgpu_reset.h"
> @@ -745,6 +746,21 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem,
>   					       NULL);
>   }
>   
> +static void kfd_process_free_gpuvm_map(struct kgd_mem *mem,
> +			struct kfd_process_device *pdd, struct iosys_map *map)
> +{
> +	struct kfd_node *dev = pdd->dev;
> +
> +	if (map && !iosys_map_is_null(map)) {
> +		amdgpu_amdkfd_gpuvm_unmap_bo_from_kernel(mem);
> +		iosys_map_clear(map);
> +	}
> +
> +	amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->adev, mem, pdd->drm_priv);
> +	amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, mem, pdd->drm_priv,
> +					       NULL);
> +}
> +
>   /* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
>    *	This function should be only called right after the process
>    *	is created and when kfd_processes_mutex is still being held
> @@ -1192,8 +1208,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
>   		if (pdd->drm_file)
>   			fput(pdd->drm_file);
>   
> -		if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
> -			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
> +		if (!iosys_map_is_null(&pdd->qpd.cwsr_map) && !pdd->qpd.cwsr_base)
> +			free_pages((unsigned long)pdd->qpd.cwsr_map.vaddr,
>   				get_order(KFD_CWSR_TBA_TMA_SIZE));
>   
>   		idr_destroy(&pdd->alloc_idr);
> @@ -1501,7 +1517,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>   	void *kaddr;
>   	int ret;
>   
> -	if (!dev->kfd->cwsr_enabled || qpd->cwsr_kaddr || !qpd->cwsr_base)
> +	if (!dev->kfd->cwsr_enabled || !iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
>   		return 0;
>   
>   	if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 2) && !dev->adev->apu_prefer_gtt)
> @@ -1516,17 +1532,28 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>   		return ret;
>   
>   	qpd->cwsr_mem = mem;
> -	qpd->cwsr_kaddr = kaddr;
> +
> +	/* Set up iosys_map based on whether memory is MMIO or system memory */
> +	if (mem->bo->kmap.bo_kmap_type & TTM_BO_MAP_IOMEM_MASK)
> +		iosys_map_set_vaddr_iomem(&qpd->cwsr_map, kaddr);
> +	else
> +		iosys_map_set_vaddr(&qpd->cwsr_map, kaddr);
> +
>   	qpd->tba_addr = qpd->cwsr_base;
>   
> -	memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
> +	/* Copy CWSR ISA to buffer using appropriate accessor */
> +	iosys_map_memcpy_to(&qpd->cwsr_map, 0, dev->kfd->cwsr_isa,
> +			    dev->kfd->cwsr_isa_size);
>   
>   	kfd_process_set_trap_debug_flag(&pdd->qpd,
>   					pdd->process->debug_trap_enabled);
>   
>   	qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> -	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> -		 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> +	pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_map:%s at %p for pqm.\n",
> +		 qpd->tba_addr, qpd->tma_addr,
> +		 qpd->cwsr_map.is_iomem ? "iomem" : "system",
> +		 qpd->cwsr_map.is_iomem ? (void *)qpd->cwsr_map.vaddr_iomem :
> +					  qpd->cwsr_map.vaddr);
>   
>   	return 0;
>   }
> @@ -1536,24 +1563,24 @@ static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd)
>   	struct kfd_node *dev = pdd->dev;
>   	struct qcm_process_device *qpd = &pdd->qpd;
>   
> -	if (!dev->kfd->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base)
> +	if (!dev->kfd->cwsr_enabled || iosys_map_is_null(&qpd->cwsr_map) || !qpd->cwsr_base)
>   		return;
>   
> -	kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, &qpd->cwsr_kaddr);
> +	kfd_process_free_gpuvm_map(qpd->cwsr_mem, pdd, &qpd->cwsr_map);
>   }
>   
>   void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
>   				  uint64_t tba_addr,
>   				  uint64_t tma_addr)
>   {
> -	if (qpd->cwsr_kaddr) {
> +	if (!iosys_map_is_null(&qpd->cwsr_map)) {
>   		/* KFD trap handler is bound, record as second-level TBA/TMA
>   		 * in first-level TMA. First-level trap will jump to second.
>   		 */
> -		uint64_t *tma =
> -			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> -		tma[0] = tba_addr;
> -		tma[1] = tma_addr;
> +		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET,
> +			     uint64_t, tba_addr);
> +		iosys_map_wr(&qpd->cwsr_map, KFD_CWSR_TMA_OFFSET + sizeof(uint64_t),
> +			     uint64_t, tma_addr);
>   	} else {
>   		/* No trap handler bound, bind as first-level TBA/TMA. */
>   		qpd->tba_addr = tba_addr;
> @@ -1619,10 +1646,10 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
>   void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
>   				     bool enabled)
>   {
> -	if (qpd->cwsr_kaddr) {
> -		uint64_t *tma =
> -			(uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
> -		tma[2] = enabled;
> +	if (!iosys_map_is_null(&qpd->cwsr_map)) {
> +		iosys_map_wr(&qpd->cwsr_map,
> +			     KFD_CWSR_TMA_OFFSET + 2 * sizeof(uint64_t),
> +			     uint64_t, enabled);
>   	}
>   }
>   

  parent reply	other threads:[~2026-06-29 17:54 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-29  0:50 [PATCH] drm/amdkfd: use iosys_map for CWSR buffer access James Zhu
2026-06-29  9:57 ` Christian König
2026-06-29 17:54 ` Kuehling, Felix [this message]
2026-06-29 18:37 ` Mario Limonciello
2026-06-29 19:43   ` James Zhu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7f1ec051-74e2-4ac9-8ac8-9725ca82bb80@amd.com \
    --to=felix.kuehling@amd.com \
    --cc=Bob.Zhou@amd.com \
    --cc=Harish.Kasiviswanathan@amd.com \
    --cc=James.Zhu@amd.com \
    --cc=Yifan1.Zhang@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    --cc=jamesz@amd.com \
    --cc=philip.yang@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.