[PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra

AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra
@ 2026-04-27  8:34 Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 02/11] drm/amdgpu/userq: route SDMA UMQ doorbells through the kernel pool Jesse Zhang
                   ` (10 more replies)
  0 siblings, 11 replies; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse Zhang

Add a per-device qword-slot pool covering the firmware-managed NBIO
SDMA decode window (BAR dwords [sdma_engine[0],
sdma_engine[0] + sdma_doorbell_range * num_instances)) — the only
range whose writes are routed to the SDMA back-end.  Kernel SDMA ring
slots are pre-masked at init.

The window is exposed to userspace as a custom drm_gem_object: no TTM
backing, custom .mmap callback that does io_remap_pfn_range from the
SDMA decode window's BAR address.  Per-fpriv GEM handles for that BO
can be minted on demand via amdgpu_sdma_userq_doorbell_create_handle()
so userspace mmap()s through the standard drm_gem_mmap path — no
file_operations override and no fixed mmap pgoff sentinel.

Slots are allocated/freed via amdgpu_sdma_userq_doorbell_alloc/free.
The init/fini and the AMDGPU_INFO_USERQ_DOORBELL ioctl that uses
create_handle land in subsequent patches.

Suggested-by:Prike Liang <Prike.Liang@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 164 +++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  55 ++++++++
 2 files changed, 219 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 321310ba2c08..1c61761c0046 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -22,6 +22,8 @@
  */
 
 #include <linux/firmware.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_file.h>
 #include "amdgpu.h"
 #include "amdgpu_sdma.h"
 #include "amdgpu_ras.h"
@@ -200,6 +202,168 @@ void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
 	       sizeof(struct amdgpu_sdma_instance) * AMDGPU_MAX_SDMA_INSTANCES);
 }
 
+static int amdgpu_sdma_userq_db_obj_mmap(struct drm_gem_object *obj,
+					 struct vm_area_struct *vma)
+{
+	struct amdgpu_sdma_userq_db_obj *db = to_amdgpu_sdma_userq_db(obj);
+
+	if (vma->vm_end - vma->vm_start > round_up(db->size, PAGE_SIZE))
+		return -EINVAL;
+
+	vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
+		     VM_DONTDUMP | VM_PFNMAP);
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	return io_remap_pfn_range(vma, vma->vm_start,
+				  db->phys_base >> PAGE_SHIFT,
+				  vma->vm_end - vma->vm_start,
+				  vma->vm_page_prot);
+}
+
+static void amdgpu_sdma_userq_db_obj_free(struct drm_gem_object *obj)
+{
+	struct amdgpu_sdma_userq_db_obj *db = to_amdgpu_sdma_userq_db(obj);
+
+	drm_gem_object_release(obj);
+	kfree(db);
+}
+
+static const struct drm_gem_object_funcs amdgpu_sdma_userq_db_obj_funcs = {
+	.free = amdgpu_sdma_userq_db_obj_free,
+	.mmap = amdgpu_sdma_userq_db_obj_mmap,
+};
+
+int amdgpu_sdma_userq_doorbell_init(struct amdgpu_device *adev)
+{
+	struct amdgpu_sdma_userq_db_obj *db;
+	u32 base_dw, size_dw, nslots, ring_dw;
+	int i, r;
+
+	if (!adev->userq_funcs[AMDGPU_HW_IP_DMA])
+		return 0;
+
+	base_dw = adev->doorbell_index.sdma_engine[0] << 1;
+	size_dw = adev->doorbell_index.sdma_doorbell_range *
+		  adev->sdma.num_instances;
+	nslots  = size_dw / 2;	/* qword slots */
+	if (!nslots)
+		return 0;
+
+	db = kzalloc(sizeof(*db), GFP_KERNEL);
+	if (!db)
+		return -ENOMEM;
+
+	db->phys_base = adev->doorbell.base +
+			(resource_size_t)base_dw * sizeof(u32);
+	db->size      = size_dw * sizeof(u32);
+	db->base.funcs = &amdgpu_sdma_userq_db_obj_funcs;
+
+	drm_gem_private_object_init(adev_to_drm(adev), &db->base,
+				    round_up(db->size, PAGE_SIZE));
+	r = drm_gem_create_mmap_offset(&db->base);
+	if (r) {
+		drm_gem_object_put(&db->base);
+		return r;
+	}
+
+	mutex_init(&adev->sdma.userq_db_mutex);
+	adev->sdma.userq_db_bitmap = bitmap_zalloc(nslots, GFP_KERNEL);
+	if (!adev->sdma.userq_db_bitmap) {
+		drm_gem_object_put(&db->base);
+		return -ENOMEM;
+	}
+
+	adev->sdma.userq_db_obj    = db;
+	adev->sdma.userq_db_nslots = nslots;
+
+	/*
+	 * Mask out the qword slots used by the kernel SDMA rings
+	 * (sdma_engine[i] << 1 in absolute BAR dwords ⇒ qword slot
+	 * (sdma_engine[i] - sdma_engine[0]) within this window).
+	 */
+	for (i = 0; i < adev->sdma.num_instances; i++) {
+		ring_dw = adev->doorbell_index.sdma_engine[i] << 1;
+		if (ring_dw >= base_dw && ring_dw < base_dw + size_dw)
+			set_bit((ring_dw - base_dw) / 2,
+				adev->sdma.userq_db_bitmap);
+	}
+
+	dev_info(adev->dev,
+		 "SDMA UMQ doorbell pool: %u qword slots in BAR dword [%u, %u)\n",
+		 nslots, base_dw, base_dw + size_dw);
+	return 0;
+}
+
+void amdgpu_sdma_userq_doorbell_fini(struct amdgpu_device *adev)
+{
+	if (!adev->sdma.userq_db_obj)
+		return;
+	bitmap_free(adev->sdma.userq_db_bitmap);
+	adev->sdma.userq_db_bitmap = NULL;
+	adev->sdma.userq_db_nslots = 0;
+	drm_gem_object_put(&adev->sdma.userq_db_obj->base);
+	adev->sdma.userq_db_obj = NULL;
+}
+
+/*
+ * Allocate one qword doorbell slot.  On success, *out_slot receives the
+ * slot id (also the qword index inside the userspace mmap of the window
+ * BO) which the caller passes back to free.
+ */
+int amdgpu_sdma_userq_doorbell_alloc(struct amdgpu_device *adev, u32 *out_slot)
+{
+	u32 slot;
+
+	if (!adev->sdma.userq_db_obj || !adev->sdma.userq_db_nslots)
+		return -ENODEV;
+
+	mutex_lock(&adev->sdma.userq_db_mutex);
+	slot = find_first_zero_bit(adev->sdma.userq_db_bitmap,
+				   adev->sdma.userq_db_nslots);
+	if (slot >= adev->sdma.userq_db_nslots) {
+		mutex_unlock(&adev->sdma.userq_db_mutex);
+		return -ENOSPC;
+	}
+	set_bit(slot, adev->sdma.userq_db_bitmap);
+	mutex_unlock(&adev->sdma.userq_db_mutex);
+
+	*out_slot = slot;
+	return 0;
+}
+
+void amdgpu_sdma_userq_doorbell_free(struct amdgpu_device *adev, u32 slot)
+{
+	if (!adev->sdma.userq_db_obj)
+		return;
+	if (slot >= adev->sdma.userq_db_nslots)
+		return;
+	mutex_lock(&adev->sdma.userq_db_mutex);
+	clear_bit(slot, adev->sdma.userq_db_bitmap);
+	mutex_unlock(&adev->sdma.userq_db_mutex);
+}
+
+/*
+ * Mint a per-fpriv GEM handle for the per-device SDMA UMQ doorbell BO.
+ * Userspace then uses standard GEM_MMAP / mmap() on /dev/dri/cardN to
+ * obtain a CPU pointer to the routable doorbell window.
+ */
+int amdgpu_sdma_userq_doorbell_create_handle(struct amdgpu_device *adev,
+					     struct drm_file *filp,
+					     u32 *handle, u32 *size_bytes)
+{
+	int r;
+
+	if (!adev->sdma.userq_db_obj)
+		return -ENODEV;
+
+	r = drm_gem_handle_create(filp, &adev->sdma.userq_db_obj->base, handle);
+	if (r)
+		return r;
+
+	*size_bytes = adev->sdma.userq_db_obj->size;
+	return 0;
+}
+
 int amdgpu_sdma_init_microcode(struct amdgpu_device *adev,
 			       u32 instance, bool duplicate)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 2bf365609775..93a7eb9746d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -146,6 +146,20 @@ struct amdgpu_sdma {
 	bool			disable_uq;
 	void (*get_csa_info)(struct amdgpu_device *adev,
 			     struct amdgpu_sdma_csa_info *csa_info);
+
+	/*
+	 * SDMA usermode-queue doorbell pool.  The window covers
+	 * BAR dwords [sdma_engine[0], sdma_engine[0] +
+	 * sdma_doorbell_range * num_instances) — the only range that NBIO
+	 * routes to the SDMA back-end.  Each bit in the bitmap represents
+	 * one qword slot; kernel SDMA ring slots are pre-masked at init.
+	 * The window is exposed to userspace as a custom drm_gem_object
+	 * (userq_db_obj) that is mmap'd via standard GEM_MMAP.
+	 */
+	struct amdgpu_sdma_userq_db_obj *userq_db_obj;
+	struct mutex		userq_db_mutex;
+	unsigned long		*userq_db_bitmap;
+	u32			userq_db_nslots;	/* qword slots */
 };
 
 /*
@@ -185,6 +199,38 @@ struct amdgpu_buffer_funcs {
 				 uint32_t byte_count);
 };
 
+/*
+ * SDMA usermode-queue doorbell pool.
+ *
+ * The pool re-uses qword doorbell slots inside the firmware-managed NBIO
+ * SDMA decode window (BAR dwords [sdma_engine[0],
+ * sdma_engine[0] + sdma_doorbell_range * num_instances)) — that range is
+ * the only one whose writes are routed to the SDMA back-end.  The kernel
+ * SDMA ring slots are pre-marked so they keep working alongside any
+ * number of SDMA UMQs.
+ *
+ * The window is exposed to userspace via a per-device drm_gem_object that
+ * userspace mmap()s through the standard GEM_MMAP path; per-fpriv handles
+ * are minted on demand by the AMDGPU_INFO_SDMA_USERQ_DOORBELL ioctl.  No
+ * file_operations override and no fixed mmap pgoff sentinel.
+ *
+ * FIXME: KFD's SDMA queue doorbells (kgd_*_hqd_sdma_get_doorbell on chips
+ * with a non-stub implementation, e.g. gfx9.4.3) are computed
+ * from the same adev->doorbell_index.sdma_engine[] array and would
+ * overlap with this pool.  On gfx12 the kgd hook stubs to 0, so there is
+ * no immediate conflict.  A shared per-adev allocator that both
+ * KFD and amdgpu UMQ call into is the longer-term fix.
+ */
+
+struct amdgpu_sdma_userq_db_obj {
+	struct drm_gem_object	base;
+	resource_size_t		phys_base;	/* BAR phys addr of window start */
+	u32			size;		/* window size in bytes */
+};
+
+#define to_amdgpu_sdma_userq_db(_obj) \
+	container_of(_obj, struct amdgpu_sdma_userq_db_obj, base)
+
 int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id,
 			     bool caller_handles_kernel_queues);
 
@@ -205,6 +251,15 @@ int amdgpu_sdma_process_ecc_irq(struct amdgpu_device *adev,
 				      struct amdgpu_iv_entry *entry);
 int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, u32 instance,
 			       bool duplicate);
+struct drm_file;
+struct amdgpu_sdma_userq_db_obj;
+int amdgpu_sdma_userq_doorbell_init(struct amdgpu_device *adev);
+void amdgpu_sdma_userq_doorbell_fini(struct amdgpu_device *adev);
+int amdgpu_sdma_userq_doorbell_alloc(struct amdgpu_device *adev, u32 *out_slot);
+void amdgpu_sdma_userq_doorbell_free(struct amdgpu_device *adev, u32 slot);
+int amdgpu_sdma_userq_doorbell_create_handle(struct amdgpu_device *adev,
+					     struct drm_file *filp,
+					     u32 *handle, u32 *size_bytes);
 void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
         bool duplicate);
 int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 02/11] drm/amdgpu/userq: route SDMA UMQ doorbells through the kernel pool
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
@ 2026-04-27  8:34 ` Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 03/11] drm/amdgpu/gem: only enforce amdgpu_bo access checks on amdgpu_bo objects Jesse Zhang
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.zhang, Jesse Zhang

From: "Jesse.zhang" <Jesse.zhang@amd.com>

User-allocated DOORBELL BOs land at BAR offsets outside the
firmware-managed NBIO SDMA decode window and cannot reach the SDMA
back-end.  For AMDGPU_HW_IP_DMA queues, ignore the user-supplied
doorbell index and allocate one from the per-device pool added in the
previous patch.  Track the assigned slot id on the queue so it can be
returned to the bitmap on destroy.

Add a new sdma_doorbell_offset_bytes field to drm_amdgpu_userq_out
that tells userspace where its kernel-allocated qword slot lives
inside the BO it will mmap (the BO handle comes from the
AMDGPU_INFO_SDMA_USERQ_DOORBELL ioctl added later in the series).

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 47 +++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |  7 ++++
 include/uapi/drm/amdgpu_drm.h             |  8 ++++
 3 files changed, 62 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 8f48520cb822..cea0f9cb59d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -662,6 +662,10 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
 	amdgpu_bo_unpin(queue->wptr_obj.obj);
 	amdgpu_bo_unreserve(queue->wptr_obj.obj);
 	amdgpu_bo_unref(&queue->wptr_obj.obj);
+
+	if (queue->sdma_userq_db_slot >= 0)
+		amdgpu_sdma_userq_doorbell_free(adev,
+					(u32)queue->sdma_userq_db_slot);
 	kfree(queue);
 
 	pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
@@ -762,6 +766,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 	queue->queue_type = args->in.ip_type;
 	queue->vm = &fpriv->vm;
 	queue->priority = priority;
+	queue->sdma_userq_db_slot = -1;
 
 	db_info.queue_type = queue->queue_type;
 	db_info.doorbell_handle = queue->doorbell_handle;
@@ -792,6 +797,38 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 		goto clean_mapping;
 	}
 
+	/*
+	 * SDMA UMQ doorbell override:  user-allocated DOORBELL BOs land at
+	 * BAR offsets outside the firmware-managed NBIO SDMA decode window
+	 * and cannot reach the SDMA back-end.  Replace the user-supplied
+	 * doorbell index with one allocated from the per-device
+	 * sdma.userq_db_obj BO that sits inside the routable window.
+	 * Userspace fetches a GEM handle for that BO via
+	 * AMDGPU_INFO_SDMA_USERQ_DOORBELL and mmap()s it through the
+	 * standard GEM_MMAP path; sdma_doorbell_offset_bytes (returned in
+	 * args->out) tells userspace where inside that mapping its slot
+	 * lives.
+	 */
+	if (queue->queue_type == AMDGPU_HW_IP_DMA &&
+	    adev->sdma.userq_db_obj) {
+		u32 slot_id;
+
+		r = amdgpu_sdma_userq_doorbell_alloc(adev, &slot_id);
+		if (r) {
+			drm_file_err(uq_mgr->file,
+				     "SDMA UMQ doorbell pool exhausted (err=%d)\n",
+				     r);
+			goto clean_mapping;
+		}
+		/*
+		 * Slot id is a qword index inside the routable window;
+		 * convert to absolute BAR dword index.
+		 */
+		index = (u64)(adev->doorbell_index.sdma_engine[0] << 1) +
+			(u64)slot_id * 2;
+		queue->sdma_userq_db_slot = (int)slot_id;
+	}
+
 	queue->doorbell_index = index;
 	xa_init_flags(&queue->fence_drv_xa, XA_FLAGS_ALLOC);
 	r = amdgpu_userq_fence_driver_alloc(adev, &queue->fence_drv);
@@ -851,6 +888,16 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 	amdgpu_userq_init_hang_detect_work(queue);
 
 	args->out.queue_id = qid;
+	if (queue->sdma_userq_db_slot >= 0) {
+		/*
+		 * Tell userspace where inside its mmap of the SDMA UMQ
+		 * doorbell BO (handle returned by
+		 * AMDGPU_INFO_SDMA_USERQ_DOORBELL) the assigned qword slot
+		 * lives.
+		 */
+		args->out.sdma_doorbell_offset_bytes =
+			(u64)queue->sdma_userq_db_slot * sizeof(u64);
+	}
 	atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
 	mutex_unlock(&uq_mgr->userq_mutex);
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 675fe6395ac8..cdfced627dec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -59,6 +59,13 @@ struct amdgpu_usermode_queue {
 	uint64_t		doorbell_handle;
 	uint64_t		doorbell_index;
 	uint64_t		flags;
+	/*
+	 * For SDMA UMQs whose doorbell came from the kernel-managed pool
+	 * (amdgpu_sdma_userq_doorbell_alloc), record the slot id so it can
+	 * be returned to the bitmap on queue destroy.  -1 means the queue
+	 * is using a user-supplied doorbell BO.
+	 */
+	int			sdma_userq_db_slot;
 	struct amdgpu_mqd_prop	*userq_prop;
 	struct amdgpu_userq_mgr *userq_mgr;
 	struct amdgpu_vm	*vm;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 9f3090db2f16..79e8bbda046b 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -421,6 +421,14 @@ struct drm_amdgpu_userq_out {
 	 */
 	__u32	queue_id;
 	__u32 _pad;
+	/**
+	 * For SDMA usermode queues whose doorbell was assigned by the
+	 * kernel from the per-device pool (see AMDGPU_INFO__USERQ_DOORBELL),
+	 * this field carries the byte offset of the assigned slot inside
+	 * the routable doorbell window so userspace can write there.
+	 * 0 means the kernel did not override the user's doorbell.
+	 */
+	__u64	sdma_doorbell_offset_bytes;
 };
 
 union drm_amdgpu_userq {
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 03/11] drm/amdgpu/gem: only enforce amdgpu_bo access checks on amdgpu_bo objects
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 02/11] drm/amdgpu/userq: route SDMA UMQ doorbells through the kernel pool Jesse Zhang
@ 2026-04-27  8:34 ` Jesse Zhang
  2026-04-27  8:39   ` Christian König
  2026-04-27  8:34 ` [PATCH v2 04/11] drm/amdgpu/sdma7: register SDMA UMQ doorbell pool Jesse Zhang
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.zhang

From: "Jesse.zhang" <Jesse.zhang@amd.com>

amdgpu_mode_dumb_mmap() unconditionally cast every looked-up
drm_gem_object to amdgpu_bo via gem_to_amdgpu_bo() and then read
robj->tbo.ttm and robj->flags.  For a bare drm_gem_object created via
drm_gem_private_object_init() with its own drm_gem_object_funcs (e.g.
the SDMA UMQ doorbell pool BO), the cast yields a pointer to unrelated
memory.  Whether that memory happens to look like a usermm or carry
AMDGPU_GEM_CREATE_NO_CPU_ACCESS set is a function of allocator state,
producing intermittent -EPERM returns from DRM_IOCTL_AMDGPU_GEM_MMAP.

Gate the amdgpu_bo-specific check on gobj->funcs matching
amdgpu_gem_object_funcs, and resolve the mmap offset via the GEM
vma_node directly so the path works for any drm_gem_object backed by
this ioctl.
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 0071d6957828..ccb92088172c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -587,13 +587,22 @@ int amdgpu_mode_dumb_mmap(struct drm_file *filp,
 	if (!gobj)
 		return -ENOENT;
 
-	robj = gem_to_amdgpu_bo(gobj);
-	if (amdgpu_ttm_tt_get_usermm(robj->tbo.ttm) ||
-	    (robj->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)) {
-		drm_gem_object_put(gobj);
-		return -EPERM;
+	/*
+	 * The amdgpu_bo-specific access checks below assume gobj is wrapped
+	 * in an amdgpu_bo. Bare drm_gem_object instances (e.g., the SDMA UMQ
+	 * doorbell pool BO created via drm_gem_private_object_init with its
+	 * own funcs) are not amdgpu_bo, so gem_to_amdgpu_bo would dereference
+	 * unrelated memory and intermittently return -EPERM.
+	 */
+	if (gobj->funcs == &amdgpu_gem_object_funcs) {
+		robj = gem_to_amdgpu_bo(gobj);
+		if (amdgpu_ttm_tt_get_usermm(robj->tbo.ttm) ||
+		    (robj->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)) {
+			drm_gem_object_put(gobj);
+			return -EPERM;
+		}
 	}
-	*offset_p = amdgpu_bo_mmap_offset(robj);
+	*offset_p = drm_vma_node_offset_addr(&gobj->vma_node);
 	drm_gem_object_put(gobj);
 	return 0;
 }
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 03/11] drm/amdgpu/gem: only enforce amdgpu_bo access checks on amdgpu_bo objects
  2026-04-27  8:34 ` [PATCH v2 03/11] drm/amdgpu/gem: only enforce amdgpu_bo access checks on amdgpu_bo objects Jesse Zhang
@ 2026-04-27  8:39   ` Christian König
  0 siblings, 0 replies; 14+ messages in thread
From: Christian König @ 2026-04-27  8:39 UTC (permalink / raw)
  To: Jesse Zhang, amd-gfx; +Cc: Alexander.Deucher

On 4/27/26 10:34, Jesse Zhang wrote:
> From: "Jesse.zhang" <Jesse.zhang@amd.com>
> 
> amdgpu_mode_dumb_mmap() unconditionally cast every looked-up
> drm_gem_object to amdgpu_bo via gem_to_amdgpu_bo() and then read
> robj->tbo.ttm and robj->flags.  For a bare drm_gem_object created via
> drm_gem_private_object_init() with its own drm_gem_object_funcs (e.g.
> the SDMA UMQ doorbell pool BO),

Well big NAK to that approach.

Why in the world would we want to create a GEM object directly through drm_gem_private_object_init()?

Regards,
Christian.


> the cast yields a pointer to unrelated
> memory.  Whether that memory happens to look like a usermm or carry
> AMDGPU_GEM_CREATE_NO_CPU_ACCESS set is a function of allocator state,
> producing intermittent -EPERM returns from DRM_IOCTL_AMDGPU_GEM_MMAP.


> 
> Gate the amdgpu_bo-specific check on gobj->funcs matching
> amdgpu_gem_object_funcs, and resolve the mmap offset via the GEM
> vma_node directly so the path works for any drm_gem_object backed by
> this ioctl.
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 21 +++++++++++++++------
>  1 file changed, 15 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 0071d6957828..ccb92088172c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -587,13 +587,22 @@ int amdgpu_mode_dumb_mmap(struct drm_file *filp,
>  	if (!gobj)
>  		return -ENOENT;
>  
> -	robj = gem_to_amdgpu_bo(gobj);
> -	if (amdgpu_ttm_tt_get_usermm(robj->tbo.ttm) ||
> -	    (robj->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)) {
> -		drm_gem_object_put(gobj);
> -		return -EPERM;
> +	/*
> +	 * The amdgpu_bo-specific access checks below assume gobj is wrapped
> +	 * in an amdgpu_bo. Bare drm_gem_object instances (e.g., the SDMA UMQ
> +	 * doorbell pool BO created via drm_gem_private_object_init with its
> +	 * own funcs) are not amdgpu_bo, so gem_to_amdgpu_bo would dereference
> +	 * unrelated memory and intermittently return -EPERM.
> +	 */
> +	if (gobj->funcs == &amdgpu_gem_object_funcs) {
> +		robj = gem_to_amdgpu_bo(gobj);
> +		if (amdgpu_ttm_tt_get_usermm(robj->tbo.ttm) ||
> +		    (robj->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)) {
> +			drm_gem_object_put(gobj);
> +			return -EPERM;
> +		}
>  	}
> -	*offset_p = amdgpu_bo_mmap_offset(robj);
> +	*offset_p = drm_vma_node_offset_addr(&gobj->vma_node);
>  	drm_gem_object_put(gobj);
>  	return 0;
>  }


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 04/11] drm/amdgpu/sdma7: register SDMA UMQ doorbell pool
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 02/11] drm/amdgpu/userq: route SDMA UMQ doorbells through the kernel pool Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 03/11] drm/amdgpu/gem: only enforce amdgpu_bo access checks on amdgpu_bo objects Jesse Zhang
@ 2026-04-27  8:34 ` Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 05/11] drm/amdgpu/sdma6: " Jesse Zhang
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.zhang, Jesse Zhang

From: "Jesse.zhang" <Jesse.zhang@amd.com>

Call amdgpu_sdma_userq_doorbell_init() at sw_init time (gated on
userq_funcs[AMDGPU_HW_IP_DMA] being set so we only run on chips that
actually expose SDMA UMQs) and amdgpu_sdma_userq_doorbell_fini() at
sw_fini, so SDMA usermode queues get doorbells from the
firmware-managed NBIO routable window.

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 85d98a0e1bff..5f6c51ba7ac1 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -1378,6 +1378,14 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block)
 		break;
 	}
 
+	/*
+	 * Init the SDMA usermode-queue doorbell pool inside the firmware-
+	 * managed NBIO S2A SDMA decode window so user SDMA UMQs get
+	 * doorbells that are actually routable to the SDMA back-end.
+	 */
+	if (adev->userq_funcs[AMDGPU_HW_IP_DMA])
+		amdgpu_sdma_userq_doorbell_init(adev);
+
 	return r;
 }
 
@@ -1389,6 +1397,7 @@ static int sdma_v7_0_sw_fini(struct amdgpu_ip_block *ip_block)
 	for (i = 0; i < adev->sdma.num_instances; i++)
 		amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+	amdgpu_sdma_userq_doorbell_fini(adev);
 	amdgpu_sdma_sysfs_reset_mask_fini(adev);
 	amdgpu_sdma_destroy_inst_ctx(adev, true);
 
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 05/11] drm/amdgpu/sdma6: register SDMA UMQ doorbell pool
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
                   ` (2 preceding siblings ...)
  2026-04-27  8:34 ` [PATCH v2 04/11] drm/amdgpu/sdma7: register SDMA UMQ doorbell pool Jesse Zhang
@ 2026-04-27  8:34 ` Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 06/11] drm/amdgpu: add AMDGPU_INFO_USERQ_DOORBELL ioctl Jesse Zhang
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.zhang, Jesse Zhang

From: "Jesse.zhang" <Jesse.zhang@amd.com>

Call amdgpu_sdma_userq_doorbell_init() at sw_init time (gated on
userq_funcs[AMDGPU_HW_IP_DMA] being set so we only run on chips that
actually expose SDMA UMQs) and amdgpu_sdma_userq_doorbell_fini() at
sw_fini, so SDMA usermode queues get doorbells from the
firmware-managed NBIO routable window.

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index de329b76a00c..02eeac3b2e11 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -1426,6 +1426,14 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block)
 	if (r)
 		return r;
 
+	/*
+	 * Init the SDMA usermode-queue doorbell pool inside the firmware-
+	 * managed NBIO SDMA decode window so user SDMA UMQs get doorbells
+	 * that are actually routable to the SDMA back-end.
+	 */
+	if (adev->userq_funcs[AMDGPU_HW_IP_DMA])
+		amdgpu_sdma_userq_doorbell_init(adev);
+
 	return r;
 }
 
@@ -1437,6 +1445,7 @@ static int sdma_v6_0_sw_fini(struct amdgpu_ip_block *ip_block)
 	for (i = 0; i < adev->sdma.num_instances; i++)
 		amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 
+	amdgpu_sdma_userq_doorbell_fini(adev);
 	amdgpu_sdma_sysfs_reset_mask_fini(adev);
 	amdgpu_sdma_destroy_inst_ctx(adev, true);
 
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 06/11] drm/amdgpu: add AMDGPU_INFO_USERQ_DOORBELL ioctl
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
                   ` (3 preceding siblings ...)
  2026-04-27  8:34 ` [PATCH v2 05/11] drm/amdgpu/sdma6: " Jesse Zhang
@ 2026-04-27  8:34 ` Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 07/11] drm/amdgpu/mes: add NOTIFY_WORK_ON_UNMAPPED_QUEUE op + ADD_QUEUE fields Jesse Zhang
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alexander.Deucher, Christian Koenig, Jesse.zhang,
	David (Ming Qiang) Wu, Alex Deucher, Jesse Zhang

From: "Jesse.zhang" <Jesse.zhang@amd.com>

Per-IP doorbell layout query.  Input: info->query_hw_ip.type selects
the IP (AMDGPU_HW_IP_GFX / COMPUTE / DMA / VCN_ENC).  Output: the
doorbell-BAR dword range usable by usermode queues for that IP, the
aggregated doorbell offset (when MES owns one), and — for
AMDGPU_HW_IP_DMA only — a per-fpriv GEM handle plus byte size for the
kernel-owned BO that backs the routable SDMA doorbell window.

For SDMA usermode queues the kernel hands out doorbell slots from the
per-device pool (the only BAR range whose writes are routed to the
SDMA back-end) instead of accepting user-allocated doorbell BOs.
Userspace mmap()s the returned handle through the standard
AMDGPU_GEM_OP_MMAP / mmap() flow to obtain a CPU pointer to the
window; each created SDMA UMQ's qword-slot offset inside that mapping
is reported in drm_amdgpu_userq_out.sdma_doorbell_offset_bytes.

V2: Picks up David Wu's [PATCH 11/14] AMDGPU_INFO_DOORBELL design (struct
    shape + per-IP dispatch) and extends it with the SDMA-only routable-BO
    handle/size fields.

Cc: David (Ming Qiang) Wu <David.Wu3@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: David (Ming Qiang) Wu <David.Wu3@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   | 35 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 13 ++++----
 include/uapi/drm/amdgpu_drm.h             | 40 +++++++++++++++++++++--
 3 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index d88e4994c8c1..32adcb32a507 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1425,6 +1425,41 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 			return -EINVAL;
 		}
 	}
+	case AMDGPU_INFO_USERQ_DOORBELL: {
+		struct drm_amdgpu_info_userq_doorbell db_info = {};
+		u32 agdb = adev->enable_mes ?
+			adev->mes.aggregated_doorbells[AMDGPU_MES_PRIORITY_LEVEL_NORMAL] : 0;
+		int r;
+
+		switch (info->query_hw_ip.type) {
+		case AMDGPU_HW_IP_DMA:
+			if (!adev->sdma.userq_db_obj)
+				return -ENODEV;
+			db_info.index_start =
+				adev->doorbell_index.sdma_engine[0] << 1;
+			db_info.index_end = db_info.index_start +
+				adev->doorbell_index.sdma_doorbell_range *
+				adev->sdma.num_instances - 1;
+			if (agdb) {
+				db_info.agdb_enable = 1;
+				db_info.agdb_offset = agdb;
+			}
+			r = amdgpu_sdma_userq_doorbell_create_handle(adev, filp,
+					&db_info.doorbell_bo_handle,
+					&db_info.doorbell_bo_size_bytes);
+			if (r)
+				return r;
+			break;
+		case AMDGPU_HW_IP_VCN_ENC:
+		case AMDGPU_HW_IP_GFX:
+		case AMDGPU_HW_IP_COMPUTE:
+		default:
+			return -EINVAL;
+		}
+		return copy_to_user(out, &db_info,
+				    min((size_t)size, sizeof(db_info)))
+			? -EFAULT : 0;
+	}
 	default:
 		DRM_DEBUG_KMS("Invalid request %d\n", info->query);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index cea0f9cb59d0..a0c90cc0cba5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -804,10 +804,10 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 	 * doorbell index with one allocated from the per-device
 	 * sdma.userq_db_obj BO that sits inside the routable window.
 	 * Userspace fetches a GEM handle for that BO via
-	 * AMDGPU_INFO_SDMA_USERQ_DOORBELL and mmap()s it through the
-	 * standard GEM_MMAP path; sdma_doorbell_offset_bytes (returned in
-	 * args->out) tells userspace where inside that mapping its slot
-	 * lives.
+	 * AMDGPU_INFO_USERQ_DOORBELL with AMDGPU_HW_IP_DMA and mmap()s it
+	 * through the standard GEM_MMAP path; sdma_doorbell_offset_bytes
+	 * (returned in args->out) tells userspace where inside that
+	 * mapping its slot lives.
 	 */
 	if (queue->queue_type == AMDGPU_HW_IP_DMA &&
 	    adev->sdma.userq_db_obj) {
@@ -891,9 +891,8 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
 	if (queue->sdma_userq_db_slot >= 0) {
 		/*
 		 * Tell userspace where inside its mmap of the SDMA UMQ
-		 * doorbell BO (handle returned by
-		 * AMDGPU_INFO_SDMA_USERQ_DOORBELL) the assigned qword slot
-		 * lives.
+		 * doorbell BO (handle returned by AMDGPU_INFO_USERQ_DOORBELL with
+		 * AMDGPU_HW_IP_DMA) the assigned qword slot lives.
 		 */
 		args->out.sdma_doorbell_offset_bytes =
 			(u64)queue->sdma_userq_db_slot * sizeof(u64);
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 79e8bbda046b..945fa3e95b3b 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -423,9 +423,10 @@ struct drm_amdgpu_userq_out {
 	__u32 _pad;
 	/**
 	 * For SDMA usermode queues whose doorbell was assigned by the
-	 * kernel from the per-device pool (see AMDGPU_INFO_SDMA_USERQ_DOORBELL),
-	 * this field carries the byte offset of the assigned slot inside
-	 * the routable doorbell window so userspace can write there.
+	 * kernel from the per-device pool (see AMDGPU_INFO_USERQ_DOORBELL with
+	 * AMDGPU_HW_IP_DMA), this field carries the byte offset of the
+	 * assigned slot inside the routable doorbell window so userspace
+	 * can write there.
 	 * 0 means the kernel did not override the user's doorbell.
 	 */
 	__u64	sdma_doorbell_offset_bytes;
@@ -1280,6 +1281,39 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow {
 #define AMDGPU_INFO_GPUVM_FAULT			0x23
 /* query FW object size and alignment */
 #define AMDGPU_INFO_UQ_FW_AREAS			0x24
+/*
+ * Per-IP doorbell layout query.  Input: info->query_hw_ip.type selects
+ * the IP (AMDGPU_HW_IP_GFX / COMPUTE / DMA / VCN_ENC).  Output:
+ * doorbell-BAR dword range usable by usermode queues for that IP, the
+ * aggregated doorbell offset (when MES/UMSCH owns one for the IP), and
+ * — for AMDGPU_HW_IP_DMA only — a per-fpriv GEM handle for a
+ * kernel-owned BO that backs the routable SDMA doorbell window so
+ * userspace can mmap() its assigned slot.
+ */
+#define AMDGPU_INFO_USERQ_DOORBELL			0x25
+
+struct drm_amdgpu_info_userq_doorbell {
+	/* BAR dword index of the start of the per-IP doorbell range. */
+	__u32 index_start;
+	/* BAR dword index of the last doorbell in the range (inclusive). */
+	__u32 index_end;
+	/* 1 if an aggregated doorbell exists for this IP. */
+	__u32 agdb_enable;
+	/* If agdb_enable, BAR dword index of the aggregated doorbell. */
+	__u32 agdb_offset;
+	/*
+	 * AMDGPU_HW_IP_DMA only: per-fpriv GEM handle for the kernel-owned
+	 * BO backing the routable SDMA doorbell window.  Userspace mmap()s
+	 * it through the standard AMDGPU_GEM_OP_MMAP / mmap() flow to get
+	 * a CPU pointer; each created SDMA usermode queue's qword-slot
+	 * offset inside that mapping is reported in
+	 * drm_amdgpu_userq_out.sdma_doorbell_offset_bytes.  0 for IPs that
+	 * do not need a kernel-managed doorbell BO.
+	 */
+	__u32 doorbell_bo_handle;
+	/* Byte size of the BO; 0 when doorbell_bo_handle is 0. */
+	__u32 doorbell_bo_size_bytes;
+};
 
 #define AMDGPU_INFO_MMR_SE_INDEX_SHIFT	0
 #define AMDGPU_INFO_MMR_SE_INDEX_MASK	0xff
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 07/11] drm/amdgpu/mes: add NOTIFY_WORK_ON_UNMAPPED_QUEUE op + ADD_QUEUE fields
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
                   ` (4 preceding siblings ...)
  2026-04-27  8:34 ` [PATCH v2 06/11] drm/amdgpu: add AMDGPU_INFO_USERQ_DOORBELL ioctl Jesse Zhang
@ 2026-04-27  8:34 ` Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 08/11] drm/amdgpu/mes11: plumb unmap_flag_addr + NOTIFY_WORK_ON_UNMAPPED_QUEUE Jesse Zhang
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.zhang, Jesse Zhang

From: "Jesse.zhang" <Jesse.zhang@amd.com>

Kernel-side abstraction work for the SDMA usermode-queue plumbing
landed in subsequent per-engine patches:

- mes_add_queue_input gains is_user_mode_submission and
  unmap_flag_addr.  Without is_user_mode_submission MES treats SDMA
  queues as kernel-managed and uses the end-of-MQD slot for the unmap
  flag, so PROTECTED_FENCE at the tail of every SDMA IB looks like a
  "queue done" signal and MES gangs the queue out forever.

- mes_misc_opcode gains MES_MISC_OP_NOTIFY_WORK_ON_UNMAPPED_QUEUE
  with a notify_work.priority_level payload.  This wakes a gangs-out
  SDMA UMQ so subsequent IBs get re-mapped (SDMA has no
  CP_UNMAPPED_DOORBELL HW intercept).

Also surface the matching firmware bits in mes_v12_api_def.h:
is_user_mode_submission / enable_perf_profiling /
exclude_process_limit / is_video_blit_queue bitfields in
MESAPI__ADD_QUEUE, and the unmap_flag_addr packet field.

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |  7 +++++++
 drivers/gpu/drm/amd/include/mes_v12_api_def.h | 12 +++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index cafc5caae822..705056de94b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -265,6 +265,8 @@ struct mes_add_queue_input {
 	uint32_t	exclusively_scheduled;
 	uint32_t	sh_mem_config_data;
 	uint32_t	vm_cntx_cntl;
+	uint32_t	is_user_mode_submission;
+	uint64_t	unmap_flag_addr;
 };
 
 struct mes_remove_queue_input {
@@ -343,6 +345,7 @@ enum mes_misc_opcode {
 	MES_MISC_OP_WRM_REG_WR_WAIT,
 	MES_MISC_OP_SET_SHADER_DEBUGGER,
 	MES_MISC_OP_CHANGE_CONFIG,
+	MES_MISC_OP_NOTIFY_WORK_ON_UNMAPPED_QUEUE,
 };
 
 struct mes_misc_op_input {
@@ -397,6 +400,10 @@ struct mes_misc_op_input {
 				uint32_t tdr_delay;
 			} tdr_config;
 		} change_config;
+
+		struct {
+			uint32_t priority_level;
+		} notify_work;
 	};
 };
 
diff --git a/drivers/gpu/drm/amd/include/mes_v12_api_def.h b/drivers/gpu/drm/amd/include/mes_v12_api_def.h
index e541a43714a1..cd6e60184a06 100644
--- a/drivers/gpu/drm/amd/include/mes_v12_api_def.h
+++ b/drivers/gpu/drm/amd/include/mes_v12_api_def.h
@@ -381,7 +381,11 @@ union MESAPI__ADD_QUEUE {
 			uint32_t exclusively_scheduled : 1;
 			uint32_t is_long_running : 1;
 			uint32_t is_dwm_queue : 1;
-			uint32_t reserved	 : 15;
+			uint32_t is_video_blit_queue : 1;
+			uint32_t is_user_mode_submission : 1;
+			uint32_t enable_perf_profiling : 1;
+			uint32_t exclude_process_limit : 1;
+			uint32_t reserved	 : 11;
 		};
 		struct MES_API_STATUS	api_status;
 		uint64_t		tma_addr;
@@ -393,6 +397,12 @@ union MESAPI__ADD_QUEUE {
 		uint32_t		queue_id;
 		uint32_t		alignment_mode_setting;
 		uint32_t		full_sh_mem_config_data;
+		/*
+		 * MC addr where MES writes 1 when it unmaps the queue.  Used
+		 * by user-mode SDMA UMQs so the kernel/userspace can detect
+		 * the unmapped state and re-arm work via NOTIFY_WORK_ON_UNMAPPED_QUEUE.
+		 */
+		uint64_t		unmap_flag_addr;
 	};
 
 	uint32_t max_dwords_in_api[API_FRAME_SIZE_IN_DWORDS];
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 08/11] drm/amdgpu/mes11: plumb unmap_flag_addr + NOTIFY_WORK_ON_UNMAPPED_QUEUE
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
                   ` (5 preceding siblings ...)
  2026-04-27  8:34 ` [PATCH v2 07/11] drm/amdgpu/mes: add NOTIFY_WORK_ON_UNMAPPED_QUEUE op + ADD_QUEUE fields Jesse Zhang
@ 2026-04-27  8:34 ` Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 09/11] drm/amdgpu/mes12: plumb is_user_mode_submission, unmap_flag_addr, NOTIFY Jesse Zhang
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.zhang, Jesse Zhang

From: "Jesse.zhang" <Jesse.zhang@amd.com>

Pass the new mes_add_queue_input.unmap_flag_addr through to the
MESAPI__ADD_QUEUE packet, and route MES_MISC_OP_NOTIFY_WORK_ON_UNMAPPED_QUEUE
to the matching MESAPI_MISC opcode.

Note: the MES v11 firmware spec does not (yet) carry a per-queue
is_user_mode_submission bit, so SDMA UMQs on chips with MES v11 may
still see PROTECTED_FENCE-as-queue-done behaviour after the first IB
until firmware adds the bit.  The wakeup mechanism (NOTIFY) is wired
up so that path is ready when firmware lands.

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index a926a330700e..575cc4a684b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -362,6 +362,16 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
 
 	mes_add_queue_pkt.exclusively_scheduled = input->exclusively_scheduled;
 
+	/*
+	 * unmap_flag_addr is plumbed through but only honoured by MES when
+	 * the global use_add_queue_unmap_flag_addr flag is set in
+	 * SET_HW_RESOURCES.  MES v11 firmware spec does not carry a
+	 * per-queue is_user_mode_submission bit, so SDMA UMQs on chips with
+	 * MES v11 may still see PROTECTED_FENCE-as-queue-done behaviour
+	 * until firmware adds the bit.
+	 */
+	mes_add_queue_pkt.unmap_flag_addr = input->unmap_flag_addr;
+
 	return mes_v11_0_submit_pkt_and_poll_completion(mes,
 			&mes_add_queue_pkt, sizeof(mes_add_queue_pkt),
 			offsetof(union MESAPI__ADD_QUEUE, api_status));
@@ -660,6 +670,10 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes,
 		misc_pkt.change_config.option.bits.limit_single_process =
 				input->change_config.option.limit_single_process;
 		break;
+	case MES_MISC_OP_NOTIFY_WORK_ON_UNMAPPED_QUEUE:
+		misc_pkt.opcode = MESAPI_MISC__NOTIFY_WORK_ON_UNMAPPED_QUEUE;
+		misc_pkt.queue_sch_level = input->notify_work.priority_level;
+		break;
 
 	default:
 		drm_err(adev_to_drm(mes->adev), "unsupported misc op (%d)\n", input->op);
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 09/11] drm/amdgpu/mes12: plumb is_user_mode_submission, unmap_flag_addr, NOTIFY
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
                   ` (6 preceding siblings ...)
  2026-04-27  8:34 ` [PATCH v2 08/11] drm/amdgpu/mes11: plumb unmap_flag_addr + NOTIFY_WORK_ON_UNMAPPED_QUEUE Jesse Zhang
@ 2026-04-27  8:34 ` Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 10/11] drm/amdgpu/mes_userqueue: mark SDMA UMQs as user-mode submission Jesse Zhang
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse Zhang

Pass is_user_mode_submission and unmap_flag_addr from
mes_add_queue_input through to MESAPI__ADD_QUEUE in both mes_v12_0
add_hw_queue paths, and route MES_MISC_OP_NOTIFY_WORK_ON_UNMAPPED_QUEUE
to the matching MESAPI_MISC opcode.

The kernel-side caller that actually sets is_user_mode_submission for
SDMA UMQs lives in a later patch; this one is just the engine-level
plumbing.

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index 023c7345ea54..5acc505533f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -342,6 +342,8 @@ static int mes_v12_0_add_hw_queue(struct amdgpu_mes *mes,
 	mes_add_queue_pkt.trap_en = input->trap_en;
 	mes_add_queue_pkt.skip_process_ctx_clear = input->skip_process_ctx_clear;
 	mes_add_queue_pkt.is_kfd_process = input->is_kfd_process;
+	mes_add_queue_pkt.is_user_mode_submission = input->is_user_mode_submission;
+	mes_add_queue_pkt.unmap_flag_addr = input->unmap_flag_addr;
 
 	/* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */
 	mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
@@ -697,6 +699,10 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes,
 		misc_pkt.change_config.option.bits.limit_single_process =
 				input->change_config.option.limit_single_process;
 		break;
+	case MES_MISC_OP_NOTIFY_WORK_ON_UNMAPPED_QUEUE:
+		misc_pkt.opcode = MESAPI_MISC__NOTIFY_WORK_ON_UNMAPPED_QUEUE;
+		misc_pkt.queue_sch_level = input->notify_work.priority_level;
+		break;
 
 	default:
 		DRM_ERROR("unsupported misc op (%d)\n", input->op);
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 10/11] drm/amdgpu/mes_userqueue: mark SDMA UMQs as user-mode submission
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
                   ` (7 preceding siblings ...)
  2026-04-27  8:34 ` [PATCH v2 09/11] drm/amdgpu/mes12: plumb is_user_mode_submission, unmap_flag_addr, NOTIFY Jesse Zhang
@ 2026-04-27  8:34 ` Jesse Zhang
  2026-04-27  8:34 ` [PATCH v2 11/11] drm/amdgpu/userq_fence: wake gangs-out SDMA UMQs via NOTIFY Jesse Zhang
  2026-04-27  8:42 ` [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Christian König
  10 siblings, 0 replies; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.zhang, Jesse Zhang

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="Y", Size: 2382 bytes --]

From: "Jesse.zhang" <Jesse.zhang@amd.com>

For AMDGPU_HW_IP_DMA queues, set mes_add_queue_input.is_user_mode_submission
and a stable unmap_flag_addr (a kernel-owned dword in the MQD
object's tail padding).  This tells MES to use the new wptr_mc /
unmap_flag scheme so the PROTECTED_FENCE at the tail of every SDMA
IB no longer terminates the queue.  Combined with the
NOTIFY_WORK_ON_UNMAPPED_QUEUE wakeup added in a follow-up patch, this
lets multi-IB submissions on a single SDMA UMQ work end-to-end.

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index d12cd1b7790b..3dbcddb46b24 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -165,6 +165,28 @@ static int mes_userq_map(struct amdgpu_usermode_queue *queue)
 	queue_input.doorbell_offset = userq_props->doorbell_index;
 	queue_input.page_table_base_addr = amdgpu_gmc_pd_addr(queue->vm->root.bo);
 	queue_input.wptr_mc_addr = queue->wptr_obj.gpu_addr;
+	/*
+	 * SDMA UMQs need is_user_mode_submission so MES treats them as user
+	 * queues (using the new wptr_mc_addr / unmap_flag_addr scheme).
+	 * Without this MES uses end-of-MQD for unmap_flag, sees PROTECTED_FENCE
+	 * as a "queue done" signal, and gangs the queue out forever.  Combined
+	 * with NOTIFY_WORK_ON_UNMAPPED_QUEUE poke from amdgpu_userq_signal_ioctl
+	 * this lets multi-IB submissions work.  Use queue->mqd.gpu_addr +
+	 * mqd_size as a stable kernel-owned location for unmap_flag — userspace
+	 * never reads it; the kernel just needs SOMETHING valid to give MES.
+	 */
+	if (queue->queue_type == AMDGPU_HW_IP_DMA) {
+		queue_input.is_user_mode_submission = 1;
+		/*
+		 * Same offset MES would derive in legacy mode
+		 * (get_unmap_flag_addr_from_end_of_mqd in MES src 12).  Lives
+		 * inside the allocated MQD object's tail padding so it's a
+		 * valid MC address; the kernel never reads it back — its only
+		 * purpose is to keep MES happy.
+		 */
+		queue_input.unmap_flag_addr = queue->mqd.gpu_addr +
+			adev->mqds[queue->queue_type].mqd_size + sizeof(u32);
+	}
 
 	amdgpu_mes_lock(&adev->mes);
 	r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2 11/11] drm/amdgpu/userq_fence: wake gangs-out SDMA UMQs via NOTIFY
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
                   ` (8 preceding siblings ...)
  2026-04-27  8:34 ` [PATCH v2 10/11] drm/amdgpu/mes_userqueue: mark SDMA UMQs as user-mode submission Jesse Zhang
@ 2026-04-27  8:34 ` Jesse Zhang
  2026-04-27  8:42 ` [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Christian König
  10 siblings, 0 replies; 14+ messages in thread
From: Jesse Zhang @ 2026-04-27  8:34 UTC (permalink / raw)
  To: amd-gfx; +Cc: Alexander.Deucher, Christian Koenig, Jesse.zhang, Jesse Zhang

From: "Jesse.zhang" <Jesse.zhang@amd.com>

SDMA has no CP_UNMAPPED_DOORBELL HW intercept, so once MES gangs the
queue out (after the first IB idles it) per-queue doorbell rings from
userspace hit a mapped-out HW slot and are silently dropped: rptr
stops advancing and FENCE IRQ never fires.

After the SDMA UMQ's first IB has actually completed
(fence_drv->cpu_addr != 0), issue
MES_MISC_OP_NOTIFY_WORK_ON_UNMAPPED_QUEUE and ring the priority's
aggregated doorbell so MES re-evaluates scheduling and re-maps the
queue for the next IB.  The first submission is intentionally skipped
— the queue is still mapped from MAP_QUEUE then, and an extra notify
would race the initial scheduling.

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c   | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index a58342c2ac44..6ef4cbd5d5da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -598,6 +598,39 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
 	/* drop the reference acquired in fence creation function */
 	dma_fence_put(fence);
 
+	/*
+	 * SDMA UMQ wake: SDMA has no CP_UNMAPPED_DOORBELL HW intercept, so
+	 * once MES gangs the queue out (after the first IB's PROTECTED_FENCE
+	 * idles the queue), subsequent per-queue doorbell rings hit a
+	 * mapped-out HW slot and are silently ignored — rptr stops
+	 * advancing, FENCE IRQ never fires.  The MES MISC API
+	 * NOTIFY_WORK_ON_UNMAPPED_QUEUE flips MES's hasReadyQueues flag for
+	 * the queue's priority level, which makes MES re-evaluate
+	 * scheduling and re-map our SDMA UMQ for the next IB.
+	 *
+	 * Skip on the very first submission (fence_drv->cpu_addr == 0
+	 * means SDMA hasn't completed any IB yet, so MES still has the
+	 * queue mapped from MAP_QUEUE — calling NOTIFY here would race the
+	 * initial scheduling and starve the first IB).
+	 */
+	if (queue && queue->queue_type == AMDGPU_HW_IP_DMA &&
+	    adev->enable_mes && adev->mes.funcs->misc_op &&
+	    queue->fence_drv && queue->fence_drv->cpu_addr &&
+	    le64_to_cpu(*queue->fence_drv->cpu_addr) != 0) {
+		struct mes_misc_op_input op = { 0 };
+		u32 agg_db = adev->mes.aggregated_doorbells[
+				AMDGPU_MES_PRIORITY_LEVEL_NORMAL];
+
+		op.op = MES_MISC_OP_NOTIFY_WORK_ON_UNMAPPED_QUEUE;
+		op.notify_work.priority_level = AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
+		amdgpu_mes_lock(&adev->mes);
+		(void)adev->mes.funcs->misc_op(&adev->mes, &op);
+		amdgpu_mes_unlock(&adev->mes);
+
+		if (agg_db)
+			WDOORBELL64(agg_db, queue->doorbell_index);
+	}
+
 exec_fini:
 	drm_exec_fini(&exec);
 put_gobj_write:
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra
  2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
                   ` (9 preceding siblings ...)
  2026-04-27  8:34 ` [PATCH v2 11/11] drm/amdgpu/userq_fence: wake gangs-out SDMA UMQs via NOTIFY Jesse Zhang
@ 2026-04-27  8:42 ` Christian König
  2026-04-28  9:39   ` Zhang, Jesse(Jie)
  10 siblings, 1 reply; 14+ messages in thread
From: Christian König @ 2026-04-27  8:42 UTC (permalink / raw)
  To: Jesse Zhang, amd-gfx; +Cc: Alexander.Deucher

On 4/27/26 10:34, Jesse Zhang wrote:
> Add a per-device qword-slot pool covering the firmware-managed NBIO
> SDMA decode window (BAR dwords [sdma_engine[0],
> sdma_engine[0] + sdma_doorbell_range * num_instances)) — the only
> range whose writes are routed to the SDMA back-end.  Kernel SDMA ring
> slots are pre-masked at init.
> 
> The window is exposed to userspace as a custom drm_gem_object: no TTM
> backing, custom .mmap callback that does io_remap_pfn_range from the
> SDMA decode window's BAR address.  Per-fpriv GEM handles for that BO
> can be minted on demand via amdgpu_sdma_userq_doorbell_create_handle()
> so userspace mmap()s through the standard drm_gem_mmap path — no
> file_operations override and no fixed mmap pgoff sentinel.

I can't see how that would even remotely work. We basically would need to distinct on every GEM handle lockup what type of BO we have.

So absolutely clear NAK to that approach.

Regards,
Christian.

> 
> Slots are allocated/freed via amdgpu_sdma_userq_doorbell_alloc/free.
> The init/fini and the AMDGPU_INFO_USERQ_DOORBELL ioctl that uses
> create_handle land in subsequent patches.
> 
> Suggested-by:Prike Liang <Prike.Liang@amd.com>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 164 +++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  55 ++++++++
>  2 files changed, 219 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 321310ba2c08..1c61761c0046 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -22,6 +22,8 @@
>   */
>  
>  #include <linux/firmware.h>
> +#include <drm/drm_gem.h>
> +#include <drm/drm_file.h>
>  #include "amdgpu.h"
>  #include "amdgpu_sdma.h"
>  #include "amdgpu_ras.h"
> @@ -200,6 +202,168 @@ void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
>  	       sizeof(struct amdgpu_sdma_instance) * AMDGPU_MAX_SDMA_INSTANCES);
>  }
>  
> +static int amdgpu_sdma_userq_db_obj_mmap(struct drm_gem_object *obj,
> +					 struct vm_area_struct *vma)
> +{
> +	struct amdgpu_sdma_userq_db_obj *db = to_amdgpu_sdma_userq_db(obj);
> +
> +	if (vma->vm_end - vma->vm_start > round_up(db->size, PAGE_SIZE))
> +		return -EINVAL;
> +
> +	vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
> +		     VM_DONTDUMP | VM_PFNMAP);
> +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +
> +	return io_remap_pfn_range(vma, vma->vm_start,
> +				  db->phys_base >> PAGE_SHIFT,
> +				  vma->vm_end - vma->vm_start,
> +				  vma->vm_page_prot);
> +}
> +
> +static void amdgpu_sdma_userq_db_obj_free(struct drm_gem_object *obj)
> +{
> +	struct amdgpu_sdma_userq_db_obj *db = to_amdgpu_sdma_userq_db(obj);
> +
> +	drm_gem_object_release(obj);
> +	kfree(db);
> +}
> +
> +static const struct drm_gem_object_funcs amdgpu_sdma_userq_db_obj_funcs = {
> +	.free = amdgpu_sdma_userq_db_obj_free,
> +	.mmap = amdgpu_sdma_userq_db_obj_mmap,
> +};
> +
> +int amdgpu_sdma_userq_doorbell_init(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_sdma_userq_db_obj *db;
> +	u32 base_dw, size_dw, nslots, ring_dw;
> +	int i, r;
> +
> +	if (!adev->userq_funcs[AMDGPU_HW_IP_DMA])
> +		return 0;
> +
> +	base_dw = adev->doorbell_index.sdma_engine[0] << 1;
> +	size_dw = adev->doorbell_index.sdma_doorbell_range *
> +		  adev->sdma.num_instances;
> +	nslots  = size_dw / 2;	/* qword slots */
> +	if (!nslots)
> +		return 0;
> +
> +	db = kzalloc(sizeof(*db), GFP_KERNEL);
> +	if (!db)
> +		return -ENOMEM;
> +
> +	db->phys_base = adev->doorbell.base +
> +			(resource_size_t)base_dw * sizeof(u32);
> +	db->size      = size_dw * sizeof(u32);
> +	db->base.funcs = &amdgpu_sdma_userq_db_obj_funcs;
> +
> +	drm_gem_private_object_init(adev_to_drm(adev), &db->base,
> +				    round_up(db->size, PAGE_SIZE));
> +	r = drm_gem_create_mmap_offset(&db->base);
> +	if (r) {
> +		drm_gem_object_put(&db->base);
> +		return r;
> +	}
> +
> +	mutex_init(&adev->sdma.userq_db_mutex);
> +	adev->sdma.userq_db_bitmap = bitmap_zalloc(nslots, GFP_KERNEL);
> +	if (!adev->sdma.userq_db_bitmap) {
> +		drm_gem_object_put(&db->base);
> +		return -ENOMEM;
> +	}
> +
> +	adev->sdma.userq_db_obj    = db;
> +	adev->sdma.userq_db_nslots = nslots;
> +
> +	/*
> +	 * Mask out the qword slots used by the kernel SDMA rings
> +	 * (sdma_engine[i] << 1 in absolute BAR dwords ⇒ qword slot
> +	 * (sdma_engine[i] - sdma_engine[0]) within this window).
> +	 */
> +	for (i = 0; i < adev->sdma.num_instances; i++) {
> +		ring_dw = adev->doorbell_index.sdma_engine[i] << 1;
> +		if (ring_dw >= base_dw && ring_dw < base_dw + size_dw)
> +			set_bit((ring_dw - base_dw) / 2,
> +				adev->sdma.userq_db_bitmap);
> +	}
> +
> +	dev_info(adev->dev,
> +		 "SDMA UMQ doorbell pool: %u qword slots in BAR dword [%u, %u)\n",
> +		 nslots, base_dw, base_dw + size_dw);
> +	return 0;
> +}
> +
> +void amdgpu_sdma_userq_doorbell_fini(struct amdgpu_device *adev)
> +{
> +	if (!adev->sdma.userq_db_obj)
> +		return;
> +	bitmap_free(adev->sdma.userq_db_bitmap);
> +	adev->sdma.userq_db_bitmap = NULL;
> +	adev->sdma.userq_db_nslots = 0;
> +	drm_gem_object_put(&adev->sdma.userq_db_obj->base);
> +	adev->sdma.userq_db_obj = NULL;
> +}
> +
> +/*
> + * Allocate one qword doorbell slot.  On success, *out_slot receives the
> + * slot id (also the qword index inside the userspace mmap of the window
> + * BO) which the caller passes back to free.
> + */
> +int amdgpu_sdma_userq_doorbell_alloc(struct amdgpu_device *adev, u32 *out_slot)
> +{
> +	u32 slot;
> +
> +	if (!adev->sdma.userq_db_obj || !adev->sdma.userq_db_nslots)
> +		return -ENODEV;
> +
> +	mutex_lock(&adev->sdma.userq_db_mutex);
> +	slot = find_first_zero_bit(adev->sdma.userq_db_bitmap,
> +				   adev->sdma.userq_db_nslots);
> +	if (slot >= adev->sdma.userq_db_nslots) {
> +		mutex_unlock(&adev->sdma.userq_db_mutex);
> +		return -ENOSPC;
> +	}
> +	set_bit(slot, adev->sdma.userq_db_bitmap);
> +	mutex_unlock(&adev->sdma.userq_db_mutex);
> +
> +	*out_slot = slot;
> +	return 0;
> +}
> +
> +void amdgpu_sdma_userq_doorbell_free(struct amdgpu_device *adev, u32 slot)
> +{
> +	if (!adev->sdma.userq_db_obj)
> +		return;
> +	if (slot >= adev->sdma.userq_db_nslots)
> +		return;
> +	mutex_lock(&adev->sdma.userq_db_mutex);
> +	clear_bit(slot, adev->sdma.userq_db_bitmap);
> +	mutex_unlock(&adev->sdma.userq_db_mutex);
> +}
> +
> +/*
> + * Mint a per-fpriv GEM handle for the per-device SDMA UMQ doorbell BO.
> + * Userspace then uses standard GEM_MMAP / mmap() on /dev/dri/cardN to
> + * obtain a CPU pointer to the routable doorbell window.
> + */
> +int amdgpu_sdma_userq_doorbell_create_handle(struct amdgpu_device *adev,
> +					     struct drm_file *filp,
> +					     u32 *handle, u32 *size_bytes)
> +{
> +	int r;
> +
> +	if (!adev->sdma.userq_db_obj)
> +		return -ENODEV;
> +
> +	r = drm_gem_handle_create(filp, &adev->sdma.userq_db_obj->base, handle);
> +	if (r)
> +		return r;
> +
> +	*size_bytes = adev->sdma.userq_db_obj->size;
> +	return 0;
> +}
> +
>  int amdgpu_sdma_init_microcode(struct amdgpu_device *adev,
>  			       u32 instance, bool duplicate)
>  {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index 2bf365609775..93a7eb9746d5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -146,6 +146,20 @@ struct amdgpu_sdma {
>  	bool			disable_uq;
>  	void (*get_csa_info)(struct amdgpu_device *adev,
>  			     struct amdgpu_sdma_csa_info *csa_info);
> +
> +	/*
> +	 * SDMA usermode-queue doorbell pool.  The window covers
> +	 * BAR dwords [sdma_engine[0], sdma_engine[0] +
> +	 * sdma_doorbell_range * num_instances) — the only range that NBIO
> +	 * routes to the SDMA back-end.  Each bit in the bitmap represents
> +	 * one qword slot; kernel SDMA ring slots are pre-masked at init.
> +	 * The window is exposed to userspace as a custom drm_gem_object
> +	 * (userq_db_obj) that is mmap'd via standard GEM_MMAP.
> +	 */
> +	struct amdgpu_sdma_userq_db_obj *userq_db_obj;
> +	struct mutex		userq_db_mutex;
> +	unsigned long		*userq_db_bitmap;
> +	u32			userq_db_nslots;	/* qword slots */
>  };
>  
>  /*
> @@ -185,6 +199,38 @@ struct amdgpu_buffer_funcs {
>  				 uint32_t byte_count);
>  };
>  
> +/*
> + * SDMA usermode-queue doorbell pool.
> + *
> + * The pool re-uses qword doorbell slots inside the firmware-managed NBIO
> + * SDMA decode window (BAR dwords [sdma_engine[0],
> + * sdma_engine[0] + sdma_doorbell_range * num_instances)) — that range is
> + * the only one whose writes are routed to the SDMA back-end.  The kernel
> + * SDMA ring slots are pre-marked so they keep working alongside any
> + * number of SDMA UMQs.
> + *
> + * The window is exposed to userspace via a per-device drm_gem_object that
> + * userspace mmap()s through the standard GEM_MMAP path; per-fpriv handles
> + * are minted on demand by the AMDGPU_INFO_SDMA_USERQ_DOORBELL ioctl.  No
> + * file_operations override and no fixed mmap pgoff sentinel.
> + *
> + * FIXME: KFD's SDMA queue doorbells (kgd_*_hqd_sdma_get_doorbell on chips
> + * with a non-stub implementation, e.g. gfx9.4.3) are computed
> + * from the same adev->doorbell_index.sdma_engine[] array and would
> + * overlap with this pool.  On gfx12 the kgd hook stubs to 0, so there is
> + * no immediate conflict.  A shared per-adev allocator that both
> + * KFD and amdgpu UMQ call into is the longer-term fix.
> + */
> +
> +struct amdgpu_sdma_userq_db_obj {
> +	struct drm_gem_object	base;
> +	resource_size_t		phys_base;	/* BAR phys addr of window start */
> +	u32			size;		/* window size in bytes */
> +};
> +
> +#define to_amdgpu_sdma_userq_db(_obj) \
> +	container_of(_obj, struct amdgpu_sdma_userq_db_obj, base)
> +
>  int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id,
>  			     bool caller_handles_kernel_queues);
>  
> @@ -205,6 +251,15 @@ int amdgpu_sdma_process_ecc_irq(struct amdgpu_device *adev,
>  				      struct amdgpu_iv_entry *entry);
>  int amdgpu_sdma_init_microcode(struct amdgpu_device *adev, u32 instance,
>  			       bool duplicate);
> +struct drm_file;
> +struct amdgpu_sdma_userq_db_obj;
> +int amdgpu_sdma_userq_doorbell_init(struct amdgpu_device *adev);
> +void amdgpu_sdma_userq_doorbell_fini(struct amdgpu_device *adev);
> +int amdgpu_sdma_userq_doorbell_alloc(struct amdgpu_device *adev, u32 *out_slot);
> +void amdgpu_sdma_userq_doorbell_free(struct amdgpu_device *adev, u32 slot);
> +int amdgpu_sdma_userq_doorbell_create_handle(struct amdgpu_device *adev,
> +					     struct drm_file *filp,
> +					     u32 *handle, u32 *size_bytes);
>  void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
>          bool duplicate);
>  int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra
  2026-04-27  8:42 ` [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Christian König
@ 2026-04-28  9:39   ` Zhang, Jesse(Jie)
  0 siblings, 0 replies; 14+ messages in thread
From: Zhang, Jesse(Jie) @ 2026-04-28  9:39 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx@lists.freedesktop.org; +Cc: Deucher, Alexander

AMD General

Hi Christian,

> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Monday, April 27, 2026 4:42 PM
> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>
> Subject: Re: [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue
> doorbell pool infra
>
> On 4/27/26 10:34, Jesse Zhang wrote:
> > Add a per-device qword-slot pool covering the firmware-managed NBIO
> > SDMA decode window (BAR dwords [sdma_engine[0], sdma_engine[0] +
> > sdma_doorbell_range * num_instances)) — the only range whose writes
> > are routed to the SDMA back-end.  Kernel SDMA ring slots are
> > pre-masked at init.
> >
> > The window is exposed to userspace as a custom drm_gem_object: no TTM
> > backing, custom .mmap callback that does io_remap_pfn_range from the
> > SDMA decode window's BAR address.  Per-fpriv GEM handles for that BO
> > can be minted on demand via amdgpu_sdma_userq_doorbell_create_handle()
> > so userspace mmap()s through the standard drm_gem_mmap path — no
> > file_operations override and no fixed mmap pgoff sentinel.
>
> I can't see how that would even remotely work. We basically would need to distinct
> on every GEM handle lockup what type of BO we have.

Thanks for your review, and I will remove the custom drm_gem_object
.  v3 will reworks the design to remove it entirely.
The new design follows the same pattern KFD uses for its private mmap

Thanks
Jesse
>
> So absolutely clear NAK to that approach.
>
> Regards,
> Christian.
>
> >
> > Slots are allocated/freed via amdgpu_sdma_userq_doorbell_alloc/free.
> > The init/fini and the AMDGPU_INFO_USERQ_DOORBELL ioctl that uses
> > create_handle land in subsequent patches.
> >
> > Suggested-by:Prike Liang <Prike.Liang@amd.com>
> > Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 164
> > +++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |
> > 55 ++++++++
> >  2 files changed, 219 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > index 321310ba2c08..1c61761c0046 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > @@ -22,6 +22,8 @@
> >   */
> >
> >  #include <linux/firmware.h>
> > +#include <drm/drm_gem.h>
> > +#include <drm/drm_file.h>
> >  #include "amdgpu.h"
> >  #include "amdgpu_sdma.h"
> >  #include "amdgpu_ras.h"
> > @@ -200,6 +202,168 @@ void amdgpu_sdma_destroy_inst_ctx(struct
> amdgpu_device *adev,
> >            sizeof(struct amdgpu_sdma_instance) *
> > AMDGPU_MAX_SDMA_INSTANCES);  }
> >
> > +static int amdgpu_sdma_userq_db_obj_mmap(struct drm_gem_object *obj,
> > +                                    struct vm_area_struct *vma)
> > +{
> > +   struct amdgpu_sdma_userq_db_obj *db = to_amdgpu_sdma_userq_db(obj);
> > +
> > +   if (vma->vm_end - vma->vm_start > round_up(db->size, PAGE_SIZE))
> > +           return -EINVAL;
> > +
> > +   vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND |
> VM_NORESERVE |
> > +                VM_DONTDUMP | VM_PFNMAP);
> > +   vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> > +
> > +   return io_remap_pfn_range(vma, vma->vm_start,
> > +                             db->phys_base >> PAGE_SHIFT,
> > +                             vma->vm_end - vma->vm_start,
> > +                             vma->vm_page_prot);
> > +}
> > +
> > +static void amdgpu_sdma_userq_db_obj_free(struct drm_gem_object *obj)
> > +{
> > +   struct amdgpu_sdma_userq_db_obj *db = to_amdgpu_sdma_userq_db(obj);
> > +
> > +   drm_gem_object_release(obj);
> > +   kfree(db);
> > +}
> > +
> > +static const struct drm_gem_object_funcs amdgpu_sdma_userq_db_obj_funcs
> = {
> > +   .free = amdgpu_sdma_userq_db_obj_free,
> > +   .mmap = amdgpu_sdma_userq_db_obj_mmap, };
> > +
> > +int amdgpu_sdma_userq_doorbell_init(struct amdgpu_device *adev) {
> > +   struct amdgpu_sdma_userq_db_obj *db;
> > +   u32 base_dw, size_dw, nslots, ring_dw;
> > +   int i, r;
> > +
> > +   if (!adev->userq_funcs[AMDGPU_HW_IP_DMA])
> > +           return 0;
> > +
> > +   base_dw = adev->doorbell_index.sdma_engine[0] << 1;
> > +   size_dw = adev->doorbell_index.sdma_doorbell_range *
> > +             adev->sdma.num_instances;
> > +   nslots  = size_dw / 2;  /* qword slots */
> > +   if (!nslots)
> > +           return 0;
> > +
> > +   db = kzalloc(sizeof(*db), GFP_KERNEL);
> > +   if (!db)
> > +           return -ENOMEM;
> > +
> > +   db->phys_base = adev->doorbell.base +
> > +                   (resource_size_t)base_dw * sizeof(u32);
> > +   db->size      = size_dw * sizeof(u32);
> > +   db->base.funcs = &amdgpu_sdma_userq_db_obj_funcs;
> > +
> > +   drm_gem_private_object_init(adev_to_drm(adev), &db->base,
> > +                               round_up(db->size, PAGE_SIZE));
> > +   r = drm_gem_create_mmap_offset(&db->base);
> > +   if (r) {
> > +           drm_gem_object_put(&db->base);
> > +           return r;
> > +   }
> > +
> > +   mutex_init(&adev->sdma.userq_db_mutex);
> > +   adev->sdma.userq_db_bitmap = bitmap_zalloc(nslots, GFP_KERNEL);
> > +   if (!adev->sdma.userq_db_bitmap) {
> > +           drm_gem_object_put(&db->base);
> > +           return -ENOMEM;
> > +   }
> > +
> > +   adev->sdma.userq_db_obj    = db;
> > +   adev->sdma.userq_db_nslots = nslots;
> > +
> > +   /*
> > +    * Mask out the qword slots used by the kernel SDMA rings
> > +    * (sdma_engine[i] << 1 in absolute BAR dwords ⇒ qword slot
> > +    * (sdma_engine[i] - sdma_engine[0]) within this window).
> > +    */
> > +   for (i = 0; i < adev->sdma.num_instances; i++) {
> > +           ring_dw = adev->doorbell_index.sdma_engine[i] << 1;
> > +           if (ring_dw >= base_dw && ring_dw < base_dw + size_dw)
> > +                   set_bit((ring_dw - base_dw) / 2,
> > +                           adev->sdma.userq_db_bitmap);
> > +   }
> > +
> > +   dev_info(adev->dev,
> > +            "SDMA UMQ doorbell pool: %u qword slots in BAR dword
> [%u, %u)\n",
> > +            nslots, base_dw, base_dw + size_dw);
> > +   return 0;
> > +}
> > +
> > +void amdgpu_sdma_userq_doorbell_fini(struct amdgpu_device *adev) {
> > +   if (!adev->sdma.userq_db_obj)
> > +           return;
> > +   bitmap_free(adev->sdma.userq_db_bitmap);
> > +   adev->sdma.userq_db_bitmap = NULL;
> > +   adev->sdma.userq_db_nslots = 0;
> > +   drm_gem_object_put(&adev->sdma.userq_db_obj->base);
> > +   adev->sdma.userq_db_obj = NULL;
> > +}
> > +
> > +/*
> > + * Allocate one qword doorbell slot.  On success, *out_slot receives
> > +the
> > + * slot id (also the qword index inside the userspace mmap of the
> > +window
> > + * BO) which the caller passes back to free.
> > + */
> > +int amdgpu_sdma_userq_doorbell_alloc(struct amdgpu_device *adev, u32
> > +*out_slot) {
> > +   u32 slot;
> > +
> > +   if (!adev->sdma.userq_db_obj || !adev->sdma.userq_db_nslots)
> > +           return -ENODEV;
> > +
> > +   mutex_lock(&adev->sdma.userq_db_mutex);
> > +   slot = find_first_zero_bit(adev->sdma.userq_db_bitmap,
> > +                              adev->sdma.userq_db_nslots);
> > +   if (slot >= adev->sdma.userq_db_nslots) {
> > +           mutex_unlock(&adev->sdma.userq_db_mutex);
> > +           return -ENOSPC;
> > +   }
> > +   set_bit(slot, adev->sdma.userq_db_bitmap);
> > +   mutex_unlock(&adev->sdma.userq_db_mutex);
> > +
> > +   *out_slot = slot;
> > +   return 0;
> > +}
> > +
> > +void amdgpu_sdma_userq_doorbell_free(struct amdgpu_device *adev, u32
> > +slot) {
> > +   if (!adev->sdma.userq_db_obj)
> > +           return;
> > +   if (slot >= adev->sdma.userq_db_nslots)
> > +           return;
> > +   mutex_lock(&adev->sdma.userq_db_mutex);
> > +   clear_bit(slot, adev->sdma.userq_db_bitmap);
> > +   mutex_unlock(&adev->sdma.userq_db_mutex);
> > +}
> > +
> > +/*
> > + * Mint a per-fpriv GEM handle for the per-device SDMA UMQ doorbell BO.
> > + * Userspace then uses standard GEM_MMAP / mmap() on /dev/dri/cardN
> > +to
> > + * obtain a CPU pointer to the routable doorbell window.
> > + */
> > +int amdgpu_sdma_userq_doorbell_create_handle(struct amdgpu_device *adev,
> > +                                        struct drm_file *filp,
> > +                                        u32 *handle, u32 *size_bytes) {
> > +   int r;
> > +
> > +   if (!adev->sdma.userq_db_obj)
> > +           return -ENODEV;
> > +
> > +   r = drm_gem_handle_create(filp, &adev->sdma.userq_db_obj->base,
> handle);
> > +   if (r)
> > +           return r;
> > +
> > +   *size_bytes = adev->sdma.userq_db_obj->size;
> > +   return 0;
> > +}
> > +
> >  int amdgpu_sdma_init_microcode(struct amdgpu_device *adev,
> >                            u32 instance, bool duplicate)  { diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> > index 2bf365609775..93a7eb9746d5 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> > @@ -146,6 +146,20 @@ struct amdgpu_sdma {
> >     bool                    disable_uq;
> >     void (*get_csa_info)(struct amdgpu_device *adev,
> >                          struct amdgpu_sdma_csa_info *csa_info);
> > +
> > +   /*
> > +    * SDMA usermode-queue doorbell pool.  The window covers
> > +    * BAR dwords [sdma_engine[0], sdma_engine[0] +
> > +    * sdma_doorbell_range * num_instances) — the only range that NBIO
> > +    * routes to the SDMA back-end.  Each bit in the bitmap represents
> > +    * one qword slot; kernel SDMA ring slots are pre-masked at init.
> > +    * The window is exposed to userspace as a custom drm_gem_object
> > +    * (userq_db_obj) that is mmap'd via standard GEM_MMAP.
> > +    */
> > +   struct amdgpu_sdma_userq_db_obj *userq_db_obj;
> > +   struct mutex            userq_db_mutex;
> > +   unsigned long           *userq_db_bitmap;
> > +   u32                     userq_db_nslots;        /* qword slots */
> >  };
> >
> >  /*
> > @@ -185,6 +199,38 @@ struct amdgpu_buffer_funcs {
> >                              uint32_t byte_count);
> >  };
> >
> > +/*
> > + * SDMA usermode-queue doorbell pool.
> > + *
> > + * The pool re-uses qword doorbell slots inside the firmware-managed
> > +NBIO
> > + * SDMA decode window (BAR dwords [sdma_engine[0],
> > + * sdma_engine[0] + sdma_doorbell_range * num_instances)) — that
> > +range is
> > + * the only one whose writes are routed to the SDMA back-end.  The
> > +kernel
> > + * SDMA ring slots are pre-marked so they keep working alongside any
> > + * number of SDMA UMQs.
> > + *
> > + * The window is exposed to userspace via a per-device drm_gem_object
> > +that
> > + * userspace mmap()s through the standard GEM_MMAP path; per-fpriv
> > +handles
> > + * are minted on demand by the AMDGPU_INFO_SDMA_USERQ_DOORBELL
> ioctl.
> > +No
> > + * file_operations override and no fixed mmap pgoff sentinel.
> > + *
> > + * FIXME: KFD's SDMA queue doorbells (kgd_*_hqd_sdma_get_doorbell on
> > +chips
> > + * with a non-stub implementation, e.g. gfx9.4.3) are computed
> > + * from the same adev->doorbell_index.sdma_engine[] array and would
> > + * overlap with this pool.  On gfx12 the kgd hook stubs to 0, so
> > +there is
> > + * no immediate conflict.  A shared per-adev allocator that both
> > + * KFD and amdgpu UMQ call into is the longer-term fix.
> > + */
> > +
> > +struct amdgpu_sdma_userq_db_obj {
> > +   struct drm_gem_object   base;
> > +   resource_size_t         phys_base;      /* BAR phys addr of window
> start */
> > +   u32                     size;           /* window size in bytes */
> > +};
> > +
> > +#define to_amdgpu_sdma_userq_db(_obj) \
> > +   container_of(_obj, struct amdgpu_sdma_userq_db_obj, base)
> > +
> >  int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t
> instance_id,
> >                          bool caller_handles_kernel_queues);
> >
> > @@ -205,6 +251,15 @@ int amdgpu_sdma_process_ecc_irq(struct
> amdgpu_device *adev,
> >                                   struct amdgpu_iv_entry *entry);  int
> > amdgpu_sdma_init_microcode(struct amdgpu_device *adev, u32 instance,
> >                            bool duplicate);
> > +struct drm_file;
> > +struct amdgpu_sdma_userq_db_obj;
> > +int amdgpu_sdma_userq_doorbell_init(struct amdgpu_device *adev); void
> > +amdgpu_sdma_userq_doorbell_fini(struct amdgpu_device *adev); int
> > +amdgpu_sdma_userq_doorbell_alloc(struct amdgpu_device *adev, u32
> > +*out_slot); void amdgpu_sdma_userq_doorbell_free(struct amdgpu_device
> > +*adev, u32 slot); int amdgpu_sdma_userq_doorbell_create_handle(struct
> amdgpu_device *adev,
> > +                                        struct drm_file *filp,
> > +                                        u32 *handle, u32 *size_bytes);
> >  void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device *adev,
> >          bool duplicate);
> >  int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2026-04-28  9:39 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-27  8:34 [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Jesse Zhang
2026-04-27  8:34 ` [PATCH v2 02/11] drm/amdgpu/userq: route SDMA UMQ doorbells through the kernel pool Jesse Zhang
2026-04-27  8:34 ` [PATCH v2 03/11] drm/amdgpu/gem: only enforce amdgpu_bo access checks on amdgpu_bo objects Jesse Zhang
2026-04-27  8:39   ` Christian König
2026-04-27  8:34 ` [PATCH v2 04/11] drm/amdgpu/sdma7: register SDMA UMQ doorbell pool Jesse Zhang
2026-04-27  8:34 ` [PATCH v2 05/11] drm/amdgpu/sdma6: " Jesse Zhang
2026-04-27  8:34 ` [PATCH v2 06/11] drm/amdgpu: add AMDGPU_INFO_USERQ_DOORBELL ioctl Jesse Zhang
2026-04-27  8:34 ` [PATCH v2 07/11] drm/amdgpu/mes: add NOTIFY_WORK_ON_UNMAPPED_QUEUE op + ADD_QUEUE fields Jesse Zhang
2026-04-27  8:34 ` [PATCH v2 08/11] drm/amdgpu/mes11: plumb unmap_flag_addr + NOTIFY_WORK_ON_UNMAPPED_QUEUE Jesse Zhang
2026-04-27  8:34 ` [PATCH v2 09/11] drm/amdgpu/mes12: plumb is_user_mode_submission, unmap_flag_addr, NOTIFY Jesse Zhang
2026-04-27  8:34 ` [PATCH v2 10/11] drm/amdgpu/mes_userqueue: mark SDMA UMQs as user-mode submission Jesse Zhang
2026-04-27  8:34 ` [PATCH v2 11/11] drm/amdgpu/userq_fence: wake gangs-out SDMA UMQs via NOTIFY Jesse Zhang
2026-04-27  8:42 ` [PATCH v2 01/11] drm/amdgpu/sdma: add SDMA usermode-queue doorbell pool infra Christian König
2026-04-28  9:39   ` Zhang, Jesse(Jie)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox