AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Jesse.Zhang <Jesse.Zhang@amd.com>
To: <amd-gfx@lists.freedesktop.org>
Cc: <Alexander.Deucher@amd.com>,
	Christian Koenig <christian.koenig@amd.com>,
	 Jesse.Zhang <Jesse.Zhang@amd.com>,
	Alex Deucher <alexander.deucher@amd.com>
Subject: [PATCH 2/3] drm/amdgpu: Add user queue reset mask support
Date: Fri, 24 Oct 2025 17:43:41 +0800	[thread overview]
Message-ID: <20251024094445.3090110-2-Jesse.Zhang@amd.com> (raw)
In-Reply-To: <20251024094445.3090110-1-Jesse.Zhang@amd.com>

This commit adds support for tracking and exposing the reset capabilities
of user mode queues across different IP blocks (GFX, Compute, SDMA).

These changes allow userspace to query the reset capabilities of user
mode queues and ensure reset operations are only attempted when supported
by the hardware and driver.

Suggested-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 44 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   | 21 +++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 13 +++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c     | 17 +++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c     | 12 ++++++
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c     | 34 ++++++++++-------
 drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c     | 24 ++++++++----
 9 files changed, 163 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d0fb4eb1d7c4..48b21863065e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1182,6 +1182,7 @@ struct amdgpu_device {
 	 * Value: struct amdgpu_usermode_queue
 	 */
 	struct xarray userq_doorbell_xa;
+	u32 userq_supported_reset[AMDGPU_RING_TYPE_MAX];
 
 	/* df */
 	struct amdgpu_df                df;
@@ -1612,6 +1613,8 @@ struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
 						  struct amdgpu_ring *ring,
 						  struct amdgpu_job *job);
 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
+ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
+				    int ring_type);
 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8480b72258f2..a0064c5314df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -7649,7 +7649,8 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
 	if (!ring || !ring->adev)
 		return size;
 
-	if (amdgpu_device_should_recover_gpu(ring->adev))
+	if (amdgpu_device_should_recover_gpu(ring->adev) &&
+	    unlikely(!ring->adev->debug_disable_gpu_ring_reset))
 		size |= AMDGPU_RESET_TYPE_FULL;
 
 	if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
@@ -7659,6 +7660,20 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
 	return size;
 }
 
+ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev, int ring_type)
+{
+	ssize_t size = 0;
+
+	if (!adev || !adev->userq_funcs[ring_type])
+		return size;
+
+	if (amdgpu_device_should_recover_gpu(adev) &&
+	    unlikely(!adev->debug_disable_gpu_ring_reset))
+		size |= AMDGPU_RESET_TYPE_FULL;
+
+	return size;
+}
+
 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
 {
 	ssize_t size = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 3d24f9cd750a..5597753ec61a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1826,6 +1826,32 @@ static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
 	return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset);
 }
 
+static ssize_t amdgpu_userq_get_gfx_reset_mask(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = drm_to_adev(ddev);
+
+	if (!adev)
+		return -ENODEV;
+
+	return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_GFX]);
+}
+
+static ssize_t amdgpu_userq_get_compute_reset_mask(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = drm_to_adev(ddev);
+
+	if (!adev)
+		return -ENODEV;
+
+	return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE]);
+}
+
 static DEVICE_ATTR(run_cleaner_shader, 0200,
 		   NULL, amdgpu_gfx_set_run_cleaner_shader);
 
@@ -1845,6 +1871,12 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
 static DEVICE_ATTR(compute_reset_mask, 0444,
 		   amdgpu_gfx_get_compute_reset_mask, NULL);
 
+static DEVICE_ATTR(gfx_userq_reset_mask, 0444,
+		   amdgpu_userq_get_gfx_reset_mask, NULL);
+
+static DEVICE_ATTR(compute_userq_reset_mask, 0444,
+		   amdgpu_userq_get_compute_reset_mask, NULL);
+
 static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
@@ -1928,6 +1960,18 @@ static int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev)
 			return r;
 	}
 
+	if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) {
+		r = device_create_file(adev->dev, &dev_attr_gfx_userq_reset_mask);
+		if (r)
+			return r;
+	}
+
+	if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) {
+		r = device_create_file(adev->dev, &dev_attr_compute_userq_reset_mask);
+		if (r)
+			return r;
+	}
+
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 8b8a04138711..2fb288b2bfc4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -471,6 +471,21 @@ static ssize_t amdgpu_get_sdma_reset_mask(struct device *dev,
 static DEVICE_ATTR(sdma_reset_mask, 0444,
 		   amdgpu_get_sdma_reset_mask, NULL);
 
+static ssize_t amdgpu_get_sdma_userq_reset_mask(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = drm_to_adev(ddev);
+
+	if (!adev)
+		return -ENODEV;
+
+	return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_DMA]);
+}
+static DEVICE_ATTR(sdma_userq_reset_mask, 0444,
+		   amdgpu_get_sdma_userq_reset_mask, NULL);
+
 int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
 {
 	int r = 0;
@@ -484,6 +499,12 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
 			return r;
 	}
 
+	if (adev->userq_funcs[AMDGPU_HW_IP_DMA]) {
+		r = device_create_file(adev->dev, &dev_attr_sdma_userq_reset_mask);
+		if (r)
+			return r;
+	}
+
 	return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 188de848c229..15ae72e2d679 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -47,6 +47,16 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
 	return userq_ip_mask;
 }
 
+bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
+                                         int ring_type,
+                                         int reset_type)
+{
+    if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
+        return false;
+
+    return (adev->userq_supported_reset[ring_type] & reset_type) != 0;
+}
+
 static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
 {
 	if (amdgpu_device_should_recover_gpu(adev)) {
@@ -94,6 +104,9 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
 		int ring_type = queue_types[i];
 		const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
 
+		if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE))
+				continue;
+
 		if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
 		    funcs && funcs->detect_and_reset) {
 			r = funcs->detect_and_reset(adev, ring_type);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 252517ce5d5a..82b7c365d720 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1815,6 +1815,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
 		amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
 	adev->gfx.compute_supported_reset =
 		amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
+	adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
+			amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
+	adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
+			amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
+
 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
 	case IP_VERSION(11, 0, 0):
 	case IP_VERSION(11, 0, 2):
@@ -1824,12 +1829,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
 		    !amdgpu_sriov_vf(adev)) {
 			adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
 			adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE;
 		}
 		break;
 	default:
 		if (!amdgpu_sriov_vf(adev)) {
 			adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
 			adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE;
 		}
 		break;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 35d5a7e99a7c..c5ac42a30789 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -1543,6 +1543,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
 		amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
 	adev->gfx.compute_supported_reset =
 		amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
+	adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
+		amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
+	adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
+		amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
+
 	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
 	case IP_VERSION(12, 0, 0):
 	case IP_VERSION(12, 0, 1):
@@ -1551,6 +1556,13 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
 		    !amdgpu_sriov_vf(adev)) {
 			adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
 			adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+
 		}
 		break;
 	default:
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index db6e41967f12..8850eaf8d2c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -1349,19 +1349,6 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block)
 			return r;
 	}
 
-	adev->sdma.supported_reset =
-		amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
-	switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
-	case IP_VERSION(6, 0, 0):
-	case IP_VERSION(6, 0, 2):
-	case IP_VERSION(6, 0, 3):
-		if ((adev->sdma.instance[0].fw_version >= 21) &&
-		    !amdgpu_sriov_vf(adev))
-			adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
-		break;
-	default:
-		break;
-	}
 
 	if (amdgpu_sdma_ras_sw_init(adev)) {
 		dev_err(adev->dev, "Failed to initialize sdma ras block!\n");
@@ -1412,6 +1399,27 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block)
 		break;
 	}
 
+	adev->sdma.supported_reset =
+		amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+	adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
+		amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA);
+
+	switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
+	case IP_VERSION(6, 0, 0):
+	case IP_VERSION(6, 0, 2):
+	case IP_VERSION(6, 0, 3):
+		if ((adev->sdma.instance[0].fw_version >= 21) &&
+		    !amdgpu_sriov_vf(adev)) {
+			adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+			if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
+			    adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
+				adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+
+		}
+		break;
+	default:
+		break;
+	}
 	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
 	if (r)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 326ecc8d37d2..9de46ac8b1db 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -1335,14 +1335,6 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block)
 			return r;
 	}
 
-	adev->sdma.supported_reset =
-		amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
-	if (!amdgpu_sriov_vf(adev))
-		adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
-
-	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
-	if (r)
-		return r;
 	/* Allocate memory for SDMA IP Dump buffer */
 	ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), GFP_KERNEL);
 	if (ptr)
@@ -1360,6 +1352,22 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block)
 		break;
 	}
 
+	adev->sdma.supported_reset =
+		amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
+	adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
+		amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA);
+
+	if (!amdgpu_sriov_vf(adev)) {
+		adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+		if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
+		    adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
+			adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |= AMDGPU_RESET_TYPE_PER_QUEUE;
+
+	}
+	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
+	if (r)
+		return r;
+
 	return r;
 }
 
-- 
2.49.0


  reply	other threads:[~2025-10-24  9:45 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-24  9:43 [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality Jesse.Zhang
2025-10-24  9:43 ` Jesse.Zhang [this message]
2025-10-29 18:13   ` [PATCH 2/3] drm/amdgpu: Add user queue reset mask support Alex Deucher
2025-10-30  2:08     ` Zhang, Jesse(Jie)
2025-10-30 12:41       ` Alex Deucher
2025-10-24  9:43 ` [PATCH 3/3] drm/amdgpu: use irq-safe lock in amdgpu_userq_fence_driver_process Jesse.Zhang
2025-10-28  4:58   ` Zhang, Jesse(Jie)
2025-10-29 18:09   ` Alex Deucher
2025-10-29 18:09 ` [PATCH 1/3] drm/amdgpu: Implement user queue reset functionality Alex Deucher

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251024094445.3090110-2-Jesse.Zhang@amd.com \
    --to=jesse.zhang@amd.com \
    --cc=Alexander.Deucher@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=christian.koenig@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox