* [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask
@ 2024-10-29 7:14 Jesse.zhang@amd.com
2024-10-29 7:29 ` Huang, Tim
2024-10-29 7:57 ` Lazar, Lijo
0 siblings, 2 replies; 5+ messages in thread
From: Jesse.zhang@amd.com @ 2024-10-29 7:14 UTC (permalink / raw)
To: amd-gfx
Cc: Alexander.Deucher, Christian Koenig, Tim.Huang,
Jesse.zhang@amd.com, Jesse Zhang
From: "Jesse.zhang@amd.com" <Jesse.zhang@amd.com>
Add two sysfs interfaces for gfx and compute:
gfx_reset_mask
compute_reset_mask
These interfaces are read-only and show the resets supported by the IP.
For example, full adapter reset (mode1/mode2/BACO/etc),
soft reset, queue reset, and pipe reset.
V2: the sysfs node returns a text string instead of some flags (Christian)
v3: add a generic helper which takes the ring as parameter
and print the strings in the order they are applied (Christian)
check amdgpu_gpu_recovery before creating sysfs file itself,
and initialize supported_reset_types in IP version files (Lijo)
v4: Fixing uninitialized variables (Tim)
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
Suggested-by:Alex Deucher <alexander.deucher@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 66 ++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 ++
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +++
drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 23 ++++++++
drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 ++++
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 ++++
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 17 ++++++
9 files changed, 184 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 48c9b9b06905..aea1031d7b84 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
#define AMDGPU_RESET_VCE (1 << 13)
#define AMDGPU_RESET_VCE1 (1 << 14)
+/* reset mask */
+#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, mode1/mode2/BACO/etc. */
+#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */
+#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */
+#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */
+
/* max cursor sizes (in pixels) */
#define CIK_CURSOR_WIDTH 128
#define CIK_CURSOR_HEIGHT 128
@@ -1466,6 +1472,8 @@ struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev);
struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
struct dma_fence *gang);
bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
+ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
+ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
/* atpx handler */
#if defined(CONFIG_VGA_SWITCHEROO)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ef715b2bbcdb..cd1e3f018893 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
}
return ret;
}
+
+ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
+{
+ ssize_t size = 0;
+
+ if (!ring)
+ return size;
+
+ if (amdgpu_device_should_recover_gpu(ring->adev))
+ size |= AMDGPU_RESET_TYPE_FULL;
+
+ if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
+ !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
+ size |= AMDGPU_RESET_TYPE_SOFT_RESET;
+
+ return size;
+}
+
+ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
+{
+ ssize_t size = 0;
+
+ if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
+ size += sysfs_emit_at(buf, size, "soft ");
+
+ if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
+ size += sysfs_emit_at(buf, size, "queue ");
+
+ if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
+ size += sysfs_emit_at(buf, size, "pipe ");
+
+ if (supported_reset & AMDGPU_RESET_TYPE_FULL)
+ size += sysfs_emit_at(buf, size, "full ");
+
+ size += sysfs_emit_at(buf, size, "\n");
+ return size;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index e96984c53e72..6de1f3bf6863 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1588,6 +1588,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
return count;
}
+static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct drm_device *ddev = dev_get_drvdata(dev);
+ struct amdgpu_device *adev = drm_to_adev(ddev);
+
+ if (!adev)
+ return -ENODEV;
+
+ return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset);
+}
+
+static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct drm_device *ddev = dev_get_drvdata(dev);
+ struct amdgpu_device *adev = drm_to_adev(ddev);
+
+ if (!adev)
+ return -ENODEV;
+
+ return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset);
+}
+
static DEVICE_ATTR(run_cleaner_shader, 0200,
NULL, amdgpu_gfx_set_run_cleaner_shader);
@@ -1602,6 +1628,12 @@ static DEVICE_ATTR(current_compute_partition, 0644,
static DEVICE_ATTR(available_compute_partition, 0444,
amdgpu_gfx_get_available_compute_partition, NULL);
+static DEVICE_ATTR(gfx_reset_mask, 0444,
+ amdgpu_gfx_get_gfx_reset_mask, NULL);
+
+static DEVICE_ATTR(compute_reset_mask, 0444,
+ amdgpu_gfx_get_compute_reset_mask, NULL);
+
int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
{
struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
@@ -1702,6 +1734,40 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
cleaner_shader_size);
}
+int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev)
+{
+ int r = 0;
+
+ if (!amdgpu_gpu_recovery)
+ return r;
+
+ if (adev->gfx.num_gfx_rings) {
+ r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask);
+ if (r)
+ return r;
+ }
+
+ if (adev->gfx.num_compute_rings) {
+ r = device_create_file(adev->dev, &dev_attr_compute_reset_mask);
+ if (r)
+ return r;
+ }
+
+ return r;
+}
+
+void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev)
+{
+ if (!amdgpu_gpu_recovery)
+ return;
+
+ if (adev->gfx.num_gfx_rings)
+ device_remove_file(adev->dev, &dev_attr_gfx_reset_mask);
+
+ if (adev->gfx.num_compute_rings)
+ device_remove_file(adev->dev, &dev_attr_compute_reset_mask);
+}
+
/**
* amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver)
* @adev: amdgpu_device pointer
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index f710178a21bc..fb0e1adf6766 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -424,6 +424,8 @@ struct amdgpu_gfx {
/* reset mask */
uint32_t grbm_soft_reset;
uint32_t srbm_soft_reset;
+ uint32_t gfx_supported_reset;
+ uint32_t compute_supported_reset;
/* gfx off */
bool gfx_off_state; /* true: enabled, false: disabled */
@@ -582,6 +584,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev);
void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work);
void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring);
+int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev);
+void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev);
static inline const char *amdgpu_gfx_compute_mode_desc(int mode)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 9da95b25e158..e2b2cdab423b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4825,6 +4825,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
}
}
}
+ /* TODO: Add queue reset mask when FW fully supports it */
+ adev->gfx.gfx_supported_reset =
+ amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
+ adev->gfx.compute_supported_reset =
+ amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0);
if (r) {
@@ -4854,6 +4859,9 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
gfx_v10_0_alloc_ip_dump(adev);
r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
+ if (r)
+ return r;
+ r = amdgpu_gfx_sysfs_reset_mask_init(adev);
if (r)
return r;
return 0;
@@ -4896,6 +4904,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block)
amdgpu_gfx_kiq_fini(adev, 0);
amdgpu_gfx_cleaner_shader_sw_fini(adev);
+ amdgpu_gfx_sysfs_reset_mask_fini(adev);
gfx_v10_0_pfp_fini(adev);
gfx_v10_0_ce_fini(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 5aff8f72de9c..ec24e8d019b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1683,6 +1683,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
}
}
+ adev->gfx.gfx_supported_reset =
+ amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
+ adev->gfx.compute_supported_reset =
+ amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
+ switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+ case IP_VERSION(11, 0, 0):
+ case IP_VERSION(11, 0, 2):
+ case IP_VERSION(11, 0, 3):
+ if ((adev->gfx.me_fw_version >= 2280) &&
+ (adev->gfx.mec_fw_version >= 2410)) {
+ adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+ adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+ }
+ break;
+ default:
+ break;
+ }
+
if (!adev->enable_mes_kiq) {
r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0);
if (r) {
@@ -1721,6 +1739,10 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
if (r)
return r;
+ r = amdgpu_gfx_sysfs_reset_mask_init (adev);
+ if (r)
+ return r;
+
return 0;
}
@@ -1783,6 +1805,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block)
gfx_v11_0_free_microcode(adev);
amdgpu_gfx_sysfs_isolation_shader_fini(adev);
+ amdgpu_gfx_sysfs_reset_mask_fini(adev);
kfree(adev->gfx.ip_dump_core);
kfree(adev->gfx.ip_dump_compute_queues);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 9fec28d8a5fc..f5ffa2d8b22a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -1437,6 +1437,12 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
}
}
+ /* TODO: Add queue reset mask when FW fully supports it */
+ adev->gfx.gfx_supported_reset =
+ amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
+ adev->gfx.compute_supported_reset =
+ amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
+
if (!adev->enable_mes_kiq) {
r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, 0);
if (r) {
@@ -1467,6 +1473,9 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
gfx_v12_0_alloc_ip_dump(adev);
r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
+ if (r)
+ return r;
+ r = amdgpu_gfx_sysfs_reset_mask_init(adev);
if (r)
return r;
@@ -1530,6 +1539,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block)
gfx_v12_0_free_microcode(adev);
amdgpu_gfx_sysfs_isolation_shader_fini(adev);
+ amdgpu_gfx_sysfs_reset_mask_fini(adev);
kfree(adev->gfx.ip_dump_core);
kfree(adev->gfx.ip_dump_compute_queues);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index b4c4b9916289..94007a9ed54b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2362,6 +2362,12 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
}
}
+ /* TODO: Add queue reset mask when FW fully supports it */
+ adev->gfx.gfx_supported_reset =
+ amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
+ adev->gfx.compute_supported_reset =
+ amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
+
r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0);
if (r) {
DRM_ERROR("Failed to init KIQ BOs!\n");
@@ -2391,6 +2397,9 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
gfx_v9_0_alloc_ip_dump(adev);
r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
+ if (r)
+ return r;
+ r = amdgpu_gfx_sysfs_reset_mask_init(adev);
if (r)
return r;
@@ -2419,6 +2428,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block)
amdgpu_gfx_kiq_fini(adev, 0);
amdgpu_gfx_cleaner_shader_sw_fini(adev);
+ amdgpu_gfx_sysfs_reset_mask_fini(adev);
gfx_v9_0_mec_fini(adev);
amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 016290f00592..028fda13ac50 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -1157,6 +1157,19 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
return r;
}
+ adev->gfx.compute_supported_reset =
+ amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
+ switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+ case IP_VERSION(9, 4, 3):
+ case IP_VERSION(9, 4, 4):
+ if (adev->gfx.mec_fw_version >= 155) {
+ adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
+ adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_PIPE;
+ }
+ break;
+ default:
+ break;
+ }
r = gfx_v9_4_3_gpu_early_init(adev);
if (r)
return r;
@@ -1175,6 +1188,9 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
if (r)
return r;
+ r = amdgpu_gfx_sysfs_reset_mask_init(adev);
+ if (r)
+ return r;
return 0;
}
@@ -1200,6 +1216,7 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block)
gfx_v9_4_3_free_microcode(adev);
amdgpu_gfx_sysfs_fini(adev);
amdgpu_gfx_sysfs_isolation_shader_fini(adev);
+ amdgpu_gfx_sysfs_reset_mask_fini(adev);
kfree(adev->gfx.ip_dump_core);
kfree(adev->gfx.ip_dump_compute_queues);
--
2.25.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* RE: [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask
2024-10-29 7:14 [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask Jesse.zhang@amd.com
@ 2024-10-29 7:29 ` Huang, Tim
2024-10-29 7:57 ` Lazar, Lijo
1 sibling, 0 replies; 5+ messages in thread
From: Huang, Tim @ 2024-10-29 7:29 UTC (permalink / raw)
To: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander, Koenig, Christian, Zhang, Jesse(Jie),
Zhang, Jesse(Jie)
[AMD Official Use Only - AMD Internal Distribution Only]
This Series is,
Reviewed-by: Tim Huang <tim.huang@amd.com>
> -----Original Message-----
> From: Jesse.zhang@amd.com <jesse.zhang@amd.com>
> Sent: Tuesday, October 29, 2024 3:15 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian
> <Christian.Koenig@amd.com>; Huang, Tim <Tim.Huang@amd.com>; Zhang,
> Jesse(Jie) <Jesse.Zhang@amd.com>; Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>
> Subject: [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset
> mask
>
> From: "Jesse.zhang@amd.com" <Jesse.zhang@amd.com>
>
> Add two sysfs interfaces for gfx and compute:
> gfx_reset_mask
> compute_reset_mask
>
> These interfaces are read-only and show the resets supported by the IP.
> For example, full adapter reset (mode1/mode2/BACO/etc), soft reset, queue
> reset, and pipe reset.
>
> V2: the sysfs node returns a text string instead of some flags (Christian)
> v3: add a generic helper which takes the ring as parameter
> and print the strings in the order they are applied (Christian)
>
> check amdgpu_gpu_recovery before creating sysfs file itself,
> and initialize supported_reset_types in IP version files (Lijo)
> v4: Fixing uninitialized variables (Tim)
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> Suggested-by:Alex
> Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 66
> ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 ++
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +++
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 23 ++++++++
> drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 ++++
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 ++++
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 17 ++++++
> 9 files changed, 184 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 48c9b9b06905..aea1031d7b84 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
> #define AMDGPU_RESET_VCE (1 << 13)
> #define AMDGPU_RESET_VCE1 (1 << 14)
>
> +/* reset mask */
> +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset,
> +mode1/mode2/BACO/etc. */ #define AMDGPU_RESET_TYPE_SOFT_RESET (1
> << 1)
> +/* IP level soft reset */ #define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2)
> +/* per queue */ #define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe
> +*/
> +
> /* max cursor sizes (in pixels) */
> #define CIK_CURSOR_WIDTH 128
> #define CIK_CURSOR_HEIGHT 128
> @@ -1466,6 +1472,8 @@ struct dma_fence
> *amdgpu_device_get_gang(struct amdgpu_device *adev); struct dma_fence
> *amdgpu_device_switch_gang(struct amdgpu_device *adev,
> struct dma_fence *gang);
> bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
> +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
> +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
>
> /* atpx handler */
> #if defined(CONFIG_VGA_SWITCHEROO)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ef715b2bbcdb..cd1e3f018893 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct
> amdgpu_device *adev,
> }
> return ret;
> }
> +
> +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) {
> + ssize_t size = 0;
> +
> + if (!ring)
> + return size;
> +
> + if (amdgpu_device_should_recover_gpu(ring->adev))
> + size |= AMDGPU_RESET_TYPE_FULL;
> +
> + if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
> + !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
> + size |= AMDGPU_RESET_TYPE_SOFT_RESET;
> +
> + return size;
> +}
> +
> +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) {
> + ssize_t size = 0;
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
> + size += sysfs_emit_at(buf, size, "soft ");
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
> + size += sysfs_emit_at(buf, size, "queue ");
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
> + size += sysfs_emit_at(buf, size, "pipe ");
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_FULL)
> + size += sysfs_emit_at(buf, size, "full ");
> +
> + size += sysfs_emit_at(buf, size, "\n");
> + return size;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index e96984c53e72..6de1f3bf6863 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1588,6 +1588,32 @@ static ssize_t
> amdgpu_gfx_set_enforce_isolation(struct device *dev,
> return count;
> }
>
> +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> + if (!adev)
> + return -ENODEV;
> +
> + return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset); }
> +
> +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> + if (!adev)
> + return -ENODEV;
> +
> + return amdgpu_show_reset_mask(buf,
> adev->gfx.compute_supported_reset);
> +}
> +
> static DEVICE_ATTR(run_cleaner_shader, 0200,
> NULL, amdgpu_gfx_set_run_cleaner_shader);
>
> @@ -1602,6 +1628,12 @@ static DEVICE_ATTR(current_compute_partition,
> 0644, static DEVICE_ATTR(available_compute_partition, 0444,
> amdgpu_gfx_get_available_compute_partition, NULL);
>
> +static DEVICE_ATTR(gfx_reset_mask, 0444,
> + amdgpu_gfx_get_gfx_reset_mask, NULL);
> +
> +static DEVICE_ATTR(compute_reset_mask, 0444,
> + amdgpu_gfx_get_compute_reset_mask, NULL);
> +
> int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) {
> struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1702,6
> +1734,40 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device
> *adev,
> cleaner_shader_size);
> }
>
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) {
> + int r = 0;
> +
> + if (!amdgpu_gpu_recovery)
> + return r;
> +
> + if (adev->gfx.num_gfx_rings) {
> + r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask);
> + if (r)
> + return r;
> + }
> +
> + if (adev->gfx.num_compute_rings) {
> + r = device_create_file(adev->dev, &dev_attr_compute_reset_mask);
> + if (r)
> + return r;
> + }
> +
> + return r;
> +}
> +
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev) {
> + if (!amdgpu_gpu_recovery)
> + return;
> +
> + if (adev->gfx.num_gfx_rings)
> + device_remove_file(adev->dev, &dev_attr_gfx_reset_mask);
> +
> + if (adev->gfx.num_compute_rings)
> + device_remove_file(adev->dev, &dev_attr_compute_reset_mask); }
> +
> /**
> * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD
> (Graphics Driver)
> * @adev: amdgpu_device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index f710178a21bc..fb0e1adf6766 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -424,6 +424,8 @@ struct amdgpu_gfx {
> /* reset mask */
> uint32_t grbm_soft_reset;
> uint32_t srbm_soft_reset;
> + uint32_t gfx_supported_reset;
> + uint32_t compute_supported_reset;
>
> /* gfx off */
> bool gfx_off_state; /* true:
> enabled, false: disabled */
> @@ -582,6 +584,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct
> amdgpu_device *adev); void amdgpu_gfx_enforce_isolation_handler(struct
> work_struct *work); void
> amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
> void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring);
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev); void
> +amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev);
>
> static inline const char *amdgpu_gfx_compute_mode_desc(int mode) { diff
> --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 9da95b25e158..e2b2cdab423b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -4825,6 +4825,11 @@ static int gfx_v10_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> }
> }
> }
> + /* TODO: Add queue reset mask when FW fully supports it */
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>
> r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0);
> if (r) {
> @@ -4854,6 +4859,9 @@ static int gfx_v10_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> gfx_v10_0_alloc_ip_dump(adev);
>
> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
> + if (r)
> + return r;
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> if (r)
> return r;
> return 0;
> @@ -4896,6 +4904,7 @@ static int gfx_v10_0_sw_fini(struct
> amdgpu_ip_block *ip_block)
> amdgpu_gfx_kiq_fini(adev, 0);
>
> amdgpu_gfx_cleaner_shader_sw_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> gfx_v10_0_pfp_fini(adev);
> gfx_v10_0_ce_fini(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 5aff8f72de9c..ec24e8d019b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1683,6 +1683,24 @@ static int gfx_v11_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> }
> }
>
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> + case IP_VERSION(11, 0, 0):
> + case IP_VERSION(11, 0, 2):
> + case IP_VERSION(11, 0, 3):
> + if ((adev->gfx.me_fw_version >= 2280) &&
> + (adev->gfx.mec_fw_version >= 2410)) {
> + adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> + adev->gfx.gfx_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> + }
> + break;
> + default:
> + break;
> + }
> +
> if (!adev->enable_mes_kiq) {
> r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0);
> if (r) {
> @@ -1721,6 +1739,10 @@ static int gfx_v11_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> + r = amdgpu_gfx_sysfs_reset_mask_init (adev);
> + if (r)
> + return r;
> +
> return 0;
> }
>
> @@ -1783,6 +1805,7 @@ static int gfx_v11_0_sw_fini(struct
> amdgpu_ip_block *ip_block)
> gfx_v11_0_free_microcode(adev);
>
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 9fec28d8a5fc..f5ffa2d8b22a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1437,6 +1437,12 @@ static int gfx_v12_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> }
> }
>
> + /* TODO: Add queue reset mask when FW fully supports it */
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> +
> if (!adev->enable_mes_kiq) {
> r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, 0);
> if (r) {
> @@ -1467,6 +1473,9 @@ static int gfx_v12_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> gfx_v12_0_alloc_ip_dump(adev);
>
> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
> + if (r)
> + return r;
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> if (r)
> return r;
>
> @@ -1530,6 +1539,7 @@ static int gfx_v12_0_sw_fini(struct
> amdgpu_ip_block *ip_block)
> gfx_v12_0_free_microcode(adev);
>
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index b4c4b9916289..94007a9ed54b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2362,6 +2362,12 @@ static int gfx_v9_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> }
> }
>
> + /* TODO: Add queue reset mask when FW fully supports it */
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> +
> r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0);
> if (r) {
> DRM_ERROR("Failed to init KIQ BOs!\n"); @@ -2391,6 +2397,9 @@
> static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
> gfx_v9_0_alloc_ip_dump(adev);
>
> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
> + if (r)
> + return r;
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> if (r)
> return r;
>
> @@ -2419,6 +2428,7 @@ static int gfx_v9_0_sw_fini(struct
> amdgpu_ip_block *ip_block)
> amdgpu_gfx_kiq_fini(adev, 0);
>
> amdgpu_gfx_cleaner_shader_sw_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> gfx_v9_0_mec_fini(adev);
> amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 016290f00592..028fda13ac50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -1157,6 +1157,19 @@ static int gfx_v9_4_3_sw_init(struct
> amdgpu_ip_block *ip_block)
> return r;
> }
>
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> + case IP_VERSION(9, 4, 3):
> + case IP_VERSION(9, 4, 4):
> + if (adev->gfx.mec_fw_version >= 155) {
> + adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> + adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_PIPE;
> + }
> + break;
> + default:
> + break;
> + }
> r = gfx_v9_4_3_gpu_early_init(adev);
> if (r)
> return r;
> @@ -1175,6 +1188,9 @@ static int gfx_v9_4_3_sw_init(struct
> amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> + if (r)
> + return r;
> return 0;
> }
>
> @@ -1200,6 +1216,7 @@ static int gfx_v9_4_3_sw_fini(struct
> amdgpu_ip_block *ip_block)
> gfx_v9_4_3_free_microcode(adev);
> amdgpu_gfx_sysfs_fini(adev);
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> --
> 2.25.1
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask
2024-10-29 7:14 [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask Jesse.zhang@amd.com
2024-10-29 7:29 ` Huang, Tim
@ 2024-10-29 7:57 ` Lazar, Lijo
2024-10-29 8:25 ` Zhang, Jesse(Jie)
1 sibling, 1 reply; 5+ messages in thread
From: Lazar, Lijo @ 2024-10-29 7:57 UTC (permalink / raw)
To: Jesse.zhang@amd.com, amd-gfx
Cc: Alexander.Deucher, Christian Koenig, Tim.Huang
On 10/29/2024 12:44 PM, Jesse.zhang@amd.com wrote:
> From: "Jesse.zhang@amd.com" <Jesse.zhang@amd.com>
>
> Add two sysfs interfaces for gfx and compute:
> gfx_reset_mask
> compute_reset_mask
>
> These interfaces are read-only and show the resets supported by the IP.
> For example, full adapter reset (mode1/mode2/BACO/etc),
> soft reset, queue reset, and pipe reset.
>
> V2: the sysfs node returns a text string instead of some flags (Christian)
> v3: add a generic helper which takes the ring as parameter
> and print the strings in the order they are applied (Christian)
>
> check amdgpu_gpu_recovery before creating sysfs file itself,
> and initialize supported_reset_types in IP version files (Lijo)
> v4: Fixing uninitialized variables (Tim)
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
> Suggested-by:Alex Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 66 ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 ++
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +++
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 23 ++++++++
> drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 ++++
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 ++++
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 17 ++++++
> 9 files changed, 184 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 48c9b9b06905..aea1031d7b84 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
> #define AMDGPU_RESET_VCE (1 << 13)
> #define AMDGPU_RESET_VCE1 (1 << 14)
>
> +/* reset mask */
> +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, mode1/mode2/BACO/etc. */
> +#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */
> +#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */
> +#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */
> +
> /* max cursor sizes (in pixels) */
> #define CIK_CURSOR_WIDTH 128
> #define CIK_CURSOR_HEIGHT 128
> @@ -1466,6 +1472,8 @@ struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev);
> struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
> struct dma_fence *gang);
> bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
> +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
> +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
>
> /* atpx handler */
> #if defined(CONFIG_VGA_SWITCHEROO)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ef715b2bbcdb..cd1e3f018893 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
> }
> return ret;
> }
> +
> +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
> +{
> + ssize_t size = 0;
> +
> + if (!ring)
> + return size;
> +
> + if (amdgpu_device_should_recover_gpu(ring->adev))
> + size |= AMDGPU_RESET_TYPE_FULL;
> +
> + if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
> + !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
> + size |= AMDGPU_RESET_TYPE_SOFT_RESET;
> +
> + return size;
> +}
> +
> +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
> +{
> + ssize_t size = 0;
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
> + size += sysfs_emit_at(buf, size, "soft ");
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
> + size += sysfs_emit_at(buf, size, "queue ");
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
> + size += sysfs_emit_at(buf, size, "pipe ");
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_FULL)
> + size += sysfs_emit_at(buf, size, "full ");
> +
> + size += sysfs_emit_at(buf, size, "\n");
Is there an expectation of having "Unsupported" when no reset is
supported (supported_reset == 0)?
Thanks,
Lijo
> + return size;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index e96984c53e72..6de1f3bf6863 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1588,6 +1588,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
> return count;
> }
>
> +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> + if (!adev)
> + return -ENODEV;
> +
> + return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset);
> +}
> +
> +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> + if (!adev)
> + return -ENODEV;
> +
> + return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset);
> +}
> +
> static DEVICE_ATTR(run_cleaner_shader, 0200,
> NULL, amdgpu_gfx_set_run_cleaner_shader);
>
> @@ -1602,6 +1628,12 @@ static DEVICE_ATTR(current_compute_partition, 0644,
> static DEVICE_ATTR(available_compute_partition, 0444,
> amdgpu_gfx_get_available_compute_partition, NULL);
>
> +static DEVICE_ATTR(gfx_reset_mask, 0444,
> + amdgpu_gfx_get_gfx_reset_mask, NULL);
> +
> +static DEVICE_ATTR(compute_reset_mask, 0444,
> + amdgpu_gfx_get_compute_reset_mask, NULL);
> +
> int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev)
> {
> struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
> @@ -1702,6 +1734,40 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
> cleaner_shader_size);
> }
>
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev)
> +{
> + int r = 0;
> +
> + if (!amdgpu_gpu_recovery)
> + return r;
> +
> + if (adev->gfx.num_gfx_rings) {
> + r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask);
> + if (r)
> + return r;
> + }
> +
> + if (adev->gfx.num_compute_rings) {
> + r = device_create_file(adev->dev, &dev_attr_compute_reset_mask);
> + if (r)
> + return r;
> + }
> +
> + return r;
> +}
> +
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev)
> +{
> + if (!amdgpu_gpu_recovery)
> + return;
> +
> + if (adev->gfx.num_gfx_rings)
> + device_remove_file(adev->dev, &dev_attr_gfx_reset_mask);
> +
> + if (adev->gfx.num_compute_rings)
> + device_remove_file(adev->dev, &dev_attr_compute_reset_mask);
> +}
> +
> /**
> * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver)
> * @adev: amdgpu_device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index f710178a21bc..fb0e1adf6766 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -424,6 +424,8 @@ struct amdgpu_gfx {
> /* reset mask */
> uint32_t grbm_soft_reset;
> uint32_t srbm_soft_reset;
> + uint32_t gfx_supported_reset;
> + uint32_t compute_supported_reset;
>
> /* gfx off */
> bool gfx_off_state; /* true: enabled, false: disabled */
> @@ -582,6 +584,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev);
> void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work);
> void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
> void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring);
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev);
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev);
>
> static inline const char *amdgpu_gfx_compute_mode_desc(int mode)
> {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 9da95b25e158..e2b2cdab423b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -4825,6 +4825,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
> }
> }
> }
> + /* TODO: Add queue reset mask when FW fully supports it */
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>
> r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0);
> if (r) {
> @@ -4854,6 +4859,9 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
> gfx_v10_0_alloc_ip_dump(adev);
>
> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
> + if (r)
> + return r;
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> if (r)
> return r;
> return 0;
> @@ -4896,6 +4904,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block)
> amdgpu_gfx_kiq_fini(adev, 0);
>
> amdgpu_gfx_cleaner_shader_sw_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> gfx_v10_0_pfp_fini(adev);
> gfx_v10_0_ce_fini(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 5aff8f72de9c..ec24e8d019b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1683,6 +1683,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
> }
> }
>
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> + case IP_VERSION(11, 0, 0):
> + case IP_VERSION(11, 0, 2):
> + case IP_VERSION(11, 0, 3):
> + if ((adev->gfx.me_fw_version >= 2280) &&
> + (adev->gfx.mec_fw_version >= 2410)) {
> + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> + adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> + }
> + break;
> + default:
> + break;
> + }
> +
> if (!adev->enable_mes_kiq) {
> r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0);
> if (r) {
> @@ -1721,6 +1739,10 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> + r = amdgpu_gfx_sysfs_reset_mask_init (adev);
> + if (r)
> + return r;
> +
> return 0;
> }
>
> @@ -1783,6 +1805,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block)
> gfx_v11_0_free_microcode(adev);
>
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 9fec28d8a5fc..f5ffa2d8b22a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1437,6 +1437,12 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
> }
> }
>
> + /* TODO: Add queue reset mask when FW fully supports it */
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> +
> if (!adev->enable_mes_kiq) {
> r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, 0);
> if (r) {
> @@ -1467,6 +1473,9 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
> gfx_v12_0_alloc_ip_dump(adev);
>
> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
> + if (r)
> + return r;
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> if (r)
> return r;
>
> @@ -1530,6 +1539,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block)
> gfx_v12_0_free_microcode(adev);
>
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index b4c4b9916289..94007a9ed54b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2362,6 +2362,12 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
> }
> }
>
> + /* TODO: Add queue reset mask when FW fully supports it */
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> +
> r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0);
> if (r) {
> DRM_ERROR("Failed to init KIQ BOs!\n");
> @@ -2391,6 +2397,9 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
> gfx_v9_0_alloc_ip_dump(adev);
>
> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
> + if (r)
> + return r;
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> if (r)
> return r;
>
> @@ -2419,6 +2428,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block)
> amdgpu_gfx_kiq_fini(adev, 0);
>
> amdgpu_gfx_cleaner_shader_sw_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> gfx_v9_0_mec_fini(adev);
> amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 016290f00592..028fda13ac50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -1157,6 +1157,19 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
> return r;
> }
>
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> + case IP_VERSION(9, 4, 3):
> + case IP_VERSION(9, 4, 4):
> + if (adev->gfx.mec_fw_version >= 155) {
> + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_PIPE;
> + }
> + break;
> + default:
> + break;
> + }
> r = gfx_v9_4_3_gpu_early_init(adev);
> if (r)
> return r;
> @@ -1175,6 +1188,9 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> + if (r)
> + return r;
> return 0;
> }
>
> @@ -1200,6 +1216,7 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block)
> gfx_v9_4_3_free_microcode(adev);
> amdgpu_gfx_sysfs_fini(adev);
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
^ permalink raw reply [flat|nested] 5+ messages in thread
* RE: [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask
2024-10-29 7:57 ` Lazar, Lijo
@ 2024-10-29 8:25 ` Zhang, Jesse(Jie)
2024-10-29 8:47 ` Lazar, Lijo
0 siblings, 1 reply; 5+ messages in thread
From: Zhang, Jesse(Jie) @ 2024-10-29 8:25 UTC (permalink / raw)
To: Lazar, Lijo, amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander, Koenig, Christian, Huang, Tim
[AMD Official Use Only - AMD Internal Distribution Only]
Hi Lijo,
-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@amd.com>
Sent: Tuesday, October 29, 2024 3:58 PM
To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Huang, Tim <Tim.Huang@amd.com>
Subject: Re: [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask
On 10/29/2024 12:44 PM, Jesse.zhang@amd.com wrote:
> From: "Jesse.zhang@amd.com" <Jesse.zhang@amd.com>
>
> Add two sysfs interfaces for gfx and compute:
> gfx_reset_mask
> compute_reset_mask
>
> These interfaces are read-only and show the resets supported by the IP.
> For example, full adapter reset (mode1/mode2/BACO/etc), soft reset,
> queue reset, and pipe reset.
>
> V2: the sysfs node returns a text string instead of some flags
> (Christian)
> v3: add a generic helper which takes the ring as parameter
> and print the strings in the order they are applied (Christian)
>
> check amdgpu_gpu_recovery before creating sysfs file itself,
> and initialize supported_reset_types in IP version files (Lijo)
> v4: Fixing uninitialized variables (Tim)
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> Suggested-by:Alex
> Deucher <alexander.deucher@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 66 ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 ++
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +++
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 23 ++++++++
> drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 ++++
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 ++++
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 17 ++++++
> 9 files changed, 184 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 48c9b9b06905..aea1031d7b84 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
> #define AMDGPU_RESET_VCE (1 << 13)
> #define AMDGPU_RESET_VCE1 (1 << 14)
>
> +/* reset mask */
> +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset,
> +mode1/mode2/BACO/etc. */ #define AMDGPU_RESET_TYPE_SOFT_RESET (1 <<
> +1) /* IP level soft reset */ #define AMDGPU_RESET_TYPE_PER_QUEUE (1
> +<< 2) /* per queue */ #define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /*
> +per pipe */
> +
> /* max cursor sizes (in pixels) */
> #define CIK_CURSOR_WIDTH 128
> #define CIK_CURSOR_HEIGHT 128
> @@ -1466,6 +1472,8 @@ struct dma_fence *amdgpu_device_get_gang(struct
> amdgpu_device *adev); struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
> struct dma_fence *gang);
> bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
> +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
> +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
>
> /* atpx handler */
> #if defined(CONFIG_VGA_SWITCHEROO)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index ef715b2bbcdb..cd1e3f018893 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
> }
> return ret;
> }
> +
> +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) {
> + ssize_t size = 0;
> +
> + if (!ring)
> + return size;
> +
> + if (amdgpu_device_should_recover_gpu(ring->adev))
> + size |= AMDGPU_RESET_TYPE_FULL;
> +
> + if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
> + !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
> + size |= AMDGPU_RESET_TYPE_SOFT_RESET;
> +
> + return size;
> +}
> +
> +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) {
> + ssize_t size = 0;
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
> + size += sysfs_emit_at(buf, size, "soft ");
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
> + size += sysfs_emit_at(buf, size, "queue ");
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
> + size += sysfs_emit_at(buf, size, "pipe ");
> +
> + if (supported_reset & AMDGPU_RESET_TYPE_FULL)
> + size += sysfs_emit_at(buf, size, "full ");
> +
> + size += sysfs_emit_at(buf, size, "\n");
Is there an expectation of having "Unsupported" when no reset is supported (supported_reset == 0)?
Yes, will add it .
Thanks
Jesse
Thanks,
Lijo
> + return size;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index e96984c53e72..6de1f3bf6863 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1588,6 +1588,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
> return count;
> }
>
> +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> + if (!adev)
> + return -ENODEV;
> +
> + return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset); }
> +
> +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> + if (!adev)
> + return -ENODEV;
> +
> + return amdgpu_show_reset_mask(buf,
> +adev->gfx.compute_supported_reset);
> +}
> +
> static DEVICE_ATTR(run_cleaner_shader, 0200,
> NULL, amdgpu_gfx_set_run_cleaner_shader);
>
> @@ -1602,6 +1628,12 @@ static DEVICE_ATTR(current_compute_partition,
> 0644, static DEVICE_ATTR(available_compute_partition, 0444,
> amdgpu_gfx_get_available_compute_partition, NULL);
>
> +static DEVICE_ATTR(gfx_reset_mask, 0444,
> + amdgpu_gfx_get_gfx_reset_mask, NULL);
> +
> +static DEVICE_ATTR(compute_reset_mask, 0444,
> + amdgpu_gfx_get_compute_reset_mask, NULL);
> +
> int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) {
> struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1702,6 +1734,40
> @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
> cleaner_shader_size);
> }
>
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) {
> + int r = 0;
> +
> + if (!amdgpu_gpu_recovery)
> + return r;
> +
> + if (adev->gfx.num_gfx_rings) {
> + r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask);
> + if (r)
> + return r;
> + }
> +
> + if (adev->gfx.num_compute_rings) {
> + r = device_create_file(adev->dev, &dev_attr_compute_reset_mask);
> + if (r)
> + return r;
> + }
> +
> + return r;
> +}
> +
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev) {
> + if (!amdgpu_gpu_recovery)
> + return;
> +
> + if (adev->gfx.num_gfx_rings)
> + device_remove_file(adev->dev, &dev_attr_gfx_reset_mask);
> +
> + if (adev->gfx.num_compute_rings)
> + device_remove_file(adev->dev, &dev_attr_compute_reset_mask); }
> +
> /**
> * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver)
> * @adev: amdgpu_device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index f710178a21bc..fb0e1adf6766 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -424,6 +424,8 @@ struct amdgpu_gfx {
> /* reset mask */
> uint32_t grbm_soft_reset;
> uint32_t srbm_soft_reset;
> + uint32_t gfx_supported_reset;
> + uint32_t compute_supported_reset;
>
> /* gfx off */
> bool gfx_off_state; /* true: enabled, false: disabled */
> @@ -582,6 +584,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct
> amdgpu_device *adev); void
> amdgpu_gfx_enforce_isolation_handler(struct work_struct *work); void
> amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
> void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring
> *ring);
> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev);
> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev);
>
> static inline const char *amdgpu_gfx_compute_mode_desc(int mode) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 9da95b25e158..e2b2cdab423b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -4825,6 +4825,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
> }
> }
> }
> + /* TODO: Add queue reset mask when FW fully supports it */
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>
> r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0);
> if (r) {
> @@ -4854,6 +4859,9 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
> gfx_v10_0_alloc_ip_dump(adev);
>
> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
> + if (r)
> + return r;
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> if (r)
> return r;
> return 0;
> @@ -4896,6 +4904,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block)
> amdgpu_gfx_kiq_fini(adev, 0);
>
> amdgpu_gfx_cleaner_shader_sw_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> gfx_v10_0_pfp_fini(adev);
> gfx_v10_0_ce_fini(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 5aff8f72de9c..ec24e8d019b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1683,6 +1683,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
> }
> }
>
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> + case IP_VERSION(11, 0, 0):
> + case IP_VERSION(11, 0, 2):
> + case IP_VERSION(11, 0, 3):
> + if ((adev->gfx.me_fw_version >= 2280) &&
> + (adev->gfx.mec_fw_version >= 2410)) {
> + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> + adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> + }
> + break;
> + default:
> + break;
> + }
> +
> if (!adev->enable_mes_kiq) {
> r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0);
> if (r) {
> @@ -1721,6 +1739,10 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> + r = amdgpu_gfx_sysfs_reset_mask_init (adev);
> + if (r)
> + return r;
> +
> return 0;
> }
>
> @@ -1783,6 +1805,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block)
> gfx_v11_0_free_microcode(adev);
>
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 9fec28d8a5fc..f5ffa2d8b22a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1437,6 +1437,12 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
> }
> }
>
> + /* TODO: Add queue reset mask when FW fully supports it */
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> +
> if (!adev->enable_mes_kiq) {
> r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, 0);
> if (r) {
> @@ -1467,6 +1473,9 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
> gfx_v12_0_alloc_ip_dump(adev);
>
> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
> + if (r)
> + return r;
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> if (r)
> return r;
>
> @@ -1530,6 +1539,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block)
> gfx_v12_0_free_microcode(adev);
>
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index b4c4b9916289..94007a9ed54b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -2362,6 +2362,12 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
> }
> }
>
> + /* TODO: Add queue reset mask when FW fully supports it */
> + adev->gfx.gfx_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> +
> r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0);
> if (r) {
> DRM_ERROR("Failed to init KIQ BOs!\n"); @@ -2391,6 +2397,9 @@
> static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
> gfx_v9_0_alloc_ip_dump(adev);
>
> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
> + if (r)
> + return r;
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> if (r)
> return r;
>
> @@ -2419,6 +2428,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block)
> amdgpu_gfx_kiq_fini(adev, 0);
>
> amdgpu_gfx_cleaner_shader_sw_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> gfx_v9_0_mec_fini(adev);
> amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 016290f00592..028fda13ac50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -1157,6 +1157,19 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
> return r;
> }
>
> + adev->gfx.compute_supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> + case IP_VERSION(9, 4, 3):
> + case IP_VERSION(9, 4, 4):
> + if (adev->gfx.mec_fw_version >= 155) {
> + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_PIPE;
> + }
> + break;
> + default:
> + break;
> + }
> r = gfx_v9_4_3_gpu_early_init(adev);
> if (r)
> return r;
> @@ -1175,6 +1188,9 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
> if (r)
> return r;
>
> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
> + if (r)
> + return r;
> return 0;
> }
>
> @@ -1200,6 +1216,7 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block)
> gfx_v9_4_3_free_microcode(adev);
> amdgpu_gfx_sysfs_fini(adev);
> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>
> kfree(adev->gfx.ip_dump_core);
> kfree(adev->gfx.ip_dump_compute_queues);
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask
2024-10-29 8:25 ` Zhang, Jesse(Jie)
@ 2024-10-29 8:47 ` Lazar, Lijo
0 siblings, 0 replies; 5+ messages in thread
From: Lazar, Lijo @ 2024-10-29 8:47 UTC (permalink / raw)
To: Zhang, Jesse(Jie), amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander, Koenig, Christian, Huang, Tim
On 10/29/2024 1:55 PM, Zhang, Jesse(Jie) wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> Hi Lijo,
>
> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar@amd.com>
> Sent: Tuesday, October 29, 2024 3:58 PM
> To: Zhang, Jesse(Jie) <Jesse.Zhang@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Huang, Tim <Tim.Huang@amd.com>
> Subject: Re: [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask
>
>
>
> On 10/29/2024 12:44 PM, Jesse.zhang@amd.com wrote:
>> From: "Jesse.zhang@amd.com" <Jesse.zhang@amd.com>
>>
>> Add two sysfs interfaces for gfx and compute:
>> gfx_reset_mask
>> compute_reset_mask
>>
>> These interfaces are read-only and show the resets supported by the IP.
>> For example, full adapter reset (mode1/mode2/BACO/etc), soft reset,
>> queue reset, and pipe reset.
>>
>> V2: the sysfs node returns a text string instead of some flags
>> (Christian)
>> v3: add a generic helper which takes the ring as parameter
>> and print the strings in the order they are applied (Christian)
>>
>> check amdgpu_gpu_recovery before creating sysfs file itself,
>> and initialize supported_reset_types in IP version files (Lijo)
>> v4: Fixing uninitialized variables (Tim)
>>
>> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> Suggested-by:Alex
>> Deucher <alexander.deucher@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 +++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 66 ++++++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 ++
>> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +++
>> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 23 ++++++++
>> drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 ++++
>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 ++++
>> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 17 ++++++
>> 9 files changed, 184 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 48c9b9b06905..aea1031d7b84 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -300,6 +300,12 @@ extern int amdgpu_wbrf;
>> #define AMDGPU_RESET_VCE (1 << 13)
>> #define AMDGPU_RESET_VCE1 (1 << 14)
>>
>> +/* reset mask */
>> +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset,
>> +mode1/mode2/BACO/etc. */ #define AMDGPU_RESET_TYPE_SOFT_RESET (1 <<
>> +1) /* IP level soft reset */ #define AMDGPU_RESET_TYPE_PER_QUEUE (1
>> +<< 2) /* per queue */ #define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /*
>> +per pipe */
>> +
>> /* max cursor sizes (in pixels) */
>> #define CIK_CURSOR_WIDTH 128
>> #define CIK_CURSOR_HEIGHT 128
>> @@ -1466,6 +1472,8 @@ struct dma_fence *amdgpu_device_get_gang(struct
>> amdgpu_device *adev); struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
>> struct dma_fence *gang);
>> bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
>> +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
>> +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
>>
>> /* atpx handler */
>> #if defined(CONFIG_VGA_SWITCHEROO)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index ef715b2bbcdb..cd1e3f018893 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
>> }
>> return ret;
>> }
>> +
>> +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) {
>> + ssize_t size = 0;
>> +
>> + if (!ring)
>> + return size;
>> +
>> + if (amdgpu_device_should_recover_gpu(ring->adev))
>> + size |= AMDGPU_RESET_TYPE_FULL;
>> +
>> + if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
>> + !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
>> + size |= AMDGPU_RESET_TYPE_SOFT_RESET;
>> +
>> + return size;
>> +}
>> +
>> +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) {
>> + ssize_t size = 0;
>> +
>> + if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
>> + size += sysfs_emit_at(buf, size, "soft ");
>> +
>> + if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
>> + size += sysfs_emit_at(buf, size, "queue ");
>> +
>> + if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
>> + size += sysfs_emit_at(buf, size, "pipe ");
>> +
>> + if (supported_reset & AMDGPU_RESET_TYPE_FULL)
>> + size += sysfs_emit_at(buf, size, "full ");
>> +
>> + size += sysfs_emit_at(buf, size, "\n");
>
> Is there an expectation of having "Unsupported" when no reset is supported (supported_reset == 0)?
> Yes, will add it .
>
Asked that for clarification. Now I see the sysfs is not created when
recovery is not enabled. Then maybe you could avoid creating sysfs if
supported_reset = 0. Or, create anyway and show unsupported if
gpu_recovery or supported_reset = 0.
Thanks,
Lijo
> Thanks
> Jesse
>
> Thanks,
> Lijo
>
>> + return size;
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index e96984c53e72..6de1f3bf6863 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -1588,6 +1588,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
>> return count;
>> }
>>
>> +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct drm_device *ddev = dev_get_drvdata(dev);
>> + struct amdgpu_device *adev = drm_to_adev(ddev);
>> +
>> + if (!adev)
>> + return -ENODEV;
>> +
>> + return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset); }
>> +
>> +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct drm_device *ddev = dev_get_drvdata(dev);
>> + struct amdgpu_device *adev = drm_to_adev(ddev);
>> +
>> + if (!adev)
>> + return -ENODEV;
>> +
>> + return amdgpu_show_reset_mask(buf,
>> +adev->gfx.compute_supported_reset);
>> +}
>> +
>> static DEVICE_ATTR(run_cleaner_shader, 0200,
>> NULL, amdgpu_gfx_set_run_cleaner_shader);
>>
>> @@ -1602,6 +1628,12 @@ static DEVICE_ATTR(current_compute_partition,
>> 0644, static DEVICE_ATTR(available_compute_partition, 0444,
>> amdgpu_gfx_get_available_compute_partition, NULL);
>>
>> +static DEVICE_ATTR(gfx_reset_mask, 0444,
>> + amdgpu_gfx_get_gfx_reset_mask, NULL);
>> +
>> +static DEVICE_ATTR(compute_reset_mask, 0444,
>> + amdgpu_gfx_get_compute_reset_mask, NULL);
>> +
>> int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) {
>> struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1702,6 +1734,40
>> @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
>> cleaner_shader_size);
>> }
>>
>> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) {
>> + int r = 0;
>> +
>> + if (!amdgpu_gpu_recovery)
>> + return r;
>> +
>> + if (adev->gfx.num_gfx_rings) {
>> + r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask);
>> + if (r)
>> + return r;
>> + }
>> +
>> + if (adev->gfx.num_compute_rings) {
>> + r = device_create_file(adev->dev, &dev_attr_compute_reset_mask);
>> + if (r)
>> + return r;
>> + }
>> +
>> + return r;
>> +}
>> +
>> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev) {
>> + if (!amdgpu_gpu_recovery)
>> + return;
>> +
>> + if (adev->gfx.num_gfx_rings)
>> + device_remove_file(adev->dev, &dev_attr_gfx_reset_mask);
>> +
>> + if (adev->gfx.num_compute_rings)
>> + device_remove_file(adev->dev, &dev_attr_compute_reset_mask); }
>> +
>> /**
>> * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver)
>> * @adev: amdgpu_device pointer
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index f710178a21bc..fb0e1adf6766 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -424,6 +424,8 @@ struct amdgpu_gfx {
>> /* reset mask */
>> uint32_t grbm_soft_reset;
>> uint32_t srbm_soft_reset;
>> + uint32_t gfx_supported_reset;
>> + uint32_t compute_supported_reset;
>>
>> /* gfx off */
>> bool gfx_off_state; /* true: enabled, false: disabled */
>> @@ -582,6 +584,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct
>> amdgpu_device *adev); void
>> amdgpu_gfx_enforce_isolation_handler(struct work_struct *work); void
>> amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring);
>> void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring
>> *ring);
>> +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev);
>> +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev);
>>
>> static inline const char *amdgpu_gfx_compute_mode_desc(int mode) {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 9da95b25e158..e2b2cdab423b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -4825,6 +4825,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
>> }
>> }
>> }
>> + /* TODO: Add queue reset mask when FW fully supports it */
>> + adev->gfx.gfx_supported_reset =
>> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>> + adev->gfx.compute_supported_reset =
>> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>>
>> r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0);
>> if (r) {
>> @@ -4854,6 +4859,9 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block)
>> gfx_v10_0_alloc_ip_dump(adev);
>>
>> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
>> + if (r)
>> + return r;
>> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
>> if (r)
>> return r;
>> return 0;
>> @@ -4896,6 +4904,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block)
>> amdgpu_gfx_kiq_fini(adev, 0);
>>
>> amdgpu_gfx_cleaner_shader_sw_fini(adev);
>> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>>
>> gfx_v10_0_pfp_fini(adev);
>> gfx_v10_0_ce_fini(adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> index 5aff8f72de9c..ec24e8d019b3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
>> @@ -1683,6 +1683,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
>> }
>> }
>>
>> + adev->gfx.gfx_supported_reset =
>> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>> + adev->gfx.compute_supported_reset =
>> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>> + case IP_VERSION(11, 0, 0):
>> + case IP_VERSION(11, 0, 2):
>> + case IP_VERSION(11, 0, 3):
>> + if ((adev->gfx.me_fw_version >= 2280) &&
>> + (adev->gfx.mec_fw_version >= 2410)) {
>> + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
>> + adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
>> + }
>> + break;
>> + default:
>> + break;
>> + }
>> +
>> if (!adev->enable_mes_kiq) {
>> r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0);
>> if (r) {
>> @@ -1721,6 +1739,10 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
>> if (r)
>> return r;
>>
>> + r = amdgpu_gfx_sysfs_reset_mask_init (adev);
>> + if (r)
>> + return r;
>> +
>> return 0;
>> }
>>
>> @@ -1783,6 +1805,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block)
>> gfx_v11_0_free_microcode(adev);
>>
>> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
>> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>>
>> kfree(adev->gfx.ip_dump_core);
>> kfree(adev->gfx.ip_dump_compute_queues);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
>> index 9fec28d8a5fc..f5ffa2d8b22a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
>> @@ -1437,6 +1437,12 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
>> }
>> }
>>
>> + /* TODO: Add queue reset mask when FW fully supports it */
>> + adev->gfx.gfx_supported_reset =
>> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>> + adev->gfx.compute_supported_reset =
>> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>> +
>> if (!adev->enable_mes_kiq) {
>> r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, 0);
>> if (r) {
>> @@ -1467,6 +1473,9 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block)
>> gfx_v12_0_alloc_ip_dump(adev);
>>
>> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
>> + if (r)
>> + return r;
>> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
>> if (r)
>> return r;
>>
>> @@ -1530,6 +1539,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block)
>> gfx_v12_0_free_microcode(adev);
>>
>> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
>> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>>
>> kfree(adev->gfx.ip_dump_core);
>> kfree(adev->gfx.ip_dump_compute_queues);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index b4c4b9916289..94007a9ed54b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -2362,6 +2362,12 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
>> }
>> }
>>
>> + /* TODO: Add queue reset mask when FW fully supports it */
>> + adev->gfx.gfx_supported_reset =
>> + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>> + adev->gfx.compute_supported_reset =
>> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>> +
>> r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0);
>> if (r) {
>> DRM_ERROR("Failed to init KIQ BOs!\n"); @@ -2391,6 +2397,9 @@
>> static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block)
>> gfx_v9_0_alloc_ip_dump(adev);
>>
>> r = amdgpu_gfx_sysfs_isolation_shader_init(adev);
>> + if (r)
>> + return r;
>> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
>> if (r)
>> return r;
>>
>> @@ -2419,6 +2428,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block)
>> amdgpu_gfx_kiq_fini(adev, 0);
>>
>> amdgpu_gfx_cleaner_shader_sw_fini(adev);
>> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>>
>> gfx_v9_0_mec_fini(adev);
>> amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
>> index 016290f00592..028fda13ac50 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
>> @@ -1157,6 +1157,19 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
>> return r;
>> }
>>
>> + adev->gfx.compute_supported_reset =
>> + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
>> + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>> + case IP_VERSION(9, 4, 3):
>> + case IP_VERSION(9, 4, 4):
>> + if (adev->gfx.mec_fw_version >= 155) {
>> + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
>> + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_PIPE;
>> + }
>> + break;
>> + default:
>> + break;
>> + }
>> r = gfx_v9_4_3_gpu_early_init(adev);
>> if (r)
>> return r;
>> @@ -1175,6 +1188,9 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block)
>> if (r)
>> return r;
>>
>> + r = amdgpu_gfx_sysfs_reset_mask_init(adev);
>> + if (r)
>> + return r;
>> return 0;
>> }
>>
>> @@ -1200,6 +1216,7 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block)
>> gfx_v9_4_3_free_microcode(adev);
>> amdgpu_gfx_sysfs_fini(adev);
>> amdgpu_gfx_sysfs_isolation_shader_fini(adev);
>> + amdgpu_gfx_sysfs_reset_mask_fini(adev);
>>
>> kfree(adev->gfx.ip_dump_core);
>> kfree(adev->gfx.ip_dump_compute_queues);
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-10-29 8:47 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-29 7:14 [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask Jesse.zhang@amd.com
2024-10-29 7:29 ` Huang, Tim
2024-10-29 7:57 ` Lazar, Lijo
2024-10-29 8:25 ` Zhang, Jesse(Jie)
2024-10-29 8:47 ` Lazar, Lijo
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox