AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/amd/ras: Update ras command context structure name
@ 2025-10-17  7:51 YiPeng Chai
  2025-10-17  7:51 ` [PATCH 2/5] drm/amd/ras: Update function and remove redundant code YiPeng Chai
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: YiPeng Chai @ 2025-10-17  7:51 UTC (permalink / raw)
  To: amd-gfx
  Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, Stanley.Yang, Jinzhou.Su,
	YiPeng Chai, Tao Zhou

According to the actual usage of this structure,
it is more appropriate to call it context, the
structure name with ioctl is easy to cause
misunderstanding.

V2:
  Update commit message content.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
 .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c  | 14 ++++++-------
 .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h  |  2 +-
 drivers/gpu/drm/amd/ras/rascore/ras_cmd.c     | 20 +++++++++----------
 drivers/gpu/drm/amd/ras/rascore/ras_cmd.h     |  8 ++++----
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
index 4706e737969a..6a281ad8e255 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
@@ -37,7 +37,7 @@
 #define AMDGPU_RAS_TYPE_VF       0x3
 
 static int amdgpu_ras_query_interface_info(struct ras_core_context *ras_core,
-			struct ras_cmd_ioctl *cmd)
+			struct ras_cmd_ctx *cmd)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
 	struct ras_query_interface_info_rsp *output_data =
@@ -77,7 +77,7 @@ static struct ras_core_context *ras_cmd_get_ras_core(uint64_t dev_handle)
 }
 
 static int amdgpu_ras_get_devices_info(struct ras_core_context *ras_core,
-			struct ras_cmd_ioctl *cmd)
+			struct ras_cmd_ctx *cmd)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
 	struct ras_cmd_devices_info_rsp *output_data =
@@ -146,7 +146,7 @@ static uint64_t local_addr_to_xgmi_global_addr(struct ras_core_context *ras_core
 }
 
 static int amdgpu_ras_inject_error(struct ras_core_context *ras_core,
-			struct ras_cmd_ioctl *cmd, void *data)
+			struct ras_cmd_ctx *cmd, void *data)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
 	struct ras_cmd_inject_error_req *req =
@@ -189,7 +189,7 @@ static int amdgpu_ras_inject_error(struct ras_core_context *ras_core,
 }
 
 static int amdgpu_ras_get_ras_safe_fb_addr_ranges(struct ras_core_context *ras_core,
-	struct ras_cmd_ioctl *cmd, void *data)
+	struct ras_cmd_ctx *cmd, void *data)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
 	struct ras_cmd_dev_handle *input_data =
@@ -259,7 +259,7 @@ static int ras_translate_fb_address(struct ras_core_context *ras_core,
 }
 
 static int amdgpu_ras_translate_fb_address(struct ras_core_context *ras_core,
-				struct ras_cmd_ioctl *cmd, void *data)
+				struct ras_cmd_ctx *cmd, void *data)
 {
 	struct ras_cmd_translate_fb_address_req *req_buff =
 			(struct ras_cmd_translate_fb_address_req *)cmd->input_buff_raw;
@@ -291,7 +291,7 @@ static struct ras_cmd_func_map amdgpu_ras_cmd_maps[] = {
 	{RAS_CMD__TRANSLATE_FB_ADDRESS, amdgpu_ras_translate_fb_address},
 };
 
-int amdgpu_ras_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ioctl *cmd, void *data)
+int amdgpu_ras_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data)
 {
 	struct ras_cmd_func_map *ras_cmd = NULL;
 	int i, res;
@@ -314,7 +314,7 @@ int amdgpu_ras_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ioct
 int amdgpu_ras_cmd_ioctl_handler(struct ras_core_context *ras_core,
 			uint8_t *cmd_buf, uint32_t buf_size)
 {
-	struct ras_cmd_ioctl *cmd = (struct ras_cmd_ioctl *)cmd_buf;
+	struct ras_cmd_ctx *cmd = (struct ras_cmd_ctx *)cmd_buf;
 	struct ras_core_context *cmd_core = NULL;
 	struct ras_cmd_dev_handle *cmd_handle = NULL;
 	int timeout = 60;
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h
index 7017198f1bac..73832c28cb55 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h
@@ -48,7 +48,7 @@ struct ras_cmd_translate_memory_fd_rsp {
 };
 
 int amdgpu_ras_handle_cmd(struct ras_core_context *ras_core,
-		struct ras_cmd_ioctl *cmd, void *data);
+		struct ras_cmd_ctx *cmd, void *data);
 int amdgpu_ras_cmd_ioctl_handler(struct ras_core_context *ras_core,
 			uint8_t *cmd_buf, uint32_t buf_size);
 
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
index 6fe3b115986c..94e6d7420d94 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
@@ -43,7 +43,7 @@ static int ras_cmd_remove_device(struct ras_core_context *ras_core)
 }
 
 static int ras_get_block_ecc_info(struct ras_core_context *ras_core,
-				struct ras_cmd_ioctl *cmd, void *data)
+				struct ras_cmd_ctx *cmd, void *data)
 {
 	struct ras_cmd_block_ecc_info_req *input_data =
 			(struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
@@ -119,7 +119,7 @@ static int ras_cmd_get_group_bad_pages(struct ras_core_context *ras_core,
 }
 
 static int ras_cmd_get_bad_pages(struct ras_core_context *ras_core,
-				struct ras_cmd_ioctl *cmd, void *data)
+				struct ras_cmd_ctx *cmd, void *data)
 {
 	struct ras_cmd_bad_pages_info_req *input_data =
 			(struct ras_cmd_bad_pages_info_req *)cmd->input_buff_raw;
@@ -141,7 +141,7 @@ static int ras_cmd_get_bad_pages(struct ras_core_context *ras_core,
 }
 
 static int ras_cmd_clear_bad_page_info(struct ras_core_context *ras_core,
-				struct ras_cmd_ioctl *cmd, void *data)
+				struct ras_cmd_ctx *cmd, void *data)
 {
 	if (cmd->input_size != sizeof(struct ras_cmd_dev_handle))
 		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
@@ -156,7 +156,7 @@ static int ras_cmd_clear_bad_page_info(struct ras_core_context *ras_core,
 }
 
 static int ras_cmd_reset_all_error_counts(struct ras_core_context *ras_core,
-				struct ras_cmd_ioctl *cmd, void *data)
+				struct ras_cmd_ctx *cmd, void *data)
 {
 	if (cmd->input_size != sizeof(struct ras_cmd_dev_handle))
 		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
@@ -171,7 +171,7 @@ static int ras_cmd_reset_all_error_counts(struct ras_core_context *ras_core,
 }
 
 static int ras_cmd_get_cper_snapshot(struct ras_core_context *ras_core,
-			struct ras_cmd_ioctl *cmd, void *data)
+			struct ras_cmd_ctx *cmd, void *data)
 {
 	struct ras_cmd_cper_snapshot_rsp *output_data =
 			(struct ras_cmd_cper_snapshot_rsp *)cmd->output_buff_raw;
@@ -193,7 +193,7 @@ static int ras_cmd_get_cper_snapshot(struct ras_core_context *ras_core,
 }
 
 static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
-			struct ras_cmd_ioctl *cmd, void *data)
+			struct ras_cmd_ctx *cmd, void *data)
 {
 	struct ras_cmd_cper_record_req *req =
 			(struct ras_cmd_cper_record_req *)cmd->input_buff_raw;
@@ -253,7 +253,7 @@ static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
 }
 
 static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context *ras_core,
-	struct ras_cmd_ioctl *cmd, void *data)
+	struct ras_cmd_ctx *cmd, void *data)
 {
 	struct ras_cmd_batch_trace_snapshot_rsp *rsp =
 			(struct ras_cmd_batch_trace_snapshot_rsp *)cmd->output_buff_raw;
@@ -275,7 +275,7 @@ static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context *ras_core,
 }
 
 static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core,
-	struct ras_cmd_ioctl *cmd, void *data)
+	struct ras_cmd_ctx *cmd, void *data)
 {
 	struct ras_cmd_batch_trace_record_req *input_data =
 			(struct ras_cmd_batch_trace_record_req *)cmd->input_buff_raw;
@@ -400,7 +400,7 @@ static enum ras_ta_error_type __get_ras_ta_err_type(enum ras_ecc_err_type error)
 }
 
 static int ras_cmd_inject_error(struct ras_core_context *ras_core,
-			struct ras_cmd_ioctl *cmd, void *data)
+			struct ras_cmd_ctx *cmd, void *data)
 {
 	struct ras_cmd_inject_error_req *req =
 		(struct ras_cmd_inject_error_req *)cmd->input_buff_raw;
@@ -441,7 +441,7 @@ static struct ras_cmd_func_map ras_cmd_maps[] = {
 };
 
 int rascore_handle_cmd(struct ras_core_context *ras_core,
-		struct ras_cmd_ioctl *cmd, void *data)
+		struct ras_cmd_ctx *cmd, void *data)
 {
 	struct ras_cmd_func_map *ras_cmd = NULL;
 	int i;
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h
index 6df8c70f5ad8..751ed50b9584 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h
@@ -101,7 +101,7 @@ enum ras_error_type {
 };
 
 struct ras_core_context;
-struct ras_cmd_ioctl;
+struct ras_cmd_ctx;
 
 struct ras_cmd_mgr {
 	struct list_head head;
@@ -112,7 +112,7 @@ struct ras_cmd_mgr {
 struct ras_cmd_func_map {
 	uint32_t cmd_id;
 	int (*func)(struct ras_core_context *ras_core,
-			struct ras_cmd_ioctl *cmd, void *data);
+			struct ras_cmd_ctx *cmd, void *data);
 };
 
 struct ras_device_bdf {
@@ -133,7 +133,7 @@ struct ras_cmd_param {
 };
 
 #pragma pack(push, 8)
-struct ras_cmd_ioctl {
+struct ras_cmd_ctx {
 	uint32_t magic;
 	union {
 		struct {
@@ -414,7 +414,7 @@ struct ras_cmd_batch_trace_record_rsp {
 
 int ras_cmd_init(struct ras_core_context *ras_core);
 int ras_cmd_fini(struct ras_core_context *ras_core);
-int rascore_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ioctl *cmd, void *data);
+int rascore_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data);
 uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core);
 int ras_cmd_query_interface_info(struct ras_core_context *ras_core,
 	struct ras_query_interface_info_rsp *rsp);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/5] drm/amd/ras: Update function and remove redundant code
  2025-10-17  7:51 [PATCH 1/5] drm/amd/ras: Update ras command context structure name YiPeng Chai
@ 2025-10-17  7:51 ` YiPeng Chai
  2025-10-17  7:51 ` [PATCH 3/5] drm/amdgpu: ras module supports error injection YiPeng Chai
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: YiPeng Chai @ 2025-10-17  7:51 UTC (permalink / raw)
  To: amd-gfx
  Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, Stanley.Yang, Jinzhou.Su,
	YiPeng Chai, Tao Zhou

Update function and remove redundant code:
1. Update function to prepare for internal use.
2. Remove unused function code previously prepared
   for ioctl.

V2:
  Update commit message content.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
 .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c  | 110 +++---------------
 .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h  |   3 +-
 .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c  |  31 +++++
 .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h  |   3 +
 drivers/gpu/drm/amd/ras/rascore/ras_cmd.h     |   3 +-
 5 files changed, 53 insertions(+), 97 deletions(-)

diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
index 6a281ad8e255..78419b7f7729 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
@@ -36,67 +36,6 @@
 #define AMDGPU_RAS_TYPE_AMDGPU   0x2
 #define AMDGPU_RAS_TYPE_VF       0x3
 
-static int amdgpu_ras_query_interface_info(struct ras_core_context *ras_core,
-			struct ras_cmd_ctx *cmd)
-{
-	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
-	struct ras_query_interface_info_rsp *output_data =
-		(struct ras_query_interface_info_rsp *)cmd->output_buff_raw;
-	int ret;
-
-	if (cmd->input_size != sizeof(struct ras_query_interface_info_req))
-		return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
-
-	ret = ras_cmd_query_interface_info(ras_core, output_data);
-	if (!ret) {
-		output_data->plat_major_ver = 0;
-		output_data->plat_minor_ver = 0;
-
-		output_data->interface_type = amdgpu_sriov_vf(adev) ?
-			RAS_CMD_INTERFACE_TYPE_VF : RAS_CMD_INTERFACE_TYPE_AMDGPU;
-
-		cmd->output_size = sizeof(struct ras_query_interface_info_rsp);
-	}
-
-	return ret;
-}
-
-static struct ras_core_context *ras_cmd_get_ras_core(uint64_t dev_handle)
-{
-	struct ras_core_context *ras_core;
-
-	if (!dev_handle || (dev_handle == RAS_CMD_DEV_HANDLE_MAGIC))
-		return NULL;
-
-	ras_core = (struct ras_core_context *)(uintptr_t)(dev_handle ^ RAS_CMD_DEV_HANDLE_MAGIC);
-
-	if (ras_cmd_get_dev_handle(ras_core) == dev_handle)
-		return ras_core;
-
-	return NULL;
-}
-
-static int amdgpu_ras_get_devices_info(struct ras_core_context *ras_core,
-			struct ras_cmd_ctx *cmd)
-{
-	struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
-	struct ras_cmd_devices_info_rsp *output_data =
-			(struct ras_cmd_devices_info_rsp *)cmd->output_buff_raw;
-	struct ras_cmd_dev_info *dev_info;
-
-	dev_info = &output_data->devs[0];
-	dev_info->dev_handle = ras_cmd_get_dev_handle(ras_core);
-	dev_info->oam_id = adev->smuio.funcs->get_socket_id(adev);
-	dev_info->ecc_enabled = 1;
-	dev_info->ecc_supported = 1;
-
-	output_data->dev_num = 1;
-	output_data->version = 0;
-	cmd->output_size = sizeof(struct ras_cmd_devices_info_rsp);
-
-	return 0;
-}
-
 static int amdgpu_ras_trigger_error_prepare(struct ras_core_context *ras_core,
 			struct ras_cmd_inject_error_req *block_info)
 {
@@ -311,51 +250,34 @@ int amdgpu_ras_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx
 	return res;
 }
 
-int amdgpu_ras_cmd_ioctl_handler(struct ras_core_context *ras_core,
-			uint8_t *cmd_buf, uint32_t buf_size)
+int amdgpu_ras_submit_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd)
 {
-	struct ras_cmd_ctx *cmd = (struct ras_cmd_ctx *)cmd_buf;
-	struct ras_core_context *cmd_core = NULL;
-	struct ras_cmd_dev_handle *cmd_handle = NULL;
+	struct ras_core_context *cmd_core = ras_core;
 	int timeout = 60;
 	int res;
 
 	cmd->cmd_res = RAS_CMD__ERROR_INVALID_CMD;
 	cmd->output_size = 0;
 
-	if (!ras_core_is_enabled(ras_core))
+	if (!ras_core_is_enabled(cmd_core))
 		return RAS_CMD__ERROR_ACCESS_DENIED;
 
-	if (cmd->cmd_id == RAS_CMD__QUERY_INTERFACE_INFO) {
-		cmd->cmd_res = amdgpu_ras_query_interface_info(ras_core, cmd);
-	} else if (cmd->cmd_id == RAS_CMD__GET_DEVICES_INFO) {
-		cmd->cmd_res = amdgpu_ras_get_devices_info(ras_core, cmd);
-	} else {
-		cmd_handle = (struct ras_cmd_dev_handle *)cmd->input_buff_raw;
-		cmd_core = ras_cmd_get_ras_core(cmd_handle->dev_handle);
-		if (!cmd_core)
-			return RAS_CMD__ERROR_INVALID_INPUT_DATA;
-
-		while (ras_core_gpu_in_reset(cmd_core)) {
-			msleep(1000);
-			if (!timeout--)
-				return RAS_CMD__ERROR_TIMEOUT;
-		}
-
-
-		if (!ras_core_is_enabled(cmd_core))
-			return RAS_CMD__ERROR_ACCESS_DENIED;
+	while (ras_core_gpu_in_reset(cmd_core)) {
+		msleep(1000);
+		if (!timeout--)
+			return RAS_CMD__ERROR_TIMEOUT;
+	}
 
-		res = amdgpu_ras_handle_cmd(cmd_core, cmd, NULL);
-		if (res == RAS_CMD__ERROR_UKNOWN_CMD)
-			res = rascore_handle_cmd(cmd_core, cmd, NULL);
+	res = amdgpu_ras_handle_cmd(cmd_core, cmd, NULL);
+	if (res == RAS_CMD__ERROR_UKNOWN_CMD)
+		res = rascore_handle_cmd(cmd_core, cmd, NULL);
 
-		cmd->cmd_res = res;
-	}
+	cmd->cmd_res = res;
 
-	if ((cmd->cmd_res == RAS_CMD__SUCCESS) &&
-	    ((cmd->output_size + sizeof(*cmd)) > buf_size)) {
-		RAS_INFO("Insufficient command buffer size 0x%x!\n", buf_size);
+	if (cmd->output_size > cmd->output_buf_size) {
+		RAS_DEV_ERR(cmd_core->dev,
+			"Output size 0x%x exceeds output buffer size 0x%x!\n",
+			cmd->output_size, cmd->output_buf_size);
 		return RAS_CMD__SUCCESS_EXEED_BUFFER;
 	}
 
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h
index 73832c28cb55..5973b156cc85 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h
@@ -49,7 +49,6 @@ struct ras_cmd_translate_memory_fd_rsp {
 
 int amdgpu_ras_handle_cmd(struct ras_core_context *ras_core,
 		struct ras_cmd_ctx *cmd, void *data);
-int amdgpu_ras_cmd_ioctl_handler(struct ras_core_context *ras_core,
-			uint8_t *cmd_buf, uint32_t buf_size);
+int amdgpu_ras_submit_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd);
 
 #endif
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c
index 13c207c8a843..8007e49951d8 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c
@@ -578,3 +578,34 @@ bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev)
 
 	return ras_core_gpu_is_rma(ras_mgr->ras_core);
 }
+
+int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
+			uint32_t cmd_id, void *input, uint32_t input_size,
+			void *output, uint32_t out_size)
+{
+	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+	struct ras_cmd_ctx *cmd_ctx;
+	uint32_t ctx_buf_size = PAGE_SIZE;
+	int ret;
+
+	if (!amdgpu_ras_mgr_is_ready(adev))
+		return -EPERM;
+
+	cmd_ctx = kzalloc(ctx_buf_size, GFP_KERNEL);
+	if (!cmd_ctx)
+		return -ENOMEM;
+
+	cmd_ctx->cmd_id = cmd_id;
+
+	memcpy(cmd_ctx->input_buff_raw, input, input_size);
+	cmd_ctx->input_size = input_size;
+	cmd_ctx->output_buf_size = ctx_buf_size - sizeof(*cmd_ctx);
+
+	ret = amdgpu_ras_submit_cmd(ras_mgr->ras_core, cmd_ctx);
+	if (!ret && !cmd_ctx->cmd_res && output && (out_size == cmd_ctx->output_size))
+		memcpy(output, cmd_ctx->output_buff_raw, cmd_ctx->output_size);
+
+	kfree(cmd_ctx);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h
index 814b65ef1c62..42f190a8feb9 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h
@@ -72,4 +72,7 @@ int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev, uint32_t *nps_m
 bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev,
 			uint64_t addr);
 bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev);
+int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
+		uint32_t cmd_id, void *input, uint32_t input_size,
+		void *output, uint32_t out_size);
 #endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h
index 751ed50b9584..48a0715eb821 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h
@@ -153,7 +153,8 @@ struct ras_cmd_ctx {
 	uint32_t cmd_res;
 	uint32_t input_size;
 	uint32_t output_size;
-	uint32_t reserved[6];
+	uint32_t output_buf_size;
+	uint32_t reserved[5];
 	uint8_t  input_buff_raw[RAS_CMD_MAX_IN_SIZE];
 	uint8_t  output_buff_raw[];
 };
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/5] drm/amdgpu: ras module supports error injection
  2025-10-17  7:51 [PATCH 1/5] drm/amd/ras: Update ras command context structure name YiPeng Chai
  2025-10-17  7:51 ` [PATCH 2/5] drm/amd/ras: Update function and remove redundant code YiPeng Chai
@ 2025-10-17  7:51 ` YiPeng Chai
  2025-10-17  7:51 ` [PATCH 4/5] drm/amdgpu: query bad page info of ras module YiPeng Chai
  2025-10-17  7:51 ` [PATCH 5/5] drm/amdgpu: query block error count " YiPeng Chai
  3 siblings, 0 replies; 6+ messages in thread
From: YiPeng Chai @ 2025-10-17  7:51 UTC (permalink / raw)
  To: amd-gfx
  Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, Stanley.Yang, Jinzhou.Su,
	YiPeng Chai, Tao Zhou

ras module supports error injection.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6a1a9278cd6a..566143ff9ccd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1597,6 +1597,27 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int amdgpu_uniras_error_inject(struct amdgpu_device *adev,
+		struct ras_inject_if *info)
+{
+	struct ras_cmd_inject_error_req inject_req;
+	struct ras_cmd_inject_error_rsp rsp;
+
+	if (!info)
+		return -EINVAL;
+
+	memset(&inject_req, 0, sizeof(inject_req));
+	inject_req.block_id = info->head.block;
+	inject_req.subblock_id = info->head.sub_block_index;
+	inject_req.address = info->address;
+	inject_req.error_type = info->head.type;
+	inject_req.instance_mask = info->instance_mask;
+	inject_req.value = info->value;
+
+	return amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__INJECT_ERROR,
+			&inject_req, sizeof(inject_req), &rsp, sizeof(rsp));
+}
+
 /* wrapper of psp_ras_trigger_error */
 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 		struct ras_inject_if *info)
@@ -1614,6 +1635,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 							info->head.block,
 							info->head.sub_block_index);
 
+	if (amdgpu_uniras_enabled(adev))
+		return amdgpu_uniras_error_inject(adev, info);
+
 	/* inject on guest isn't allowed, return success directly */
 	if (amdgpu_sriov_vf(adev))
 		return 0;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/5] drm/amdgpu: query bad page info of ras module
  2025-10-17  7:51 [PATCH 1/5] drm/amd/ras: Update ras command context structure name YiPeng Chai
  2025-10-17  7:51 ` [PATCH 2/5] drm/amd/ras: Update function and remove redundant code YiPeng Chai
  2025-10-17  7:51 ` [PATCH 3/5] drm/amdgpu: ras module supports error injection YiPeng Chai
@ 2025-10-17  7:51 ` YiPeng Chai
  2025-10-17  7:51 ` [PATCH 5/5] drm/amdgpu: query block error count " YiPeng Chai
  3 siblings, 0 replies; 6+ messages in thread
From: YiPeng Chai @ 2025-10-17  7:51 UTC (permalink / raw)
  To: amd-gfx
  Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, Stanley.Yang, Jinzhou.Su,
	YiPeng Chai, Tao Zhou

Query bad page info of ras module.

V2:
  Update code to reuse bad page output code.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 142 ++++++++++++++++--------
 1 file changed, 98 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 566143ff9ccd..5d5e1c0154b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1782,7 +1782,9 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 /* sysfs begin */
 
 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
-		struct ras_badpage **bps, unsigned int *count);
+		struct ras_badpage *bps, uint32_t count, uint32_t start);
+static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev,
+		struct ras_badpage *bps, uint32_t count, uint32_t start);
 
 static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
 {
@@ -1840,19 +1842,50 @@ static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
 	unsigned int end = div64_ul(ppos + count - 1, element_size);
 	ssize_t s = 0;
 	struct ras_badpage *bps = NULL;
-	unsigned int bps_count = 0;
+	int bps_count = 0, i, status;
+	uint64_t address;
 
 	memset(buf, 0, count);
 
-	if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
+	bps_count = end - start;
+	bps = kmalloc_array(bps_count, sizeof(*bps), GFP_KERNEL);
+	if (!bps)
+		return 0;
+
+	memset(bps, 0, sizeof(*bps) * bps_count);
+
+	if (amdgpu_uniras_enabled(adev))
+		bps_count = amdgpu_uniras_badpages_read(adev, bps, bps_count, start);
+	else
+		bps_count = amdgpu_ras_badpages_read(adev, bps, bps_count, start);
+
+	if (bps_count <= 0) {
+		kfree(bps);
 		return 0;
+	}
+
+	for (i = 0; i < bps_count; i++) {
+		address = ((uint64_t)bps[i].bp) << AMDGPU_GPU_PAGE_SHIFT;
+		if (amdgpu_ras_check_critical_address(adev, address))
+			continue;
+
+		bps[i].size = AMDGPU_GPU_PAGE_SIZE;
+
+		status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
+					address);
+		if (status == -EBUSY)
+			bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
+		else if (status == -ENOENT)
+			bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
+		else
+			bps[i].flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED;
 
-	for (; start < end && start < bps_count; start++)
 		s += scnprintf(&buf[s], element_size + 1,
 				"0x%08x : 0x%08x : %1s\n",
-				bps[start].bp,
-				bps[start].size,
-				amdgpu_ras_badpage_flags_str(bps[start].flags));
+				bps[i].bp,
+				bps[i].size,
+				amdgpu_ras_badpage_flags_str(bps[i].flags));
+	}
 
 	kfree(bps);
 
@@ -2645,62 +2678,83 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
 	}
 }
 
-/* recovery begin */
-
-/* return 0 on success.
- * caller need free bps.
- */
 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
-		struct ras_badpage **bps, unsigned int *count)
+		struct ras_badpage *bps, uint32_t count, uint32_t start)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data;
-	int i = 0;
-	int ret = 0, status;
+	int r = 0;
+	uint32_t i;
 
 	if (!con || !con->eh_data || !bps || !count)
 		return -EINVAL;
 
 	mutex_lock(&con->recovery_lock);
 	data = con->eh_data;
-	if (!data || data->count == 0) {
-		*bps = NULL;
-		ret = -EINVAL;
-		goto out;
+	if (start < data->count) {
+		for (i = start; i < data->count; i++) {
+			if (!data->bps[i].ts)
+				continue;
+
+			bps[r].bp = data->bps[i].retired_page;
+			r++;
+			if (r >= count)
+				break;
+		}
 	}
+	mutex_unlock(&con->recovery_lock);
 
-	*bps = kmalloc_array(data->count, sizeof(struct ras_badpage), GFP_KERNEL);
-	if (!*bps) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	return r;
+}
 
-	for (; i < data->count; i++) {
-		if (!data->bps[i].ts)
-			continue;
+static int amdgpu_uniras_badpages_read(struct amdgpu_device *adev,
+		struct ras_badpage *bps, uint32_t count, uint32_t start)
+{
+	struct ras_cmd_bad_pages_info_req cmd_input;
+	struct ras_cmd_bad_pages_info_rsp *output;
+	uint32_t group, start_group, end_group;
+	uint32_t pos, pos_in_group;
+	int r = 0, i;
 
-		(*bps)[i] = (struct ras_badpage){
-			.bp = data->bps[i].retired_page,
-			.size = AMDGPU_GPU_PAGE_SIZE,
-			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
-		};
+	if (!bps || !count)
+		return -EINVAL;
 
-		if (amdgpu_ras_check_critical_address(adev,
-			data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
-			continue;
+	output = kmalloc(sizeof(*output), GFP_KERNEL);
+	if (!output)
+		return -ENOMEM;
 
-		status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
-				data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
-		if (status == -EBUSY)
-			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
-		else if (status == -ENOENT)
-			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
+	memset(&cmd_input, 0, sizeof(cmd_input));
+
+	start_group = start / RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
+	end_group = (start + count + RAS_CMD_MAX_BAD_PAGES_PER_GROUP - 1) /
+				RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
+
+	pos = start;
+	for (group = start_group; group < end_group; group++) {
+		memset(output, 0, sizeof(*output));
+		cmd_input.group_index = group;
+		if (amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BAD_PAGES,
+			&cmd_input, sizeof(cmd_input), output, sizeof(*output)))
+			goto out;
+
+		if (pos >= output->bp_total_cnt)
+			goto out;
+
+		pos_in_group = pos - group * RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
+		for (i = pos_in_group; i < output->bp_in_group; i++, pos++) {
+			if (!output->records[i].ts)
+				continue;
+
+			bps[r].bp = output->records[i].retired_page;
+			r++;
+			if (r >= count)
+				goto out;
+		}
 	}
 
-	*count = con->bad_page_num;
 out:
-	mutex_unlock(&con->recovery_lock);
-	return ret;
+	kfree(output);
+	return r;
 }
 
 static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/5] drm/amdgpu: query block error count of ras module
  2025-10-17  7:51 [PATCH 1/5] drm/amd/ras: Update ras command context structure name YiPeng Chai
                   ` (2 preceding siblings ...)
  2025-10-17  7:51 ` [PATCH 4/5] drm/amdgpu: query bad page info of ras module YiPeng Chai
@ 2025-10-17  7:51 ` YiPeng Chai
  2025-10-20  2:38   ` Zhang, Hawking
  3 siblings, 1 reply; 6+ messages in thread
From: YiPeng Chai @ 2025-10-17  7:51 UTC (permalink / raw)
  To: amd-gfx
  Cc: Hawking.Zhang, Tao.Zhou1, Candice.Li, Stanley.Yang, Jinzhou.Su,
	YiPeng Chai, Tao Zhou

Query block error count of ras module.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 ++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5d5e1c0154b2..3150d736a4e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1543,9 +1543,36 @@ static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev,
 	return ret;
 }
 
+static int amdgpu_uniras_query_block_ecc(struct amdgpu_device *adev,
+			struct ras_query_if *info)
+{
+	struct ras_cmd_block_ecc_info_req req = {0};
+	struct ras_cmd_block_ecc_info_rsp rsp = {0};
+	int ret;
+
+	if (!info)
+		return -EINVAL;
+
+	req.block_id = info->head.block;
+	req.subblock_id = info->head.sub_block_index;
+
+	ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BLOCK_ECC_STATUS,
+				&req, sizeof(req), &rsp, sizeof(rsp));
+	if (!ret) {
+		info->ce_count = rsp.ce_count;
+		info->ue_count = rsp.ue_count;
+		info->de_count = rsp.de_count;
+	}
+
+	return ret;
+}
+
 int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
 {
-	return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID);
+	if (amdgpu_uniras_enabled(adev))
+		return amdgpu_uniras_query_block_ecc(adev, info);
+	else
+		return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID);
 }
 
 int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* RE: [PATCH 5/5] drm/amdgpu: query block error count of ras module
  2025-10-17  7:51 ` [PATCH 5/5] drm/amdgpu: query block error count " YiPeng Chai
@ 2025-10-20  2:38   ` Zhang, Hawking
  0 siblings, 0 replies; 6+ messages in thread
From: Zhang, Hawking @ 2025-10-20  2:38 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx@lists.freedesktop.org
  Cc: Zhou1, Tao, Li, Candice, Yang, Stanley, Su, Joe, Zhou1, Tao

[AMD Official Use Only - AMD Internal Distribution Only]

Series is

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>

Regards,
Hawking
-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@amd.com>
Sent: Friday, October 17, 2025 15:52
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice <Candice.Li@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>; Su, Joe <Jinzhou.Su@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 5/5] drm/amdgpu: query block error count of ras module

Query block error count of ras module.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 ++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5d5e1c0154b2..3150d736a4e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1543,9 +1543,36 @@ static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev,
        return ret;
 }

+static int amdgpu_uniras_query_block_ecc(struct amdgpu_device *adev,
+                       struct ras_query_if *info)
+{
+       struct ras_cmd_block_ecc_info_req req = {0};
+       struct ras_cmd_block_ecc_info_rsp rsp = {0};
+       int ret;
+
+       if (!info)
+               return -EINVAL;
+
+       req.block_id = info->head.block;
+       req.subblock_id = info->head.sub_block_index;
+
+       ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__GET_BLOCK_ECC_STATUS,
+                               &req, sizeof(req), &rsp, sizeof(rsp));
+       if (!ret) {
+               info->ce_count = rsp.ce_count;
+               info->ue_count = rsp.ue_count;
+               info->de_count = rsp.de_count;
+       }
+
+       return ret;
+}
+
 int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)  {
-       return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID);
+       if (amdgpu_uniras_enabled(adev))
+               return amdgpu_uniras_query_block_ecc(adev, info);
+       else
+               return amdgpu_ras_query_error_status_with_event(adev, info,
+RAS_EVENT_TYPE_INVALID);
 }

 int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
--
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-10-20  2:38 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-17  7:51 [PATCH 1/5] drm/amd/ras: Update ras command context structure name YiPeng Chai
2025-10-17  7:51 ` [PATCH 2/5] drm/amd/ras: Update function and remove redundant code YiPeng Chai
2025-10-17  7:51 ` [PATCH 3/5] drm/amdgpu: ras module supports error injection YiPeng Chai
2025-10-17  7:51 ` [PATCH 4/5] drm/amdgpu: query bad page info of ras module YiPeng Chai
2025-10-17  7:51 ` [PATCH 5/5] drm/amdgpu: query block error count " YiPeng Chai
2025-10-20  2:38   ` Zhang, Hawking

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox