* [PATCH v2] tools/drm_ras: tool to communicate with DRM Netlink Subsystem
@ 2025-11-14 10:07 Ravi Kishore Koppuravuri
2025-11-14 16:40 ` Rodrigo Vivi
0 siblings, 1 reply; 3+ messages in thread
From: Ravi Kishore Koppuravuri @ 2025-11-14 10:07 UTC (permalink / raw)
To: igt-dev
Cc: Ravi Kishore Koppuravuri, Tauro Riana, Iddamsetty Aravind,
Gupta Anshuman, Vivi Rodrigo
User space tool for querying GPU health monitoring RAS events via
Generic Netlink Socket interface from Kernel's DRM Netlink Subsystem.
Available Commands are
- List Nodes
- Get Error Counters
- Query Error Counter
Signed-off-by: Ravi Kishore Koppuravuri <ravi.kishore.koppuravuri@intel.com>
Cc: Tauro Riana <riana.tauro@intel.com>
Cc: Iddamsetty Aravind <aravind.iddamsetty@intel.com>
Cc: Gupta Anshuman <anshuman.gupta@intel.com>
Cc: Vivi Rodrigo <rodrigo.vivi@intel.com>
---
V1 -> V2:
- Removed device_id from the input parameters
- Updated help() function
- Incorporated error handling logic
---
---
include/drm-uapi/drm_netlink.h | 79 +++++++
meson.build | 5 +-
tools/drm_ras.c | 421 +++++++++++++++++++++++++++++++++
tools/meson.build | 5 +
4 files changed, 509 insertions(+), 1 deletion(-)
create mode 100644 include/drm-uapi/drm_netlink.h
create mode 100644 tools/drm_ras.c
diff --git a/include/drm-uapi/drm_netlink.h b/include/drm-uapi/drm_netlink.h
new file mode 100644
index 000000000..af893aa36
--- /dev/null
+++ b/include/drm-uapi/drm_netlink.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/drm_ras.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _LINUX_DRM_RAS_H
+#define _LINUX_DRM_RAS_H
+
+#define DRM_RAS_GENL_NAME "drm-ras"
+#define DRM_RAS_FAMILY_VERSION 1
+
+/*
+ * Type of the node. Currently, only error-counter nodes are supported, which
+ * expose reliability counters for a hardware/software component.
+ */
+enum drm_ras_node_type {
+ DRM_RAS_NODE_TYPE_ERROR_COUNTER = 1,
+};
+
+enum {
+ /* Unique identifier for the node*/
+ DRM_RAS_NODE_ATTR_NODE_ID = 1,
+
+ /* Device name chosen by the driver at the time of registration */
+ DRM_RAS_NODE_ATTR_DEVICE_NAME,
+
+ /* Node name chosen by the driver at registration to identify RAS node inside the device */
+ DRM_RAS_NODE_ATTR_NODE_NAME,
+
+ /* Type of the node, identifying its function */
+ DRM_RAS_NODE_ATTR_NODE_TYPE,
+
+ __DRM_RAS_NODE_ATTR_MAX,
+ DRM_RAS_NODE_ATTR_MAX = (__DRM_RAS_NODE_ATTR_MAX - 1)
+};
+
+enum {
+ /* Node ID targeted by this error counter operation */
+ DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID = 1,
+
+ /* Unique identifier for a specific error counter within an node */
+ DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID,
+
+ /* Name of the requested error counter */
+ DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME,
+
+ /* Current value of the requested error counter */
+ DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE,
+
+ __DRM_RAS_ERROR_COUNTER_ATTR_MAX,
+ DRM_RAS_ERROR_COUNTER_ATTR_MAX = (__DRM_RAS_ERROR_COUNTER_ATTR_MAX - 1)
+};
+
+enum drm_genl_error_cmds {
+ /**
+ * @DRM_RAS_CMD_LIST_NODES: Command to Retrieve the full list of currently registered
+ * DRM RAS nodes.Each node includes its dynamically assigned ID, name, and type.
+ * Obtain the Node IDs by calling this command and use it in the subsequent operations
+ * on the nodes.
+ */
+ DRM_RAS_CMD_LIST_NODES = 1,
+
+ /**
+ * @DRM_RAS_CMD_GET_ERROR_COUNTERS: Retrieve the full list of error counters for a given
+ * node. The response include id, name, and current value of each counter.
+ */
+ DRM_RAS_CMD_GET_ERROR_COUNTERS,
+
+ /**
+ * @DRM_RAS_CMD_QUERY_ERROR_COUNTER: Query the information of a specific error counter
+ * for a given node. Response contains id, name, and current value of the counter.
+ */
+ DRM_RAS_CMD_QUERY_ERROR_COUNTER,
+
+ __DRM_RAS_CMD_MAX,
+ DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1)
+};
+
+#endif /* _LINUX_DRM_RAS_H */
diff --git a/meson.build b/meson.build
index db6e09a94..f7807660e 100644
--- a/meson.build
+++ b/meson.build
@@ -165,10 +165,13 @@ cairo = dependency('cairo', version : '>1.12.0', required : true)
libudev = dependency('libudev', required : true)
glib = dependency('glib-2.0', required : true)
+libnl = dependency('libnl-3.0', required: false)
+libnl_genl = dependency('libnl-genl-3.0', required: false)
+libnl_cli = dependency('libnl-cli-3.0', required:false)
+
xmlrpc = dependency('xmlrpc', required : false)
xmlrpc_util = dependency('xmlrpc_util', required : false)
xmlrpc_client = dependency('xmlrpc_client', required : false)
-
xmlrpc_cmd = find_program('xmlrpc-c-config', required : false)
if not xmlrpc.found() and xmlrpc_cmd.found()
libs_cmd = run_command(xmlrpc_cmd, 'client', '--libs', check: false)
diff --git a/tools/drm_ras.c b/tools/drm_ras.c
new file mode 100644
index 000000000..bb7d0dfa0
--- /dev/null
+++ b/tools/drm_ras.c
@@ -0,0 +1,421 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <getopt.h>
+#include <linux/genetlink.h>
+#include <netlink/netlink.h>
+#include <netlink/cache.h>
+#include <netlink/genl/genl.h>
+#include <netlink/genl/ctrl.h>
+#include <netlink/cli/utils.h>
+#include <netlink/cli/link.h>
+#include "../include/drm-uapi/drm_netlink.h"
+#include "igt_device_scan.h"
+
+#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
+
+struct nl_sock *mcsock;
+
+enum opt_val {
+ OPT_UNKNOWN = '?',
+ OPT_END = -1,
+ OPT_NODEID,
+ OPT_ERRORID,
+ OPT_HELP,
+};
+
+enum cmd_ids {
+ INVALID_CMD = -1,
+ LIST_NODES = 0,
+ GET_ERROR_COUNTERS,
+ QUERY_ERROR_COUNTER,
+
+ __MAX_CMDS,
+};
+
+static const char * const cmd_names[] = {
+ "list_nodes",
+ "get_error_counters",
+ "query_error_counter",
+};
+
+struct app_context {
+ enum drm_genl_error_cmds command;
+ struct nl_sock *sock;
+ struct nl_cb *cb;
+ uint32_t node_id;
+ uint32_t error_id;
+ int error_id_set;
+ int node_id_set;
+ int error;
+ int family_id;
+};
+
+static void help(char **argv)
+{
+ int i;
+
+ printf("Usage: %s command [<command options>]\n", argv[0]);
+ printf("commands:\n");
+
+ for (i = 0; i < __MAX_CMDS; i++) {
+ switch (i) {
+ case LIST_NODES:
+ printf("%s %s\n",
+ argv[0],
+ cmd_names[i]);
+ break;
+ case GET_ERROR_COUNTERS:
+ printf("%s %s "
+ "--node-id=<node-id>\n",
+ argv[0],
+ cmd_names[i]);
+ break;
+ case QUERY_ERROR_COUNTER:
+ printf("%s %s "
+ "--node-id=<node-id> "
+ "--error-id=<error-id>\n",
+ argv[0],
+ cmd_names[i]);
+ break;
+ default:
+ printf("%s is Unknown Command\n",
+ (i < __MAX_CMDS && cmd_names[i]) ? cmd_names[i] : "Unknown");
+ }
+ }
+}
+
+static int list_nodes_handler(struct nl_msg *msg, void *arg)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ struct nlattr *nla;
+ int len, remain;
+
+ len = GENL_HDRLEN;
+ nlmsg_for_each_attr(nla, nlh, len, remain) {
+ /* Validate whether the attribute is with in the range or not*/
+ if (nla_type(nla) > DRM_RAS_NODE_ATTR_MAX) {
+ printf("Unknown Node attribute type: %d\n", nla_type(nla));
+ return NL_SKIP;
+ }
+
+ switch (nla_type(nla)) {
+ case DRM_RAS_NODE_ATTR_NODE_ID:
+ printf("%-18u\t", nla_get_u32(nla));
+ break;
+ case DRM_RAS_NODE_ATTR_DEVICE_NAME:
+ printf("%-30s\t", nla_get_string(nla));
+ break;
+ case DRM_RAS_NODE_ATTR_NODE_NAME:
+ printf("%-30s\t", nla_get_string(nla));
+ break;
+ case DRM_RAS_NODE_ATTR_NODE_TYPE:
+ printf("%-18u\n", nla_get_u32(nla));
+ break;
+ default:
+ printf("Unknown attribute type: %d\n", nla_type(nla));
+ break;
+ }
+ }
+ return NL_OK;
+}
+
+static int query_error_counter(struct nl_msg *msg, void *arg)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ struct nlattr *attrs[256];
+ int ret;
+
+ /* Parse the attributes */
+ ret = genlmsg_parse(nlh, 0, attrs, 256, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to parse attributes: %s\n", nl_geterror(ret));
+ return NL_SKIP;
+ }
+
+ if (!attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]) {
+ nl_cli_fatal(NLE_FAILURE, "DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE attribute is missing");
+ return NL_SKIP;
+ }
+
+ printf("counter value %u\n", nla_get_u32(attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]));
+
+ return NL_OK;
+}
+
+static int get_error_counters(struct nl_msg *msg, void *arg)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ struct nlattr *nla;
+ int len, remain;
+
+ len = GENL_HDRLEN;
+
+ nlmsg_for_each_attr(nla, nlh, len, remain) {
+ /* Validate whether the attribute is with in the range or not*/
+ if (nla_type(nla) > DRM_RAS_ERROR_COUNTER_ATTR_MAX) {
+ printf("Unknown error counter attribute type: %d\n", nla_type(nla));
+ return NL_SKIP;
+ }
+
+ switch (nla_type(nla)) {
+ case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID:
+ printf("%-18u\t", nla_get_u32(nla));
+ break;
+ case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME:
+ printf("%-30s\t", nla_get_string(nla));
+ break;
+ case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE:
+ printf("%-18u\n", nla_get_u32(nla));
+ break;
+ default:
+ printf("Unknown attribute type: %d\n", nla_type(nla));
+ break;
+ }
+ }
+ return NL_OK;
+}
+
+static int drm_genl_handle_msg(struct nl_msg *msg, void *arg)
+{
+ struct app_context *ctx = (struct app_context *)arg;
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ struct genlmsghdr *gnlh = genlmsg_hdr(nlh);
+
+ /* Verify aginst the expected command response */
+ if (gnlh->cmd != ctx->command) {
+ fprintf(stderr,
+ "Unexpected command response: got %d, expected %d\n",
+ gnlh->cmd,
+ ctx->command);
+ return NL_SKIP;
+ }
+
+ /* Route to respective Command handling function */
+ switch (ctx->command) {
+ case DRM_RAS_CMD_LIST_NODES:
+ return list_nodes_handler(msg, arg);
+ case DRM_RAS_CMD_GET_ERROR_COUNTERS:
+ return get_error_counters(msg, arg);
+ case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
+ return query_error_counter(msg, arg);
+ default:
+ fprintf(stderr, "Unknown command: %d\n", ctx->command);
+ ctx->error = -EOPNOTSUPP;
+ return NL_SKIP;
+ }
+}
+
+static void send_cmd(int cmd, void *arg)
+{
+ struct app_context *ctx = (struct app_context *)arg;
+ struct nl_msg *msg;
+ void *msg_head;
+ int ret;
+
+ msg = nlmsg_alloc();
+ if (!msg)
+ nl_cli_fatal(NLE_INVAL, "nlmsg_alloc failed\n");
+
+ switch (cmd) {
+ case DRM_RAS_CMD_LIST_NODES:
+ msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+ ctx->family_id, 0,
+ NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT | NLM_F_MATCH,
+ cmd, 1);
+ if (!msg_head)
+ nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
+
+ printf("%-18s\t%-30s\t%-30s\t%-18s\n",
+ "node-id", "device-name", "node-name", "node-type");
+ break;
+ case DRM_RAS_CMD_GET_ERROR_COUNTERS:
+ if (!ctx->node_id_set) {
+ fprintf(stderr, "Error: --node-id is required for %s command\n",
+ cmd_names[ctx->command - 1]);
+ exit(EXIT_FAILURE);
+ }
+ msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+ ctx->family_id, 0,
+ NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT | NLM_F_MATCH,
+ cmd, 1);
+
+ if (!msg_head)
+ nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
+
+ nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID, ctx->node_id);
+ printf("%-18s\t%-30s\t%-18s\n",
+ "error-id", "error-name", "error-value");
+ break;
+ case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
+ if (!ctx->node_id_set || !ctx->error_id_set) {
+ fprintf(stderr,
+ "Error: --node-id and --error-id are required "
+ "for %s command\n",
+ cmd_names[ctx->command - 1]);
+ exit(EXIT_FAILURE);
+ }
+ msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+ ctx->family_id, 0,
+ NLM_F_REQUEST | NLM_F_ACK,
+ cmd, 1);
+
+ if (!msg_head)
+ nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
+
+ nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID, ctx->node_id);
+ nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID, ctx->error_id);
+ break;
+ default:
+ break;
+ }
+
+ ret = nl_send_auto(ctx->sock, msg);
+ if (ret < 0)
+ nl_cli_fatal(ret, "Unable to send message: %s", nl_geterror(ret));
+
+ ret = nl_recvmsgs_default(ctx->sock);
+ if (ret < 0)
+ nl_cli_fatal(ret, "Unable to receive message: %s", nl_geterror(ret));
+
+ nlmsg_free(msg);
+}
+
+static int get_cmd(char *cmd_name)
+{
+ int i;
+
+ if (!cmd_name)
+ return -1;
+
+ for (i = 0; i < __DRM_RAS_CMD_MAX; i++) {
+ if (strcasecmp(cmd_name, cmd_names[i]) == 0)
+ return i + 1;
+ }
+ return -1;
+}
+
+static int check_for_help(int argc, char **argv)
+{
+ for (int i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ char *endptr;
+ enum opt_val val;
+ int ret, opt, option_index = 0;
+ struct app_context ctx = {0};
+
+ // Check for help option before command parsing
+ if (check_for_help(argc, argv)) {
+ help(argv);
+ exit(EXIT_SUCCESS);
+ }
+
+ //Parse the input command
+ ctx.command = get_cmd(argv[1]);
+ if (ctx.command < 0) {
+ fprintf(stderr, "invalid command\n");
+ help(argv);
+ exit(EXIT_FAILURE);
+ }
+
+ static struct option options[] = {
+ {"error-id", optional_argument, NULL, OPT_ERRORID},
+ {"node-id", optional_argument, NULL, OPT_NODEID},
+ {"help", no_argument, NULL, OPT_HELP},
+ {0, 0, 0, 0}
+ };
+
+ optind = 2;
+ while ((opt = getopt_long(argc, argv, "h", options, &option_index)) != -1) {
+ switch (opt) {
+ case OPT_ERRORID:
+ if (optarg) {
+ printf("Error ID: %s\n", optarg);
+ //Assuming input is in Decimal Representation
+ ctx.error_id = strtoul(optarg, &endptr, 10);
+ if (*endptr != '\0') {
+ fprintf(stderr, "invalid error id %s\n", optarg);
+ exit(EXIT_FAILURE);
+ }
+ ctx.error_id_set = 1;
+ } else {
+ printf("Error ID not specified\n");
+ ctx.error_id_set = 0;
+ }
+ break;
+ case OPT_NODEID:
+ if (optarg) {
+ printf("Node ID: %s\n", optarg);
+ //Assuming input is in Decimal Representation
+ ctx.node_id = strtoul(optarg, &endptr, 10);
+ if (*endptr != '\0') {
+ fprintf(stderr, "invalid node id %s\n", optarg);
+ exit(EXIT_FAILURE);
+ }
+ ctx.node_id_set = 1;
+ } else {
+ printf("Node ID not specified\n");
+ ctx.node_id_set = 0;
+ }
+ break;
+ case OPT_HELP:
+ case 'h':
+ help(argv);
+ exit(EXIT_SUCCESS);
+ break;
+ case '?':
+ fprintf(stderr, "Unknown option\n");
+ exit(EXIT_FAILURE);
+ break;
+ default:
+ fprintf(stderr, "Unexpected option: %c\n", opt);
+ exit(EXIT_FAILURE);
+ break;
+ }
+ }
+
+ /* Create a Netlink Socket object*/
+ ctx.sock = nl_cli_alloc_socket();
+ if (!ctx.sock)
+ nl_cli_fatal(NLE_NOMEM, "Cannot allocate nl_sock");
+
+ /* Connect the allocated socket to NETLINK_GENERIC protocol*/
+ ret = nl_cli_connect(ctx.sock, NETLINK_GENERIC);
+ if (ret < 0)
+ nl_cli_fatal(ret, "Cannot connect handle");
+
+ /**
+ * Resolves the Generic Netlink family name to the corresponding
+ * numeric family identifier. This function queries the kernel directly
+ */
+ ctx.family_id = genl_ctrl_resolve(ctx.sock, DRM_RAS_GENL_NAME);
+ if (ctx.family_id < 0)
+ nl_cli_fatal(NLE_INVAL, "Resolving of \"%s\" failed", DRM_RAS_GENL_NAME);
+
+ /* Modify the callback handler associated with the socket */
+ ret = nl_socket_modify_cb(ctx.sock, NL_CB_VALID, NL_CB_CUSTOM, drm_genl_handle_msg, &ctx);
+ if (ret < 0)
+ nl_cli_fatal(ret, "Unable to modify valid message callback");
+
+ send_cmd(ctx.command, &ctx);
+
+ nl_close(ctx.sock);
+ nl_socket_free(ctx.sock);
+
+ return 0;
+}
diff --git a/tools/meson.build b/tools/meson.build
index 8185ba160..74ff97713 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -70,6 +70,11 @@ if libudev.found()
install : true)
endif
+executable('drm_ras', 'drm_ras.c',
+ dependencies : [tool_deps, libnl, libnl_cli, libnl_genl],
+ install_rpath : bindir_rpathdir,
+ install : true)
+
executable('gputop', 'gputop.c',
install : true,
install_rpath : bindir_rpathdir,
--
2.34.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH v2] tools/drm_ras: tool to communicate with DRM Netlink Subsystem
2025-11-14 10:07 [PATCH v2] tools/drm_ras: tool to communicate with DRM Netlink Subsystem Ravi Kishore Koppuravuri
@ 2025-11-14 16:40 ` Rodrigo Vivi
2025-11-19 8:43 ` Koppuravuri, Ravi Kishore
0 siblings, 1 reply; 3+ messages in thread
From: Rodrigo Vivi @ 2025-11-14 16:40 UTC (permalink / raw)
To: Ravi Kishore Koppuravuri
Cc: igt-dev, Tauro Riana, Iddamsetty Aravind, Gupta Anshuman
On Fri, Nov 14, 2025 at 03:37:29PM +0530, Ravi Kishore Koppuravuri wrote:
> User space tool for querying GPU health monitoring RAS events via
> Generic Netlink Socket interface from Kernel's DRM Netlink Subsystem.
> Available Commands are
> - List Nodes
> - Get Error Counters
> - Query Error Counter
>
> Signed-off-by: Ravi Kishore Koppuravuri <ravi.kishore.koppuravuri@intel.com>
> Cc: Tauro Riana <riana.tauro@intel.com>
> Cc: Iddamsetty Aravind <aravind.iddamsetty@intel.com>
> Cc: Gupta Anshuman <anshuman.gupta@intel.com>
> Cc: Vivi Rodrigo <rodrigo.vivi@intel.com>
>
> ---
> V1 -> V2:
> - Removed device_id from the input parameters
> - Updated help() function
> - Incorporated error handling logic
> ---
> ---
> include/drm-uapi/drm_netlink.h | 79 +++++++
> meson.build | 5 +-
> tools/drm_ras.c | 421 +++++++++++++++++++++++++++++++++
> tools/meson.build | 5 +
> 4 files changed, 509 insertions(+), 1 deletion(-)
> create mode 100644 include/drm-uapi/drm_netlink.h
> create mode 100644 tools/drm_ras.c
>
> diff --git a/include/drm-uapi/drm_netlink.h b/include/drm-uapi/drm_netlink.h
> new file mode 100644
> index 000000000..af893aa36
> --- /dev/null
> +++ b/include/drm-uapi/drm_netlink.h
This confused me. Please don't change the filename.
It needs to be a straight copy from the kernel name.
in this case drm_ras.h
This is likely what also confused me in the v1 where I thought
your code was based on the old implementation of the netlink.
> @@ -0,0 +1,79 @@
> +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
> +/* Do not edit directly, auto-generated from: */
> +/* Documentation/netlink/specs/drm_ras.yaml */
> +/* YNL-GEN uapi header */
> +
> +#ifndef _LINUX_DRM_RAS_H
> +#define _LINUX_DRM_RAS_H
> +
> +#define DRM_RAS_GENL_NAME "drm-ras"
> +#define DRM_RAS_FAMILY_VERSION 1
> +
> +/*
> + * Type of the node. Currently, only error-counter nodes are supported, which
> + * expose reliability counters for a hardware/software component.
> + */
> +enum drm_ras_node_type {
> + DRM_RAS_NODE_TYPE_ERROR_COUNTER = 1,
> +};
> +
> +enum {
> + /* Unique identifier for the node*/
> + DRM_RAS_NODE_ATTR_NODE_ID = 1,
> +
> + /* Device name chosen by the driver at the time of registration */
> + DRM_RAS_NODE_ATTR_DEVICE_NAME,
> +
> + /* Node name chosen by the driver at registration to identify RAS node inside the device */
> + DRM_RAS_NODE_ATTR_NODE_NAME,
> +
> + /* Type of the node, identifying its function */
> + DRM_RAS_NODE_ATTR_NODE_TYPE,
> +
> + __DRM_RAS_NODE_ATTR_MAX,
> + DRM_RAS_NODE_ATTR_MAX = (__DRM_RAS_NODE_ATTR_MAX - 1)
> +};
> +
> +enum {
> + /* Node ID targeted by this error counter operation */
> + DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID = 1,
> +
> + /* Unique identifier for a specific error counter within an node */
> + DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID,
> +
> + /* Name of the requested error counter */
> + DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME,
> +
> + /* Current value of the requested error counter */
> + DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE,
> +
> + __DRM_RAS_ERROR_COUNTER_ATTR_MAX,
> + DRM_RAS_ERROR_COUNTER_ATTR_MAX = (__DRM_RAS_ERROR_COUNTER_ATTR_MAX - 1)
> +};
> +
> +enum drm_genl_error_cmds {
> + /**
> + * @DRM_RAS_CMD_LIST_NODES: Command to Retrieve the full list of currently registered
> + * DRM RAS nodes.Each node includes its dynamically assigned ID, name, and type.
> + * Obtain the Node IDs by calling this command and use it in the subsequent operations
> + * on the nodes.
> + */
> + DRM_RAS_CMD_LIST_NODES = 1,
> +
> + /**
> + * @DRM_RAS_CMD_GET_ERROR_COUNTERS: Retrieve the full list of error counters for a given
> + * node. The response include id, name, and current value of each counter.
> + */
> + DRM_RAS_CMD_GET_ERROR_COUNTERS,
> +
> + /**
> + * @DRM_RAS_CMD_QUERY_ERROR_COUNTER: Query the information of a specific error counter
> + * for a given node. Response contains id, name, and current value of the counter.
> + */
> + DRM_RAS_CMD_QUERY_ERROR_COUNTER,
> +
> + __DRM_RAS_CMD_MAX,
> + DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1)
> +};
> +
> +#endif /* _LINUX_DRM_RAS_H */
> diff --git a/meson.build b/meson.build
> index db6e09a94..f7807660e 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -165,10 +165,13 @@ cairo = dependency('cairo', version : '>1.12.0', required : true)
> libudev = dependency('libudev', required : true)
> glib = dependency('glib-2.0', required : true)
>
> +libnl = dependency('libnl-3.0', required: false)
> +libnl_genl = dependency('libnl-genl-3.0', required: false)
> +libnl_cli = dependency('libnl-cli-3.0', required:false)
> +
> xmlrpc = dependency('xmlrpc', required : false)
> xmlrpc_util = dependency('xmlrpc_util', required : false)
> xmlrpc_client = dependency('xmlrpc_client', required : false)
> -
> xmlrpc_cmd = find_program('xmlrpc-c-config', required : false)
> if not xmlrpc.found() and xmlrpc_cmd.found()
> libs_cmd = run_command(xmlrpc_cmd, 'client', '--libs', check: false)
> diff --git a/tools/drm_ras.c b/tools/drm_ras.c
> new file mode 100644
> index 000000000..bb7d0dfa0
> --- /dev/null
> +++ b/tools/drm_ras.c
> @@ -0,0 +1,421 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2025 Intel Corporation
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/types.h>
> +#include <unistd.h>
> +#include <ctype.h>
> +#include <getopt.h>
> +#include <linux/genetlink.h>
> +#include <netlink/netlink.h>
> +#include <netlink/cache.h>
> +#include <netlink/genl/genl.h>
> +#include <netlink/genl/ctrl.h>
> +#include <netlink/cli/utils.h>
> +#include <netlink/cli/link.h>
> +#include "../include/drm-uapi/drm_netlink.h"
> +#include "igt_device_scan.h"
> +
> +#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
> +
> +struct nl_sock *mcsock;
> +
> +enum opt_val {
> + OPT_UNKNOWN = '?',
> + OPT_END = -1,
> + OPT_NODEID,
> + OPT_ERRORID,
> + OPT_HELP,
> +};
> +
> +enum cmd_ids {
> + INVALID_CMD = -1,
> + LIST_NODES = 0,
> + GET_ERROR_COUNTERS,
> + QUERY_ERROR_COUNTER,
> +
> + __MAX_CMDS,
> +};
> +
> +static const char * const cmd_names[] = {
> + "list_nodes",
> + "get_error_counters",
> + "query_error_counter",
> +};
> +
> +struct app_context {
> + enum drm_genl_error_cmds command;
> + struct nl_sock *sock;
> + struct nl_cb *cb;
> + uint32_t node_id;
> + uint32_t error_id;
> + int error_id_set;
> + int node_id_set;
> + int error;
> + int family_id;
> +};
> +
> +static void help(char **argv)
> +{
> + int i;
> +
> + printf("Usage: %s command [<command options>]\n", argv[0]);
> + printf("commands:\n");
> +
> + for (i = 0; i < __MAX_CMDS; i++) {
> + switch (i) {
> + case LIST_NODES:
> + printf("%s %s\n",
> + argv[0],
> + cmd_names[i]);
> + break;
> + case GET_ERROR_COUNTERS:
> + printf("%s %s "
> + "--node-id=<node-id>\n",
> + argv[0],
> + cmd_names[i]);
> + break;
> + case QUERY_ERROR_COUNTER:
> + printf("%s %s "
> + "--node-id=<node-id> "
> + "--error-id=<error-id>\n",
> + argv[0],
> + cmd_names[i]);
> + break;
> + default:
> + printf("%s is Unknown Command\n",
> + (i < __MAX_CMDS && cmd_names[i]) ? cmd_names[i] : "Unknown");
> + }
> + }
> +}
> +
> +static int list_nodes_handler(struct nl_msg *msg, void *arg)
> +{
> + struct nlmsghdr *nlh = nlmsg_hdr(msg);
> + struct nlattr *nla;
> + int len, remain;
> +
> + len = GENL_HDRLEN;
> + nlmsg_for_each_attr(nla, nlh, len, remain) {
> + /* Validate whether the attribute is with in the range or not*/
I will randomly chose this point here to do an overall complain about all these
comments in the entire patch here.
Way too much redundant comments. A developer can read the code.
Also, most of them are in different formats and with missed spaces on the begin
or at the end.
Please only use comments when the do tell something else that the code itself
is not already telling and use the standard formats all across.
> + if (nla_type(nla) > DRM_RAS_NODE_ATTR_MAX) {
> + printf("Unknown Node attribute type: %d\n", nla_type(nla));
> + return NL_SKIP;
> + }
> +
> + switch (nla_type(nla)) {
> + case DRM_RAS_NODE_ATTR_NODE_ID:
> + printf("%-18u\t", nla_get_u32(nla));
> + break;
> + case DRM_RAS_NODE_ATTR_DEVICE_NAME:
> + printf("%-30s\t", nla_get_string(nla));
> + break;
> + case DRM_RAS_NODE_ATTR_NODE_NAME:
> + printf("%-30s\t", nla_get_string(nla));
> + break;
> + case DRM_RAS_NODE_ATTR_NODE_TYPE:
> + printf("%-18u\n", nla_get_u32(nla));
> + break;
> + default:
> + printf("Unknown attribute type: %d\n", nla_type(nla));
> + break;
> + }
> + }
> + return NL_OK;
> +}
> +
> +static int query_error_counter(struct nl_msg *msg, void *arg)
> +{
> + struct nlmsghdr *nlh = nlmsg_hdr(msg);
> + struct nlattr *attrs[256];
> + int ret;
> +
> + /* Parse the attributes */
> + ret = genlmsg_parse(nlh, 0, attrs, 256, NULL);
> + if (ret < 0) {
> + fprintf(stderr, "Failed to parse attributes: %s\n", nl_geterror(ret));
> + return NL_SKIP;
> + }
> +
> + if (!attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]) {
> + nl_cli_fatal(NLE_FAILURE, "DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE attribute is missing");
> + return NL_SKIP;
> + }
> +
> + printf("counter value %u\n", nla_get_u32(attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]));
> +
> + return NL_OK;
> +}
> +
> +static int get_error_counters(struct nl_msg *msg, void *arg)
> +{
> + struct nlmsghdr *nlh = nlmsg_hdr(msg);
> + struct nlattr *nla;
> + int len, remain;
> +
> + len = GENL_HDRLEN;
> +
> + nlmsg_for_each_attr(nla, nlh, len, remain) {
> + /* Validate whether the attribute is with in the range or not*/
> + if (nla_type(nla) > DRM_RAS_ERROR_COUNTER_ATTR_MAX) {
> + printf("Unknown error counter attribute type: %d\n", nla_type(nla));
> + return NL_SKIP;
> + }
> +
> + switch (nla_type(nla)) {
> + case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID:
> + printf("%-18u\t", nla_get_u32(nla));
> + break;
> + case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME:
> + printf("%-30s\t", nla_get_string(nla));
> + break;
> + case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE:
> + printf("%-18u\n", nla_get_u32(nla));
> + break;
> + default:
> + printf("Unknown attribute type: %d\n", nla_type(nla));
> + break;
> + }
> + }
> + return NL_OK;
> +}
> +
> +static int drm_genl_handle_msg(struct nl_msg *msg, void *arg)
> +{
> + struct app_context *ctx = (struct app_context *)arg;
> + struct nlmsghdr *nlh = nlmsg_hdr(msg);
> + struct genlmsghdr *gnlh = genlmsg_hdr(nlh);
> +
> + /* Verify aginst the expected command response */
> + if (gnlh->cmd != ctx->command) {
> + fprintf(stderr,
> + "Unexpected command response: got %d, expected %d\n",
> + gnlh->cmd,
> + ctx->command);
> + return NL_SKIP;
> + }
> +
> + /* Route to respective Command handling function */
> + switch (ctx->command) {
> + case DRM_RAS_CMD_LIST_NODES:
> + return list_nodes_handler(msg, arg);
> + case DRM_RAS_CMD_GET_ERROR_COUNTERS:
> + return get_error_counters(msg, arg);
> + case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
> + return query_error_counter(msg, arg);
> + default:
> + fprintf(stderr, "Unknown command: %d\n", ctx->command);
> + ctx->error = -EOPNOTSUPP;
> + return NL_SKIP;
> + }
> +}
> +
> +static void send_cmd(int cmd, void *arg)
> +{
> + struct app_context *ctx = (struct app_context *)arg;
> + struct nl_msg *msg;
> + void *msg_head;
> + int ret;
> +
> + msg = nlmsg_alloc();
> + if (!msg)
> + nl_cli_fatal(NLE_INVAL, "nlmsg_alloc failed\n");
> +
> + switch (cmd) {
> + case DRM_RAS_CMD_LIST_NODES:
> + msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
> + ctx->family_id, 0,
> + NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT | NLM_F_MATCH,
> + cmd, 1);
> + if (!msg_head)
> + nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
> +
> + printf("%-18s\t%-30s\t%-30s\t%-18s\n",
> + "node-id", "device-name", "node-name", "node-type");
> + break;
> + case DRM_RAS_CMD_GET_ERROR_COUNTERS:
> + if (!ctx->node_id_set) {
> + fprintf(stderr, "Error: --node-id is required for %s command\n",
> + cmd_names[ctx->command - 1]);
> + exit(EXIT_FAILURE);
> + }
> + msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
> + ctx->family_id, 0,
> + NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT | NLM_F_MATCH,
> + cmd, 1);
> +
> + if (!msg_head)
> + nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
> +
> + nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID, ctx->node_id);
> + printf("%-18s\t%-30s\t%-18s\n",
> + "error-id", "error-name", "error-value");
> + break;
> + case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
> + if (!ctx->node_id_set || !ctx->error_id_set) {
> + fprintf(stderr,
> + "Error: --node-id and --error-id are required "
> + "for %s command\n",
> + cmd_names[ctx->command - 1]);
> + exit(EXIT_FAILURE);
> + }
> + msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
> + ctx->family_id, 0,
> + NLM_F_REQUEST | NLM_F_ACK,
> + cmd, 1);
> +
> + if (!msg_head)
> + nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
> +
> + nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID, ctx->node_id);
> + nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID, ctx->error_id);
> + break;
> + default:
> + break;
> + }
> +
> + ret = nl_send_auto(ctx->sock, msg);
> + if (ret < 0)
> + nl_cli_fatal(ret, "Unable to send message: %s", nl_geterror(ret));
> +
> + ret = nl_recvmsgs_default(ctx->sock);
> + if (ret < 0)
> + nl_cli_fatal(ret, "Unable to receive message: %s", nl_geterror(ret));
> +
> + nlmsg_free(msg);
> +}
> +
> +static int get_cmd(char *cmd_name)
> +{
> + int i;
> +
> + if (!cmd_name)
> + return -1;
> +
> + for (i = 0; i < __DRM_RAS_CMD_MAX; i++) {
> + if (strcasecmp(cmd_name, cmd_names[i]) == 0)
> + return i + 1;
> + }
> + return -1;
> +}
> +
> +static int check_for_help(int argc, char **argv)
> +{
> + for (int i = 1; i < argc; i++) {
> + if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
> + return 1;
> + }
> + return 0;
> +}
> +
> +int main(int argc, char **argv)
> +{
> + char *endptr;
> + enum opt_val val;
> + int ret, opt, option_index = 0;
> + struct app_context ctx = {0};
> +
> + // Check for help option before command parsing
> + if (check_for_help(argc, argv)) {
> + help(argv);
> + exit(EXIT_SUCCESS);
> + }
> +
> + //Parse the input command
> + ctx.command = get_cmd(argv[1]);
> + if (ctx.command < 0) {
> + fprintf(stderr, "invalid command\n");
> + help(argv);
> + exit(EXIT_FAILURE);
> + }
> +
> + static struct option options[] = {
> + {"error-id", optional_argument, NULL, OPT_ERRORID},
> + {"node-id", optional_argument, NULL, OPT_NODEID},
> + {"help", no_argument, NULL, OPT_HELP},
> + {0, 0, 0, 0}
> + };
> +
> + optind = 2;
> + while ((opt = getopt_long(argc, argv, "h", options, &option_index)) != -1) {
> + switch (opt) {
> + case OPT_ERRORID:
> + if (optarg) {
> + printf("Error ID: %s\n", optarg);
> + //Assuming input is in Decimal Representation
> + ctx.error_id = strtoul(optarg, &endptr, 10);
> + if (*endptr != '\0') {
> + fprintf(stderr, "invalid error id %s\n", optarg);
> + exit(EXIT_FAILURE);
> + }
> + ctx.error_id_set = 1;
> + } else {
> + printf("Error ID not specified\n");
> + ctx.error_id_set = 0;
> + }
> + break;
> + case OPT_NODEID:
> + if (optarg) {
> + printf("Node ID: %s\n", optarg);
no need to echo back
> + //Assuming input is in Decimal Representation
besides the comment comment I made above,
we don't assume, we check...
> + ctx.node_id = strtoul(optarg, &endptr, 10);
> + if (*endptr != '\0') {
...but we check before the conversion, not after.
> + fprintf(stderr, "invalid node id %s\n", optarg);
> + exit(EXIT_FAILURE);
> + }
> + ctx.node_id_set = 1;
> + } else {
> + printf("Node ID not specified\n");
stderr print and exit?
or if it is not an error flow you don't need to be that verbose...
> + ctx.node_id_set = 0;
init the node_id to -1 and you can avoid this extra variable.
> + }
> + break;
> + case OPT_HELP:
> + case 'h':
> + help(argv);
> + exit(EXIT_SUCCESS);
> + break;
> + case '?':
> + fprintf(stderr, "Unknown option\n");
> + exit(EXIT_FAILURE);
> + break;
> + default:
> + fprintf(stderr, "Unexpected option: %c\n", opt);
> + exit(EXIT_FAILURE);
> + break;
> + }
> + }
> +
> + /* Create a Netlink Socket object*/
> + ctx.sock = nl_cli_alloc_socket();
> + if (!ctx.sock)
> + nl_cli_fatal(NLE_NOMEM, "Cannot allocate nl_sock");
why do we use cli_fatal? and when using it, why don't we exit?
> +
> + /* Connect the allocated socket to NETLINK_GENERIC protocol*/
> + ret = nl_cli_connect(ctx.sock, NETLINK_GENERIC);
> + if (ret < 0)
> + nl_cli_fatal(ret, "Cannot connect handle");
> +
> + /**
> + * Resolves the Generic Netlink family name to the corresponding
> + * numeric family identifier. This function queries the kernel directly
> + */
> + ctx.family_id = genl_ctrl_resolve(ctx.sock, DRM_RAS_GENL_NAME);
> + if (ctx.family_id < 0)
> + nl_cli_fatal(NLE_INVAL, "Resolving of \"%s\" failed", DRM_RAS_GENL_NAME);
> +
> + /* Modify the callback handler associated with the socket */
> + ret = nl_socket_modify_cb(ctx.sock, NL_CB_VALID, NL_CB_CUSTOM, drm_genl_handle_msg, &ctx);
> + if (ret < 0)
> + nl_cli_fatal(ret, "Unable to modify valid message callback");
> +
> + send_cmd(ctx.command, &ctx);
> +
> + nl_close(ctx.sock);
> + nl_socket_free(ctx.sock);
> +
> + return 0;
> +}
> diff --git a/tools/meson.build b/tools/meson.build
> index 8185ba160..74ff97713 100644
> --- a/tools/meson.build
> +++ b/tools/meson.build
> @@ -70,6 +70,11 @@ if libudev.found()
> install : true)
> endif
>
> +executable('drm_ras', 'drm_ras.c',
> + dependencies : [tool_deps, libnl, libnl_cli, libnl_genl],
> + install_rpath : bindir_rpathdir,
> + install : true)
> +
> executable('gputop', 'gputop.c',
> install : true,
> install_rpath : bindir_rpathdir,
> --
> 2.34.1
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v2] tools/drm_ras: tool to communicate with DRM Netlink Subsystem
2025-11-14 16:40 ` Rodrigo Vivi
@ 2025-11-19 8:43 ` Koppuravuri, Ravi Kishore
0 siblings, 0 replies; 3+ messages in thread
From: Koppuravuri, Ravi Kishore @ 2025-11-19 8:43 UTC (permalink / raw)
To: Rodrigo Vivi; +Cc: igt-dev, Tauro Riana, Iddamsetty Aravind, Gupta Anshuman
On 14-11-2025 22:10, Rodrigo Vivi wrote:
> On Fri, Nov 14, 2025 at 03:37:29PM +0530, Ravi Kishore Koppuravuri wrote:
>> User space tool for querying GPU health monitoring RAS events via
>> Generic Netlink Socket interface from Kernel's DRM Netlink Subsystem.
>> Available Commands are
>> - List Nodes
>> - Get Error Counters
>> - Query Error Counter
>>
>> Signed-off-by: Ravi Kishore Koppuravuri <ravi.kishore.koppuravuri@intel.com>
>> Cc: Tauro Riana <riana.tauro@intel.com>
>> Cc: Iddamsetty Aravind <aravind.iddamsetty@intel.com>
>> Cc: Gupta Anshuman <anshuman.gupta@intel.com>
>> Cc: Vivi Rodrigo <rodrigo.vivi@intel.com>
>>
>> ---
>> V1 -> V2:
>> - Removed device_id from the input parameters
>> - Updated help() function
>> - Incorporated error handling logic
>> ---
>> ---
>> include/drm-uapi/drm_netlink.h | 79 +++++++
>> meson.build | 5 +-
>> tools/drm_ras.c | 421 +++++++++++++++++++++++++++++++++
>> tools/meson.build | 5 +
>> 4 files changed, 509 insertions(+), 1 deletion(-)
>> create mode 100644 include/drm-uapi/drm_netlink.h
>> create mode 100644 tools/drm_ras.c
>>
>> diff --git a/include/drm-uapi/drm_netlink.h b/include/drm-uapi/drm_netlink.h
>> new file mode 100644
>> index 000000000..af893aa36
>> --- /dev/null
>> +++ b/include/drm-uapi/drm_netlink.h
> This confused me. Please don't change the filename.
> It needs to be a straight copy from the kernel name.
> in this case drm_ras.h
>
> This is likely what also confused me in the v1 where I thought
> your code was based on the old implementation of the netlink.
Sure. I will update the header file name as suggested.
>> @@ -0,0 +1,79 @@
>> +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
>> +/* Do not edit directly, auto-generated from: */
>> +/* Documentation/netlink/specs/drm_ras.yaml */
>> +/* YNL-GEN uapi header */
>> +
>> +#ifndef _LINUX_DRM_RAS_H
>> +#define _LINUX_DRM_RAS_H
>> +
>> +#define DRM_RAS_GENL_NAME "drm-ras"
>> +#define DRM_RAS_FAMILY_VERSION 1
>> +
>> +/*
>> + * Type of the node. Currently, only error-counter nodes are supported, which
>> + * expose reliability counters for a hardware/software component.
>> + */
>> +enum drm_ras_node_type {
>> + DRM_RAS_NODE_TYPE_ERROR_COUNTER = 1,
>> +};
>> +
>> +enum {
>> + /* Unique identifier for the node*/
>> + DRM_RAS_NODE_ATTR_NODE_ID = 1,
>> +
>> + /* Device name chosen by the driver at the time of registration */
>> + DRM_RAS_NODE_ATTR_DEVICE_NAME,
>> +
>> + /* Node name chosen by the driver at registration to identify RAS node inside the device */
>> + DRM_RAS_NODE_ATTR_NODE_NAME,
>> +
>> + /* Type of the node, identifying its function */
>> + DRM_RAS_NODE_ATTR_NODE_TYPE,
>> +
>> + __DRM_RAS_NODE_ATTR_MAX,
>> + DRM_RAS_NODE_ATTR_MAX = (__DRM_RAS_NODE_ATTR_MAX - 1)
>> +};
>> +
>> +enum {
>> + /* Node ID targeted by this error counter operation */
>> + DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID = 1,
>> +
>> + /* Unique identifier for a specific error counter within an node */
>> + DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID,
>> +
>> + /* Name of the requested error counter */
>> + DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME,
>> +
>> + /* Current value of the requested error counter */
>> + DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE,
>> +
>> + __DRM_RAS_ERROR_COUNTER_ATTR_MAX,
>> + DRM_RAS_ERROR_COUNTER_ATTR_MAX = (__DRM_RAS_ERROR_COUNTER_ATTR_MAX - 1)
>> +};
>> +
>> +enum drm_genl_error_cmds {
>> + /**
>> + * @DRM_RAS_CMD_LIST_NODES: Command to Retrieve the full list of currently registered
>> + * DRM RAS nodes.Each node includes its dynamically assigned ID, name, and type.
>> + * Obtain the Node IDs by calling this command and use it in the subsequent operations
>> + * on the nodes.
>> + */
>> + DRM_RAS_CMD_LIST_NODES = 1,
>> +
>> + /**
>> + * @DRM_RAS_CMD_GET_ERROR_COUNTERS: Retrieve the full list of error counters for a given
>> + * node. The response include id, name, and current value of each counter.
>> + */
>> + DRM_RAS_CMD_GET_ERROR_COUNTERS,
>> +
>> + /**
>> + * @DRM_RAS_CMD_QUERY_ERROR_COUNTER: Query the information of a specific error counter
>> + * for a given node. Response contains id, name, and current value of the counter.
>> + */
>> + DRM_RAS_CMD_QUERY_ERROR_COUNTER,
>> +
>> + __DRM_RAS_CMD_MAX,
>> + DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1)
>> +};
>> +
>> +#endif /* _LINUX_DRM_RAS_H */
>> diff --git a/meson.build b/meson.build
>> index db6e09a94..f7807660e 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -165,10 +165,13 @@ cairo = dependency('cairo', version : '>1.12.0', required : true)
>> libudev = dependency('libudev', required : true)
>> glib = dependency('glib-2.0', required : true)
>>
>> +libnl = dependency('libnl-3.0', required: false)
>> +libnl_genl = dependency('libnl-genl-3.0', required: false)
>> +libnl_cli = dependency('libnl-cli-3.0', required:false)
>> +
>> xmlrpc = dependency('xmlrpc', required : false)
>> xmlrpc_util = dependency('xmlrpc_util', required : false)
>> xmlrpc_client = dependency('xmlrpc_client', required : false)
>> -
>> xmlrpc_cmd = find_program('xmlrpc-c-config', required : false)
>> if not xmlrpc.found() and xmlrpc_cmd.found()
>> libs_cmd = run_command(xmlrpc_cmd, 'client', '--libs', check: false)
>> diff --git a/tools/drm_ras.c b/tools/drm_ras.c
>> new file mode 100644
>> index 000000000..bb7d0dfa0
>> --- /dev/null
>> +++ b/tools/drm_ras.c
>> @@ -0,0 +1,421 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <sys/types.h>
>> +#include <unistd.h>
>> +#include <ctype.h>
>> +#include <getopt.h>
>> +#include <linux/genetlink.h>
>> +#include <netlink/netlink.h>
>> +#include <netlink/cache.h>
>> +#include <netlink/genl/genl.h>
>> +#include <netlink/genl/ctrl.h>
>> +#include <netlink/cli/utils.h>
>> +#include <netlink/cli/link.h>
>> +#include "../include/drm-uapi/drm_netlink.h"
>> +#include "igt_device_scan.h"
>> +
>> +#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
>> +
>> +struct nl_sock *mcsock;
>> +
>> +enum opt_val {
>> + OPT_UNKNOWN = '?',
>> + OPT_END = -1,
>> + OPT_NODEID,
>> + OPT_ERRORID,
>> + OPT_HELP,
>> +};
>> +
>> +enum cmd_ids {
>> + INVALID_CMD = -1,
>> + LIST_NODES = 0,
>> + GET_ERROR_COUNTERS,
>> + QUERY_ERROR_COUNTER,
>> +
>> + __MAX_CMDS,
>> +};
>> +
>> +static const char * const cmd_names[] = {
>> + "list_nodes",
>> + "get_error_counters",
>> + "query_error_counter",
>> +};
>> +
>> +struct app_context {
>> + enum drm_genl_error_cmds command;
>> + struct nl_sock *sock;
>> + struct nl_cb *cb;
>> + uint32_t node_id;
>> + uint32_t error_id;
>> + int error_id_set;
>> + int node_id_set;
>> + int error;
>> + int family_id;
>> +};
>> +
>> +static void help(char **argv)
>> +{
>> + int i;
>> +
>> + printf("Usage: %s command [<command options>]\n", argv[0]);
>> + printf("commands:\n");
>> +
>> + for (i = 0; i < __MAX_CMDS; i++) {
>> + switch (i) {
>> + case LIST_NODES:
>> + printf("%s %s\n",
>> + argv[0],
>> + cmd_names[i]);
>> + break;
>> + case GET_ERROR_COUNTERS:
>> + printf("%s %s "
>> + "--node-id=<node-id>\n",
>> + argv[0],
>> + cmd_names[i]);
>> + break;
>> + case QUERY_ERROR_COUNTER:
>> + printf("%s %s "
>> + "--node-id=<node-id> "
>> + "--error-id=<error-id>\n",
>> + argv[0],
>> + cmd_names[i]);
>> + break;
>> + default:
>> + printf("%s is Unknown Command\n",
>> + (i < __MAX_CMDS && cmd_names[i]) ? cmd_names[i] : "Unknown");
>> + }
>> + }
>> +}
>> +
>> +static int list_nodes_handler(struct nl_msg *msg, void *arg)
>> +{
>> + struct nlmsghdr *nlh = nlmsg_hdr(msg);
>> + struct nlattr *nla;
>> + int len, remain;
>> +
>> + len = GENL_HDRLEN;
>> + nlmsg_for_each_attr(nla, nlh, len, remain) {
>> + /* Validate whether the attribute is with in the range or not*/
> I will randomly chose this point here to do an overall complain about all these
> comments in the entire patch here.
>
> Way too much redundant comments. A developer can read the code.
>
> Also, most of them are in different formats and with missed spaces on the begin
> or at the end.
>
> Please only use comments when the do tell something else that the code itself
> is not already telling and use the standard formats all across.
Thanks Rodrigo. I will update the comments as per the guidelines suggested.
>> + if (nla_type(nla) > DRM_RAS_NODE_ATTR_MAX) {
>> + printf("Unknown Node attribute type: %d\n", nla_type(nla));
>> + return NL_SKIP;
>> + }
>> +
>> + switch (nla_type(nla)) {
>> + case DRM_RAS_NODE_ATTR_NODE_ID:
>> + printf("%-18u\t", nla_get_u32(nla));
>> + break;
>> + case DRM_RAS_NODE_ATTR_DEVICE_NAME:
>> + printf("%-30s\t", nla_get_string(nla));
>> + break;
>> + case DRM_RAS_NODE_ATTR_NODE_NAME:
>> + printf("%-30s\t", nla_get_string(nla));
>> + break;
>> + case DRM_RAS_NODE_ATTR_NODE_TYPE:
>> + printf("%-18u\n", nla_get_u32(nla));
>> + break;
>> + default:
>> + printf("Unknown attribute type: %d\n", nla_type(nla));
>> + break;
>> + }
>> + }
>> + return NL_OK;
>> +}
>> +
>> +static int query_error_counter(struct nl_msg *msg, void *arg)
>> +{
>> + struct nlmsghdr *nlh = nlmsg_hdr(msg);
>> + struct nlattr *attrs[256];
>> + int ret;
>> +
>> + /* Parse the attributes */
>> + ret = genlmsg_parse(nlh, 0, attrs, 256, NULL);
>> + if (ret < 0) {
>> + fprintf(stderr, "Failed to parse attributes: %s\n", nl_geterror(ret));
>> + return NL_SKIP;
>> + }
>> +
>> + if (!attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]) {
>> + nl_cli_fatal(NLE_FAILURE, "DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE attribute is missing");
>> + return NL_SKIP;
>> + }
>> +
>> + printf("counter value %u\n", nla_get_u32(attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]));
>> +
>> + return NL_OK;
>> +}
>> +
>> +static int get_error_counters(struct nl_msg *msg, void *arg)
>> +{
>> + struct nlmsghdr *nlh = nlmsg_hdr(msg);
>> + struct nlattr *nla;
>> + int len, remain;
>> +
>> + len = GENL_HDRLEN;
>> +
>> + nlmsg_for_each_attr(nla, nlh, len, remain) {
>> + /* Validate whether the attribute is with in the range or not*/
>> + if (nla_type(nla) > DRM_RAS_ERROR_COUNTER_ATTR_MAX) {
>> + printf("Unknown error counter attribute type: %d\n", nla_type(nla));
>> + return NL_SKIP;
>> + }
>> +
>> + switch (nla_type(nla)) {
>> + case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID:
>> + printf("%-18u\t", nla_get_u32(nla));
>> + break;
>> + case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME:
>> + printf("%-30s\t", nla_get_string(nla));
>> + break;
>> + case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE:
>> + printf("%-18u\n", nla_get_u32(nla));
>> + break;
>> + default:
>> + printf("Unknown attribute type: %d\n", nla_type(nla));
>> + break;
>> + }
>> + }
>> + return NL_OK;
>> +}
>> +
>> +static int drm_genl_handle_msg(struct nl_msg *msg, void *arg)
>> +{
>> + struct app_context *ctx = (struct app_context *)arg;
>> + struct nlmsghdr *nlh = nlmsg_hdr(msg);
>> + struct genlmsghdr *gnlh = genlmsg_hdr(nlh);
>> +
>> + /* Verify aginst the expected command response */
>> + if (gnlh->cmd != ctx->command) {
>> + fprintf(stderr,
>> + "Unexpected command response: got %d, expected %d\n",
>> + gnlh->cmd,
>> + ctx->command);
>> + return NL_SKIP;
>> + }
>> +
>> + /* Route to respective Command handling function */
>> + switch (ctx->command) {
>> + case DRM_RAS_CMD_LIST_NODES:
>> + return list_nodes_handler(msg, arg);
>> + case DRM_RAS_CMD_GET_ERROR_COUNTERS:
>> + return get_error_counters(msg, arg);
>> + case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
>> + return query_error_counter(msg, arg);
>> + default:
>> + fprintf(stderr, "Unknown command: %d\n", ctx->command);
>> + ctx->error = -EOPNOTSUPP;
>> + return NL_SKIP;
>> + }
>> +}
>> +
>> +static void send_cmd(int cmd, void *arg)
>> +{
>> + struct app_context *ctx = (struct app_context *)arg;
>> + struct nl_msg *msg;
>> + void *msg_head;
>> + int ret;
>> +
>> + msg = nlmsg_alloc();
>> + if (!msg)
>> + nl_cli_fatal(NLE_INVAL, "nlmsg_alloc failed\n");
>> +
>> + switch (cmd) {
>> + case DRM_RAS_CMD_LIST_NODES:
>> + msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
>> + ctx->family_id, 0,
>> + NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT | NLM_F_MATCH,
>> + cmd, 1);
>> + if (!msg_head)
>> + nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
>> +
>> + printf("%-18s\t%-30s\t%-30s\t%-18s\n",
>> + "node-id", "device-name", "node-name", "node-type");
>> + break;
>> + case DRM_RAS_CMD_GET_ERROR_COUNTERS:
>> + if (!ctx->node_id_set) {
>> + fprintf(stderr, "Error: --node-id is required for %s command\n",
>> + cmd_names[ctx->command - 1]);
>> + exit(EXIT_FAILURE);
>> + }
>> + msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
>> + ctx->family_id, 0,
>> + NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT | NLM_F_MATCH,
>> + cmd, 1);
>> +
>> + if (!msg_head)
>> + nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
>> +
>> + nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID, ctx->node_id);
>> + printf("%-18s\t%-30s\t%-18s\n",
>> + "error-id", "error-name", "error-value");
>> + break;
>> + case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
>> + if (!ctx->node_id_set || !ctx->error_id_set) {
>> + fprintf(stderr,
>> + "Error: --node-id and --error-id are required "
>> + "for %s command\n",
>> + cmd_names[ctx->command - 1]);
>> + exit(EXIT_FAILURE);
>> + }
>> + msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
>> + ctx->family_id, 0,
>> + NLM_F_REQUEST | NLM_F_ACK,
>> + cmd, 1);
>> +
>> + if (!msg_head)
>> + nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
>> +
>> + nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID, ctx->node_id);
>> + nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID, ctx->error_id);
>> + break;
>> + default:
>> + break;
>> + }
>> +
>> + ret = nl_send_auto(ctx->sock, msg);
>> + if (ret < 0)
>> + nl_cli_fatal(ret, "Unable to send message: %s", nl_geterror(ret));
>> +
>> + ret = nl_recvmsgs_default(ctx->sock);
>> + if (ret < 0)
>> + nl_cli_fatal(ret, "Unable to receive message: %s", nl_geterror(ret));
>> +
>> + nlmsg_free(msg);
>> +}
>> +
>> +static int get_cmd(char *cmd_name)
>> +{
>> + int i;
>> +
>> + if (!cmd_name)
>> + return -1;
>> +
>> + for (i = 0; i < __DRM_RAS_CMD_MAX; i++) {
>> + if (strcasecmp(cmd_name, cmd_names[i]) == 0)
>> + return i + 1;
>> + }
>> + return -1;
>> +}
>> +
>> +static int check_for_help(int argc, char **argv)
>> +{
>> + for (int i = 1; i < argc; i++) {
>> + if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0)
>> + return 1;
>> + }
>> + return 0;
>> +}
>> +
>> +int main(int argc, char **argv)
>> +{
>> + char *endptr;
>> + enum opt_val val;
>> + int ret, opt, option_index = 0;
>> + struct app_context ctx = {0};
>> +
>> + // Check for help option before command parsing
>> + if (check_for_help(argc, argv)) {
>> + help(argv);
>> + exit(EXIT_SUCCESS);
>> + }
>> +
>> + //Parse the input command
>> + ctx.command = get_cmd(argv[1]);
>> + if (ctx.command < 0) {
>> + fprintf(stderr, "invalid command\n");
>> + help(argv);
>> + exit(EXIT_FAILURE);
>> + }
>> +
>> + static struct option options[] = {
>> + {"error-id", optional_argument, NULL, OPT_ERRORID},
>> + {"node-id", optional_argument, NULL, OPT_NODEID},
>> + {"help", no_argument, NULL, OPT_HELP},
>> + {0, 0, 0, 0}
>> + };
>> +
>> + optind = 2;
>> + while ((opt = getopt_long(argc, argv, "h", options, &option_index)) != -1) {
>> + switch (opt) {
>> + case OPT_ERRORID:
>> + if (optarg) {
>> + printf("Error ID: %s\n", optarg);
>> + //Assuming input is in Decimal Representation
>> + ctx.error_id = strtoul(optarg, &endptr, 10);
>> + if (*endptr != '\0') {
>> + fprintf(stderr, "invalid error id %s\n", optarg);
>> + exit(EXIT_FAILURE);
>> + }
>> + ctx.error_id_set = 1;
>> + } else {
>> + printf("Error ID not specified\n");
>> + ctx.error_id_set = 0;
>> + }
>> + break;
>> + case OPT_NODEID:
>> + if (optarg) {
>> + printf("Node ID: %s\n", optarg);
> no need to echo back
Ok.
>> + //Assuming input is in Decimal Representation
> besides the comment comment I made above,
>
> we don't assume, we check...
Ok.
>> + ctx.node_id = strtoul(optarg, &endptr, 10);
>> + if (*endptr != '\0') {
> ...but we check before the conversion, not after.
>
>> + fprintf(stderr, "invalid node id %s\n", optarg);
>> + exit(EXIT_FAILURE);
>> + }
>> + ctx.node_id_set = 1;
>> + } else {
>> + printf("Node ID not specified\n");
> stderr print and exit?
> or if it is not an error flow you don't need to be that verbose...
Ok. I will remove the verbose prints here.
>> + ctx.node_id_set = 0;
> init the node_id to -1 and you can avoid this extra variable.
Ok. I will update the logic accordingly
>> + }
>> + break;
>> + case OPT_HELP:
>> + case 'h':
>> + help(argv);
>> + exit(EXIT_SUCCESS);
>> + break;
>> + case '?':
>> + fprintf(stderr, "Unknown option\n");
>> + exit(EXIT_FAILURE);
>> + break;
>> + default:
>> + fprintf(stderr, "Unexpected option: %c\n", opt);
>> + exit(EXIT_FAILURE);
>> + break;
>> + }
>> + }
>> +
>> + /* Create a Netlink Socket object*/
>> + ctx.sock = nl_cli_alloc_socket();
>> + if (!ctx.sock)
>> + nl_cli_fatal(NLE_NOMEM, "Cannot allocate nl_sock");
> why do we use cli_fatal? and when using it, why don't we exit?
nl_cli_fatal() prints the error message and terminates the program
immediately. So I am not using exit() explicitly. Also, I am working on
to update the error handling logic in my next revision.
>> +
>> + /* Connect the allocated socket to NETLINK_GENERIC protocol*/
>> + ret = nl_cli_connect(ctx.sock, NETLINK_GENERIC);
>> + if (ret < 0)
>> + nl_cli_fatal(ret, "Cannot connect handle");
>> +
>> + /**
>> + * Resolves the Generic Netlink family name to the corresponding
>> + * numeric family identifier. This function queries the kernel directly
>> + */
>> + ctx.family_id = genl_ctrl_resolve(ctx.sock, DRM_RAS_GENL_NAME);
>> + if (ctx.family_id < 0)
>> + nl_cli_fatal(NLE_INVAL, "Resolving of \"%s\" failed", DRM_RAS_GENL_NAME);
>> +
>> + /* Modify the callback handler associated with the socket */
>> + ret = nl_socket_modify_cb(ctx.sock, NL_CB_VALID, NL_CB_CUSTOM, drm_genl_handle_msg, &ctx);
>> + if (ret < 0)
>> + nl_cli_fatal(ret, "Unable to modify valid message callback");
>> +
>> + send_cmd(ctx.command, &ctx);
>> +
>> + nl_close(ctx.sock);
>> + nl_socket_free(ctx.sock);
>> +
>> + return 0;
>> +}
>> diff --git a/tools/meson.build b/tools/meson.build
>> index 8185ba160..74ff97713 100644
>> --- a/tools/meson.build
>> +++ b/tools/meson.build
>> @@ -70,6 +70,11 @@ if libudev.found()
>> install : true)
>> endif
>>
>> +executable('drm_ras', 'drm_ras.c',
>> + dependencies : [tool_deps, libnl, libnl_cli, libnl_genl],
>> + install_rpath : bindir_rpathdir,
>> + install : true)
>> +
>> executable('gputop', 'gputop.c',
>> install : true,
>> install_rpath : bindir_rpathdir,
>> --
>> 2.34.1
>>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-11-19 8:44 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-14 10:07 [PATCH v2] tools/drm_ras: tool to communicate with DRM Netlink Subsystem Ravi Kishore Koppuravuri
2025-11-14 16:40 ` Rodrigo Vivi
2025-11-19 8:43 ` Koppuravuri, Ravi Kishore
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox