Re: [PATCH 3/3] drm/xe/xe_ras: Add RAS support for GPU health indicator

Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: "Purkait, Soham" <soham.purkait@intel.com>
To: "Nilawar, Badal" <badal.nilawar@intel.com>,
	<intel-xe@lists.freedesktop.org>, <riana.tauro@intel.com>,
	<anshuman.gupta@intel.com>, <aravind.iddamsetty@linux.intel.com>,
	<raag.jadav@intel.com>, <ravi.kishore.koppuravuri@intel.com>,
	<mallesh.koujalagi@intel.com>
Cc: <anoop.c.vijay@intel.com>
Subject: Re: [PATCH 3/3] drm/xe/xe_ras: Add RAS support for GPU health indicator
Date: Tue, 14 Apr 2026 16:46:37 +0530	[thread overview]
Message-ID: <60d27d8b-5b32-4631-a28f-a8a1e9343176@intel.com> (raw)
In-Reply-To: <6e821825-872f-4246-93da-03f1f8c42998@intel.com>

Hi Badal,

On 08-04-2026 17:19, Nilawar, Badal wrote:
>
> On 09-03-2026 10:47, Soham Purkait wrote:
>> GPU health indicator exposes a single sysfs interface (gpu_health),
>> placed in the device level that allows administrators and user-space
>> tools to both query and modify the GPU health status.
>>
>> Signed-off-by: Soham Purkait <soham.purkait@intel.com>
>> ---
>>   drivers/gpu/drm/xe/Makefile    |   1 +
>>   drivers/gpu/drm/xe/xe_device.c |   3 +
>>   drivers/gpu/drm/xe/xe_ras.c    | 166 +++++++++++++++++++++++++++++++++
>>   drivers/gpu/drm/xe/xe_ras.h    |  13 +++
>>   4 files changed, 183 insertions(+)
>>   create mode 100644 drivers/gpu/drm/xe/xe_ras.c
>>   create mode 100644 drivers/gpu/drm/xe/xe_ras.h
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index 1890bbd1b28d..ee18638f73c3 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -110,6 +110,7 @@ xe-y += xe_bb.o \
>>       xe_pxp_debugfs.o \
>>       xe_pxp_submit.o \
>>       xe_query.o \
>> +    xe_ras.o \
>>       xe_range_fence.o \
>>       xe_reg_sr.o \
>>       xe_reg_whitelist.o \
>> diff --git a/drivers/gpu/drm/xe/xe_device.c 
>> b/drivers/gpu/drm/xe/xe_device.c
>> index 1d61bb504e9b..2283a18e1034 100644
>> --- a/drivers/gpu/drm/xe/xe_device.c
>> +++ b/drivers/gpu/drm/xe/xe_device.c
>> @@ -60,6 +60,7 @@
>>   #include "xe_psmi.h"
>>   #include "xe_pxp.h"
>>   #include "xe_query.h"
>> +#include "xe_ras.h"
>>   #include "xe_shrinker.h"
>>   #include "xe_soc_remapper.h"
>>   #include "xe_survivability_mode.h"
>> @@ -1009,6 +1010,8 @@ int xe_device_probe(struct xe_device *xe)
>>         xe_vsec_init(xe);
>>   +    xe_ras_init(xe);
>> +
>>       err = xe_sriov_init_late(xe);
>>       if (err)
>>           goto err_unregister_display;
>> diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
>> new file mode 100644
>> index 000000000000..44324fe3273b
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_ras.c
>> @@ -0,0 +1,166 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2026 Intel Corporation
>> + */
>> +
>> +#include "xe_device.h"
>> +#include "xe_device_types.h"
>> +#include "xe_printk.h"
>> +#include "xe_ras.h"
>> +#include "xe_ras_types.h"
>> +#include "xe_sysctrl_mailbox.h"
>> +#include "xe_sysctrl_mailbox_types.h"
>> +
>> +static const char * const gpu_health_states[] = { "ok", "warning", 
>> "critical" };
>> +static const char * const gpu_health_fmt[] = {
>> +    "[%s] %s %s\n",
>> +    "%s [%s] %s\n",
>> +    "%s %s [%s]\n",
>> +};
>> +
>> +static void prepare_sysctrl_command(struct 
>> xe_sysctrl_mailbox_command *command,
>> +                    u32 cmd_mask, void *request, size_t request_len,
>> +                    void *response, size_t response_len)
>> +{
>> +    struct xe_sysctrl_app_msg_hdr hdr = {0};
>> +    u32 req_hdr;
>> +
>> +    req_hdr = FIELD_PREP(APP_HDR_GROUP_ID_MASK, 
>> XE_SYSCTRL_GROUP_GFSP) |
>> +          FIELD_PREP(APP_HDR_COMMAND_MASK, cmd_mask);
>> +
>> +    hdr.data = req_hdr;
>> +    command->header = hdr;
>> +    command->data_in = request;
>> +    command->data_in_len = request_len;
>> +    command->data_out = response;
>> +    command->data_out_len = response_len;
>> +}
>> +
>> +static ssize_t gpu_health_show(struct device *dev, struct 
>> device_attribute *attr, char *buf)
>> +{
>> +    struct xe_device *xe = kdev_to_xe_device(dev);
>> +    struct xe_sysctrl_mailbox_command command = {0};
>> +    struct xe_ras_health_get_response response = {0};
>> +    struct xe_ras_health_get_input request = {0};
>> +    u8 health;
>> +    int ret;
>> +    size_t rlen = 0;
>> +
>> +    prepare_sysctrl_command(&command, XE_SYSCTRL_CMD_GET_HEALTH, 
>> &request,
>> +                sizeof(request), &response, sizeof(response));
>> +    ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen);
>> +    if (ret) {
>> +        xe_err(xe, "[RAS]: Sysctrl error ret %d\n", ret);
>> +        return -EIO;
>> +    }
>> +    if (rlen != sizeof(response)) {
>> +        xe_err(xe,
>> +               "[RAS]: invalid Sysctrl response length %zu (expected 
>> %zu)\n",
>> +               rlen, sizeof(response));
>> +        return -EIO;
>> +    }
>> +    if (response.current_health >= ARRAY_SIZE(gpu_health_states)) {
>> +        xe_err(xe, "[RAS]: invalid health state %u from Sysctrl\n",
>> +               response.current_health);
>> +        return -EIO;
>> +    }
>> +
>> +    health = response.current_health;
>> +
>> +    xe_dbg(xe, "[RAS]: %s state = %d (%s)\n",
>> +           __func__, health, gpu_health_states[health]);
>> +
>> +    return sysfs_emit(buf, gpu_health_fmt[health],
>> +              gpu_health_states[0],
>> +              gpu_health_states[1],
>> +              gpu_health_states[2]);
>> +}
>> +
>> +static ssize_t gpu_health_store(struct device *dev, struct 
>> device_attribute *attr,
>> +                const char *buf, size_t count)
>> +{
>> +    struct xe_device *xe = kdev_to_xe_device(dev);
>> +    struct xe_sysctrl_mailbox_command command = {0};
>> +    struct xe_ras_health_set_input request = {0};
>> +    struct xe_ras_health_set_response response = {0};
>> +    u8 health;
>> +    int ret;
>> +    size_t rlen = 0;
>> +    int state;
>> +
>> +    state = __sysfs_match_string(gpu_health_states,
>> +                     ARRAY_SIZE(gpu_health_states),
>> +                     buf);
>> +    if (state < 0)
>> +        return -EINVAL;
>> +
>> +    request.new_health = (xe_ras_health_status_t)state;
>> +
>> +    prepare_sysctrl_command(&command, XE_SYSCTRL_CMD_SET_HEALTH, 
>> &request,
>> +                sizeof(request), &response, sizeof(response));
>> +    ret = xe_sysctrl_send_command(&xe->sc, &command, &rlen);
>> +    if (ret) {
>> +        xe_err(xe, "[RAS]: Sysctrl error ret %d\n", ret);
>> +        return -EIO;
>> +    }
>> +    if (rlen != sizeof(response)) {
>> +        xe_err(xe,
>> +               "[RAS]: invalid Sysctrl response length %zu (expected 
>> %zu)\n",
>> +               rlen, sizeof(response));
>> +        return -EIO;
>> +    }
>> +    if (response.current_health >= ARRAY_SIZE(gpu_health_states)) {
>> +        xe_err(xe, "[RAS]: invalid health state %u from Sysctrl\n",
>> +               response.current_health);
>> +        return -EIO;
>> +    }
>> +
>> +    health = response.current_health;
>> +
>> +    xe_dbg(xe, "[RAS]: %s state=%d (%s)\n",
>> +           __func__, health, gpu_health_states[health]);
>> +
>> +    return count;
>> +}
>
> The function sets the health status, but its purpose is unclear to me. 
> What happens if the health status is set to critical? How does the 
> device behave in that case, and why and under what scenario would a 
> user need to set this status?
Setting the health status to "critical" is a way for the system (or an 
admin) to flag that the device has a serious issue and shouldn't be used 
for new workloads. When in this state, management tools and 
orchestration software will typically stop scheduling work on the device 
and alert operators for investigation. This status is set when hardware 
faults, persistent errors, or other critical problems are detected, or 
if an admin wants to proactively take the device out of service for 
maintenance or troubleshooting.

Thanks,
Soham

>
> Thanks,
> Badal
>
>> +
>> +static DEVICE_ATTR_ADMIN_RW(gpu_health);
>> +
>> +static void gpu_health_sysfs_fini(void *arg)
>> +{
>> +    struct device *dev = arg;
>> +
>> +    device_remove_file(dev, &dev_attr_gpu_health);
>> +}
>> +
>> +static void gpu_health_indicator_sysfs_init(struct xe_device *xe)
>> +{
>> +    struct device *dev = xe->drm.dev;
>> +    int err;
>> +
>> +    err = device_create_file(dev, &dev_attr_gpu_health);
>> +    if (err)
>> +        goto err;
>> +
>> +    err = devm_add_action_or_reset(dev, gpu_health_sysfs_fini, dev);
>> +    if (err)
>> +        goto err;
>> +
>> +    return;
>> +
>> +err:
>> +    xe_err(xe, "[RAS]: failed to initialize GPU health sysfs, 
>> err=%d\n", err);
>> +}
>> +
>> +/**
>> + * xe_ras_init - Initialize Xe RAS
>> + * @xe: xe device instance
>> + *
>> + * Initialize Xe RAS
>> + */
>> +void xe_ras_init(struct xe_device *xe)
>> +{
>> +    if (!xe->info.has_sysctrl)
>> +        return;
>> +
>> +    gpu_health_indicator_sysfs_init(xe);
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h
>> new file mode 100644
>> index 000000000000..14cb973603e7
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_ras.h
>> @@ -0,0 +1,13 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2026 Intel Corporation
>> + */
>> +
>> +#ifndef _XE_RAS_H_
>> +#define _XE_RAS_H_
>> +
>> +struct xe_device;
>> +
>> +void xe_ras_init(struct xe_device *xe);
>> +
>> +#endif

next prev parent reply	other threads:[~2026-04-14 11:16 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-09  5:17 [PATCH 0/3] drm/xe: Add support for GPU health indicator Soham Purkait
2026-03-09  5:17 ` [PATCH 1/3] From: Anoop Vijay <anoop.c.vijay@intel.com> Soham Purkait
2026-03-09  5:17 ` [PATCH 2/3] drm/xe/xe_ras: Add structures and commands for RAS GPU health indicator Soham Purkait
2026-03-09  5:17 ` [PATCH 3/3] drm/xe/xe_ras: Add RAS support for " Soham Purkait
2026-04-08 11:49   ` Nilawar, Badal
2026-04-14 11:16     ` Purkait, Soham [this message]
2026-03-09  5:28 ` ✗ CI.checkpatch: warning for drm/xe: Add " Patchwork
2026-03-09  5:30 ` ✓ CI.KUnit: success " Patchwork
2026-03-09  6:36 ` ✓ Xe.CI.BAT: " Patchwork
2026-03-09  8:31 ` ✗ Xe.CI.FULL: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=60d27d8b-5b32-4631-a28f-a8a1e9343176@intel.com \
    --to=soham.purkait@intel.com \
    --cc=anoop.c.vijay@intel.com \
    --cc=anshuman.gupta@intel.com \
    --cc=aravind.iddamsetty@linux.intel.com \
    --cc=badal.nilawar@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=mallesh.koujalagi@intel.com \
    --cc=raag.jadav@intel.com \
    --cc=ravi.kishore.koppuravuri@intel.com \
    --cc=riana.tauro@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox