* [PATCH v1 0/3] DRM RAS Fixes
@ 2026-05-14 20:28 Raag Jadav
2026-05-14 20:28 ` [PATCH v1 1/3] drm/ras: Cancel and free message on get counter failure Raag Jadav
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Raag Jadav @ 2026-05-14 20:28 UTC (permalink / raw)
To: intel-xe, dri-devel, netdev
Cc: rodrigo.vivi, riana.tauro, maarten, airlied, simona, kuba,
Raag Jadav
Detailed description in commit message.
Raag Jadav (3):
drm/ras: Cancel and free message on get counter failure
drm/xe/drm_ras: Make counter allocation drm managed
drm/xe/drm_ras: Add per node cleanup action
drivers/gpu/drm/drm_ras.c | 5 +++-
drivers/gpu/drm/xe/xe_drm_ras.c | 47 +++++++++++++--------------------
2 files changed, 22 insertions(+), 30 deletions(-)
--
2.43.0
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH v1 1/3] drm/ras: Cancel and free message on get counter failure
2026-05-14 20:28 [PATCH v1 0/3] DRM RAS Fixes Raag Jadav
@ 2026-05-14 20:28 ` Raag Jadav
2026-05-14 20:28 ` [PATCH v1 2/3] drm/xe/drm_ras: Make counter allocation drm managed Raag Jadav
2026-05-14 20:28 ` [PATCH v1 3/3] drm/xe/drm_ras: Add per node cleanup action Raag Jadav
2 siblings, 0 replies; 4+ messages in thread
From: Raag Jadav @ 2026-05-14 20:28 UTC (permalink / raw)
To: intel-xe, dri-devel, netdev
Cc: rodrigo.vivi, riana.tauro, maarten, airlied, simona, kuba,
Raag Jadav
doit_reply_value() directly returns on get counter failure, which results
in stale sk_buff and genetlink header that aren't cleaned up. Fix it.
Fixes: c36218dc49f5 ("drm/ras: Introduce the DRM RAS infrastructure over generic netlink")
Signed-off-by: Raag Jadav <raag.jadav@intel.com>
---
drivers/gpu/drm/drm_ras.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/drm_ras.c b/drivers/gpu/drm/drm_ras.c
index d6eab29a1394..262a75f0c493 100644
--- a/drivers/gpu/drm/drm_ras.c
+++ b/drivers/gpu/drm/drm_ras.c
@@ -207,8 +207,11 @@ static int doit_reply_value(struct genl_info *info, u32 node_id,
ret = get_node_error_counter(node_id, error_id,
&error_name, &value);
- if (ret)
+ if (ret) {
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
return ret;
+ }
ret = msg_reply_value(msg, error_id, error_name, value);
if (ret) {
--
2.43.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH v1 2/3] drm/xe/drm_ras: Make counter allocation drm managed
2026-05-14 20:28 [PATCH v1 0/3] DRM RAS Fixes Raag Jadav
2026-05-14 20:28 ` [PATCH v1 1/3] drm/ras: Cancel and free message on get counter failure Raag Jadav
@ 2026-05-14 20:28 ` Raag Jadav
2026-05-14 20:28 ` [PATCH v1 3/3] drm/xe/drm_ras: Add per node cleanup action Raag Jadav
2 siblings, 0 replies; 4+ messages in thread
From: Raag Jadav @ 2026-05-14 20:28 UTC (permalink / raw)
To: intel-xe, dri-devel, netdev
Cc: rodrigo.vivi, riana.tauro, maarten, airlied, simona, kuba,
Raag Jadav
cleanup_node_param() is not registered in case of counter allocation
failure, which results in stale memory of previous node that isn't
cleaned up on unwind. Fix this using drm managed allocation, which is
guaranteed to be cleaned up on unwind.
Fixes: b40db12b542f ("drm/xe/xe_drm_ras: Add support for XE DRM RAS")
Signed-off-by: Raag Jadav <raag.jadav@intel.com>
---
drivers/gpu/drm/xe/xe_drm_ras.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c
index c21c8b428de6..89640ffb1c33 100644
--- a/drivers/gpu/drm/xe/xe_drm_ras.c
+++ b/drivers/gpu/drm/xe/xe_drm_ras.c
@@ -80,7 +80,7 @@ static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *x
struct xe_drm_ras_counter *counter;
int i;
- counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL);
+ counter = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL);
if (!counter)
return ERR_PTR(-ENOMEM);
@@ -135,9 +135,6 @@ static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_err
{
struct drm_ras_node *node = &ras->node[severity];
- kfree(ras->info[severity]);
- ras->info[severity] = NULL;
-
kfree(node->device_name);
node->device_name = NULL;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH v1 3/3] drm/xe/drm_ras: Add per node cleanup action
2026-05-14 20:28 [PATCH v1 0/3] DRM RAS Fixes Raag Jadav
2026-05-14 20:28 ` [PATCH v1 1/3] drm/ras: Cancel and free message on get counter failure Raag Jadav
2026-05-14 20:28 ` [PATCH v1 2/3] drm/xe/drm_ras: Make counter allocation drm managed Raag Jadav
@ 2026-05-14 20:28 ` Raag Jadav
2 siblings, 0 replies; 4+ messages in thread
From: Raag Jadav @ 2026-05-14 20:28 UTC (permalink / raw)
To: intel-xe, dri-devel, netdev
Cc: rodrigo.vivi, riana.tauro, maarten, airlied, simona, kuba,
Raag Jadav
cleanup_node_param() is not registered in case of counter allocation
failure, which results in stale memory of previous node that isn't
cleaned up on unwind. Add per node cleanup action which guarantees
cleanup on unwind and also simplifies the cleanup logic.
Fixes: b40db12b542f ("drm/xe/xe_drm_ras: Add support for XE DRM RAS")
Signed-off-by: Raag Jadav <raag.jadav@intel.com>
---
drivers/gpu/drm/xe/xe_drm_ras.c | 42 +++++++++++++--------------------
1 file changed, 17 insertions(+), 25 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c
index 89640ffb1c33..40abde29a26f 100644
--- a/drivers/gpu/drm/xe/xe_drm_ras.c
+++ b/drivers/gpu/drm/xe/xe_drm_ras.c
@@ -131,14 +131,20 @@ static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
return 0;
}
-static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity)
+static void cleanup_node_param(struct drm_ras_node *node)
{
- struct drm_ras_node *node = &ras->node[severity];
-
kfree(node->device_name);
node->device_name = NULL;
}
+static void cleanup_node(struct drm_device *drm, void *arg)
+{
+ struct drm_ras_node *node = arg;
+
+ drm_ras_node_unregister(node);
+ cleanup_node_param(node);
+}
+
static int register_nodes(struct xe_device *xe)
{
struct xe_drm_ras *ras = &xe->ras;
@@ -150,13 +156,19 @@ static int register_nodes(struct xe_device *xe)
ret = assign_node_params(xe, node, i);
if (ret) {
- cleanup_node_param(ras, i);
+ cleanup_node_param(node);
return ret;
}
ret = drm_ras_node_register(node);
if (ret) {
- cleanup_node_param(ras, i);
+ cleanup_node_param(node);
+ return ret;
+ }
+
+ ret = drmm_add_action_or_reset(&xe->drm, cleanup_node, node);
+ if (ret) {
+ cleanup_node(&xe->drm, node);
return ret;
}
}
@@ -164,20 +176,6 @@ static int register_nodes(struct xe_device *xe)
return 0;
}
-static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg)
-{
- struct xe_device *xe = arg;
- struct xe_drm_ras *ras = &xe->ras;
- int i;
-
- for_each_error_severity(i) {
- struct drm_ras_node *node = &ras->node[i];
-
- drm_ras_node_unregister(node);
- cleanup_node_param(ras, i);
- }
-}
-
/**
* xe_drm_ras_init() - Initialize DRM RAS
* @xe: xe device instance
@@ -204,11 +202,5 @@ int xe_drm_ras_init(struct xe_device *xe)
return err;
}
- err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe);
- if (err) {
- drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err));
- return err;
- }
-
return 0;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-05-14 20:32 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-14 20:28 [PATCH v1 0/3] DRM RAS Fixes Raag Jadav
2026-05-14 20:28 ` [PATCH v1 1/3] drm/ras: Cancel and free message on get counter failure Raag Jadav
2026-05-14 20:28 ` [PATCH v1 2/3] drm/xe/drm_ras: Make counter allocation drm managed Raag Jadav
2026-05-14 20:28 ` [PATCH v1 3/3] drm/xe/drm_ras: Add per node cleanup action Raag Jadav
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox