[RFC PATCH V7 10/10] drm/xe/cri: Add sysfs interface for bad gpu vram pages

public inbox for intel-xe@lists.freedesktop.org
 help / color / mirror / Atom feed

From: Tejas Upadhyay <tejas.upadhyay@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: matthew.auld@intel.com, matthew.brost@intel.com,
	thomas.hellstrom@linux.intel.com,
	himal.prasad.ghimiray@intel.com,
	Tejas Upadhyay <tejas.upadhyay@intel.com>
Subject: [RFC PATCH V7 10/10] drm/xe/cri: Add sysfs interface for bad gpu vram pages
Date: Thu, 16 Apr 2026 13:19:59 +0530	[thread overview]
Message-ID: <20260416074958.3722666-22-tejas.upadhyay@intel.com> (raw)
In-Reply-To: <20260416074958.3722666-12-tejas.upadhyay@intel.com>

Starting CRI, Include a sysfs interface designed to expose information
about bad VRAM pages—those identified as having hardware faults
(e.g., ECC errors). This interface allows userspace tools and
administrators to monitor the health of the GPU's local memory and
track the status of page retirement.To get details on bad gpu vram
pages can be found under /sys/bus/pci/devices/bdf/vram_bad_pages.

Where The format is, pfn : gpu page size : flags

flags:
R: reserved, this gpu page is reserved.
P: pending for reserve, this gpu page is marked as bad, will be reserved
   in next window of page_reserve.
F: unable to reserve. this gpu page can’t be reserved due to some reasons.

For example if you read using cat /sys/bus/pci/devices/bdf/vram_bad_pages,
max_pages : 10000
0x00000000 : 0x00001000 : R
0x00001234 : 0x00001000 : P

v3:
- Move FW communication in RAS code
v2:
- Add max_pages info as per updated design doc
- Rebase

Signed-off-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
---
 drivers/gpu/drm/xe/xe_device_sysfs.c       |  7 ++
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c       | 79 ++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.h       |  1 +
 drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h |  2 +
 4 files changed, 89 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c
index a73e0e957cb0..47c5be4180fe 100644
--- a/drivers/gpu/drm/xe/xe_device_sysfs.c
+++ b/drivers/gpu/drm/xe/xe_device_sysfs.c
@@ -8,12 +8,14 @@
 #include <linux/pci.h>
 #include <linux/sysfs.h>
 
+#include "xe_configfs.h"
 #include "xe_device.h"
 #include "xe_device_sysfs.h"
 #include "xe_mmio.h"
 #include "xe_pcode_api.h"
 #include "xe_pcode.h"
 #include "xe_pm.h"
+#include "xe_ttm_vram_mgr.h"
 
 /**
  * DOC: Xe device sysfs
@@ -267,6 +269,7 @@ static const struct attribute_group auto_link_downgrade_attr_group = {
 int xe_device_sysfs_init(struct xe_device *xe)
 {
 	struct device *dev = xe->drm.dev;
+	bool policy;
 	int ret;
 
 	if (xe->d3cold.capable) {
@@ -285,5 +288,9 @@ int xe_device_sysfs_init(struct xe_device *xe)
 			return ret;
 	}
 
+	policy = xe_configfs_get_bad_page_reservation(to_pci_dev(dev));
+	if (xe->info.platform == XE_CRESCENTISLAND && policy)
+		xe_ttm_vram_sysfs_init(xe);
+
 	return 0;
 }
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index 7f58e7e8c3e1..611d945c9eb4 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -760,3 +760,82 @@ int xe_ttm_vram_handle_addr_fault(struct xe_device *xe, unsigned long addr)
 	return ret;
 }
 EXPORT_SYMBOL(xe_ttm_vram_handle_addr_fault);
+
+static void xe_ttm_vram_dump_bad_pages_info(char *buf, struct xe_ttm_vram_mgr *mgr)
+{
+	const unsigned int element_size = sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
+	const unsigned int maxpage_size = sizeof("max_pages: 10000\n") - 1;
+	struct xe_ttm_vram_offline_resource *pos, *n;
+	struct gpu_buddy_block *block;
+	ssize_t s = 0;
+
+	mutex_lock(&mgr->lock);
+	s += scnprintf(&buf[s], maxpage_size + 1, "max_pages: %d\n", mgr->max_pages);
+	list_for_each_entry_safe(pos, n, &mgr->offlined_pages, offlined_link) {
+		block = list_first_entry(&pos->blocks,
+					 struct gpu_buddy_block,
+					 link);
+		s += scnprintf(&buf[s], element_size + 1,
+			       "0x%08llx : 0x%08llx : %1s\n",
+			       gpu_buddy_block_offset(block) >> PAGE_SHIFT,
+			       gpu_buddy_block_size(&mgr->mm, block),
+			       "R");
+	}
+	list_for_each_entry_safe(pos, n, &mgr->queued_pages, queued_link) {
+		block = list_first_entry(&pos->blocks,
+					 struct gpu_buddy_block,
+					 link);
+		s += scnprintf(&buf[s], element_size + 1,
+			       "0x%08llx : 0x%08llx : %1s\n",
+			       gpu_buddy_block_offset(block) >> PAGE_SHIFT,
+			       gpu_buddy_block_size(&mgr->mm, block),
+			       pos->status ? "P" : "F");
+	}
+	mutex_unlock(&mgr->lock);
+}
+
+static ssize_t vram_bad_pages_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct xe_device *xe = pdev_to_xe_device(pdev);
+	struct ttm_resource_manager *man;
+	struct xe_ttm_vram_mgr *mgr;
+
+	man = ttm_manager_type(&xe->ttm, XE_PL_VRAM0);
+	if (man) {
+		mgr = to_xe_ttm_vram_mgr(man);
+		xe_ttm_vram_dump_bad_pages_info(buf, mgr);
+	}
+
+	return sysfs_emit(buf, "%s\n", buf);
+}
+static DEVICE_ATTR_RO(vram_bad_pages);
+
+static void xe_ttm_vram_sysfs_fini(void *arg)
+{
+	struct xe_device *xe = arg;
+
+	device_remove_file(xe->drm.dev, &dev_attr_vram_bad_pages);
+}
+
+/**
+ * xe_ttm_vram_sysfs_init - Initialize vram sysfs component
+ * @tile: Xe Tile object
+ *
+ * It needs to be initialized after the main tile component is ready
+ *
+ * Returns: 0 on success, negative error code on error.
+ */
+int xe_ttm_vram_sysfs_init(struct xe_device *xe)
+{
+	int err;
+
+	err = device_create_file(xe->drm.dev, &dev_attr_vram_bad_pages);
+	if (err) {
+		dev_err(xe->drm.dev, "Failed to create vram_bad_pages sysfs file: %d\n", err);
+		return 0;
+	}
+
+	return devm_add_action_or_reset(xe->drm.dev, xe_ttm_vram_sysfs_fini, xe);
+}
+EXPORT_SYMBOL(xe_ttm_vram_sysfs_init);
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h
index 8ef06d9d44f7..c33e1a8d9217 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h
@@ -32,6 +32,7 @@ void xe_ttm_vram_get_used(struct ttm_resource_manager *man,
 			  u64 *used, u64 *used_visible);
 
 int xe_ttm_vram_handle_addr_fault(struct xe_device *xe, unsigned long addr);
+int xe_ttm_vram_sysfs_init(struct xe_device *xe);
 static inline struct xe_ttm_vram_mgr_resource *
 to_xe_ttm_vram_mgr_resource(struct ttm_resource *res)
 {
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h
index 07ed88b47e04..b23796066a1a 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h
@@ -39,6 +39,8 @@ struct xe_ttm_vram_mgr {
 	u32 mem_type;
 	/** @offline_mode: debugfs hook for setting page offline mode */
 	u64 offline_mode;
+	/** @max_pages: max pages that can be in offline queue retrieved from FW */
+	u16 max_pages;
 };
 
 /**
-- 
2.52.0

next prev parent reply	other threads:[~2026-04-16  7:50 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-16  7:49 [RFC PATCH V7 00/10] Add memory page offlining support Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 01/10] drm/xe: Link VRAM object with gpu buddy Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 02/10] gpu/buddy: Integrate lockdep for gpu buddy manager Tejas Upadhyay
2026-04-16  8:55   ` Matthew Auld
2026-04-16  9:43     ` Upadhyay, Tejas
2026-04-16  9:56       ` Matthew Auld
2026-04-16 10:04         ` Upadhyay, Tejas
2026-04-16 10:15           ` Matthew Auld
2026-04-16 10:18             ` Upadhyay, Tejas
2026-04-16  7:49 ` [RFC PATCH V7 03/10] drm/gpu: Add gpu_buddy_allocated_addr_to_block helper Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 04/10] drm/xe: Link LRC BO and its execution Queue Tejas Upadhyay
2026-04-30  3:34   ` Matthew Brost
2026-05-04  9:11     ` Upadhyay, Tejas
2026-04-16  7:49 ` [RFC PATCH V7 05/10] drm/xe: Extend BO purge to handle vram pages as well Tejas Upadhyay
2026-04-30  3:44   ` Matthew Brost
2026-04-30 12:08     ` Upadhyay, Tejas
2026-05-05  8:15     ` Yadav, Arvind
2026-04-16  7:49 ` [RFC PATCH V7 06/10] drm/xe: Handle physical memory address error Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 07/10] drm/xe/cri: Add debugfs to inject faulty vram address Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 08/10] gpu/buddy: Add routine to dump allocated buddy blocks Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 09/10] drm/xe/configfs: Add vram bad page reservation policy Tejas Upadhyay
2026-04-16  7:49 ` Tejas Upadhyay [this message]
2026-04-30 13:53   ` [RFC PATCH V7 10/10] drm/xe/cri: Add sysfs interface for bad gpu vram pages Matthew Auld
2026-05-04  9:02     ` Upadhyay, Tejas
2026-05-05  8:44       ` Matthew Auld
2026-04-16  7:56 ` ✗ CI.checkpatch: warning for Add memory page offlining support (rev8) Patchwork
2026-04-16  7:57 ` ✗ CI.KUnit: failure " Patchwork

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:a73e0e957cb dfblob:47c5be4180f dfblob:7f58e7e8c3e
dfblob:611d945c9eb dfblob:8ef06d9d44f dfblob:c33e1a8d921
dfblob:07ed88b47e0 dfblob:b23796066a1 )
 OR (
bs:"[RFC PATCH V7 10/10] drm/xe/cri: Add sysfs interface for bad gpu vram pages" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260416074958.3722666-22-tejas.upadhyay@intel.com \
    --to=tejas.upadhyay@intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.auld@intel.com \
    --cc=matthew.brost@intel.com \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox