From: Tejas Upadhyay <tejas.upadhyay@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: matthew.auld@intel.com, matthew.brost@intel.com,
himal.prasad.ghimiray@intel.com,
Tejas Upadhyay <tejas.upadhyay@intel.com>
Subject: [RFC PATCH 5/5] [DO NOT REVIEW]drm/xe/cri: Add sysfs interface for bad gpu vram pages
Date: Wed, 11 Feb 2026 10:31:38 +0530 [thread overview]
Message-ID: <20260211050132.1332599-12-tejas.upadhyay@intel.com> (raw)
In-Reply-To: <20260211050132.1332599-7-tejas.upadhyay@intel.com>
Starting CRI, Include a sysfs interface designed to expose information
about bad VRAM pages—those identified as having hardware faults
(e.g., ECC errors). This interface allows userspace tools and
administrators to monitor the health of the GPU's local memory and
track the status of page retirement.To get details on bad gpu vram
pages can be found under /sys/bus/pci/devices/bdf/vram_bad_pages.
Where The format is, pfn : gpu page size : flags
flags:
R: reserved, this gpu page is reserved.
P: pending for reserve, this gpu page is marked as bad, will be reserved in next window of page_reserve.
F: unable to reserve. this gpu page can’t be reserved due to some reasons.
For example if you read using cat /sys/bus/pci/devices/bdf/vram_bad_pages,
0x00000000 : 0x00001000 : R
0x00001234 : 0x00001000 : P
Signed-off-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
---
drivers/gpu/drm/xe/xe_device_sysfs.c | 2 +
drivers/gpu/drm/xe/xe_tile_sysfs.c | 1 +
drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 78 ++++++++++++++++++++++++++++
drivers/gpu/drm/xe/xe_ttm_vram_mgr.h | 1 +
4 files changed, 82 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c
index a73e0e957cb0..e6a017601428 100644
--- a/drivers/gpu/drm/xe/xe_device_sysfs.c
+++ b/drivers/gpu/drm/xe/xe_device_sysfs.c
@@ -14,6 +14,7 @@
#include "xe_pcode_api.h"
#include "xe_pcode.h"
#include "xe_pm.h"
+#include "xe_ttm_vram_mgr.h"
/**
* DOC: Xe device sysfs
@@ -284,6 +285,7 @@ int xe_device_sysfs_init(struct xe_device *xe)
if (ret)
return ret;
}
+ xe_ttm_vram_sysfs_init(xe);
return 0;
}
diff --git a/drivers/gpu/drm/xe/xe_tile_sysfs.c b/drivers/gpu/drm/xe/xe_tile_sysfs.c
index 9e1236a9ec67..beaf20ecddb4 100644
--- a/drivers/gpu/drm/xe/xe_tile_sysfs.c
+++ b/drivers/gpu/drm/xe/xe_tile_sysfs.c
@@ -10,6 +10,7 @@
#include "xe_pm.h"
#include "xe_tile.h"
#include "xe_tile_sysfs.h"
+#include "xe_ttm_vram_mgr.h"
#include "xe_vram_freq.h"
static void xe_tile_sysfs_kobj_release(struct kobject *kobj)
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index 1ad98ef8b54e..f06416b92962 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -296,6 +296,37 @@ static void xe_ttm_vram_free_bad_pages(struct drm_device *dev, struct xe_ttm_vra
mutex_unlock(&mgr->lock);
}
+static void xe_ttm_vram_dump_bad_pages_info(char *buf, struct xe_ttm_vram_mgr *mgr)
+{
+ const unsigned int element_size = sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
+ struct xe_ttm_offline_resource *pos, *n;
+ struct drm_buddy_block *block;
+ ssize_t s = 0;
+
+ mutex_lock(&mgr->lock);
+ list_for_each_entry_safe(pos, n, &mgr->offlined_pages, offlined_link) {
+ block = list_first_entry(&pos->blocks,
+ struct drm_buddy_block,
+ link);
+ s += scnprintf(&buf[s], element_size + 1,
+ "0x%08llx : 0x%08llx : %1s\n",
+ drm_buddy_block_offset(block) >> PAGE_SHIFT,
+ drm_buddy_block_size(&mgr->mm, block),
+ "R");
+ }
+ list_for_each_entry_safe(pos, n, &mgr->queued_pages, queued_link) {
+ block = list_first_entry(&pos->blocks,
+ struct drm_buddy_block,
+ link);
+ s += scnprintf(&buf[s], element_size + 1,
+ "0x%08llx : 0x%08llx : %1s\n",
+ drm_buddy_block_offset(block) >> PAGE_SHIFT,
+ drm_buddy_block_size(&mgr->mm, block),
+ "P");
+ }
+ mutex_unlock(&mgr->lock);
+}
+
static void xe_ttm_vram_mgr_fini(struct drm_device *dev, void *arg)
{
struct xe_device *xe = to_xe_device(dev);
@@ -755,3 +786,50 @@ int xe_ttm_tbo_handle_addr_fault(struct xe_tile *tile, unsigned long addr)
return ret;
}
EXPORT_SYMBOL(xe_ttm_tbo_handle_addr_fault);
+
+static ssize_t vram_bad_pages_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct xe_device *xe = pdev_to_xe_device(pdev);
+ struct ttm_resource_manager *man;
+ u8 mem_type = XE_PL_VRAM1;
+
+ do {
+ man = ttm_manager_type(&xe->ttm, mem_type);
+ struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man);
+
+ if (man)
+ xe_ttm_vram_dump_bad_pages_info(buf, mgr);
+ --mem_type;
+ } while (mem_type >= XE_PL_VRAM0);
+
+ return sysfs_emit(buf, "%s\n", buf);
+}
+static DEVICE_ATTR_RO(vram_bad_pages);
+
+static void xe_ttm_vram_sysfs_fini(void *arg)
+{
+ struct xe_device *xe = arg;
+
+ device_remove_file(xe->drm.dev, &dev_attr_vram_bad_pages);
+}
+
+/**
+ * xe_ttm_vram_sysfs_init - Initialize vram sysfs component
+ * @tile: Xe Tile object
+ *
+ * It needs to be initialized after the main tile component is ready
+ *
+ * Returns: 0 on success, negative error code on error.
+ */
+int xe_ttm_vram_sysfs_init(struct xe_device *xe)
+{
+ int err;
+
+ err = device_create_file(xe->drm.dev, &dev_attr_vram_bad_pages);
+ if (err)
+ return 0;
+
+ return devm_add_action_or_reset(xe->drm.dev, xe_ttm_vram_sysfs_fini, xe);
+}
+EXPORT_SYMBOL(xe_ttm_vram_sysfs_init);
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h
index 5872e8b48779..6e69140c0be8 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h
@@ -34,6 +34,7 @@ void xe_ttm_vram_get_used(struct ttm_resource_manager *man,
int xe_ttm_tbo_handle_addr_fault(struct xe_tile *tile, unsigned long addr);
void xe_ttm_vram_dump_allocated_blocks(struct drm_device *dev, struct drm_buddy *mm,
struct drm_printer *p);
+int xe_ttm_vram_sysfs_init(struct xe_device *xe);
static inline struct xe_ttm_vram_mgr_resource *
to_xe_ttm_vram_mgr_resource(struct ttm_resource *res)
{
--
2.52.0
prev parent reply other threads:[~2026-02-11 5:02 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-11 5:01 [RFC PATCH 0/5] Add memory page offlining support Tejas Upadhyay
2026-02-11 5:01 ` [RFC PATCH 1/5] drm/xe: Implement VRAM object tracking ability using physical address Tejas Upadhyay
2026-02-11 6:26 ` Matthew Brost
2026-02-12 4:49 ` Upadhyay, Tejas
2026-02-11 5:01 ` [RFC PATCH 2/5] drm/xe: Handle physical memory address error Tejas Upadhyay
2026-02-11 5:01 ` [RFC PATCH 3/5] [DO_NOT_REVIEW]drm/xe/cri: Add debugfs to inject faulty vram address Tejas Upadhyay
2026-02-11 5:01 ` [RFC PATCH 4/5] drm/xe: Add routine to dump allocated VRAM blocks Tejas Upadhyay
2026-02-11 5:01 ` Tejas Upadhyay [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260211050132.1332599-12-tejas.upadhyay@intel.com \
--to=tejas.upadhyay@intel.com \
--cc=himal.prasad.ghimiray@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=matthew.auld@intel.com \
--cc=matthew.brost@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox