From: Tejas Upadhyay <tejas.upadhyay@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: matthew.auld@intel.com, matthew.brost@intel.com,
thomas.hellstrom@linux.intel.com,
himal.prasad.ghimiray@intel.com,
Tejas Upadhyay <tejas.upadhyay@intel.com>
Subject: [RFC PATCH V7 09/10] drm/xe/configfs: Add vram bad page reservation policy
Date: Thu, 16 Apr 2026 13:19:58 +0530 [thread overview]
Message-ID: <20260416074958.3722666-21-tejas.upadhyay@intel.com> (raw)
In-Reply-To: <20260416074958.3722666-12-tejas.upadhyay@intel.com>
The interface enables setting the policy for how bad pages are
handled in VRAM. This is crucial for maintaining system
stability in scenarios where VRAM degradation occurs.
By default policy will be "reserve", which can be changed to
"logging" only.
v3:
- All FW communication moved under RAS
v2:
- Add CRI check and rebase
Signed-off-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
---
drivers/gpu/drm/xe/xe_configfs.c | 64 +++++++++++++++++++++++++++-
drivers/gpu/drm/xe/xe_configfs.h | 2 +
drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 10 +++++
3 files changed, 75 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
index 32102600a148..e07a6a74896b 100644
--- a/drivers/gpu/drm/xe/xe_configfs.c
+++ b/drivers/gpu/drm/xe/xe_configfs.c
@@ -61,7 +61,8 @@
* ├── survivability_mode
* ├── gt_types_allowed
* ├── engines_allowed
- * └── enable_psmi
+ * ├── enable_psmi
+ * └── bad_page_reservation
*
* After configuring the attributes as per next section, the device can be
* probed with::
@@ -159,6 +160,16 @@
*
* This attribute can only be set before binding to the device.
*
+ * Bad pages reservation:
+ * ---------------------
+ *
+ * Disable vram bad pages reservation, instead just report it in dmesg.
+ * Example to disable it::
+ *
+ * # echo 0 > /sys/kernel/config/xe/0000:03:00.0/bad_page_reservation
+ *
+ * This attribute can only be set before binding to the device.
+ *
* Context restore BB
* ------------------
*
@@ -262,6 +273,7 @@ struct xe_config_group_device {
struct wa_bb ctx_restore_mid_bb[XE_ENGINE_CLASS_MAX];
bool survivability_mode;
bool enable_psmi;
+ bool bad_page_reservation;
struct {
unsigned int max_vfs;
bool admin_only_pf;
@@ -281,6 +293,7 @@ static const struct xe_config_device device_defaults = {
.engines_allowed = U64_MAX,
.survivability_mode = false,
.enable_psmi = false,
+ .bad_page_reservation = true,
.sriov = {
.max_vfs = XE_DEFAULT_MAX_VFS,
.admin_only_pf = XE_DEFAULT_ADMIN_ONLY_PF,
@@ -575,6 +588,32 @@ static ssize_t enable_psmi_store(struct config_item *item, const char *page, siz
return len;
}
+static ssize_t bad_page_reservation_show(struct config_item *item, char *page)
+{
+ struct xe_config_device *dev = to_xe_config_device(item);
+
+ return sprintf(page, "%d\n", dev->bad_page_reservation);
+}
+
+static ssize_t bad_page_reservation_store(struct config_item *item, const char *page, size_t len)
+{
+ struct xe_config_group_device *dev = to_xe_config_group_device(item);
+ bool val;
+ int ret;
+
+ ret = kstrtobool(page, &val);
+ if (ret)
+ return ret;
+
+ guard(mutex)(&dev->lock);
+ if (is_bound(dev))
+ return -EBUSY;
+
+ dev->config.bad_page_reservation = val;
+
+ return len;
+}
+
static bool wa_bb_read_advance(bool dereference, char **p,
const char *append, size_t len,
size_t *max_size)
@@ -813,6 +852,7 @@ static ssize_t ctx_restore_post_bb_store(struct config_item *item,
CONFIGFS_ATTR(, ctx_restore_mid_bb);
CONFIGFS_ATTR(, ctx_restore_post_bb);
CONFIGFS_ATTR(, enable_psmi);
+CONFIGFS_ATTR(, bad_page_reservation);
CONFIGFS_ATTR(, engines_allowed);
CONFIGFS_ATTR(, gt_types_allowed);
CONFIGFS_ATTR(, survivability_mode);
@@ -821,6 +861,7 @@ static struct configfs_attribute *xe_config_device_attrs[] = {
&attr_ctx_restore_mid_bb,
&attr_ctx_restore_post_bb,
&attr_enable_psmi,
+ &attr_bad_page_reservation,
&attr_engines_allowed,
&attr_gt_types_allowed,
&attr_survivability_mode,
@@ -1098,6 +1139,7 @@ static void dump_custom_dev_config(struct pci_dev *pdev,
PRI_CUSTOM_ATTR("%llx", gt_types_allowed);
PRI_CUSTOM_ATTR("%llx", engines_allowed);
PRI_CUSTOM_ATTR("%d", enable_psmi);
+ PRI_CUSTOM_ATTR("%d", bad_page_reservation);
PRI_CUSTOM_ATTR("%d", survivability_mode);
PRI_CUSTOM_ATTR("%u", sriov.admin_only_pf);
@@ -1225,6 +1267,26 @@ bool xe_configfs_get_psmi_enabled(struct pci_dev *pdev)
return ret;
}
+/**
+ * xe_configfs_get_bad_page_reservation - get configfs bad_page_reservation setting
+ * @pdev: pci device
+ *
+ * Return: bad_page_reservation setting in configfs
+ */
+bool xe_configfs_get_bad_page_reservation(struct pci_dev *pdev)
+{
+ struct xe_config_group_device *dev = find_xe_config_group_device(pdev);
+ bool ret;
+
+ if (!dev)
+ return device_defaults.bad_page_reservation;
+
+ ret = dev->config.bad_page_reservation;
+ config_group_put(&dev->group);
+
+ return ret;
+}
+
/**
* xe_configfs_get_ctx_restore_mid_bb - get configfs ctx_restore_mid_bb setting
* @pdev: pci device
diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
index 07d62bf0c152..c107d84b2c62 100644
--- a/drivers/gpu/drm/xe/xe_configfs.h
+++ b/drivers/gpu/drm/xe/xe_configfs.h
@@ -23,6 +23,7 @@ bool xe_configfs_primary_gt_allowed(struct pci_dev *pdev);
bool xe_configfs_media_gt_allowed(struct pci_dev *pdev);
u64 xe_configfs_get_engines_allowed(struct pci_dev *pdev);
bool xe_configfs_get_psmi_enabled(struct pci_dev *pdev);
+bool xe_configfs_get_bad_page_reservation(struct pci_dev *pdev);
u32 xe_configfs_get_ctx_restore_mid_bb(struct pci_dev *pdev,
enum xe_engine_class class,
const u32 **cs);
@@ -42,6 +43,7 @@ static inline bool xe_configfs_primary_gt_allowed(struct pci_dev *pdev) { return
static inline bool xe_configfs_media_gt_allowed(struct pci_dev *pdev) { return true; }
static inline u64 xe_configfs_get_engines_allowed(struct pci_dev *pdev) { return U64_MAX; }
static inline bool xe_configfs_get_psmi_enabled(struct pci_dev *pdev) { return false; }
+static inline bool xe_configfs_get_bad_page_reservation(struct pci_dev *pdev) { return true; }
static inline u32 xe_configfs_get_ctx_restore_mid_bb(struct pci_dev *pdev,
enum xe_engine_class class,
const u32 **cs) { return 0; }
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index fcf32360f240..7f58e7e8c3e1 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -12,6 +12,7 @@
#include <drm/ttm/ttm_range_manager.h>
#include "xe_bo.h"
+#include "xe_configfs.h"
#include "xe_device.h"
#include "xe_exec_queue.h"
#include "xe_lrc.h"
@@ -731,6 +732,7 @@ int xe_ttm_vram_handle_addr_fault(struct xe_device *xe, unsigned long addr)
struct xe_ttm_vram_mgr *vram_mgr;
struct xe_vram_region *vr;
struct gpu_buddy *mm;
+ bool policy;
int ret;
vr = xe_ttm_vram_addr_to_region(xe, addr);
@@ -745,6 +747,14 @@ int xe_ttm_vram_handle_addr_fault(struct xe_device *xe, unsigned long addr)
/* TODO: Check if we already processed faulted address, and if yes return -EEXIST */
+ policy = xe_configfs_get_bad_page_reservation(to_pci_dev(xe->drm.dev));
+ if (!policy) {
+ drm_err(&xe->drm, "0x%lx is reported as corrupted address by HW\n",
+ addr);
+ /* Let RAS report to FW to drop addr from SRAM queue */
+ return -EOPNOTSUPP;
+ }
+
/* Reserve page at address */
ret = xe_ttm_vram_reserve_page_at_addr(xe, addr, vram_mgr, mm);
return ret;
--
2.52.0
next prev parent reply other threads:[~2026-04-16 7:50 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-16 7:49 [RFC PATCH V7 00/10] Add memory page offlining support Tejas Upadhyay
2026-04-16 7:49 ` [RFC PATCH V7 01/10] drm/xe: Link VRAM object with gpu buddy Tejas Upadhyay
2026-04-16 7:49 ` [RFC PATCH V7 02/10] gpu/buddy: Integrate lockdep for gpu buddy manager Tejas Upadhyay
2026-04-16 8:55 ` Matthew Auld
2026-04-16 9:43 ` Upadhyay, Tejas
2026-04-16 9:56 ` Matthew Auld
2026-04-16 10:04 ` Upadhyay, Tejas
2026-04-16 10:15 ` Matthew Auld
2026-04-16 10:18 ` Upadhyay, Tejas
2026-04-16 7:49 ` [RFC PATCH V7 03/10] drm/gpu: Add gpu_buddy_allocated_addr_to_block helper Tejas Upadhyay
2026-04-16 7:49 ` [RFC PATCH V7 04/10] drm/xe: Link LRC BO and its execution Queue Tejas Upadhyay
2026-04-30 3:34 ` Matthew Brost
2026-05-04 9:11 ` Upadhyay, Tejas
2026-04-16 7:49 ` [RFC PATCH V7 05/10] drm/xe: Extend BO purge to handle vram pages as well Tejas Upadhyay
2026-04-30 3:44 ` Matthew Brost
2026-04-30 12:08 ` Upadhyay, Tejas
2026-05-05 8:15 ` Yadav, Arvind
2026-04-16 7:49 ` [RFC PATCH V7 06/10] drm/xe: Handle physical memory address error Tejas Upadhyay
2026-04-16 7:49 ` [RFC PATCH V7 07/10] drm/xe/cri: Add debugfs to inject faulty vram address Tejas Upadhyay
2026-04-16 7:49 ` [RFC PATCH V7 08/10] gpu/buddy: Add routine to dump allocated buddy blocks Tejas Upadhyay
2026-04-16 7:49 ` Tejas Upadhyay [this message]
2026-04-16 7:49 ` [RFC PATCH V7 10/10] drm/xe/cri: Add sysfs interface for bad gpu vram pages Tejas Upadhyay
2026-04-30 13:53 ` Matthew Auld
2026-05-04 9:02 ` Upadhyay, Tejas
2026-05-05 8:44 ` Matthew Auld
2026-05-06 5:18 ` Upadhyay, Tejas
2026-04-16 7:56 ` ✗ CI.checkpatch: warning for Add memory page offlining support (rev8) Patchwork
2026-04-16 7:57 ` ✗ CI.KUnit: failure " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260416074958.3722666-21-tejas.upadhyay@intel.com \
--to=tejas.upadhyay@intel.com \
--cc=himal.prasad.ghimiray@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=matthew.auld@intel.com \
--cc=matthew.brost@intel.com \
--cc=thomas.hellstrom@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox