From: Tejas Upadhyay <tejas.upadhyay@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: matthew.auld@intel.com, matthew.brost@intel.com,
thomas.hellstrom@linux.intel.com,
himal.prasad.ghimiray@intel.com,
Tejas Upadhyay <tejas.upadhyay@intel.com>
Subject: [RFC PATCH V7 07/10] drm/xe/cri: Add debugfs to inject faulty vram address
Date: Thu, 16 Apr 2026 13:19:56 +0530 [thread overview]
Message-ID: <20260416074958.3722666-19-tejas.upadhyay@intel.com> (raw)
In-Reply-To: <20260416074958.3722666-12-tejas.upadhyay@intel.com>
Add debugfs which can help testing feature with manual error injection.
Adding a debugfs interface to the drm/xe driver allows manual injection
of faulty VRAM addresses, facilitating the testing of the CRI memory
page offline feature before it is fully functional. The implementation
involves creating a debugfs entry, likely under
/sys/kernel/debug/dri/bdf/invalid_addr_vram0,
to accept specific faulty addresses for validation.
For example,
echo 0 > /sys/kernel/debug/dri/bdf/invalid_addr_vram0
where 0 is below address types to be tested,
enum mempage_offline_mode {
MEMPAGE_OFFLINE_UNALLOCATED = 0,
MEMPAGE_OFFLINE_USER_ALLOCATED = 1,
MEMPAGE_OFFLINE_KERNEL_USER_GGTT_ALLOCATED = 2,
MEMPAGE_OFFLINE_KERNEL_USER_PPGTT_ALLOCATED = 3,
MEMPAGE_OFFLINE_KERNEL_CRITICAL_ALLOCATED = 4,
MEMPAGE_OFFLINE_RESERVED = 5
};
v4:
- Use scope_guard around lock, adapt bo->q and enhance warn messages
- %s/gpu_buddy_addr_to_block/gpu_buddy_allocated_addr_to_block
v3:
- Add more specific noncritical bo tests
v2:
- Add mode based automated test vs manual address feed
Signed-off-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
---
drivers/gpu/drm/xe/xe_debugfs.c | 171 +++++++++++++++++++++
drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h | 2 +
2 files changed, 173 insertions(+)
diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
index c9d4484821af..ce899aa363b1 100644
--- a/drivers/gpu/drm/xe/xe_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_debugfs.c
@@ -14,6 +14,7 @@
#include "regs/xe_pmt.h"
#include "xe_bo.h"
#include "xe_device.h"
+#include "xe_exec_queue_types.h"
#include "xe_force_wake.h"
#include "xe_gt.h"
#include "xe_gt_debugfs.h"
@@ -21,6 +22,7 @@
#include "xe_guc_ads.h"
#include "xe_hw_engine.h"
#include "xe_mmio.h"
+#include "xe_migrate.h"
#include "xe_pm.h"
#include "xe_psmi.h"
#include "xe_pxp_debugfs.h"
@@ -29,6 +31,8 @@
#include "xe_sriov_vf.h"
#include "xe_step.h"
#include "xe_tile_debugfs.h"
+#include "xe_ttm_stolen_mgr.h"
+#include "xe_ttm_vram_mgr.h"
#include "xe_vsec.h"
#include "xe_wa.h"
@@ -40,6 +44,14 @@
DECLARE_FAULT_ATTR(gt_reset_failure);
DECLARE_FAULT_ATTR(inject_csc_hw_error);
+enum mempage_offline_mode {
+ MEMPAGE_OFFLINE_UNALLOCATED = 0,
+ MEMPAGE_OFFLINE_USER_ALLOCATED = 1,
+ MEMPAGE_OFFLINE_KERNEL_USER_GGTT_ALLOCATED = 2,
+ MEMPAGE_OFFLINE_KERNEL_USER_PPGTT_ALLOCATED = 3,
+ MEMPAGE_OFFLINE_KERNEL_CRITICAL_ALLOCATED = 4,
+ MEMPAGE_OFFLINE_RESERVED = 5,
+};
static void read_residency_counter(struct xe_device *xe, struct xe_mmio *mmio,
u32 offset, const char *name, struct drm_printer *p)
@@ -544,6 +556,154 @@ static const struct file_operations disable_late_binding_fops = {
.write = disable_late_binding_set,
};
+static ssize_t addr_fault_reporting_show(struct file *f, char __user *ubuf,
+ size_t size, loff_t *pos)
+{
+ struct xe_device *xe = file_inode(f)->i_private;
+ char buf[32];
+ int len;
+
+ len = scnprintf(buf, sizeof(buf), "%lld\n", xe->mem.vram->ttm.offline_mode);
+
+ return simple_read_from_buffer(ubuf, size, pos, buf, len);
+}
+
+static int mempage_exec_offline(struct xe_device *xe, u64 mode)
+{
+ struct xe_tile *tile = xe_device_get_root_tile(xe);
+ struct xe_vram_region *vr = tile->mem.vram;
+ struct ttm_buffer_object *tbo = NULL;
+ struct xe_ttm_vram_mgr *vram_mgr;
+ struct gpu_buddy_block *block;
+ bool do_offline = false;
+ struct gpu_buddy *mm;
+ struct xe_bo *bo;
+ u64 addr = 0x0;
+ int ret = 0;
+
+ vram_mgr = &vr->ttm;
+ mm = &vram_mgr->mm;
+ addr = vr->dpa_base;
+ while (addr <= vr->dpa_base + vr->actual_physical_size) {
+ scoped_guard(mutex, &vram_mgr->lock) {
+ block = gpu_buddy_allocated_addr_to_block(mm, addr);
+ if (!block && mode == MEMPAGE_OFFLINE_UNALLOCATED)
+ do_offline = true;
+ if (block && PTR_ERR(block) != -ENXIO) {
+ if (!block->private) {
+ addr = addr + SZ_4K;
+ do_offline = false;
+ continue;
+ }
+ tbo = block->private;
+ bo = ttm_to_xe_bo(tbo);
+ if (bo->ttm.type == ttm_bo_type_device &&
+ bo->flags & XE_BO_FLAG_USER &&
+ bo->flags & XE_BO_FLAG_VRAM_MASK &&
+ mode == MEMPAGE_OFFLINE_USER_ALLOCATED) {
+ do_offline = true;
+ } else if (bo->q &&
+ mode == MEMPAGE_OFFLINE_KERNEL_USER_GGTT_ALLOCATED) {
+ /* lrc */
+ struct xe_vm *migrate_vm;
+
+ migrate_vm = xe_migrate_get_vm(tile->migrate);
+ if (migrate_vm != bo->q->vm)
+ do_offline = true;
+ xe_vm_put(migrate_vm);
+ } else if (bo->ttm.type == ttm_bo_type_kernel &&
+ bo->flags & XE_BO_FLAG_FORCE_USER_VRAM &&
+ bo->flags & XE_BO_FLAG_PAGETABLE &&
+ mode == MEMPAGE_OFFLINE_KERNEL_USER_PPGTT_ALLOCATED) {
+ /* ppgtt */
+ do_offline = true;
+ } else if (bo->ttm.type == ttm_bo_type_kernel &&
+ !(bo->flags & XE_BO_FLAG_FORCE_USER_VRAM) &&
+ mode == MEMPAGE_OFFLINE_KERNEL_CRITICAL_ALLOCATED) {
+ do_offline = true;
+ }
+ }
+ }
+ if (do_offline) {
+ /* Report fault */
+ ret = xe_ttm_vram_handle_addr_fault(xe, addr);
+ if (ret) {
+ if ((ret == -EIO) &&
+ mode == MEMPAGE_OFFLINE_KERNEL_USER_GGTT_ALLOCATED) {
+ addr = addr + SZ_4K;
+ if (do_offline)
+ do_offline = false;
+ continue;
+ }
+ break;
+ }
+ /* Verify addr + SZ_4K is allocated */
+ scoped_guard(mutex, &vram_mgr->lock) {
+ block = gpu_buddy_allocated_addr_to_block(mm, addr);
+ if (!block || PTR_ERR(block) == -ENXIO || block->private)
+ ret = -EBUSY;
+ }
+ break;
+ }
+ addr = addr + SZ_4K;
+ if (do_offline)
+ do_offline = false;
+ }
+ if (!do_offline)
+ drm_warn(&xe->drm, "no such object, ret:%d\n", ret);
+
+ return ret;
+}
+
+static ssize_t addr_fault_reporting_set(struct file *f, const char __user *ubuf,
+ size_t size, loff_t *pos)
+{
+ struct xe_device *xe = file_inode(f)->i_private;
+ int ret = 0;
+ u64 mode;
+
+ ret = kstrtou64_from_user(ubuf, size, 0, &mode);
+ if (ret)
+ return ret;
+
+ switch (mode) {
+ case MEMPAGE_OFFLINE_UNALLOCATED:
+ case MEMPAGE_OFFLINE_USER_ALLOCATED:
+ case MEMPAGE_OFFLINE_KERNEL_USER_GGTT_ALLOCATED:
+ case MEMPAGE_OFFLINE_KERNEL_USER_PPGTT_ALLOCATED:
+ case MEMPAGE_OFFLINE_KERNEL_CRITICAL_ALLOCATED:
+ ret = mempage_exec_offline(xe, mode);
+ break;
+ case MEMPAGE_OFFLINE_RESERVED:
+ u64 stolen_base;
+
+ stolen_base = xe_ttm_stolen_gpu_offset(xe);
+ ret = xe_ttm_vram_handle_addr_fault(xe, stolen_base);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ xe->mem.vram->ttm.offline_mode = mode;
+ if (!ret || (ret == -EIO &&
+ (mode == MEMPAGE_OFFLINE_KERNEL_CRITICAL_ALLOCATED ||
+ mode == MEMPAGE_OFFLINE_RESERVED))) {
+ drm_info(&xe->drm, "offline mode %llu passed ret:%d\n", mode, ret);
+ } else {
+ drm_warn(&xe->drm, "offline mode %llu failed, ret:%d\n", mode, ret);
+ return ret;
+ }
+
+ return size;
+}
+
+static const struct file_operations addr_fault_reporting_fops = {
+ .owner = THIS_MODULE,
+ .read = addr_fault_reporting_show,
+ .write = addr_fault_reporting_set,
+};
+
void xe_debugfs_register(struct xe_device *xe)
{
struct ttm_device *bdev = &xe->ttm;
@@ -600,6 +760,17 @@ void xe_debugfs_register(struct xe_device *xe)
if (man)
ttm_resource_manager_create_debugfs(man, root, "stolen_mm");
+ if (xe->info.platform == XE_CRESCENTISLAND) {
+ man = ttm_manager_type(bdev, XE_PL_VRAM0);
+ if (man) {
+ char name[20];
+
+ snprintf(name, sizeof(name), "invalid_addr_vram%d", 0);
+ debugfs_create_file(name, 0600, root, xe,
+ &addr_fault_reporting_fops);
+ }
+ }
+
for_each_tile(tile, xe, tile_id)
xe_tile_debugfs_register(tile);
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h
index 3ad7966798eb..07ed88b47e04 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h
@@ -37,6 +37,8 @@ struct xe_ttm_vram_mgr {
struct mutex lock;
/** @mem_type: The TTM memory type */
u32 mem_type;
+ /** @offline_mode: debugfs hook for setting page offline mode */
+ u64 offline_mode;
};
/**
--
2.52.0
next prev parent reply other threads:[~2026-04-16 7:50 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-16 7:49 [RFC PATCH V7 00/10] Add memory page offlining support Tejas Upadhyay
2026-04-16 7:49 ` [RFC PATCH V7 01/10] drm/xe: Link VRAM object with gpu buddy Tejas Upadhyay
2026-04-16 7:49 ` [RFC PATCH V7 02/10] gpu/buddy: Integrate lockdep for gpu buddy manager Tejas Upadhyay
2026-04-16 8:55 ` Matthew Auld
2026-04-16 9:43 ` Upadhyay, Tejas
2026-04-16 9:56 ` Matthew Auld
2026-04-16 10:04 ` Upadhyay, Tejas
2026-04-16 10:15 ` Matthew Auld
2026-04-16 10:18 ` Upadhyay, Tejas
2026-04-16 7:49 ` [RFC PATCH V7 03/10] drm/gpu: Add gpu_buddy_allocated_addr_to_block helper Tejas Upadhyay
2026-04-16 7:49 ` [RFC PATCH V7 04/10] drm/xe: Link LRC BO and its execution Queue Tejas Upadhyay
2026-04-30 3:34 ` Matthew Brost
2026-05-04 9:11 ` Upadhyay, Tejas
2026-04-16 7:49 ` [RFC PATCH V7 05/10] drm/xe: Extend BO purge to handle vram pages as well Tejas Upadhyay
2026-04-30 3:44 ` Matthew Brost
2026-04-30 12:08 ` Upadhyay, Tejas
2026-05-05 8:15 ` Yadav, Arvind
2026-04-16 7:49 ` [RFC PATCH V7 06/10] drm/xe: Handle physical memory address error Tejas Upadhyay
2026-04-16 7:49 ` Tejas Upadhyay [this message]
2026-04-16 7:49 ` [RFC PATCH V7 08/10] gpu/buddy: Add routine to dump allocated buddy blocks Tejas Upadhyay
2026-04-16 7:49 ` [RFC PATCH V7 09/10] drm/xe/configfs: Add vram bad page reservation policy Tejas Upadhyay
2026-04-16 7:49 ` [RFC PATCH V7 10/10] drm/xe/cri: Add sysfs interface for bad gpu vram pages Tejas Upadhyay
2026-04-30 13:53 ` Matthew Auld
2026-05-04 9:02 ` Upadhyay, Tejas
2026-05-05 8:44 ` Matthew Auld
2026-05-06 5:18 ` Upadhyay, Tejas
2026-04-16 7:56 ` ✗ CI.checkpatch: warning for Add memory page offlining support (rev8) Patchwork
2026-04-16 7:57 ` ✗ CI.KUnit: failure " Patchwork
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260416074958.3722666-19-tejas.upadhyay@intel.com \
--to=tejas.upadhyay@intel.com \
--cc=himal.prasad.ghimiray@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=matthew.auld@intel.com \
--cc=matthew.brost@intel.com \
--cc=thomas.hellstrom@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox