[RFC PATCH V7 07/10] drm/xe/cri: Add debugfs to inject faulty vram address

public inbox for intel-xe@lists.freedesktop.org
 help / color / mirror / Atom feed

From: Tejas Upadhyay <tejas.upadhyay@intel.com>
To: intel-xe@lists.freedesktop.org
Cc: matthew.auld@intel.com, matthew.brost@intel.com,
	thomas.hellstrom@linux.intel.com,
	himal.prasad.ghimiray@intel.com,
	Tejas Upadhyay <tejas.upadhyay@intel.com>
Subject: [RFC PATCH V7 07/10] drm/xe/cri: Add debugfs to inject faulty vram address
Date: Thu, 16 Apr 2026 13:19:56 +0530	[thread overview]
Message-ID: <20260416074958.3722666-19-tejas.upadhyay@intel.com> (raw)
In-Reply-To: <20260416074958.3722666-12-tejas.upadhyay@intel.com>

Add debugfs which can help testing feature with manual error injection.
Adding a debugfs interface to the drm/xe driver allows manual injection
of faulty VRAM addresses, facilitating the testing of the CRI memory
page offline feature before it is fully functional. The implementation
involves creating a debugfs entry, likely under
/sys/kernel/debug/dri/bdf/invalid_addr_vram0,
to accept specific faulty addresses for validation.

For example,
echo 0 > /sys/kernel/debug/dri/bdf/invalid_addr_vram0
where 0 is below address types to be tested,
enum mempage_offline_mode {
        MEMPAGE_OFFLINE_UNALLOCATED = 0,
        MEMPAGE_OFFLINE_USER_ALLOCATED = 1,
        MEMPAGE_OFFLINE_KERNEL_USER_GGTT_ALLOCATED = 2,
        MEMPAGE_OFFLINE_KERNEL_USER_PPGTT_ALLOCATED = 3,
        MEMPAGE_OFFLINE_KERNEL_CRITICAL_ALLOCATED = 4,
        MEMPAGE_OFFLINE_RESERVED = 5
};

v4:
- Use scope_guard around lock, adapt bo->q and enhance warn messages
- %s/gpu_buddy_addr_to_block/gpu_buddy_allocated_addr_to_block
v3:
- Add more specific noncritical bo tests
v2:
- Add mode based automated test vs manual address feed

Signed-off-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
---
 drivers/gpu/drm/xe/xe_debugfs.c            | 171 +++++++++++++++++++++
 drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h |   2 +
 2 files changed, 173 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
index c9d4484821af..ce899aa363b1 100644
--- a/drivers/gpu/drm/xe/xe_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_debugfs.c
@@ -14,6 +14,7 @@
 #include "regs/xe_pmt.h"
 #include "xe_bo.h"
 #include "xe_device.h"
+#include "xe_exec_queue_types.h"
 #include "xe_force_wake.h"
 #include "xe_gt.h"
 #include "xe_gt_debugfs.h"
@@ -21,6 +22,7 @@
 #include "xe_guc_ads.h"
 #include "xe_hw_engine.h"
 #include "xe_mmio.h"
+#include "xe_migrate.h"
 #include "xe_pm.h"
 #include "xe_psmi.h"
 #include "xe_pxp_debugfs.h"
@@ -29,6 +31,8 @@
 #include "xe_sriov_vf.h"
 #include "xe_step.h"
 #include "xe_tile_debugfs.h"
+#include "xe_ttm_stolen_mgr.h"
+#include "xe_ttm_vram_mgr.h"
 #include "xe_vsec.h"
 #include "xe_wa.h"
 
@@ -40,6 +44,14 @@
 
 DECLARE_FAULT_ATTR(gt_reset_failure);
 DECLARE_FAULT_ATTR(inject_csc_hw_error);
+enum mempage_offline_mode {
+	MEMPAGE_OFFLINE_UNALLOCATED = 0,
+	MEMPAGE_OFFLINE_USER_ALLOCATED = 1,
+	MEMPAGE_OFFLINE_KERNEL_USER_GGTT_ALLOCATED = 2,
+	MEMPAGE_OFFLINE_KERNEL_USER_PPGTT_ALLOCATED = 3,
+	MEMPAGE_OFFLINE_KERNEL_CRITICAL_ALLOCATED = 4,
+	MEMPAGE_OFFLINE_RESERVED = 5,
+};
 
 static void read_residency_counter(struct xe_device *xe, struct xe_mmio *mmio,
 				   u32 offset, const char *name, struct drm_printer *p)
@@ -544,6 +556,154 @@ static const struct file_operations disable_late_binding_fops = {
 	.write = disable_late_binding_set,
 };
 
+static ssize_t addr_fault_reporting_show(struct file *f, char __user *ubuf,
+					 size_t size, loff_t *pos)
+{
+	struct xe_device *xe = file_inode(f)->i_private;
+	char buf[32];
+	int len;
+
+	len = scnprintf(buf, sizeof(buf), "%lld\n", xe->mem.vram->ttm.offline_mode);
+
+	return simple_read_from_buffer(ubuf, size, pos, buf, len);
+}
+
+static int mempage_exec_offline(struct xe_device *xe, u64 mode)
+{
+	struct xe_tile *tile = xe_device_get_root_tile(xe);
+	struct xe_vram_region *vr = tile->mem.vram;
+	struct ttm_buffer_object *tbo = NULL;
+	struct xe_ttm_vram_mgr *vram_mgr;
+	struct gpu_buddy_block *block;
+	bool do_offline = false;
+	struct gpu_buddy *mm;
+	struct xe_bo *bo;
+	u64 addr = 0x0;
+	int ret = 0;
+
+	vram_mgr = &vr->ttm;
+	mm = &vram_mgr->mm;
+	addr = vr->dpa_base;
+	while (addr <= vr->dpa_base + vr->actual_physical_size) {
+		scoped_guard(mutex, &vram_mgr->lock) {
+			block = gpu_buddy_allocated_addr_to_block(mm, addr);
+			if (!block && mode == MEMPAGE_OFFLINE_UNALLOCATED)
+				do_offline = true;
+			if (block && PTR_ERR(block) != -ENXIO) {
+				if (!block->private) {
+					addr = addr + SZ_4K;
+					do_offline = false;
+					continue;
+				}
+				tbo = block->private;
+				bo = ttm_to_xe_bo(tbo);
+				if (bo->ttm.type == ttm_bo_type_device &&
+				    bo->flags & XE_BO_FLAG_USER &&
+				    bo->flags & XE_BO_FLAG_VRAM_MASK &&
+				    mode == MEMPAGE_OFFLINE_USER_ALLOCATED) {
+					do_offline = true;
+				} else if (bo->q &&
+					   mode == MEMPAGE_OFFLINE_KERNEL_USER_GGTT_ALLOCATED) {
+					/* lrc */
+					struct xe_vm *migrate_vm;
+
+					migrate_vm = xe_migrate_get_vm(tile->migrate);
+					if (migrate_vm != bo->q->vm)
+						do_offline = true;
+					xe_vm_put(migrate_vm);
+				} else if (bo->ttm.type == ttm_bo_type_kernel &&
+					   bo->flags & XE_BO_FLAG_FORCE_USER_VRAM &&
+					   bo->flags & XE_BO_FLAG_PAGETABLE &&
+					   mode == MEMPAGE_OFFLINE_KERNEL_USER_PPGTT_ALLOCATED) {
+					/* ppgtt */
+					do_offline = true;
+				} else if (bo->ttm.type == ttm_bo_type_kernel &&
+					   !(bo->flags & XE_BO_FLAG_FORCE_USER_VRAM) &&
+					   mode == MEMPAGE_OFFLINE_KERNEL_CRITICAL_ALLOCATED) {
+					do_offline = true;
+				}
+			}
+		}
+		if (do_offline) {
+			/* Report fault */
+			ret = xe_ttm_vram_handle_addr_fault(xe, addr);
+			if (ret) {
+				if ((ret == -EIO) &&
+				    mode == MEMPAGE_OFFLINE_KERNEL_USER_GGTT_ALLOCATED) {
+					addr = addr + SZ_4K;
+					if (do_offline)
+						do_offline = false;
+					continue;
+				}
+				break;
+			}
+			/* Verify addr + SZ_4K is allocated */
+			scoped_guard(mutex, &vram_mgr->lock) {
+				block = gpu_buddy_allocated_addr_to_block(mm, addr);
+				if (!block || PTR_ERR(block) == -ENXIO || block->private)
+					ret = -EBUSY;
+			}
+			break;
+		}
+		addr = addr + SZ_4K;
+		if (do_offline)
+			do_offline = false;
+	}
+	if (!do_offline)
+		drm_warn(&xe->drm, "no such object, ret:%d\n", ret);
+
+	return ret;
+}
+
+static ssize_t addr_fault_reporting_set(struct file *f, const char __user *ubuf,
+					size_t size, loff_t *pos)
+{
+	struct xe_device *xe = file_inode(f)->i_private;
+	int ret = 0;
+	u64 mode;
+
+	ret = kstrtou64_from_user(ubuf, size, 0, &mode);
+	if (ret)
+		return ret;
+
+	switch (mode) {
+	case MEMPAGE_OFFLINE_UNALLOCATED:
+	case MEMPAGE_OFFLINE_USER_ALLOCATED:
+	case MEMPAGE_OFFLINE_KERNEL_USER_GGTT_ALLOCATED:
+	case MEMPAGE_OFFLINE_KERNEL_USER_PPGTT_ALLOCATED:
+	case MEMPAGE_OFFLINE_KERNEL_CRITICAL_ALLOCATED:
+		ret = mempage_exec_offline(xe, mode);
+		break;
+	case MEMPAGE_OFFLINE_RESERVED:
+		u64 stolen_base;
+
+		stolen_base = xe_ttm_stolen_gpu_offset(xe);
+		ret = xe_ttm_vram_handle_addr_fault(xe, stolen_base);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	xe->mem.vram->ttm.offline_mode = mode;
+	if (!ret || (ret == -EIO &&
+		     (mode == MEMPAGE_OFFLINE_KERNEL_CRITICAL_ALLOCATED ||
+		      mode == MEMPAGE_OFFLINE_RESERVED))) {
+		drm_info(&xe->drm, "offline mode %llu passed ret:%d\n", mode, ret);
+	} else {
+		drm_warn(&xe->drm, "offline mode %llu failed, ret:%d\n", mode, ret);
+		return ret;
+	}
+
+	return size;
+}
+
+static const struct file_operations addr_fault_reporting_fops = {
+	.owner = THIS_MODULE,
+	.read = addr_fault_reporting_show,
+	.write = addr_fault_reporting_set,
+};
+
 void xe_debugfs_register(struct xe_device *xe)
 {
 	struct ttm_device *bdev = &xe->ttm;
@@ -600,6 +760,17 @@ void xe_debugfs_register(struct xe_device *xe)
 	if (man)
 		ttm_resource_manager_create_debugfs(man, root, "stolen_mm");
 
+	if (xe->info.platform == XE_CRESCENTISLAND) {
+		man = ttm_manager_type(bdev, XE_PL_VRAM0);
+		if (man) {
+			char name[20];
+
+			snprintf(name, sizeof(name), "invalid_addr_vram%d", 0);
+			debugfs_create_file(name, 0600, root, xe,
+					    &addr_fault_reporting_fops);
+		}
+	}
+
 	for_each_tile(tile, xe, tile_id)
 		xe_tile_debugfs_register(tile);
 
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h
index 3ad7966798eb..07ed88b47e04 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h
@@ -37,6 +37,8 @@ struct xe_ttm_vram_mgr {
 	struct mutex lock;
 	/** @mem_type: The TTM memory type */
 	u32 mem_type;
+	/** @offline_mode: debugfs hook for setting page offline mode */
+	u64 offline_mode;
 };
 
 /**
-- 
2.52.0

next prev parent reply	other threads:[~2026-04-16  7:50 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-16  7:49 [RFC PATCH V7 00/10] Add memory page offlining support Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 01/10] drm/xe: Link VRAM object with gpu buddy Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 02/10] gpu/buddy: Integrate lockdep for gpu buddy manager Tejas Upadhyay
2026-04-16  8:55   ` Matthew Auld
2026-04-16  9:43     ` Upadhyay, Tejas
2026-04-16  9:56       ` Matthew Auld
2026-04-16 10:04         ` Upadhyay, Tejas
2026-04-16 10:15           ` Matthew Auld
2026-04-16 10:18             ` Upadhyay, Tejas
2026-04-16  7:49 ` [RFC PATCH V7 03/10] drm/gpu: Add gpu_buddy_allocated_addr_to_block helper Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 04/10] drm/xe: Link LRC BO and its execution Queue Tejas Upadhyay
2026-04-30  3:34   ` Matthew Brost
2026-05-04  9:11     ` Upadhyay, Tejas
2026-04-16  7:49 ` [RFC PATCH V7 05/10] drm/xe: Extend BO purge to handle vram pages as well Tejas Upadhyay
2026-04-30  3:44   ` Matthew Brost
2026-04-30 12:08     ` Upadhyay, Tejas
2026-05-05  8:15     ` Yadav, Arvind
2026-04-16  7:49 ` [RFC PATCH V7 06/10] drm/xe: Handle physical memory address error Tejas Upadhyay
2026-04-16  7:49 ` Tejas Upadhyay [this message]
2026-04-16  7:49 ` [RFC PATCH V7 08/10] gpu/buddy: Add routine to dump allocated buddy blocks Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 09/10] drm/xe/configfs: Add vram bad page reservation policy Tejas Upadhyay
2026-04-16  7:49 ` [RFC PATCH V7 10/10] drm/xe/cri: Add sysfs interface for bad gpu vram pages Tejas Upadhyay
2026-04-30 13:53   ` Matthew Auld
2026-05-04  9:02     ` Upadhyay, Tejas
2026-05-05  8:44       ` Matthew Auld
2026-05-06  5:18         ` Upadhyay, Tejas
2026-04-16  7:56 ` ✗ CI.checkpatch: warning for Add memory page offlining support (rev8) Patchwork
2026-04-16  7:57 ` ✗ CI.KUnit: failure " Patchwork

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c9d4484821a dfblob:ce899aa363b dfblob:3ad7966798e
dfblob:07ed88b47e0 )
 OR (
bs:"[RFC PATCH V7 07/10] drm/xe/cri: Add debugfs to inject faulty vram address" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260416074958.3722666-19-tejas.upadhyay@intel.com \
    --to=tejas.upadhyay@intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=matthew.auld@intel.com \
    --cc=matthew.brost@intel.com \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox