* [PATCH 0/2] Add configfs support for survivability mode
@ 2025-03-07 14:24 Riana Tauro
2025-03-07 14:24 ` [PATCH 1/2] RFC drm/xe: Add configfs to enable " Riana Tauro
` (4 more replies)
0 siblings, 5 replies; 23+ messages in thread
From: Riana Tauro @ 2025-03-07 14:24 UTC (permalink / raw)
To: intel-xe
Cc: riana.tauro, anshuman.gupta, rodrigo.vivi, matthew.d.roper,
lucas.demarchi
This RFC series proposes to expose attributes via xe configfs
subsystem. survivability mode attribute is added here that allows
the user to configure the device to enter survivability mode.
This is done by
echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/unbind
echo "domain:bus:dev.fn" > sys/kernel/config/xe/survivability_mode
echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/bind
This is an alternative to introducing module param that causes
all the connected and supported cards to enter survivability mode.
Manually entering survivability mode is useful when pcode does not
report failure and for validation
Riana Tauro (2):
drm/xe: Add configfs to enable survivability mode
drm/xe: Enable configfs support for survivability mode
drivers/gpu/drm/xe/Makefile | 1 +
drivers/gpu/drm/xe/xe_configfs.c | 117 +++++++++++++++++++++
drivers/gpu/drm/xe/xe_configfs.h | 16 +++
drivers/gpu/drm/xe/xe_module.c | 5 +
drivers/gpu/drm/xe/xe_module.h | 1 +
drivers/gpu/drm/xe/xe_pci.c | 8 +-
drivers/gpu/drm/xe/xe_survivability_mode.c | 21 +++-
7 files changed, 163 insertions(+), 6 deletions(-)
create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
--
2.47.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-07 14:24 [PATCH 0/2] Add configfs support for survivability mode Riana Tauro
@ 2025-03-07 14:24 ` Riana Tauro
2025-03-07 14:45 ` Rodrigo Vivi
` (2 more replies)
2025-03-07 14:24 ` [PATCH 2/2] RFC drm/xe: Enable configfs support for " Riana Tauro
` (3 subsequent siblings)
4 siblings, 3 replies; 23+ messages in thread
From: Riana Tauro @ 2025-03-07 14:24 UTC (permalink / raw)
To: intel-xe
Cc: riana.tauro, anshuman.gupta, rodrigo.vivi, matthew.d.roper,
lucas.demarchi
Registers a configfs subsystem called 'xe' to userspace. The user can
use this to modify exposed attributes.
Add survivability mode attribute (config/xe/survivability_mode) to the
subsystem to allow the user to specify the card that should enter
survivability mode.
Signed-off-by: Riana Tauro <riana.tauro@intel.com>
---
drivers/gpu/drm/xe/Makefile | 1 +
drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
drivers/gpu/drm/xe/xe_module.c | 5 ++
drivers/gpu/drm/xe/xe_module.h | 1 +
5 files changed, 114 insertions(+)
create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index 9699b08585f7..3f8c87292cee 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
xe-y += xe_bb.o \
xe_bo.o \
xe_bo_evict.o \
+ xe_configfs.o \
xe_devcoredump.o \
xe_device.o \
xe_device_sysfs.o \
diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
new file mode 100644
index 000000000000..8c5f248e466d
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_configfs.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <linux/configfs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include "xe_configfs.h"
+#include "xe_module.h"
+
+/**
+ * DOC: Xe Configfs
+ *
+ * XE KMD registers a configfs subsystem called 'xe'to userspace that allows users to modify
+ * the exposed attributes.
+ *
+ * Attributes:
+ *
+ * config/xe/survivability_mode : Write only attribute that allows user to specify the PCI address
+ * of the card that has to enter survivability mode
+ */
+
+void xe_configfs_clear_survivability_mode(void)
+{
+ kfree(xe_modparam.survivability_mode);
+ xe_modparam.survivability_mode = NULL;
+}
+
+static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
+{
+ char *survivability_mode;
+ int ret;
+ unsigned int domain, bus, slot, function;
+
+ ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus, &slot, &function);
+ if (ret != 4)
+ return -EINVAL;
+
+ survivability_mode = kstrdup(page, GFP_KERNEL);
+ if (!survivability_mode)
+ return -ENOMEM;
+
+ xe_configfs_clear_survivability_mode();
+ xe_modparam.survivability_mode = survivability_mode;
+
+ return len;
+}
+
+CONFIGFS_ATTR_WO(, survivability_mode);
+
+static struct configfs_attribute *xe_configfs_attrs[] = {
+ &attr_survivability_mode,
+ NULL,
+};
+
+static const struct config_item_type xe_config_type = {
+ .ct_attrs = xe_configfs_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem xe_config_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "xe",
+ .ci_type = &xe_config_type,
+ },
+ },
+};
+
+int __init xe_configfs_init(void)
+{
+ int ret;
+
+ config_group_init(&xe_config_subsys.su_group);
+ mutex_init(&xe_config_subsys.su_mutex);
+ ret = configfs_register_subsystem(&xe_config_subsys);
+ if (ret) {
+ pr_err("Error %d while registering subsystem %s\n",
+ ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
+ mutex_destroy(&xe_config_subsys.su_mutex);
+ return ret;
+ }
+
+ return 0;
+}
+
+void __exit xe_configfs_exit(void)
+{
+ xe_configfs_clear_survivability_mode();
+ configfs_unregister_subsystem(&xe_config_subsys);
+ mutex_destroy(&xe_config_subsys.su_mutex);
+}
+
diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
new file mode 100644
index 000000000000..491629a2ca53
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_configfs.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+#ifndef _XE_CONFIGFS_H_
+#define _XE_CONFIGFS_H_
+
+int xe_configfs_init(void);
+void xe_configfs_exit(void);
+void xe_configfs_clear_survivability_mode(void);
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
index 475acdba2b55..15b3cf22193c 100644
--- a/drivers/gpu/drm/xe/xe_module.c
+++ b/drivers/gpu/drm/xe/xe_module.c
@@ -11,6 +11,7 @@
#include <drm/drm_module.h>
#include "xe_drv.h"
+#include "xe_configfs.h"
#include "xe_hw_fence.h"
#include "xe_pci.h"
#include "xe_pm.h"
@@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
{
.init = xe_check_nomodeset,
},
+ {
+ .init = xe_configfs_init,
+ .exit = xe_configfs_exit,
+ },
{
.init = xe_hw_fence_module_init,
.exit = xe_hw_fence_module_exit,
diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
index 84339e509c80..c238dbee6bc7 100644
--- a/drivers/gpu/drm/xe/xe_module.h
+++ b/drivers/gpu/drm/xe/xe_module.h
@@ -24,6 +24,7 @@ struct xe_modparam {
#endif
int wedged_mode;
u32 svm_notifier_size;
+ char *survivability_mode;
};
extern struct xe_modparam xe_modparam;
--
2.47.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH 2/2] RFC drm/xe: Enable configfs support for survivability mode
2025-03-07 14:24 [PATCH 0/2] Add configfs support for survivability mode Riana Tauro
2025-03-07 14:24 ` [PATCH 1/2] RFC drm/xe: Add configfs to enable " Riana Tauro
@ 2025-03-07 14:24 ` Riana Tauro
2025-03-07 15:01 ` Rodrigo Vivi
2025-03-07 14:36 ` ✓ CI.Patch_applied: success for Add " Patchwork
` (2 subsequent siblings)
4 siblings, 1 reply; 23+ messages in thread
From: Riana Tauro @ 2025-03-07 14:24 UTC (permalink / raw)
To: intel-xe
Cc: riana.tauro, anshuman.gupta, rodrigo.vivi, matthew.d.roper,
lucas.demarchi
Register a configfs subsystem called 'xe' to userspace that allows
users to modify the exposed attributes. Expose survivability mode as
an attribute that can be modified manually. This is useful if
pcode fails to detect survivability mode and for validation
To enable survivability mode for a card,
echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/unbind
echo "domain:bus:dev.fn" > sys/kernel/config/xe/survivability_mode
echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/bind
Signed-off-by: Riana Tauro <riana.tauro@intel.com>
---
drivers/gpu/drm/xe/xe_configfs.c | 22 ++++++++++++++++++++++
drivers/gpu/drm/xe/xe_configfs.h | 6 +++++-
drivers/gpu/drm/xe/xe_pci.c | 8 ++++----
drivers/gpu/drm/xe/xe_survivability_mode.c | 21 +++++++++++++++++++--
4 files changed, 50 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
index 8c5f248e466d..ce9f3757100f 100644
--- a/drivers/gpu/drm/xe/xe_configfs.c
+++ b/drivers/gpu/drm/xe/xe_configfs.c
@@ -6,6 +6,7 @@
#include <linux/configfs.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/pci.h>
#include "xe_configfs.h"
#include "xe_module.h"
@@ -28,6 +29,27 @@ void xe_configfs_clear_survivability_mode(void)
xe_modparam.survivability_mode = NULL;
}
+bool xe_configfs_survivability_mode_enabled(struct pci_dev *pdev)
+{
+ unsigned int domain, bus, slot, function;
+ int ret = 0;
+
+ if (!xe_modparam.survivability_mode)
+ goto err;
+
+ ret = sscanf(xe_modparam.survivability_mode, "%04x:%02x:%02x.%x", &domain, &bus,
+ &slot, &function);
+ if (ret != 4)
+ goto err;
+
+ if ((pci_domain_nr(pdev->bus) == domain) && (pdev->bus->number == bus) &&
+ (PCI_SLOT(pdev->devfn) == slot) && (PCI_FUNC(pdev->devfn) == function))
+ return true;
+
+err:
+ return false;
+}
+
static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
{
char *survivability_mode;
diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
index 491629a2ca53..b03f4c7d0f54 100644
--- a/drivers/gpu/drm/xe/xe_configfs.h
+++ b/drivers/gpu/drm/xe/xe_configfs.h
@@ -5,8 +5,12 @@
#ifndef _XE_CONFIGFS_H_
#define _XE_CONFIGFS_H_
+#include <linux/types.h>
+
+struct pci_dev;
+
int xe_configfs_init(void);
void xe_configfs_exit(void);
void xe_configfs_clear_survivability_mode(void);
-
+bool xe_configfs_survivability_mode_enabled(struct pci_dev *pdev);
#endif
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 4d982a5a4ffd..d0f66cc08aa5 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -17,6 +17,7 @@
#include "display/xe_display.h"
#include "regs/xe_gt_regs.h"
+#include "xe_configfs.h"
#include "xe_device.h"
#include "xe_drv.h"
#include "xe_gt.h"
@@ -815,10 +816,9 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
* mei. If early probe fails, check if survivability mode is flagged by
* HW to be enabled. In that case enable it and return success.
*/
- if (err) {
- if (xe_survivability_mode_required(xe) &&
- xe_survivability_mode_enable(xe))
- return 0;
+ if (xe_configfs_survivability_mode_enabled(pdev) || err) {
+ if (xe_survivability_mode_required(xe))
+ return xe_survivability_mode_enable(xe);
return err;
}
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
index d939ce70e6fa..5b60cbf8f7cb 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
@@ -10,6 +10,7 @@
#include <linux/pci.h>
#include <linux/sysfs.h>
+#include "xe_configfs.h"
#include "xe_device.h"
#include "xe_gt.h"
#include "xe_heci_gsc.h"
@@ -28,8 +29,10 @@
* This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
* to be flashed through mei and collect telemetry. The driver's probe flow is modified
* such that it enters survivability mode when pcode initialization is incomplete and boot status
- * denotes a failure. The driver then populates the survivability_mode PCI sysfs indicating
- * survivability mode and provides additional information required for debug
+ * denotes a failure. Survivability mode can also be enabled manually by writing the pci address of
+ * the card to the xe configfs attribute. This is useful in cases where pcode does not detect
+ * failure and for validation. The driver then populates the survivability_mode PCI sysfs
+ * indicating survivability mode and provides additional information required for debug
*
* KMD exposes below admin-only readable sysfs in survivability mode
*
@@ -42,6 +45,15 @@
* Overflow Information - Provides history of previous failures
* Auxiliary Information - Certain failures may have information in
* addition to postcode information
+ * Enable survivability mode through configfs
+ *
+ * survivability mode is exposed as an attribute under the xe configfs subsystem. User can specify
+ * the card that needs to enter survivability mode.
+ *
+ * echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/unbind
+ * echo "domain:bus:dev.fn" > sys/kernel/config/xe/survivability_mode
+ * echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/bind
+ *
*/
static u32 aux_history_offset(u32 reg_value)
@@ -133,6 +145,7 @@ static void xe_survivability_mode_fini(void *arg)
struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
struct device *dev = &pdev->dev;
+ xe_configfs_clear_survivability_mode();
sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
}
@@ -190,11 +203,15 @@ bool xe_survivability_mode_required(struct xe_device *xe)
{
struct xe_survivability *survivability = &xe->survivability;
struct xe_mmio *mmio = xe_root_tile_mmio(xe);
+ struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
u32 data;
if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe))
return false;
+ if (xe_configfs_survivability_mode_enabled(pdev))
+ return true;
+
data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
--
2.47.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* ✓ CI.Patch_applied: success for Add configfs support for survivability mode
2025-03-07 14:24 [PATCH 0/2] Add configfs support for survivability mode Riana Tauro
2025-03-07 14:24 ` [PATCH 1/2] RFC drm/xe: Add configfs to enable " Riana Tauro
2025-03-07 14:24 ` [PATCH 2/2] RFC drm/xe: Enable configfs support for " Riana Tauro
@ 2025-03-07 14:36 ` Patchwork
2025-03-07 14:36 ` ✗ CI.checkpatch: warning " Patchwork
2025-03-07 14:37 ` ✗ CI.KUnit: failure " Patchwork
4 siblings, 0 replies; 23+ messages in thread
From: Patchwork @ 2025-03-07 14:36 UTC (permalink / raw)
To: Riana Tauro; +Cc: intel-xe
== Series Details ==
Series: Add configfs support for survivability mode
URL : https://patchwork.freedesktop.org/series/145999/
State : success
== Summary ==
=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: 066c46f28642 drm-tip: 2025y-03m-07d-12h-35m-40s UTC integration manifest
=== git am output follows ===
.git/rebase-apply/patch:123: new blank line at EOF.
+
warning: 1 line adds whitespace errors.
Applying: RFC drm/xe: Add configfs to enable survivability mode
Applying: RFC drm/xe: Enable configfs support for survivability mode
^ permalink raw reply [flat|nested] 23+ messages in thread
* ✗ CI.checkpatch: warning for Add configfs support for survivability mode
2025-03-07 14:24 [PATCH 0/2] Add configfs support for survivability mode Riana Tauro
` (2 preceding siblings ...)
2025-03-07 14:36 ` ✓ CI.Patch_applied: success for Add " Patchwork
@ 2025-03-07 14:36 ` Patchwork
2025-03-07 14:37 ` ✗ CI.KUnit: failure " Patchwork
4 siblings, 0 replies; 23+ messages in thread
From: Patchwork @ 2025-03-07 14:36 UTC (permalink / raw)
To: Riana Tauro; +Cc: intel-xe
== Series Details ==
Series: Add configfs support for survivability mode
URL : https://patchwork.freedesktop.org/series/145999/
State : warning
== Summary ==
+ KERNEL=/kernel
+ git clone https://gitlab.freedesktop.org/drm/maintainer-tools mt
Cloning into 'mt'...
warning: redirecting to https://gitlab.freedesktop.org/drm/maintainer-tools.git/
+ git -C mt rev-list -n1 origin/master
cbb4e4a079d89106c2736adc3c7de6f9dc56da07
+ cd /kernel
+ git config --global --add safe.directory /kernel
+ git log -n1
commit 48de3888a0a153a62e1851dee907b3217f00905a
Author: Riana Tauro <riana.tauro@intel.com>
Date: Fri Mar 7 19:54:45 2025 +0530
RFC drm/xe: Enable configfs support for survivability mode
Register a configfs subsystem called 'xe' to userspace that allows
users to modify the exposed attributes. Expose survivability mode as
an attribute that can be modified manually. This is useful if
pcode fails to detect survivability mode and for validation
To enable survivability mode for a card,
echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/unbind
echo "domain:bus:dev.fn" > sys/kernel/config/xe/survivability_mode
echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/bind
Signed-off-by: Riana Tauro <riana.tauro@intel.com>
+ /mt/dim checkpatch 066c46f28642479869d2d1dd27cc6ae476de4abc drm-intel
1dde0ff4eddd RFC drm/xe: Add configfs to enable survivability mode
-:28: WARNING:FILE_PATH_CHANGES: added, moved or deleted file(s), does MAINTAINERS need updating?
#28:
new file mode 100644
total: 0 errors, 1 warnings, 0 checks, 138 lines checked
48de3888a0a1 RFC drm/xe: Enable configfs support for survivability mode
-:48: CHECK:UNNECESSARY_PARENTHESES: Unnecessary parentheses around 'pdev->bus->number == bus'
#48: FILE: drivers/gpu/drm/xe/xe_configfs.c:45:
+ if ((pci_domain_nr(pdev->bus) == domain) && (pdev->bus->number == bus) &&
+ (PCI_SLOT(pdev->devfn) == slot) && (PCI_FUNC(pdev->devfn) == function))
total: 0 errors, 0 warnings, 1 checks, 123 lines checked
^ permalink raw reply [flat|nested] 23+ messages in thread
* ✗ CI.KUnit: failure for Add configfs support for survivability mode
2025-03-07 14:24 [PATCH 0/2] Add configfs support for survivability mode Riana Tauro
` (3 preceding siblings ...)
2025-03-07 14:36 ` ✗ CI.checkpatch: warning " Patchwork
@ 2025-03-07 14:37 ` Patchwork
4 siblings, 0 replies; 23+ messages in thread
From: Patchwork @ 2025-03-07 14:37 UTC (permalink / raw)
To: Riana Tauro; +Cc: intel-xe
== Series Details ==
Series: Add configfs support for survivability mode
URL : https://patchwork.freedesktop.org/series/145999/
State : failure
== Summary ==
+ trap cleanup EXIT
+ /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
ERROR:root:../lib/iomap.c:156:5: warning: no previous prototype for ‘ioread64_lo_hi’ [-Wmissing-prototypes]
156 | u64 ioread64_lo_hi(const void __iomem *addr)
| ^~~~~~~~~~~~~~
../lib/iomap.c:163:5: warning: no previous prototype for ‘ioread64_hi_lo’ [-Wmissing-prototypes]
163 | u64 ioread64_hi_lo(const void __iomem *addr)
| ^~~~~~~~~~~~~~
../lib/iomap.c:170:5: warning: no previous prototype for ‘ioread64be_lo_hi’ [-Wmissing-prototypes]
170 | u64 ioread64be_lo_hi(const void __iomem *addr)
| ^~~~~~~~~~~~~~~~
../lib/iomap.c:178:5: warning: no previous prototype for ‘ioread64be_hi_lo’ [-Wmissing-prototypes]
178 | u64 ioread64be_hi_lo(const void __iomem *addr)
| ^~~~~~~~~~~~~~~~
../lib/iomap.c:264:6: warning: no previous prototype for ‘iowrite64_lo_hi’ [-Wmissing-prototypes]
264 | void iowrite64_lo_hi(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~
../lib/iomap.c:272:6: warning: no previous prototype for ‘iowrite64_hi_lo’ [-Wmissing-prototypes]
272 | void iowrite64_hi_lo(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~
../lib/iomap.c:280:6: warning: no previous prototype for ‘iowrite64be_lo_hi’ [-Wmissing-prototypes]
280 | void iowrite64be_lo_hi(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~~~
../lib/iomap.c:288:6: warning: no previous prototype for ‘iowrite64be_hi_lo’ [-Wmissing-prototypes]
288 | void iowrite64be_hi_lo(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~~~
/usr/bin/ld: drivers/gpu/drm/xe/xe_configfs.o: in function `xe_configfs_init':
xe_configfs.c:(.init.text+0x6): undefined reference to `config_group_init'
/usr/bin/ld: xe_configfs.c:(.init.text+0x4f): undefined reference to `configfs_register_subsystem'
/usr/bin/ld: drivers/gpu/drm/xe/xe_configfs.o: in function `xe_configfs_exit':
xe_configfs.c:(.exit.text+0x31): undefined reference to `configfs_unregister_subsystem'
collect2: error: ld returned 1 exit status
make[3]: *** [../scripts/Makefile.vmlinux:77: vmlinux] Error 1
make[2]: *** [/kernel/Makefile:1226: vmlinux] Error 2
make[1]: *** [/kernel/Makefile:251: __sub-make] Error 2
make: *** [Makefile:251: __sub-make] Error 2
[14:36:30] Configuring KUnit Kernel ...
Generating .config ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
[14:36:34] Building KUnit Kernel ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
Building with:
$ make all compile_commands.json ARCH=um O=.kunit --jobs=48
+ cleanup
++ stat -c %u:%g /kernel
+ chown -R 1003:1003 /kernel
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-07 14:24 ` [PATCH 1/2] RFC drm/xe: Add configfs to enable " Riana Tauro
@ 2025-03-07 14:45 ` Rodrigo Vivi
2025-03-07 15:16 ` Lucas De Marchi
2025-03-10 7:14 ` Aravind Iddamsetty
2 siblings, 0 replies; 23+ messages in thread
From: Rodrigo Vivi @ 2025-03-07 14:45 UTC (permalink / raw)
To: Riana Tauro; +Cc: intel-xe, anshuman.gupta, matthew.d.roper, lucas.demarchi
On Fri, Mar 07, 2025 at 07:54:44PM +0530, Riana Tauro wrote:
> Registers a configfs subsystem called 'xe' to userspace. The user can
> use this to modify exposed attributes.
>
> Add survivability mode attribute (config/xe/survivability_mode) to the
> subsystem to allow the user to specify the card that should enter
> survivability mode.
>
> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
> ---
> drivers/gpu/drm/xe/Makefile | 1 +
> drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
> drivers/gpu/drm/xe/xe_module.c | 5 ++
> drivers/gpu/drm/xe/xe_module.h | 1 +
> 5 files changed, 114 insertions(+)
> create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
> create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>
> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> index 9699b08585f7..3f8c87292cee 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> xe-y += xe_bb.o \
> xe_bo.o \
> xe_bo_evict.o \
> + xe_configfs.o \
> xe_devcoredump.o \
> xe_device.o \
> xe_device_sysfs.o \
> diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
> new file mode 100644
> index 000000000000..8c5f248e466d
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_configfs.c
> @@ -0,0 +1,95 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2025 Intel Corporation
> + */
> +
> +#include <linux/configfs.h>
> +#include <linux/init.h>
> +#include <linux/module.h>
> +
> +#include "xe_configfs.h"
> +#include "xe_module.h"
> +
> +/**
> + * DOC: Xe Configfs
> + *
> + * XE KMD registers a configfs subsystem called 'xe'to userspace that allows users to modify
-----> ^ missing space
> + * the exposed attributes.
> + *
> + * Attributes:
> + *
> + * config/xe/survivability_mode : Write only attribute that allows user to specify the PCI address
> + * of the card that has to enter survivability mode
> + */
> +
> +void xe_configfs_clear_survivability_mode(void)
> +{
> + kfree(xe_modparam.survivability_mode);
> + xe_modparam.survivability_mode = NULL;
let's avoid xe_modparam, but more about this below...
> +}
> +
> +static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
> +{
> + char *survivability_mode;
> + int ret;
> + unsigned int domain, bus, slot, function;
> +
> + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus, &slot, &function);
> + if (ret != 4)
> + return -EINVAL;
> +
> + survivability_mode = kstrdup(page, GFP_KERNEL);
> + if (!survivability_mode)
> + return -ENOMEM;
> +
> + xe_configfs_clear_survivability_mode();
> + xe_modparam.survivability_mode = survivability_mode;
> +
> + return len;
> +}
> +
> +CONFIGFS_ATTR_WO(, survivability_mode);
> +
> +static struct configfs_attribute *xe_configfs_attrs[] = {
> + &attr_survivability_mode,
> + NULL,
> +};
> +
> +static const struct config_item_type xe_config_type = {
> + .ct_attrs = xe_configfs_attrs,
> + .ct_owner = THIS_MODULE,
> +};
> +
> +static struct configfs_subsystem xe_config_subsys = {
> + .su_group = {
> + .cg_item = {
> + .ci_namebuf = "xe",
> + .ci_type = &xe_config_type,
> + },
> + },
> +};
> +
> +int __init xe_configfs_init(void)
> +{
> + int ret;
> +
> + config_group_init(&xe_config_subsys.su_group);
> + mutex_init(&xe_config_subsys.su_mutex);
That's strange... if we are not using the mutex here,
why do we need to initialize it ourselves?
Why isn't it done inside the config_group_init then?!
> + ret = configfs_register_subsystem(&xe_config_subsys);
> + if (ret) {
> + pr_err("Error %d while registering subsystem %s\n",
> + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
> + mutex_destroy(&xe_config_subsys.su_mutex);
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +void __exit xe_configfs_exit(void)
> +{
> + xe_configfs_clear_survivability_mode();
> + configfs_unregister_subsystem(&xe_config_subsys);
> + mutex_destroy(&xe_config_subsys.su_mutex);
> +}
> +
> diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
> new file mode 100644
> index 000000000000..491629a2ca53
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_configfs.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2025 Intel Corporation
> + */
> +#ifndef _XE_CONFIGFS_H_
> +#define _XE_CONFIGFS_H_
> +
> +int xe_configfs_init(void);
> +void xe_configfs_exit(void);
> +void xe_configfs_clear_survivability_mode(void);
> +
> +#endif
> diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
> index 475acdba2b55..15b3cf22193c 100644
> --- a/drivers/gpu/drm/xe/xe_module.c
> +++ b/drivers/gpu/drm/xe/xe_module.c
> @@ -11,6 +11,7 @@
> #include <drm/drm_module.h>
>
> #include "xe_drv.h"
> +#include "xe_configfs.h"
> #include "xe_hw_fence.h"
> #include "xe_pci.h"
> #include "xe_pm.h"
> @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
> {
> .init = xe_check_nomodeset,
> },
> + {
> + .init = xe_configfs_init,
> + .exit = xe_configfs_exit,
> + },
> {
> .init = xe_hw_fence_module_init,
> .exit = xe_hw_fence_module_exit,
> diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
> index 84339e509c80..c238dbee6bc7 100644
> --- a/drivers/gpu/drm/xe/xe_module.h
> +++ b/drivers/gpu/drm/xe/xe_module.h
> @@ -24,6 +24,7 @@ struct xe_modparam {
> #endif
> int wedged_mode;
> u32 svm_notifier_size;
> + char *survivability_mode;
We should probably create another struct
struct xe_configfs {
char *survivability_mode;
}
it could be here or perhaps inside a xe_configfs_types.h
that is imported wherever it is needed.
> };
>
> extern struct xe_modparam xe_modparam;
> --
> 2.47.1
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 2/2] RFC drm/xe: Enable configfs support for survivability mode
2025-03-07 14:24 ` [PATCH 2/2] RFC drm/xe: Enable configfs support for " Riana Tauro
@ 2025-03-07 15:01 ` Rodrigo Vivi
2025-03-10 5:36 ` Riana Tauro
0 siblings, 1 reply; 23+ messages in thread
From: Rodrigo Vivi @ 2025-03-07 15:01 UTC (permalink / raw)
To: Riana Tauro; +Cc: intel-xe, anshuman.gupta, matthew.d.roper, lucas.demarchi
On Fri, Mar 07, 2025 at 07:54:45PM +0530, Riana Tauro wrote:
> Register a configfs subsystem called 'xe' to userspace that allows
> users to modify the exposed attributes. Expose survivability mode as
> an attribute that can be modified manually. This is useful if
> pcode fails to detect survivability mode and for validation
>
> To enable survivability mode for a card,
>
> echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/unbind
> echo "domain:bus:dev.fn" > sys/kernel/config/xe/survivability_mode
> echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/bind
This is very great! Perhaps at some point we will still need
the module parameter in this specific survivability case.
But this configfs is more than needed and wanted for this and
many other cases.
>
> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
> ---
> drivers/gpu/drm/xe/xe_configfs.c | 22 ++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_configfs.h | 6 +++++-
> drivers/gpu/drm/xe/xe_pci.c | 8 ++++----
> drivers/gpu/drm/xe/xe_survivability_mode.c | 21 +++++++++++++++++++--
> 4 files changed, 50 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
> index 8c5f248e466d..ce9f3757100f 100644
> --- a/drivers/gpu/drm/xe/xe_configfs.c
> +++ b/drivers/gpu/drm/xe/xe_configfs.c
> @@ -6,6 +6,7 @@
> #include <linux/configfs.h>
> #include <linux/init.h>
> #include <linux/module.h>
> +#include <linux/pci.h>
>
> #include "xe_configfs.h"
> #include "xe_module.h"
> @@ -28,6 +29,27 @@ void xe_configfs_clear_survivability_mode(void)
> xe_modparam.survivability_mode = NULL;
> }
>
> +bool xe_configfs_survivability_mode_enabled(struct pci_dev *pdev)
> +{
> + unsigned int domain, bus, slot, function;
> + int ret = 0;
> +
> + if (!xe_modparam.survivability_mode)
> + goto err;
> +
> + ret = sscanf(xe_modparam.survivability_mode, "%04x:%02x:%02x.%x", &domain, &bus,
> + &slot, &function);
> + if (ret != 4)
> + goto err;
> +
> + if ((pci_domain_nr(pdev->bus) == domain) && (pdev->bus->number == bus) &&
> + (PCI_SLOT(pdev->devfn) == slot) && (PCI_FUNC(pdev->devfn) == function))
> + return true;
> +
> +err:
if nothing else is here, please just return false everywhere instead of the goto
> + return false;
> +}
> +
> static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
> {
> char *survivability_mode;
> diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
> index 491629a2ca53..b03f4c7d0f54 100644
> --- a/drivers/gpu/drm/xe/xe_configfs.h
> +++ b/drivers/gpu/drm/xe/xe_configfs.h
> @@ -5,8 +5,12 @@
> #ifndef _XE_CONFIGFS_H_
> #define _XE_CONFIGFS_H_
>
> +#include <linux/types.h>
> +
> +struct pci_dev;
> +
> int xe_configfs_init(void);
> void xe_configfs_exit(void);
> void xe_configfs_clear_survivability_mode(void);
> -
> +bool xe_configfs_survivability_mode_enabled(struct pci_dev *pdev);
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
> index 4d982a5a4ffd..d0f66cc08aa5 100644
> --- a/drivers/gpu/drm/xe/xe_pci.c
> +++ b/drivers/gpu/drm/xe/xe_pci.c
> @@ -17,6 +17,7 @@
>
> #include "display/xe_display.h"
> #include "regs/xe_gt_regs.h"
> +#include "xe_configfs.h"
> #include "xe_device.h"
> #include "xe_drv.h"
> #include "xe_gt.h"
> @@ -815,10 +816,9 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> * mei. If early probe fails, check if survivability mode is flagged by
> * HW to be enabled. In that case enable it and return success.
> */
> - if (err) {
> - if (xe_survivability_mode_required(xe) &&
> - xe_survivability_mode_enable(xe))
> - return 0;
> + if (xe_configfs_survivability_mode_enabled(pdev) || err) {
no strong feelings here, but:
I believe it is likely better to keep the current code and add
the 2 new lines, instead of mixing err and new condition.
if (err) {
if (xe_survivability_mode_required(xe) &&
xe_survivability_mode_enable(xe))
return 0;
}
if (xe_configfs_survivability_mode_enabled(pdev) || err)
return 0;
or perhaps encapsulate all of that in a static function and then here
just:
if(check_for_survivability(err))
return 0;
the function can be better to extend to the module parameter
to be used in boot if/when needed.
but, really up to you... as I said, no strong feelings...
> + if (xe_survivability_mode_required(xe))
> + return xe_survivability_mode_enable(xe);
>
> return err;
> }
> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
> index d939ce70e6fa..5b60cbf8f7cb 100644
> --- a/drivers/gpu/drm/xe/xe_survivability_mode.c
> +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
> @@ -10,6 +10,7 @@
> #include <linux/pci.h>
> #include <linux/sysfs.h>
>
> +#include "xe_configfs.h"
> #include "xe_device.h"
> #include "xe_gt.h"
> #include "xe_heci_gsc.h"
> @@ -28,8 +29,10 @@
> * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
> * to be flashed through mei and collect telemetry. The driver's probe flow is modified
> * such that it enters survivability mode when pcode initialization is incomplete and boot status
> - * denotes a failure. The driver then populates the survivability_mode PCI sysfs indicating
> - * survivability mode and provides additional information required for debug
> + * denotes a failure. Survivability mode can also be enabled manually by writing the pci address of
> + * the card to the xe configfs attribute. This is useful in cases where pcode does not detect
> + * failure and for validation.
or even for IFR (in-field-repair) use cases, where the repair or flash can be
performed in a single GPU card without impacting the usage of other GPUs
in the same node.
or something like that...
> The driver then populates the survivability_mode PCI sysfs
> + * indicating survivability mo
de and provides additional information required for debug
> *
> * KMD exposes below admin-only readable sysfs in survivability mode
> *
> @@ -42,6 +45,15 @@
> * Overflow Information - Provides history of previous failures
> * Auxiliary Information - Certain failures may have information in
> * addition to postcode information
> + * Enable survivability mode through configfs
> + *
> + * survivability mode is exposed as an attribute under the xe configfs subsystem. User can specify
> + * the card that needs to enter survivability mode.
> + *
> + * echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/unbind
> + * echo "domain:bus:dev.fn" > sys/kernel/config/xe/survivability_mode
> + * echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/bind
should we add an example in the doc so folks don't get so confused
with the details of the terminology?
Example:
echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/unbind
echo "0000:03:00.0" > sys/kernel/config/xe/survivability_mode
echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
?
> + *
> */
>
> static u32 aux_history_offset(u32 reg_value)
> @@ -133,6 +145,7 @@ static void xe_survivability_mode_fini(void *arg)
> struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
> struct device *dev = &pdev->dev;
>
> + xe_configfs_clear_survivability_mode();
> sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
> }
>
> @@ -190,11 +203,15 @@ bool xe_survivability_mode_required(struct xe_device *xe)
> {
> struct xe_survivability *survivability = &xe->survivability;
> struct xe_mmio *mmio = xe_root_tile_mmio(xe);
> + struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
> u32 data;
>
> if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe))
> return false;
>
> + if (xe_configfs_survivability_mode_enabled(pdev))
> + return true;
this part is not needed right? as the check outside won't be evaluated
or bypassed...
> +
> data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
> survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
>
> --
> 2.47.1
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-07 14:24 ` [PATCH 1/2] RFC drm/xe: Add configfs to enable " Riana Tauro
2025-03-07 14:45 ` Rodrigo Vivi
@ 2025-03-07 15:16 ` Lucas De Marchi
2025-03-10 5:31 ` Riana Tauro
2025-03-10 7:14 ` Aravind Iddamsetty
2 siblings, 1 reply; 23+ messages in thread
From: Lucas De Marchi @ 2025-03-07 15:16 UTC (permalink / raw)
To: Riana Tauro; +Cc: intel-xe, anshuman.gupta, rodrigo.vivi, matthew.d.roper
On Fri, Mar 07, 2025 at 07:54:44PM +0530, Riana Tauro wrote:
>Registers a configfs subsystem called 'xe' to userspace. The user can
>use this to modify exposed attributes.
>
>Add survivability mode attribute (config/xe/survivability_mode) to the
>subsystem to allow the user to specify the card that should enter
>survivability mode.
>
>Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>---
> drivers/gpu/drm/xe/Makefile | 1 +
> drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
> drivers/gpu/drm/xe/xe_module.c | 5 ++
> drivers/gpu/drm/xe/xe_module.h | 1 +
> 5 files changed, 114 insertions(+)
> create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
> create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>
>diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>index 9699b08585f7..3f8c87292cee 100644
>--- a/drivers/gpu/drm/xe/Makefile
>+++ b/drivers/gpu/drm/xe/Makefile
>@@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> xe-y += xe_bb.o \
> xe_bo.o \
> xe_bo_evict.o \
>+ xe_configfs.o \
> xe_devcoredump.o \
> xe_device.o \
> xe_device_sysfs.o \
>diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
>new file mode 100644
>index 000000000000..8c5f248e466d
>--- /dev/null
>+++ b/drivers/gpu/drm/xe/xe_configfs.c
>@@ -0,0 +1,95 @@
>+// SPDX-License-Identifier: MIT
>+/*
>+ * Copyright © 2025 Intel Corporation
>+ */
>+
>+#include <linux/configfs.h>
>+#include <linux/init.h>
>+#include <linux/module.h>
>+
>+#include "xe_configfs.h"
>+#include "xe_module.h"
>+
>+/**
>+ * DOC: Xe Configfs
>+ *
>+ * XE KMD registers a configfs subsystem called 'xe'to userspace that allows users to modify
>+ * the exposed attributes.
>+ *
>+ * Attributes:
>+ *
>+ * config/xe/survivability_mode : Write only attribute that allows user to specify the PCI address
>+ * of the card that has to enter survivability mode
I think the desired interface is actually that the user mkdir the bdf in
<configfs>/xe/. This populates the available config entries that the user
writes to.
>+ */
>+
>+void xe_configfs_clear_survivability_mode(void)
>+{
>+ kfree(xe_modparam.survivability_mode);
>+ xe_modparam.survivability_mode = NULL;
>+}
>+
>+static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
once you handle the mkdir mentioned above, this should probably be
a boolean attr like this:
drivers/nvme/target/configfs.c:CONFIGFS_ATTR(nvmet_, param_pi_enable);
>+{
>+ char *survivability_mode;
>+ int ret;
>+ unsigned int domain, bus, slot, function;
>+
>+ ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus, &slot, &function);
>+ if (ret != 4)
>+ return -EINVAL;
>+
>+ survivability_mode = kstrdup(page, GFP_KERNEL);
>+ if (!survivability_mode)
>+ return -ENOMEM;
>+
>+ xe_configfs_clear_survivability_mode();
>+ xe_modparam.survivability_mode = survivability_mode;
>+
>+ return len;
>+}
>+
>+CONFIGFS_ATTR_WO(, survivability_mode);
>+
>+static struct configfs_attribute *xe_configfs_attrs[] = {
>+ &attr_survivability_mode,
>+ NULL,
>+};
>+
>+static const struct config_item_type xe_config_type = {
>+ .ct_attrs = xe_configfs_attrs,
>+ .ct_owner = THIS_MODULE,
>+};
>+
>+static struct configfs_subsystem xe_config_subsys = {
>+ .su_group = {
>+ .cg_item = {
>+ .ci_namebuf = "xe",
>+ .ci_type = &xe_config_type,
>+ },
>+ },
>+};
>
so... it seems you are missing a configfs_group_operations.make_group.
>+int __init xe_configfs_init(void)
>+{
>+ int ret;
>+
>+ config_group_init(&xe_config_subsys.su_group);
>+ mutex_init(&xe_config_subsys.su_mutex);
this mutex_init seems odd, but inline with other drivers
>+ ret = configfs_register_subsystem(&xe_config_subsys);
>+ if (ret) {
>+ pr_err("Error %d while registering subsystem %s\n",
>+ ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
>+ mutex_destroy(&xe_config_subsys.su_mutex);
>+ return ret;
>+ }
>+
>+ return 0;
>+}
>+
>+void __exit xe_configfs_exit(void)
>+{
>+ xe_configfs_clear_survivability_mode();
>+ configfs_unregister_subsystem(&xe_config_subsys);
>+ mutex_destroy(&xe_config_subsys.su_mutex);
>+}
>+
>diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
>new file mode 100644
>index 000000000000..491629a2ca53
>--- /dev/null
>+++ b/drivers/gpu/drm/xe/xe_configfs.h
>@@ -0,0 +1,12 @@
>+/* SPDX-License-Identifier: MIT */
>+/*
>+ * Copyright © 2025 Intel Corporation
>+ */
>+#ifndef _XE_CONFIGFS_H_
>+#define _XE_CONFIGFS_H_
>+
>+int xe_configfs_init(void);
>+void xe_configfs_exit(void);
>+void xe_configfs_clear_survivability_mode(void);
>+
>+#endif
>diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
>index 475acdba2b55..15b3cf22193c 100644
>--- a/drivers/gpu/drm/xe/xe_module.c
>+++ b/drivers/gpu/drm/xe/xe_module.c
>@@ -11,6 +11,7 @@
> #include <drm/drm_module.h>
>
> #include "xe_drv.h"
>+#include "xe_configfs.h"
> #include "xe_hw_fence.h"
> #include "xe_pci.h"
> #include "xe_pm.h"
>@@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
> {
> .init = xe_check_nomodeset,
> },
>+ {
>+ .init = xe_configfs_init,
>+ .exit = xe_configfs_exit,
>+ },
> {
> .init = xe_hw_fence_module_init,
> .exit = xe_hw_fence_module_exit,
>diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
>index 84339e509c80..c238dbee6bc7 100644
>--- a/drivers/gpu/drm/xe/xe_module.h
>+++ b/drivers/gpu/drm/xe/xe_module.h
>@@ -24,6 +24,7 @@ struct xe_modparam {
> #endif
> int wedged_mode;
> u32 svm_notifier_size;
>+ char *survivability_mode;
drop this.. We want a config struct in the xe_device. It's actually
allocated by the mkdir in the configs and when we are probing, we should
find the config instace based on pdev and set the pointer xe->configfs
or something like that.
thanks
Lucas De Marchi
> };
>
> extern struct xe_modparam xe_modparam;
>--
>2.47.1
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-07 15:16 ` Lucas De Marchi
@ 2025-03-10 5:31 ` Riana Tauro
2025-03-10 13:31 ` Lucas De Marchi
0 siblings, 1 reply; 23+ messages in thread
From: Riana Tauro @ 2025-03-10 5:31 UTC (permalink / raw)
To: Lucas De Marchi; +Cc: intel-xe, anshuman.gupta, rodrigo.vivi, matthew.d.roper
Hi Lucas
On 3/7/2025 8:46 PM, Lucas De Marchi wrote:
> On Fri, Mar 07, 2025 at 07:54:44PM +0530, Riana Tauro wrote:
>> Registers a configfs subsystem called 'xe' to userspace. The user can
>> use this to modify exposed attributes.
>>
>> Add survivability mode attribute (config/xe/survivability_mode) to the
>> subsystem to allow the user to specify the card that should enter
>> survivability mode.
>>
>> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>> ---
>> drivers/gpu/drm/xe/Makefile | 1 +
>> drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
>> drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
>> drivers/gpu/drm/xe/xe_module.c | 5 ++
>> drivers/gpu/drm/xe/xe_module.h | 1 +
>> 5 files changed, 114 insertions(+)
>> create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
>> create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index 9699b08585f7..3f8c87292cee 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/
>> %_wa_oob.h: $(obj)/xe_gen_wa_oob \
>> xe-y += xe_bb.o \
>> xe_bo.o \
>> xe_bo_evict.o \
>> + xe_configfs.o \
>> xe_devcoredump.o \
>> xe_device.o \
>> xe_device_sysfs.o \
>> diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/
>> xe_configfs.c
>> new file mode 100644
>> index 000000000000..8c5f248e466d
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_configfs.c
>> @@ -0,0 +1,95 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#include <linux/configfs.h>
>> +#include <linux/init.h>
>> +#include <linux/module.h>
>> +
>> +#include "xe_configfs.h"
>> +#include "xe_module.h"
>> +
>> +/**
>> + * DOC: Xe Configfs
>> + *
>> + * XE KMD registers a configfs subsystem called 'xe'to userspace that
>> allows users to modify
>> + * the exposed attributes.
>> + *
>> + * Attributes:
>> + *
>> + * config/xe/survivability_mode : Write only attribute that allows
>> user to specify the PCI address
>> + * of the card that has to enter survivability mode
>
> I think the desired interface is actually that the user mkdir the bdf in
> <configfs>/xe/. This populates the available config entries that the user
> writes to.
Initial thought was mkdir bdf, but since it was a single attribute and
after a offline discussion with Rodrigo, did a simpler version to get
comments on the RFC patch and if configfs is okay to use
for survivability mode
For survivability mode, below procedure needs to be followed
echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/unbind
add bdf for survivability mode
echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
After the device is removed directory has to be created, the bdf has
to be checked if it belongs to xe driver and then attrs populated.
Even i think mkdir is better in case other attrs have to be added in
future, but for unbind case the check of the driver might be tricky .
The attr is WO and any value can be written, only if the correct bdf is
added in the attr then it'll be used on probe to enter survivability
mode.The current patch only checks the format and does not check if bdf
belongs to xe.
>
>> + */
>> +
>> +void xe_configfs_clear_survivability_mode(void)
>> +{
>> + kfree(xe_modparam.survivability_mode);
>> + xe_modparam.survivability_mode = NULL;
>> +}
>> +
>> +static ssize_t survivability_mode_store(struct config_item *item,
>> const char *page, size_t len)
>
> once you handle the mkdir mentioned above, this should probably be
> a boolean attr like this:
>
> drivers/nvme/target/configfs.c:CONFIGFS_ATTR(nvmet_, param_pi_enable);
>
>> +{
>> + char *survivability_mode;
>> + int ret;
>> + unsigned int domain, bus, slot, function;
>> +
>> + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus, &slot,
>> &function);
>> + if (ret != 4)
>> + return -EINVAL;
>> +
>> + survivability_mode = kstrdup(page, GFP_KERNEL);
>> + if (!survivability_mode)
>> + return -ENOMEM;
>> +
>> + xe_configfs_clear_survivability_mode();
>> + xe_modparam.survivability_mode = survivability_mode;
>> +
>> + return len;
>> +}
>> +
>> +CONFIGFS_ATTR_WO(, survivability_mode);
>> +
>> +static struct configfs_attribute *xe_configfs_attrs[] = {
>> + &attr_survivability_mode,
>> + NULL,
>> +};
>> +
>> +static const struct config_item_type xe_config_type = {
>> + .ct_attrs = xe_configfs_attrs,
>> + .ct_owner = THIS_MODULE,
>> +};
>> +
>> +static struct configfs_subsystem xe_config_subsys = {
>> + .su_group = {
>> + .cg_item = {
>> + .ci_namebuf = "xe",
>> + .ci_type = &xe_config_type,
>> + },
>> + },
>> +};
>>
>
> so... it seems you are missing a configfs_group_operations.make_group.
>
>> +int __init xe_configfs_init(void)
>> +{
>> + int ret;
>> +
>> + config_group_init(&xe_config_subsys.su_group);
>> + mutex_init(&xe_config_subsys.su_mutex);
>
> this mutex_init seems odd, but inline with other drivers
yeah it is not used anywhere but the sample and other drivers also
initialize it
>
>> + ret = configfs_register_subsystem(&xe_config_subsys);
>> + if (ret) {
>> + pr_err("Error %d while registering subsystem %s\n",
>> + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
>> + mutex_destroy(&xe_config_subsys.su_mutex);
>> + return ret;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +void __exit xe_configfs_exit(void)
>> +{
>> + xe_configfs_clear_survivability_mode();
>> + configfs_unregister_subsystem(&xe_config_subsys);
>> + mutex_destroy(&xe_config_subsys.su_mutex);
>> +}
>> +
>> diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/
>> xe_configfs.h
>> new file mode 100644
>> index 000000000000..491629a2ca53
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_configfs.h
>> @@ -0,0 +1,12 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +#ifndef _XE_CONFIGFS_H_
>> +#define _XE_CONFIGFS_H_
>> +
>> +int xe_configfs_init(void);
>> +void xe_configfs_exit(void);
>> +void xe_configfs_clear_survivability_mode(void);
>> +
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/
>> xe_module.c
>> index 475acdba2b55..15b3cf22193c 100644
>> --- a/drivers/gpu/drm/xe/xe_module.c
>> +++ b/drivers/gpu/drm/xe/xe_module.c
>> @@ -11,6 +11,7 @@
>> #include <drm/drm_module.h>
>>
>> #include "xe_drv.h"
>> +#include "xe_configfs.h"
>> #include "xe_hw_fence.h"
>> #include "xe_pci.h"
>> #include "xe_pm.h"
>> @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
>> {
>> .init = xe_check_nomodeset,
>> },
>> + {
>> + .init = xe_configfs_init,
>> + .exit = xe_configfs_exit,
>> + },
>> {
>> .init = xe_hw_fence_module_init,
>> .exit = xe_hw_fence_module_exit,
>> diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/
>> xe_module.h
>> index 84339e509c80..c238dbee6bc7 100644
>> --- a/drivers/gpu/drm/xe/xe_module.h
>> +++ b/drivers/gpu/drm/xe/xe_module.h
>> @@ -24,6 +24,7 @@ struct xe_modparam {
>> #endif
>> int wedged_mode;
>> u32 svm_notifier_size;
>> + char *survivability_mode;
>
> drop this.. We want a config struct in the xe_device. It's actually
> allocated by the mkdir in the configs and when we are probing, we should
> find the config instace based on pdev and set the pointer xe->configfs
> or something like that.
Will add a new struct
Thanks
Riana
>
> thanks
> Lucas De Marchi
>
>> };
>>
>> extern struct xe_modparam xe_modparam;
>> --
>> 2.47.1
>>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 2/2] RFC drm/xe: Enable configfs support for survivability mode
2025-03-07 15:01 ` Rodrigo Vivi
@ 2025-03-10 5:36 ` Riana Tauro
0 siblings, 0 replies; 23+ messages in thread
From: Riana Tauro @ 2025-03-10 5:36 UTC (permalink / raw)
To: Rodrigo Vivi; +Cc: intel-xe, anshuman.gupta, matthew.d.roper, lucas.demarchi
Hi Rodrigo
On 3/7/2025 8:31 PM, Rodrigo Vivi wrote:
> On Fri, Mar 07, 2025 at 07:54:45PM +0530, Riana Tauro wrote:
>> Register a configfs subsystem called 'xe' to userspace that allows
>> users to modify the exposed attributes. Expose survivability mode as
>> an attribute that can be modified manually. This is useful if
>> pcode fails to detect survivability mode and for validation
>>
>> To enable survivability mode for a card,
>>
>> echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/unbind
>> echo "domain:bus:dev.fn" > sys/kernel/config/xe/survivability_mode
>> echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/bind
>
> This is very great! Perhaps at some point we will still need
> the module parameter in this specific survivability case.
> But this configfs is more than needed and wanted for this and
> many other cases.
>
>>
>> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_configfs.c | 22 ++++++++++++++++++++++
>> drivers/gpu/drm/xe/xe_configfs.h | 6 +++++-
>> drivers/gpu/drm/xe/xe_pci.c | 8 ++++----
>> drivers/gpu/drm/xe/xe_survivability_mode.c | 21 +++++++++++++++++++--
>> 4 files changed, 50 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
>> index 8c5f248e466d..ce9f3757100f 100644
>> --- a/drivers/gpu/drm/xe/xe_configfs.c
>> +++ b/drivers/gpu/drm/xe/xe_configfs.c
>> @@ -6,6 +6,7 @@
>> #include <linux/configfs.h>
>> #include <linux/init.h>
>> #include <linux/module.h>
>> +#include <linux/pci.h>
>>
>> #include "xe_configfs.h"
>> #include "xe_module.h"
>> @@ -28,6 +29,27 @@ void xe_configfs_clear_survivability_mode(void)
>> xe_modparam.survivability_mode = NULL;
>> }
>>
>> +bool xe_configfs_survivability_mode_enabled(struct pci_dev *pdev)
>> +{
>> + unsigned int domain, bus, slot, function;
>> + int ret = 0;
>> +
>> + if (!xe_modparam.survivability_mode)
>> + goto err;
>> +
>> + ret = sscanf(xe_modparam.survivability_mode, "%04x:%02x:%02x.%x", &domain, &bus,
>> + &slot, &function);
>> + if (ret != 4)
>> + goto err;
>> +
>> + if ((pci_domain_nr(pdev->bus) == domain) && (pdev->bus->number == bus) &&
>> + (PCI_SLOT(pdev->devfn) == slot) && (PCI_FUNC(pdev->devfn) == function))
>> + return true;
>> +
>> +err:
>
> if nothing else is here, please just return false everywhere instead of the goto
sure will fix this
>
>> + return false;
>> +}
>> +
>> static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
>> {
>> char *survivability_mode;
>> diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
>> index 491629a2ca53..b03f4c7d0f54 100644
>> --- a/drivers/gpu/drm/xe/xe_configfs.h
>> +++ b/drivers/gpu/drm/xe/xe_configfs.h
>> @@ -5,8 +5,12 @@
>> #ifndef _XE_CONFIGFS_H_
>> #define _XE_CONFIGFS_H_
>>
>> +#include <linux/types.h>
>> +
>> +struct pci_dev;
>> +
>> int xe_configfs_init(void);
>> void xe_configfs_exit(void);
>> void xe_configfs_clear_survivability_mode(void);
>> -
>> +bool xe_configfs_survivability_mode_enabled(struct pci_dev *pdev);
>> #endif
>> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>> index 4d982a5a4ffd..d0f66cc08aa5 100644
>> --- a/drivers/gpu/drm/xe/xe_pci.c
>> +++ b/drivers/gpu/drm/xe/xe_pci.c
>> @@ -17,6 +17,7 @@
>>
>> #include "display/xe_display.h"
>> #include "regs/xe_gt_regs.h"
>> +#include "xe_configfs.h"
>> #include "xe_device.h"
>> #include "xe_drv.h"
>> #include "xe_gt.h"
>> @@ -815,10 +816,9 @@ static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>> * mei. If early probe fails, check if survivability mode is flagged by
>> * HW to be enabled. In that case enable it and return success.
>> */
>> - if (err) {
>> - if (xe_survivability_mode_required(xe) &&
>> - xe_survivability_mode_enable(xe))
>> - return 0;
>> + if (xe_configfs_survivability_mode_enabled(pdev) || err) {
>
> no strong feelings here, but:
>
> I believe it is likely better to keep the current code and add
> the 2 new lines, instead of mixing err and new condition.
>
>
> if (err) {
> if (xe_survivability_mode_required(xe) &&
> xe_survivability_mode_enable(xe))
> return 0;
> }
> if (xe_configfs_survivability_mode_enabled(pdev) || err)
> return 0;
>
>
> or perhaps encapsulate all of that in a static function and then here
> just:
>
> if(check_for_survivability(err))
> return 0;
Will try this.
>
> the function can be better to extend to the module parameter
> to be used in boot if/when needed.
>
> but, really up to you... as I said, no strong feelings...
>
>> + if (xe_survivability_mode_required(xe))
>> + return xe_survivability_mode_enable(xe);
>>
>> return err;
>> }
>> diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
>> index d939ce70e6fa..5b60cbf8f7cb 100644
>> --- a/drivers/gpu/drm/xe/xe_survivability_mode.c
>> +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
>> @@ -10,6 +10,7 @@
>> #include <linux/pci.h>
>> #include <linux/sysfs.h>
>>
>> +#include "xe_configfs.h"
>> #include "xe_device.h"
>> #include "xe_gt.h"
>> #include "xe_heci_gsc.h"
>> @@ -28,8 +29,10 @@
>> * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
>> * to be flashed through mei and collect telemetry. The driver's probe flow is modified
>> * such that it enters survivability mode when pcode initialization is incomplete and boot status
>> - * denotes a failure. The driver then populates the survivability_mode PCI sysfs indicating
>> - * survivability mode and provides additional information required for debug
>> + * denotes a failure. Survivability mode can also be enabled manually by writing the pci address of
>> + * the card to the xe configfs attribute. This is useful in cases where pcode does not detect
>> + * failure and for validation.
>
> or even for IFR (in-field-repair) use cases, where the repair or flash can be
> performed in a single GPU card without impacting the usage of other GPUs
> in the same node.
>
> or something like that...
Will add the IFR case
>
>> The driver then populates the survivability_mode PCI sysfs
>> + * indicating survivability mo
> de and provides additional information required for debug
>> *
>> * KMD exposes below admin-only readable sysfs in survivability mode
>> *
>> @@ -42,6 +45,15 @@
>> * Overflow Information - Provides history of previous failures
>> * Auxiliary Information - Certain failures may have information in
>> * addition to postcode information
>> + * Enable survivability mode through configfs
>> + *
>> + * survivability mode is exposed as an attribute under the xe configfs subsystem. User can specify
>> + * the card that needs to enter survivability mode.
>> + *
>> + * echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/unbind
>> + * echo "domain:bus:dev.fn" > sys/kernel/config/xe/survivability_mode
>> + * echo "domain:bus:dev.fn" > /sys/bus/pci/drivers/xe/bind
>
> should we add an example in the doc so folks don't get so confused
> with the details of the terminology?
>
> Example:
>
> echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/unbind
> echo "0000:03:00.0" > sys/kernel/config/xe/survivability_mode
> echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
>
> ?
Sure. will add an example instead
>
>> + *
>> */
>>
>> static u32 aux_history_offset(u32 reg_value)
>> @@ -133,6 +145,7 @@ static void xe_survivability_mode_fini(void *arg)
>> struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>> struct device *dev = &pdev->dev;
>>
>> + xe_configfs_clear_survivability_mode();
>> sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
>> }
>>
>> @@ -190,11 +203,15 @@ bool xe_survivability_mode_required(struct xe_device *xe)
>> {
>> struct xe_survivability *survivability = &xe->survivability;
>> struct xe_mmio *mmio = xe_root_tile_mmio(xe);
>> + struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>> u32 data;
>>
>> if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe))
>> return false;
>>
>> + if (xe_configfs_survivability_mode_enabled(pdev))
>> + return true;
>
> this part is not needed right? as the check outside won't be evaluated
> or bypassed...
If its changed to single check , i'll remove this.
Thanks
Riana
>
>> +
>> data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
>> survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
>>
>> --
>> 2.47.1
>>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-07 14:24 ` [PATCH 1/2] RFC drm/xe: Add configfs to enable " Riana Tauro
2025-03-07 14:45 ` Rodrigo Vivi
2025-03-07 15:16 ` Lucas De Marchi
@ 2025-03-10 7:14 ` Aravind Iddamsetty
2025-03-13 6:18 ` Riana Tauro
2 siblings, 1 reply; 23+ messages in thread
From: Aravind Iddamsetty @ 2025-03-10 7:14 UTC (permalink / raw)
To: Riana Tauro, intel-xe
Cc: anshuman.gupta, rodrigo.vivi, matthew.d.roper, lucas.demarchi
On 07-03-2025 19:54, Riana Tauro wrote:
Hi Riana,
I do think we can achieve the same functionality with module param and
we needn't reload the driver
if we are doing unbind, Since the driver will be loaded event after
unbind we can modify the module param
and once we bind the device back it can check if the BDF belongs to this
driver instance and configure the mode accordingly.
Thanks,
Aravind.
> Registers a configfs subsystem called 'xe' to userspace. The user can
> use this to modify exposed attributes.
>
> Add survivability mode attribute (config/xe/survivability_mode) to the
> subsystem to allow the user to specify the card that should enter
> survivability mode.
>
> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
> ---
> drivers/gpu/drm/xe/Makefile | 1 +
> drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
> drivers/gpu/drm/xe/xe_module.c | 5 ++
> drivers/gpu/drm/xe/xe_module.h | 1 +
> 5 files changed, 114 insertions(+)
> create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
> create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>
> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> index 9699b08585f7..3f8c87292cee 100644
> --- a/drivers/gpu/drm/xe/Makefile
> +++ b/drivers/gpu/drm/xe/Makefile
> @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> xe-y += xe_bb.o \
> xe_bo.o \
> xe_bo_evict.o \
> + xe_configfs.o \
> xe_devcoredump.o \
> xe_device.o \
> xe_device_sysfs.o \
> diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
> new file mode 100644
> index 000000000000..8c5f248e466d
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_configfs.c
> @@ -0,0 +1,95 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2025 Intel Corporation
> + */
> +
> +#include <linux/configfs.h>
> +#include <linux/init.h>
> +#include <linux/module.h>
> +
> +#include "xe_configfs.h"
> +#include "xe_module.h"
> +
> +/**
> + * DOC: Xe Configfs
> + *
> + * XE KMD registers a configfs subsystem called 'xe'to userspace that allows users to modify
> + * the exposed attributes.
> + *
> + * Attributes:
> + *
> + * config/xe/survivability_mode : Write only attribute that allows user to specify the PCI address
> + * of the card that has to enter survivability mode
> + */
> +
> +void xe_configfs_clear_survivability_mode(void)
> +{
> + kfree(xe_modparam.survivability_mode);
> + xe_modparam.survivability_mode = NULL;
> +}
> +
> +static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
> +{
> + char *survivability_mode;
> + int ret;
> + unsigned int domain, bus, slot, function;
> +
> + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus, &slot, &function);
> + if (ret != 4)
> + return -EINVAL;
> +
> + survivability_mode = kstrdup(page, GFP_KERNEL);
> + if (!survivability_mode)
> + return -ENOMEM;
> +
> + xe_configfs_clear_survivability_mode();
> + xe_modparam.survivability_mode = survivability_mode;
> +
> + return len;
> +}
> +
> +CONFIGFS_ATTR_WO(, survivability_mode);
> +
> +static struct configfs_attribute *xe_configfs_attrs[] = {
> + &attr_survivability_mode,
> + NULL,
> +};
> +
> +static const struct config_item_type xe_config_type = {
> + .ct_attrs = xe_configfs_attrs,
> + .ct_owner = THIS_MODULE,
> +};
> +
> +static struct configfs_subsystem xe_config_subsys = {
> + .su_group = {
> + .cg_item = {
> + .ci_namebuf = "xe",
> + .ci_type = &xe_config_type,
> + },
> + },
> +};
> +
> +int __init xe_configfs_init(void)
> +{
> + int ret;
> +
> + config_group_init(&xe_config_subsys.su_group);
> + mutex_init(&xe_config_subsys.su_mutex);
> + ret = configfs_register_subsystem(&xe_config_subsys);
> + if (ret) {
> + pr_err("Error %d while registering subsystem %s\n",
> + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
> + mutex_destroy(&xe_config_subsys.su_mutex);
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +void __exit xe_configfs_exit(void)
> +{
> + xe_configfs_clear_survivability_mode();
> + configfs_unregister_subsystem(&xe_config_subsys);
> + mutex_destroy(&xe_config_subsys.su_mutex);
> +}
> +
> diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
> new file mode 100644
> index 000000000000..491629a2ca53
> --- /dev/null
> +++ b/drivers/gpu/drm/xe/xe_configfs.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2025 Intel Corporation
> + */
> +#ifndef _XE_CONFIGFS_H_
> +#define _XE_CONFIGFS_H_
> +
> +int xe_configfs_init(void);
> +void xe_configfs_exit(void);
> +void xe_configfs_clear_survivability_mode(void);
> +
> +#endif
> diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
> index 475acdba2b55..15b3cf22193c 100644
> --- a/drivers/gpu/drm/xe/xe_module.c
> +++ b/drivers/gpu/drm/xe/xe_module.c
> @@ -11,6 +11,7 @@
> #include <drm/drm_module.h>
>
> #include "xe_drv.h"
> +#include "xe_configfs.h"
> #include "xe_hw_fence.h"
> #include "xe_pci.h"
> #include "xe_pm.h"
> @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
> {
> .init = xe_check_nomodeset,
> },
> + {
> + .init = xe_configfs_init,
> + .exit = xe_configfs_exit,
> + },
> {
> .init = xe_hw_fence_module_init,
> .exit = xe_hw_fence_module_exit,
> diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
> index 84339e509c80..c238dbee6bc7 100644
> --- a/drivers/gpu/drm/xe/xe_module.h
> +++ b/drivers/gpu/drm/xe/xe_module.h
> @@ -24,6 +24,7 @@ struct xe_modparam {
> #endif
> int wedged_mode;
> u32 svm_notifier_size;
> + char *survivability_mode;
> };
>
> extern struct xe_modparam xe_modparam;
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-10 5:31 ` Riana Tauro
@ 2025-03-10 13:31 ` Lucas De Marchi
2025-03-10 14:23 ` Riana Tauro
0 siblings, 1 reply; 23+ messages in thread
From: Lucas De Marchi @ 2025-03-10 13:31 UTC (permalink / raw)
To: Riana Tauro; +Cc: intel-xe, anshuman.gupta, rodrigo.vivi, matthew.d.roper
On Mon, Mar 10, 2025 at 11:01:50AM +0530, Riana Tauro wrote:
>Hi Lucas
>
>On 3/7/2025 8:46 PM, Lucas De Marchi wrote:
>>On Fri, Mar 07, 2025 at 07:54:44PM +0530, Riana Tauro wrote:
>>>Registers a configfs subsystem called 'xe' to userspace. The user can
>>>use this to modify exposed attributes.
>>>
>>>Add survivability mode attribute (config/xe/survivability_mode) to the
>>>subsystem to allow the user to specify the card that should enter
>>>survivability mode.
>>>
>>>Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>>>---
>>>drivers/gpu/drm/xe/Makefile | 1 +
>>>drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
>>>drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
>>>drivers/gpu/drm/xe/xe_module.c | 5 ++
>>>drivers/gpu/drm/xe/xe_module.h | 1 +
>>>5 files changed, 114 insertions(+)
>>>create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
>>>create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>>>
>>>diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>>>index 9699b08585f7..3f8c87292cee 100644
>>>--- a/drivers/gpu/drm/xe/Makefile
>>>+++ b/drivers/gpu/drm/xe/Makefile
>>>@@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/
>>>%_wa_oob.h: $(obj)/xe_gen_wa_oob \
>>>xe-y += xe_bb.o \
>>> xe_bo.o \
>>> xe_bo_evict.o \
>>>+ xe_configfs.o \
>>> xe_devcoredump.o \
>>> xe_device.o \
>>> xe_device_sysfs.o \
>>>diff --git a/drivers/gpu/drm/xe/xe_configfs.c
>>>b/drivers/gpu/drm/xe/ xe_configfs.c
>>>new file mode 100644
>>>index 000000000000..8c5f248e466d
>>>--- /dev/null
>>>+++ b/drivers/gpu/drm/xe/xe_configfs.c
>>>@@ -0,0 +1,95 @@
>>>+// SPDX-License-Identifier: MIT
>>>+/*
>>>+ * Copyright © 2025 Intel Corporation
>>>+ */
>>>+
>>>+#include <linux/configfs.h>
>>>+#include <linux/init.h>
>>>+#include <linux/module.h>
>>>+
>>>+#include "xe_configfs.h"
>>>+#include "xe_module.h"
>>>+
>>>+/**
>>>+ * DOC: Xe Configfs
>>>+ *
>>>+ * XE KMD registers a configfs subsystem called 'xe'to userspace
>>>that allows users to modify
>>>+ * the exposed attributes.
>>>+ *
>>>+ * Attributes:
>>>+ *
>>>+ * config/xe/survivability_mode : Write only attribute that
>>>allows user to specify the PCI address
>>>+ * of the card that has to enter survivability mode
>>
>>I think the desired interface is actually that the user mkdir the bdf in
>><configfs>/xe/. This populates the available config entries that the user
>>writes to.
>
>Initial thought was mkdir bdf, but since it was a single attribute and
>after a offline discussion with Rodrigo, did a simpler version to get
>comments on the RFC patch and if configfs is okay to use
>for survivability mode
I disagree on the conclusion as then you are moving away from how
configfs is commonly used and also making it harder to add new
attributes in a uniform way.
>
>For survivability mode, below procedure needs to be followed
>
>echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/unbind
>add bdf for survivability mode
>echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
>
>After the device is removed directory has to be created, the bdf has
>to be checked if it belongs to xe driver and then attrs populated.
>Even i think mkdir is better in case other attrs have to be added in
>future, but for unbind case the check of the driver might be tricky .
I think you are talking about the mkdir in the wrong place. It isn't
related to unbind at all. The sequence you mentioned is just because we
lazy and let pci auto probe the driver. Under the hood you are doing:
1) load the module
2) bind the driver
3) unbind the driver
4) bind the driver in survivability mode
The sequence we should actually have is:
1) load the module
2) bind the driver in survivability mode
You shouldn't create any directory when unbinding. It's the user who
should create it. When **binding** the driver we should check what are
the extra configuration available and adapt the probe according to that.
For that we have to disable pci driver autoprobe and tweak the
configfs settings:
1) echo 0 > /sys/bus/pci/drivers_autoprobe
modprobe xe
2) mkdir /sys/kernel/config/xe/0000:03:00.0
echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
Lucas De Marchi
>
>The attr is WO and any value can be written, only if the correct bdf
>is added in the attr then it'll be used on probe to enter
>survivability mode.The current patch only checks the format and does
>not check if bdf belongs to xe.
>>
>>>+ */
>>>+
>>>+void xe_configfs_clear_survivability_mode(void)
>>>+{
>>>+ kfree(xe_modparam.survivability_mode);
>>>+ xe_modparam.survivability_mode = NULL;
>>>+}
>>>+
>>>+static ssize_t survivability_mode_store(struct config_item *item,
>>>const char *page, size_t len)
>>
>>once you handle the mkdir mentioned above, this should probably be
>>a boolean attr like this:
>>
>> drivers/nvme/target/configfs.c:CONFIGFS_ATTR(nvmet_, param_pi_enable);
>>
>>>+{
>>>+ char *survivability_mode;
>>>+ int ret;
>>>+ unsigned int domain, bus, slot, function;
>>>+
>>>+ ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus, &slot,
>>>&function);
>>>+ if (ret != 4)
>>>+ return -EINVAL;
>>>+
>>>+ survivability_mode = kstrdup(page, GFP_KERNEL);
>>>+ if (!survivability_mode)
>>>+ return -ENOMEM;
>>>+
>>>+ xe_configfs_clear_survivability_mode();
>>>+ xe_modparam.survivability_mode = survivability_mode;
>>>+
>>>+ return len;
>>>+}
>>>+
>>>+CONFIGFS_ATTR_WO(, survivability_mode);
>>>+
>>>+static struct configfs_attribute *xe_configfs_attrs[] = {
>>>+ &attr_survivability_mode,
>>>+ NULL,
>>>+};
>>>+
>>>+static const struct config_item_type xe_config_type = {
>>>+ .ct_attrs = xe_configfs_attrs,
>>>+ .ct_owner = THIS_MODULE,
>>>+};
>>>+
>>>+static struct configfs_subsystem xe_config_subsys = {
>>>+ .su_group = {
>>>+ .cg_item = {
>>>+ .ci_namebuf = "xe",
>>>+ .ci_type = &xe_config_type,
>>>+ },
>>>+ },
>>>+};
>>>
>>
>>so... it seems you are missing a configfs_group_operations.make_group.
>>
>>>+int __init xe_configfs_init(void)
>>>+{
>>>+ int ret;
>>>+
>>>+ config_group_init(&xe_config_subsys.su_group);
>>>+ mutex_init(&xe_config_subsys.su_mutex);
>>
>>this mutex_init seems odd, but inline with other drivers
>yeah it is not used anywhere but the sample and other drivers also
>initialize it
>>
>>>+ ret = configfs_register_subsystem(&xe_config_subsys);
>>>+ if (ret) {
>>>+ pr_err("Error %d while registering subsystem %s\n",
>>>+ ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
>>>+ mutex_destroy(&xe_config_subsys.su_mutex);
>>>+ return ret;
>>>+ }
>>>+
>>>+ return 0;
>>>+}
>>>+
>>>+void __exit xe_configfs_exit(void)
>>>+{
>>>+ xe_configfs_clear_survivability_mode();
>>>+ configfs_unregister_subsystem(&xe_config_subsys);
>>>+ mutex_destroy(&xe_config_subsys.su_mutex);
>>>+}
>>>+
>>>diff --git a/drivers/gpu/drm/xe/xe_configfs.h
>>>b/drivers/gpu/drm/xe/ xe_configfs.h
>>>new file mode 100644
>>>index 000000000000..491629a2ca53
>>>--- /dev/null
>>>+++ b/drivers/gpu/drm/xe/xe_configfs.h
>>>@@ -0,0 +1,12 @@
>>>+/* SPDX-License-Identifier: MIT */
>>>+/*
>>>+ * Copyright © 2025 Intel Corporation
>>>+ */
>>>+#ifndef _XE_CONFIGFS_H_
>>>+#define _XE_CONFIGFS_H_
>>>+
>>>+int xe_configfs_init(void);
>>>+void xe_configfs_exit(void);
>>>+void xe_configfs_clear_survivability_mode(void);
>>>+
>>>+#endif
>>>diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/
>>>xe_module.c
>>>index 475acdba2b55..15b3cf22193c 100644
>>>--- a/drivers/gpu/drm/xe/xe_module.c
>>>+++ b/drivers/gpu/drm/xe/xe_module.c
>>>@@ -11,6 +11,7 @@
>>>#include <drm/drm_module.h>
>>>
>>>#include "xe_drv.h"
>>>+#include "xe_configfs.h"
>>>#include "xe_hw_fence.h"
>>>#include "xe_pci.h"
>>>#include "xe_pm.h"
>>>@@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
>>> {
>>> .init = xe_check_nomodeset,
>>> },
>>>+ {
>>>+ .init = xe_configfs_init,
>>>+ .exit = xe_configfs_exit,
>>>+ },
>>> {
>>> .init = xe_hw_fence_module_init,
>>> .exit = xe_hw_fence_module_exit,
>>>diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/
>>>xe_module.h
>>>index 84339e509c80..c238dbee6bc7 100644
>>>--- a/drivers/gpu/drm/xe/xe_module.h
>>>+++ b/drivers/gpu/drm/xe/xe_module.h
>>>@@ -24,6 +24,7 @@ struct xe_modparam {
>>>#endif
>>> int wedged_mode;
>>> u32 svm_notifier_size;
>>>+ char *survivability_mode;
>>
>>drop this.. We want a config struct in the xe_device. It's actually
>>allocated by the mkdir in the configs and when we are probing, we should
>>find the config instace based on pdev and set the pointer xe->configfs
>>or something like that.
>
>Will add a new struct
>
>Thanks
>Riana
>>
>>thanks
>>Lucas De Marchi
>>
>>>};
>>>
>>>extern struct xe_modparam xe_modparam;
>>>--
>>>2.47.1
>>>
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-10 13:31 ` Lucas De Marchi
@ 2025-03-10 14:23 ` Riana Tauro
2025-03-10 16:40 ` Rodrigo Vivi
0 siblings, 1 reply; 23+ messages in thread
From: Riana Tauro @ 2025-03-10 14:23 UTC (permalink / raw)
To: Lucas De Marchi; +Cc: intel-xe, anshuman.gupta, rodrigo.vivi, matthew.d.roper
Hi Lucas
On 3/10/2025 7:01 PM, Lucas De Marchi wrote:
> On Mon, Mar 10, 2025 at 11:01:50AM +0530, Riana Tauro wrote:
>> Hi Lucas
>>
>> On 3/7/2025 8:46 PM, Lucas De Marchi wrote:
>>> On Fri, Mar 07, 2025 at 07:54:44PM +0530, Riana Tauro wrote:
>>>> Registers a configfs subsystem called 'xe' to userspace. The user can
>>>> use this to modify exposed attributes.
>>>>
>>>> Add survivability mode attribute (config/xe/survivability_mode) to the
>>>> subsystem to allow the user to specify the card that should enter
>>>> survivability mode.
>>>>
>>>> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>>>> ---
>>>> drivers/gpu/drm/xe/Makefile | 1 +
>>>> drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
>>>> drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
>>>> drivers/gpu/drm/xe/xe_module.c | 5 ++
>>>> drivers/gpu/drm/xe/xe_module.h | 1 +
>>>> 5 files changed, 114 insertions(+)
>>>> create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
>>>> create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>>>>
>>>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>>>> index 9699b08585f7..3f8c87292cee 100644
>>>> --- a/drivers/gpu/drm/xe/Makefile
>>>> +++ b/drivers/gpu/drm/xe/Makefile
>>>> @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/
>>>> %_wa_oob.h: $(obj)/xe_gen_wa_oob \
>>>> xe-y += xe_bb.o \
>>>> xe_bo.o \
>>>> xe_bo_evict.o \
>>>> + xe_configfs.o \
>>>> xe_devcoredump.o \
>>>> xe_device.o \
>>>> xe_device_sysfs.o \
>>>> diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/
>>>> xe_configfs.c
>>>> new file mode 100644
>>>> index 000000000000..8c5f248e466d
>>>> --- /dev/null
>>>> +++ b/drivers/gpu/drm/xe/xe_configfs.c
>>>> @@ -0,0 +1,95 @@
>>>> +// SPDX-License-Identifier: MIT
>>>> +/*
>>>> + * Copyright © 2025 Intel Corporation
>>>> + */
>>>> +
>>>> +#include <linux/configfs.h>
>>>> +#include <linux/init.h>
>>>> +#include <linux/module.h>
>>>> +
>>>> +#include "xe_configfs.h"
>>>> +#include "xe_module.h"
>>>> +
>>>> +/**
>>>> + * DOC: Xe Configfs
>>>> + *
>>>> + * XE KMD registers a configfs subsystem called 'xe'to userspace
>>>> that allows users to modify
>>>> + * the exposed attributes.
>>>> + *
>>>> + * Attributes:
>>>> + *
>>>> + * config/xe/survivability_mode : Write only attribute that allows
>>>> user to specify the PCI address
>>>> + * of the card that has to enter survivability mode
>>>
>>> I think the desired interface is actually that the user mkdir the bdf in
>>> <configfs>/xe/. This populates the available config entries that the
>>> user
>>> writes to.
>>
>> Initial thought was mkdir bdf, but since it was a single attribute and
>> after a offline discussion with Rodrigo, did a simpler version to get
>> comments on the RFC patch and if configfs is okay to use
>> for survivability mode
>
> I disagree on the conclusion as then you are moving away from how
> configfs is commonly used and also making it harder to add new
> attributes in a uniform way.
>
>>
>> For survivability mode, below procedure needs to be followed
>>
>> echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/unbind
>> add bdf for survivability mode
>> echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
>>
>> After the device is removed directory has to be created, the bdf has
>> to be checked if it belongs to xe driver and then attrs populated.
>> Even i think mkdir is better in case other attrs have to be added in
>> future, but for unbind case the check of the driver might be tricky .
>
> I think you are talking about the mkdir in the wrong place. It isn't
> related to unbind at all. The sequence you mentioned is just because we
> lazy and let pci auto probe the driver. Under the hood you are doing:
>
> 1) load the module
> 2) bind the driver
> 3) unbind the driver
> 4) bind the driver in survivability mode
>
> The sequence we should actually have is:
>
> 1) load the module
> 2) bind the driver in survivability mode
This is what i meant.
1) load the module
2) bind the driver
3) unbind the driver
4) mkdir /sys/kernel/config/xe/0000:03:00.0
echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
5) bind the driver in survivability mode
At step 4, when creating directory. The bdf needs to be validated, ie
(it should be pci_dev that xe will probe). i checked the struct pci_dev
but didn't find anything.
Will have to match the device id against supported pci device ids.
Otherwise, there might be wrong non functional bdf directories present
based on what user provides.
Thanks
Riana
>
> You shouldn't create any directory when unbinding. It's the user who
> should create it. When **binding** the driver we should check what are
> the extra configuration available and adapt the probe according to that.
>
> For that we have to disable pci driver autoprobe and tweak the
> configfs settings:
>
> 1) echo 0 > /sys/bus/pci/drivers_autoprobe
> modprobe xe
>
> 2) mkdir /sys/kernel/config/xe/0000:03:00.0
> echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
> echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
>
>
> Lucas De Marchi
>
>>
>> The attr is WO and any value can be written, only if the correct bdf
>> is added in the attr then it'll be used on probe to enter
>> survivability mode.The current patch only checks the format and does
>> not check if bdf belongs to xe.
>>>
>>>> + */
>>>> +
>>>> +void xe_configfs_clear_survivability_mode(void)
>>>> +{
>>>> + kfree(xe_modparam.survivability_mode);
>>>> + xe_modparam.survivability_mode = NULL;
>>>> +}
>>>> +
>>>> +static ssize_t survivability_mode_store(struct config_item *item,
>>>> const char *page, size_t len)
>>>
>>> once you handle the mkdir mentioned above, this should probably be
>>> a boolean attr like this:
>>>
>>> drivers/nvme/target/configfs.c:CONFIGFS_ATTR(nvmet_,
>>> param_pi_enable);
>>>
>>>> +{
>>>> + char *survivability_mode;
>>>> + int ret;
>>>> + unsigned int domain, bus, slot, function;
>>>> +
>>>> + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus, &slot,
>>>> &function);
>>>> + if (ret != 4)
>>>> + return -EINVAL;
>>>> +
>>>> + survivability_mode = kstrdup(page, GFP_KERNEL);
>>>> + if (!survivability_mode)
>>>> + return -ENOMEM;
>>>> +
>>>> + xe_configfs_clear_survivability_mode();
>>>> + xe_modparam.survivability_mode = survivability_mode;
>>>> +
>>>> + return len;
>>>> +}
>>>> +
>>>> +CONFIGFS_ATTR_WO(, survivability_mode);
>>>> +
>>>> +static struct configfs_attribute *xe_configfs_attrs[] = {
>>>> + &attr_survivability_mode,
>>>> + NULL,
>>>> +};
>>>> +
>>>> +static const struct config_item_type xe_config_type = {
>>>> + .ct_attrs = xe_configfs_attrs,
>>>> + .ct_owner = THIS_MODULE,
>>>> +};
>>>> +
>>>> +static struct configfs_subsystem xe_config_subsys = {
>>>> + .su_group = {
>>>> + .cg_item = {
>>>> + .ci_namebuf = "xe",
>>>> + .ci_type = &xe_config_type,
>>>> + },
>>>> + },
>>>> +};
>>>>
>>>
>>> so... it seems you are missing a configfs_group_operations.make_group.
>>>
>>>> +int __init xe_configfs_init(void)
>>>> +{
>>>> + int ret;
>>>> +
>>>> + config_group_init(&xe_config_subsys.su_group);
>>>> + mutex_init(&xe_config_subsys.su_mutex);
>>>
>>> this mutex_init seems odd, but inline with other drivers
>> yeah it is not used anywhere but the sample and other drivers also
>> initialize it
>>>
>>>> + ret = configfs_register_subsystem(&xe_config_subsys);
>>>> + if (ret) {
>>>> + pr_err("Error %d while registering subsystem %s\n",
>>>> + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
>>>> + mutex_destroy(&xe_config_subsys.su_mutex);
>>>> + return ret;
>>>> + }
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +void __exit xe_configfs_exit(void)
>>>> +{
>>>> + xe_configfs_clear_survivability_mode();
>>>> + configfs_unregister_subsystem(&xe_config_subsys);
>>>> + mutex_destroy(&xe_config_subsys.su_mutex);
>>>> +}
>>>> +
>>>> diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/
>>>> xe_configfs.h
>>>> new file mode 100644
>>>> index 000000000000..491629a2ca53
>>>> --- /dev/null
>>>> +++ b/drivers/gpu/drm/xe/xe_configfs.h
>>>> @@ -0,0 +1,12 @@
>>>> +/* SPDX-License-Identifier: MIT */
>>>> +/*
>>>> + * Copyright © 2025 Intel Corporation
>>>> + */
>>>> +#ifndef _XE_CONFIGFS_H_
>>>> +#define _XE_CONFIGFS_H_
>>>> +
>>>> +int xe_configfs_init(void);
>>>> +void xe_configfs_exit(void);
>>>> +void xe_configfs_clear_survivability_mode(void);
>>>> +
>>>> +#endif
>>>> diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/
>>>> xe_module.c
>>>> index 475acdba2b55..15b3cf22193c 100644
>>>> --- a/drivers/gpu/drm/xe/xe_module.c
>>>> +++ b/drivers/gpu/drm/xe/xe_module.c
>>>> @@ -11,6 +11,7 @@
>>>> #include <drm/drm_module.h>
>>>>
>>>> #include "xe_drv.h"
>>>> +#include "xe_configfs.h"
>>>> #include "xe_hw_fence.h"
>>>> #include "xe_pci.h"
>>>> #include "xe_pm.h"
>>>> @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
>>>> {
>>>> .init = xe_check_nomodeset,
>>>> },
>>>> + {
>>>> + .init = xe_configfs_init,
>>>> + .exit = xe_configfs_exit,
>>>> + },
>>>> {
>>>> .init = xe_hw_fence_module_init,
>>>> .exit = xe_hw_fence_module_exit,
>>>> diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/
>>>> xe_module.h
>>>> index 84339e509c80..c238dbee6bc7 100644
>>>> --- a/drivers/gpu/drm/xe/xe_module.h
>>>> +++ b/drivers/gpu/drm/xe/xe_module.h
>>>> @@ -24,6 +24,7 @@ struct xe_modparam {
>>>> #endif
>>>> int wedged_mode;
>>>> u32 svm_notifier_size;
>>>> + char *survivability_mode;
>>>
>>> drop this.. We want a config struct in the xe_device. It's actually
>>> allocated by the mkdir in the configs and when we are probing, we should
>>> find the config instace based on pdev and set the pointer xe->configfs
>>> or something like that.
>>
>> Will add a new struct
>>
>> Thanks
>> Riana
>>>
>>> thanks
>>> Lucas De Marchi
>>>
>>>> };
>>>>
>>>> extern struct xe_modparam xe_modparam;
>>>> --
>>>> 2.47.1
>>>>
>>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-10 14:23 ` Riana Tauro
@ 2025-03-10 16:40 ` Rodrigo Vivi
2025-03-10 17:01 ` Lucas De Marchi
0 siblings, 1 reply; 23+ messages in thread
From: Rodrigo Vivi @ 2025-03-10 16:40 UTC (permalink / raw)
To: Riana Tauro; +Cc: Lucas De Marchi, intel-xe, anshuman.gupta, matthew.d.roper
On Mon, Mar 10, 2025 at 07:53:27PM +0530, Riana Tauro wrote:
> Hi Lucas
>
> On 3/10/2025 7:01 PM, Lucas De Marchi wrote:
> > On Mon, Mar 10, 2025 at 11:01:50AM +0530, Riana Tauro wrote:
> > > Hi Lucas
> > >
> > > On 3/7/2025 8:46 PM, Lucas De Marchi wrote:
> > > > On Fri, Mar 07, 2025 at 07:54:44PM +0530, Riana Tauro wrote:
> > > > > Registers a configfs subsystem called 'xe' to userspace. The user can
> > > > > use this to modify exposed attributes.
> > > > >
> > > > > Add survivability mode attribute (config/xe/survivability_mode) to the
> > > > > subsystem to allow the user to specify the card that should enter
> > > > > survivability mode.
> > > > >
> > > > > Signed-off-by: Riana Tauro <riana.tauro@intel.com>
> > > > > ---
> > > > > drivers/gpu/drm/xe/Makefile | 1 +
> > > > > drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
> > > > > drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
> > > > > drivers/gpu/drm/xe/xe_module.c | 5 ++
> > > > > drivers/gpu/drm/xe/xe_module.h | 1 +
> > > > > 5 files changed, 114 insertions(+)
> > > > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
> > > > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
> > > > >
> > > > > diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> > > > > index 9699b08585f7..3f8c87292cee 100644
> > > > > --- a/drivers/gpu/drm/xe/Makefile
> > > > > +++ b/drivers/gpu/drm/xe/Makefile
> > > > > @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c
> > > > > $(obj)/generated/ %_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > > > > xe-y += xe_bb.o \
> > > > > xe_bo.o \
> > > > > xe_bo_evict.o \
> > > > > + xe_configfs.o \
> > > > > xe_devcoredump.o \
> > > > > xe_device.o \
> > > > > xe_device_sysfs.o \
> > > > > diff --git a/drivers/gpu/drm/xe/xe_configfs.c
> > > > > b/drivers/gpu/drm/xe/ xe_configfs.c
> > > > > new file mode 100644
> > > > > index 000000000000..8c5f248e466d
> > > > > --- /dev/null
> > > > > +++ b/drivers/gpu/drm/xe/xe_configfs.c
> > > > > @@ -0,0 +1,95 @@
> > > > > +// SPDX-License-Identifier: MIT
> > > > > +/*
> > > > > + * Copyright © 2025 Intel Corporation
> > > > > + */
> > > > > +
> > > > > +#include <linux/configfs.h>
> > > > > +#include <linux/init.h>
> > > > > +#include <linux/module.h>
> > > > > +
> > > > > +#include "xe_configfs.h"
> > > > > +#include "xe_module.h"
> > > > > +
> > > > > +/**
> > > > > + * DOC: Xe Configfs
> > > > > + *
> > > > > + * XE KMD registers a configfs subsystem called 'xe'to
> > > > > userspace that allows users to modify
> > > > > + * the exposed attributes.
> > > > > + *
> > > > > + * Attributes:
> > > > > + *
> > > > > + * config/xe/survivability_mode : Write only attribute that
> > > > > allows user to specify the PCI address
> > > > > + * of the card that has to enter survivability mode
> > > >
> > > > I think the desired interface is actually that the user mkdir the bdf in
> > > > <configfs>/xe/. This populates the available config entries that
> > > > the user
> > > > writes to.
> > >
> > > Initial thought was mkdir bdf, but since it was a single attribute
> > > and after a offline discussion with Rodrigo, did a simpler version
> > > to get comments on the RFC patch and if configfs is okay to use
> > > for survivability mode
> >
> > I disagree on the conclusion as then you are moving away from how
> > configfs is commonly used and also making it harder to add new
> > attributes in a uniform way.
My bad, I'm sorry.
I might have created the confusion here.
> >
> > >
> > > For survivability mode, below procedure needs to be followed
> > >
> > > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/unbind
> > > add bdf for survivability mode
> > > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
> > >
> > > After the device is removed directory has to be created, the bdf has
> > > to be checked if it belongs to xe driver and then attrs populated.
> > > Even i think mkdir is better in case other attrs have to be added in
> > > future, but for unbind case the check of the driver might be tricky
> > > .
> >
> > I think you are talking about the mkdir in the wrong place. It isn't
> > related to unbind at all. The sequence you mentioned is just because we
> > lazy and let pci auto probe the driver. Under the hood you are doing:
> >
> > 1) load the module
> > 2) bind the driver
> > 3) unbind the driver
> > 4) bind the driver in survivability mode
> >
> > The sequence we should actually have is:
> >
> > 1) load the module
> > 2) bind the driver in survivability mode
No, this doesn't work for the use case we want it.
The goal is to be able to only move one single device to the survivability
mode at runtime without disrupting the work that is going on on other devices.
So we cannot unload the module and remove the autoprobe.
>
> This is what i meant.
>
> 1) load the module
> 2) bind the driver
> 3) unbind the driver
>
> 4) mkdir /sys/kernel/config/xe/0000:03:00.0
> echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
> 5) bind the driver in survivability mode
>
>
> At step 4, when creating directory. The bdf needs to be validated, ie (it
> should be pci_dev that xe will probe). i checked the struct pci_dev but
> didn't find anything.
> Will have to match the device id against supported pci device ids.
I don't believe we need to run any validation. If admin creates a file/directory
with invalid address it will be just ignored, no?!
On the probe we only get the directory, file for the pci device we are probing.
>
> Otherwise, there might be wrong non functional bdf directories present based
> on what user provides.
>
> Thanks
> Riana
> >
> > You shouldn't create any directory when unbinding. It's the user who
> > should create it. When **binding** the driver we should check what are
> > the extra configuration available and adapt the probe according to that.
> >
> > For that we have to disable pci driver autoprobe and tweak the
> > configfs settings:
> >
> > 1) echo 0 > /sys/bus/pci/drivers_autoprobe
> > modprobe xe
> >
> > 2) mkdir /sys/kernel/config/xe/0000:03:00.0
> > echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
> > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
> >
> >
> > Lucas De Marchi
> >
> > >
> > > The attr is WO and any value can be written, only if the correct bdf
> > > is added in the attr then it'll be used on probe to enter
> > > survivability mode.The current patch only checks the format and does
> > > not check if bdf belongs to xe.
> > > >
> > > > > + */
> > > > > +
> > > > > +void xe_configfs_clear_survivability_mode(void)
> > > > > +{
> > > > > + kfree(xe_modparam.survivability_mode);
> > > > > + xe_modparam.survivability_mode = NULL;
> > > > > +}
> > > > > +
> > > > > +static ssize_t survivability_mode_store(struct config_item
> > > > > *item, const char *page, size_t len)
> > > >
> > > > once you handle the mkdir mentioned above, this should probably be
> > > > a boolean attr like this:
> > > >
> > > > drivers/nvme/target/configfs.c:CONFIGFS_ATTR(nvmet_,
> > > > param_pi_enable);
> > > >
> > > > > +{
> > > > > + char *survivability_mode;
> > > > > + int ret;
> > > > > + unsigned int domain, bus, slot, function;
> > > > > +
> > > > > + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus,
> > > > > &slot, &function);
> > > > > + if (ret != 4)
> > > > > + return -EINVAL;
> > > > > +
> > > > > + survivability_mode = kstrdup(page, GFP_KERNEL);
> > > > > + if (!survivability_mode)
> > > > > + return -ENOMEM;
> > > > > +
> > > > > + xe_configfs_clear_survivability_mode();
> > > > > + xe_modparam.survivability_mode = survivability_mode;
> > > > > +
> > > > > + return len;
> > > > > +}
> > > > > +
> > > > > +CONFIGFS_ATTR_WO(, survivability_mode);
> > > > > +
> > > > > +static struct configfs_attribute *xe_configfs_attrs[] = {
> > > > > + &attr_survivability_mode,
> > > > > + NULL,
> > > > > +};
> > > > > +
> > > > > +static const struct config_item_type xe_config_type = {
> > > > > + .ct_attrs = xe_configfs_attrs,
> > > > > + .ct_owner = THIS_MODULE,
> > > > > +};
> > > > > +
> > > > > +static struct configfs_subsystem xe_config_subsys = {
> > > > > + .su_group = {
> > > > > + .cg_item = {
> > > > > + .ci_namebuf = "xe",
> > > > > + .ci_type = &xe_config_type,
> > > > > + },
> > > > > + },
> > > > > +};
> > > > >
> > > >
> > > > so... it seems you are missing a configfs_group_operations.make_group.
> > > >
> > > > > +int __init xe_configfs_init(void)
> > > > > +{
> > > > > + int ret;
> > > > > +
> > > > > + config_group_init(&xe_config_subsys.su_group);
> > > > > + mutex_init(&xe_config_subsys.su_mutex);
> > > >
> > > > this mutex_init seems odd, but inline with other drivers
> > > yeah it is not used anywhere but the sample and other drivers also
> > > initialize it
> > > >
> > > > > + ret = configfs_register_subsystem(&xe_config_subsys);
> > > > > + if (ret) {
> > > > > + pr_err("Error %d while registering subsystem %s\n",
> > > > > + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
> > > > > + mutex_destroy(&xe_config_subsys.su_mutex);
> > > > > + return ret;
> > > > > + }
> > > > > +
> > > > > + return 0;
> > > > > +}
> > > > > +
> > > > > +void __exit xe_configfs_exit(void)
> > > > > +{
> > > > > + xe_configfs_clear_survivability_mode();
> > > > > + configfs_unregister_subsystem(&xe_config_subsys);
> > > > > + mutex_destroy(&xe_config_subsys.su_mutex);
> > > > > +}
> > > > > +
> > > > > diff --git a/drivers/gpu/drm/xe/xe_configfs.h
> > > > > b/drivers/gpu/drm/xe/ xe_configfs.h
> > > > > new file mode 100644
> > > > > index 000000000000..491629a2ca53
> > > > > --- /dev/null
> > > > > +++ b/drivers/gpu/drm/xe/xe_configfs.h
> > > > > @@ -0,0 +1,12 @@
> > > > > +/* SPDX-License-Identifier: MIT */
> > > > > +/*
> > > > > + * Copyright © 2025 Intel Corporation
> > > > > + */
> > > > > +#ifndef _XE_CONFIGFS_H_
> > > > > +#define _XE_CONFIGFS_H_
> > > > > +
> > > > > +int xe_configfs_init(void);
> > > > > +void xe_configfs_exit(void);
> > > > > +void xe_configfs_clear_survivability_mode(void);
> > > > > +
> > > > > +#endif
> > > > > diff --git a/drivers/gpu/drm/xe/xe_module.c
> > > > > b/drivers/gpu/drm/xe/ xe_module.c
> > > > > index 475acdba2b55..15b3cf22193c 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_module.c
> > > > > +++ b/drivers/gpu/drm/xe/xe_module.c
> > > > > @@ -11,6 +11,7 @@
> > > > > #include <drm/drm_module.h>
> > > > >
> > > > > #include "xe_drv.h"
> > > > > +#include "xe_configfs.h"
> > > > > #include "xe_hw_fence.h"
> > > > > #include "xe_pci.h"
> > > > > #include "xe_pm.h"
> > > > > @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
> > > > > {
> > > > > .init = xe_check_nomodeset,
> > > > > },
> > > > > + {
> > > > > + .init = xe_configfs_init,
> > > > > + .exit = xe_configfs_exit,
> > > > > + },
> > > > > {
> > > > > .init = xe_hw_fence_module_init,
> > > > > .exit = xe_hw_fence_module_exit,
> > > > > diff --git a/drivers/gpu/drm/xe/xe_module.h
> > > > > b/drivers/gpu/drm/xe/ xe_module.h
> > > > > index 84339e509c80..c238dbee6bc7 100644
> > > > > --- a/drivers/gpu/drm/xe/xe_module.h
> > > > > +++ b/drivers/gpu/drm/xe/xe_module.h
> > > > > @@ -24,6 +24,7 @@ struct xe_modparam {
> > > > > #endif
> > > > > int wedged_mode;
> > > > > u32 svm_notifier_size;
> > > > > + char *survivability_mode;
> > > >
> > > > drop this.. We want a config struct in the xe_device. It's actually
> > > > allocated by the mkdir in the configs and when we are probing, we should
> > > > find the config instace based on pdev and set the pointer xe->configfs
> > > > or something like that.
> > >
> > > Will add a new struct
> > >
> > > Thanks
> > > Riana
> > > >
> > > > thanks
> > > > Lucas De Marchi
> > > >
> > > > > };
> > > > >
> > > > > extern struct xe_modparam xe_modparam;
> > > > > --
> > > > > 2.47.1
> > > > >
> > >
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-10 16:40 ` Rodrigo Vivi
@ 2025-03-10 17:01 ` Lucas De Marchi
2025-03-10 17:14 ` Rodrigo Vivi
0 siblings, 1 reply; 23+ messages in thread
From: Lucas De Marchi @ 2025-03-10 17:01 UTC (permalink / raw)
To: Rodrigo Vivi; +Cc: Riana Tauro, intel-xe, anshuman.gupta, matthew.d.roper
On Mon, Mar 10, 2025 at 12:40:39PM -0400, Rodrigo Vivi wrote:
>On Mon, Mar 10, 2025 at 07:53:27PM +0530, Riana Tauro wrote:
>> Hi Lucas
>>
>> On 3/10/2025 7:01 PM, Lucas De Marchi wrote:
>> > On Mon, Mar 10, 2025 at 11:01:50AM +0530, Riana Tauro wrote:
>> > > Hi Lucas
>> > >
>> > > On 3/7/2025 8:46 PM, Lucas De Marchi wrote:
>> > > > On Fri, Mar 07, 2025 at 07:54:44PM +0530, Riana Tauro wrote:
>> > > > > Registers a configfs subsystem called 'xe' to userspace. The user can
>> > > > > use this to modify exposed attributes.
>> > > > >
>> > > > > Add survivability mode attribute (config/xe/survivability_mode) to the
>> > > > > subsystem to allow the user to specify the card that should enter
>> > > > > survivability mode.
>> > > > >
>> > > > > Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>> > > > > ---
>> > > > > drivers/gpu/drm/xe/Makefile | 1 +
>> > > > > drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
>> > > > > drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
>> > > > > drivers/gpu/drm/xe/xe_module.c | 5 ++
>> > > > > drivers/gpu/drm/xe/xe_module.h | 1 +
>> > > > > 5 files changed, 114 insertions(+)
>> > > > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
>> > > > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>> > > > >
>> > > > > diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> > > > > index 9699b08585f7..3f8c87292cee 100644
>> > > > > --- a/drivers/gpu/drm/xe/Makefile
>> > > > > +++ b/drivers/gpu/drm/xe/Makefile
>> > > > > @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c
>> > > > > $(obj)/generated/ %_wa_oob.h: $(obj)/xe_gen_wa_oob \
>> > > > > xe-y += xe_bb.o \
>> > > > > xe_bo.o \
>> > > > > xe_bo_evict.o \
>> > > > > + xe_configfs.o \
>> > > > > xe_devcoredump.o \
>> > > > > xe_device.o \
>> > > > > xe_device_sysfs.o \
>> > > > > diff --git a/drivers/gpu/drm/xe/xe_configfs.c
>> > > > > b/drivers/gpu/drm/xe/ xe_configfs.c
>> > > > > new file mode 100644
>> > > > > index 000000000000..8c5f248e466d
>> > > > > --- /dev/null
>> > > > > +++ b/drivers/gpu/drm/xe/xe_configfs.c
>> > > > > @@ -0,0 +1,95 @@
>> > > > > +// SPDX-License-Identifier: MIT
>> > > > > +/*
>> > > > > + * Copyright © 2025 Intel Corporation
>> > > > > + */
>> > > > > +
>> > > > > +#include <linux/configfs.h>
>> > > > > +#include <linux/init.h>
>> > > > > +#include <linux/module.h>
>> > > > > +
>> > > > > +#include "xe_configfs.h"
>> > > > > +#include "xe_module.h"
>> > > > > +
>> > > > > +/**
>> > > > > + * DOC: Xe Configfs
>> > > > > + *
>> > > > > + * XE KMD registers a configfs subsystem called 'xe'to
>> > > > > userspace that allows users to modify
>> > > > > + * the exposed attributes.
>> > > > > + *
>> > > > > + * Attributes:
>> > > > > + *
>> > > > > + * config/xe/survivability_mode : Write only attribute that
>> > > > > allows user to specify the PCI address
>> > > > > + * of the card that has to enter survivability mode
>> > > >
>> > > > I think the desired interface is actually that the user mkdir the bdf in
>> > > > <configfs>/xe/. This populates the available config entries that
>> > > > the user
>> > > > writes to.
>> > >
>> > > Initial thought was mkdir bdf, but since it was a single attribute
>> > > and after a offline discussion with Rodrigo, did a simpler version
>> > > to get comments on the RFC patch and if configfs is okay to use
>> > > for survivability mode
>> >
>> > I disagree on the conclusion as then you are moving away from how
>> > configfs is commonly used and also making it harder to add new
>> > attributes in a uniform way.
>
>My bad, I'm sorry.
>I might have created the confusion here.
>
>> >
>> > >
>> > > For survivability mode, below procedure needs to be followed
>> > >
>> > > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/unbind
>> > > add bdf for survivability mode
>> > > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
>> > >
>> > > After the device is removed directory has to be created, the bdf has
>> > > to be checked if it belongs to xe driver and then attrs populated.
>> > > Even i think mkdir is better in case other attrs have to be added in
>> > > future, but for unbind case the check of the driver might be tricky
>> > > .
>> >
>> > I think you are talking about the mkdir in the wrong place. It isn't
>> > related to unbind at all. The sequence you mentioned is just because we
>> > lazy and let pci auto probe the driver. Under the hood you are doing:
>> >
>> > 1) load the module
>> > 2) bind the driver
>> > 3) unbind the driver
>> > 4) bind the driver in survivability mode
>> >
>> > The sequence we should actually have is:
>> >
>> > 1) load the module
>> > 2) bind the driver in survivability mode
>
>No, this doesn't work for the use case we want it.
>
>The goal is to be able to only move one single device to the survivability
>mode at runtime without disrupting the work that is going on on other devices.
>So we cannot unload the module and remove the autoprobe.
where are you seeing the need to remove the module in the above
sequence?
See that step 1 is about loading the module. *If* the module
was already loaded, then unbinding it would make sense, but it's not a
required step.
Lucas De Marchi
>
>>
>> This is what i meant.
>>
>> 1) load the module
>> 2) bind the driver
>> 3) unbind the driver
>>
>> 4) mkdir /sys/kernel/config/xe/0000:03:00.0
>> echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
>> 5) bind the driver in survivability mode
>>
>>
>> At step 4, when creating directory. The bdf needs to be validated, ie (it
>> should be pci_dev that xe will probe). i checked the struct pci_dev but
>> didn't find anything.
>> Will have to match the device id against supported pci device ids.
>
>I don't believe we need to run any validation. If admin creates a file/directory
>with invalid address it will be just ignored, no?!
>
>On the probe we only get the directory, file for the pci device we are probing.
>
>>
>> Otherwise, there might be wrong non functional bdf directories present based
>> on what user provides.
>>
>> Thanks
>> Riana
>> >
>> > You shouldn't create any directory when unbinding. It's the user who
>> > should create it. When **binding** the driver we should check what are
>> > the extra configuration available and adapt the probe according to that.
>> >
>> > For that we have to disable pci driver autoprobe and tweak the
>> > configfs settings:
>> >
>> > 1) echo 0 > /sys/bus/pci/drivers_autoprobe
>> > modprobe xe
>> >
>> > 2) mkdir /sys/kernel/config/xe/0000:03:00.0
>> > echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
>> > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
>> >
>> >
>> > Lucas De Marchi
>> >
>> > >
>> > > The attr is WO and any value can be written, only if the correct bdf
>> > > is added in the attr then it'll be used on probe to enter
>> > > survivability mode.The current patch only checks the format and does
>> > > not check if bdf belongs to xe.
>> > > >
>> > > > > + */
>> > > > > +
>> > > > > +void xe_configfs_clear_survivability_mode(void)
>> > > > > +{
>> > > > > + kfree(xe_modparam.survivability_mode);
>> > > > > + xe_modparam.survivability_mode = NULL;
>> > > > > +}
>> > > > > +
>> > > > > +static ssize_t survivability_mode_store(struct config_item
>> > > > > *item, const char *page, size_t len)
>> > > >
>> > > > once you handle the mkdir mentioned above, this should probably be
>> > > > a boolean attr like this:
>> > > >
>> > > > drivers/nvme/target/configfs.c:CONFIGFS_ATTR(nvmet_,
>> > > > param_pi_enable);
>> > > >
>> > > > > +{
>> > > > > + char *survivability_mode;
>> > > > > + int ret;
>> > > > > + unsigned int domain, bus, slot, function;
>> > > > > +
>> > > > > + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus,
>> > > > > &slot, &function);
>> > > > > + if (ret != 4)
>> > > > > + return -EINVAL;
>> > > > > +
>> > > > > + survivability_mode = kstrdup(page, GFP_KERNEL);
>> > > > > + if (!survivability_mode)
>> > > > > + return -ENOMEM;
>> > > > > +
>> > > > > + xe_configfs_clear_survivability_mode();
>> > > > > + xe_modparam.survivability_mode = survivability_mode;
>> > > > > +
>> > > > > + return len;
>> > > > > +}
>> > > > > +
>> > > > > +CONFIGFS_ATTR_WO(, survivability_mode);
>> > > > > +
>> > > > > +static struct configfs_attribute *xe_configfs_attrs[] = {
>> > > > > + &attr_survivability_mode,
>> > > > > + NULL,
>> > > > > +};
>> > > > > +
>> > > > > +static const struct config_item_type xe_config_type = {
>> > > > > + .ct_attrs = xe_configfs_attrs,
>> > > > > + .ct_owner = THIS_MODULE,
>> > > > > +};
>> > > > > +
>> > > > > +static struct configfs_subsystem xe_config_subsys = {
>> > > > > + .su_group = {
>> > > > > + .cg_item = {
>> > > > > + .ci_namebuf = "xe",
>> > > > > + .ci_type = &xe_config_type,
>> > > > > + },
>> > > > > + },
>> > > > > +};
>> > > > >
>> > > >
>> > > > so... it seems you are missing a configfs_group_operations.make_group.
>> > > >
>> > > > > +int __init xe_configfs_init(void)
>> > > > > +{
>> > > > > + int ret;
>> > > > > +
>> > > > > + config_group_init(&xe_config_subsys.su_group);
>> > > > > + mutex_init(&xe_config_subsys.su_mutex);
>> > > >
>> > > > this mutex_init seems odd, but inline with other drivers
>> > > yeah it is not used anywhere but the sample and other drivers also
>> > > initialize it
>> > > >
>> > > > > + ret = configfs_register_subsystem(&xe_config_subsys);
>> > > > > + if (ret) {
>> > > > > + pr_err("Error %d while registering subsystem %s\n",
>> > > > > + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
>> > > > > + mutex_destroy(&xe_config_subsys.su_mutex);
>> > > > > + return ret;
>> > > > > + }
>> > > > > +
>> > > > > + return 0;
>> > > > > +}
>> > > > > +
>> > > > > +void __exit xe_configfs_exit(void)
>> > > > > +{
>> > > > > + xe_configfs_clear_survivability_mode();
>> > > > > + configfs_unregister_subsystem(&xe_config_subsys);
>> > > > > + mutex_destroy(&xe_config_subsys.su_mutex);
>> > > > > +}
>> > > > > +
>> > > > > diff --git a/drivers/gpu/drm/xe/xe_configfs.h
>> > > > > b/drivers/gpu/drm/xe/ xe_configfs.h
>> > > > > new file mode 100644
>> > > > > index 000000000000..491629a2ca53
>> > > > > --- /dev/null
>> > > > > +++ b/drivers/gpu/drm/xe/xe_configfs.h
>> > > > > @@ -0,0 +1,12 @@
>> > > > > +/* SPDX-License-Identifier: MIT */
>> > > > > +/*
>> > > > > + * Copyright © 2025 Intel Corporation
>> > > > > + */
>> > > > > +#ifndef _XE_CONFIGFS_H_
>> > > > > +#define _XE_CONFIGFS_H_
>> > > > > +
>> > > > > +int xe_configfs_init(void);
>> > > > > +void xe_configfs_exit(void);
>> > > > > +void xe_configfs_clear_survivability_mode(void);
>> > > > > +
>> > > > > +#endif
>> > > > > diff --git a/drivers/gpu/drm/xe/xe_module.c
>> > > > > b/drivers/gpu/drm/xe/ xe_module.c
>> > > > > index 475acdba2b55..15b3cf22193c 100644
>> > > > > --- a/drivers/gpu/drm/xe/xe_module.c
>> > > > > +++ b/drivers/gpu/drm/xe/xe_module.c
>> > > > > @@ -11,6 +11,7 @@
>> > > > > #include <drm/drm_module.h>
>> > > > >
>> > > > > #include "xe_drv.h"
>> > > > > +#include "xe_configfs.h"
>> > > > > #include "xe_hw_fence.h"
>> > > > > #include "xe_pci.h"
>> > > > > #include "xe_pm.h"
>> > > > > @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
>> > > > > {
>> > > > > .init = xe_check_nomodeset,
>> > > > > },
>> > > > > + {
>> > > > > + .init = xe_configfs_init,
>> > > > > + .exit = xe_configfs_exit,
>> > > > > + },
>> > > > > {
>> > > > > .init = xe_hw_fence_module_init,
>> > > > > .exit = xe_hw_fence_module_exit,
>> > > > > diff --git a/drivers/gpu/drm/xe/xe_module.h
>> > > > > b/drivers/gpu/drm/xe/ xe_module.h
>> > > > > index 84339e509c80..c238dbee6bc7 100644
>> > > > > --- a/drivers/gpu/drm/xe/xe_module.h
>> > > > > +++ b/drivers/gpu/drm/xe/xe_module.h
>> > > > > @@ -24,6 +24,7 @@ struct xe_modparam {
>> > > > > #endif
>> > > > > int wedged_mode;
>> > > > > u32 svm_notifier_size;
>> > > > > + char *survivability_mode;
>> > > >
>> > > > drop this.. We want a config struct in the xe_device. It's actually
>> > > > allocated by the mkdir in the configs and when we are probing, we should
>> > > > find the config instace based on pdev and set the pointer xe->configfs
>> > > > or something like that.
>> > >
>> > > Will add a new struct
>> > >
>> > > Thanks
>> > > Riana
>> > > >
>> > > > thanks
>> > > > Lucas De Marchi
>> > > >
>> > > > > };
>> > > > >
>> > > > > extern struct xe_modparam xe_modparam;
>> > > > > --
>> > > > > 2.47.1
>> > > > >
>> > >
>>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-10 17:01 ` Lucas De Marchi
@ 2025-03-10 17:14 ` Rodrigo Vivi
2025-03-10 21:40 ` Matt Roper
0 siblings, 1 reply; 23+ messages in thread
From: Rodrigo Vivi @ 2025-03-10 17:14 UTC (permalink / raw)
To: Lucas De Marchi; +Cc: Riana Tauro, intel-xe, anshuman.gupta, matthew.d.roper
On Mon, Mar 10, 2025 at 12:01:55PM -0500, Lucas De Marchi wrote:
> On Mon, Mar 10, 2025 at 12:40:39PM -0400, Rodrigo Vivi wrote:
> > On Mon, Mar 10, 2025 at 07:53:27PM +0530, Riana Tauro wrote:
> > > Hi Lucas
> > >
> > > On 3/10/2025 7:01 PM, Lucas De Marchi wrote:
> > > > On Mon, Mar 10, 2025 at 11:01:50AM +0530, Riana Tauro wrote:
> > > > > Hi Lucas
> > > > >
> > > > > On 3/7/2025 8:46 PM, Lucas De Marchi wrote:
> > > > > > On Fri, Mar 07, 2025 at 07:54:44PM +0530, Riana Tauro wrote:
> > > > > > > Registers a configfs subsystem called 'xe' to userspace. The user can
> > > > > > > use this to modify exposed attributes.
> > > > > > >
> > > > > > > Add survivability mode attribute (config/xe/survivability_mode) to the
> > > > > > > subsystem to allow the user to specify the card that should enter
> > > > > > > survivability mode.
> > > > > > >
> > > > > > > Signed-off-by: Riana Tauro <riana.tauro@intel.com>
> > > > > > > ---
> > > > > > > drivers/gpu/drm/xe/Makefile | 1 +
> > > > > > > drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
> > > > > > > drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
> > > > > > > drivers/gpu/drm/xe/xe_module.c | 5 ++
> > > > > > > drivers/gpu/drm/xe/xe_module.h | 1 +
> > > > > > > 5 files changed, 114 insertions(+)
> > > > > > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
> > > > > > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
> > > > > > >
> > > > > > > diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> > > > > > > index 9699b08585f7..3f8c87292cee 100644
> > > > > > > --- a/drivers/gpu/drm/xe/Makefile
> > > > > > > +++ b/drivers/gpu/drm/xe/Makefile
> > > > > > > @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c
> > > > > > > $(obj)/generated/ %_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > > > > > > xe-y += xe_bb.o \
> > > > > > > xe_bo.o \
> > > > > > > xe_bo_evict.o \
> > > > > > > + xe_configfs.o \
> > > > > > > xe_devcoredump.o \
> > > > > > > xe_device.o \
> > > > > > > xe_device_sysfs.o \
> > > > > > > diff --git a/drivers/gpu/drm/xe/xe_configfs.c
> > > > > > > b/drivers/gpu/drm/xe/ xe_configfs.c
> > > > > > > new file mode 100644
> > > > > > > index 000000000000..8c5f248e466d
> > > > > > > --- /dev/null
> > > > > > > +++ b/drivers/gpu/drm/xe/xe_configfs.c
> > > > > > > @@ -0,0 +1,95 @@
> > > > > > > +// SPDX-License-Identifier: MIT
> > > > > > > +/*
> > > > > > > + * Copyright © 2025 Intel Corporation
> > > > > > > + */
> > > > > > > +
> > > > > > > +#include <linux/configfs.h>
> > > > > > > +#include <linux/init.h>
> > > > > > > +#include <linux/module.h>
> > > > > > > +
> > > > > > > +#include "xe_configfs.h"
> > > > > > > +#include "xe_module.h"
> > > > > > > +
> > > > > > > +/**
> > > > > > > + * DOC: Xe Configfs
> > > > > > > + *
> > > > > > > + * XE KMD registers a configfs subsystem called 'xe'to
> > > > > > > userspace that allows users to modify
> > > > > > > + * the exposed attributes.
> > > > > > > + *
> > > > > > > + * Attributes:
> > > > > > > + *
> > > > > > > + * config/xe/survivability_mode : Write only attribute that
> > > > > > > allows user to specify the PCI address
> > > > > > > + * of the card that has to enter survivability mode
> > > > > >
> > > > > > I think the desired interface is actually that the user mkdir the bdf in
> > > > > > <configfs>/xe/. This populates the available config entries that
> > > > > > the user
> > > > > > writes to.
> > > > >
> > > > > Initial thought was mkdir bdf, but since it was a single attribute
> > > > > and after a offline discussion with Rodrigo, did a simpler version
> > > > > to get comments on the RFC patch and if configfs is okay to use
> > > > > for survivability mode
> > > >
> > > > I disagree on the conclusion as then you are moving away from how
> > > > configfs is commonly used and also making it harder to add new
> > > > attributes in a uniform way.
> >
> > My bad, I'm sorry.
> > I might have created the confusion here.
> >
> > > >
> > > > >
> > > > > For survivability mode, below procedure needs to be followed
> > > > >
> > > > > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/unbind
> > > > > add bdf for survivability mode
> > > > > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
> > > > >
> > > > > After the device is removed directory has to be created, the bdf has
> > > > > to be checked if it belongs to xe driver and then attrs populated.
> > > > > Even i think mkdir is better in case other attrs have to be added in
> > > > > future, but for unbind case the check of the driver might be tricky
> > > > > .
> > > >
> > > > I think you are talking about the mkdir in the wrong place. It isn't
> > > > related to unbind at all. The sequence you mentioned is just because we
> > > > lazy and let pci auto probe the driver. Under the hood you are doing:
> > > >
> > > > 1) load the module
> > > > 2) bind the driver
> > > > 3) unbind the driver
> > > > 4) bind the driver in survivability mode
> > > >
> > > > The sequence we should actually have is:
> > > >
> > > > 1) load the module
> > > > 2) bind the driver in survivability mode
> >
> > No, this doesn't work for the use case we want it.
> >
> > The goal is to be able to only move one single device to the survivability
> > mode at runtime without disrupting the work that is going on on other devices.
> > So we cannot unload the module and remove the autoprobe.
>
> where are you seeing the need to remove the module in the above
> sequence?
>
> See that step 1 is about loading the module. *If* the module
> was already loaded, then unbinding it would make sense, but it's not a
> required step.
Ah okay. Yeap, in the use case that we have the module is already loaded
and we don't want to disrupt other running GPUs. So let's considered
that module is already loaded at boot and we need to unbind. So we are
in the same page and avoid some confusion here.
so,
1. unbind specific device
2. configfs (create dirs/files needed with the content of the desired pci bus address.)
3. bind specific device (during the bind we check the config against the device it is
getting probed and move to survivability mode. ignore otherwise and proceed with the probe)
>
> Lucas De Marchi
>
> >
> > >
> > > This is what i meant.
> > >
> > > 1) load the module
> > > 2) bind the driver
> > > 3) unbind the driver
> > >
> > > 4) mkdir /sys/kernel/config/xe/0000:03:00.0
> > > echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
> > > 5) bind the driver in survivability mode
> > >
> > >
> > > At step 4, when creating directory. The bdf needs to be validated, ie (it
> > > should be pci_dev that xe will probe). i checked the struct pci_dev but
> > > didn't find anything.
> > > Will have to match the device id against supported pci device ids.
> >
> > I don't believe we need to run any validation. If admin creates a file/directory
> > with invalid address it will be just ignored, no?!
> >
> > On the probe we only get the directory, file for the pci device we are probing.
> >
> > >
> > > Otherwise, there might be wrong non functional bdf directories present based
> > > on what user provides.
> > >
> > > Thanks
> > > Riana
> > > >
> > > > You shouldn't create any directory when unbinding. It's the user who
> > > > should create it. When **binding** the driver we should check what are
> > > > the extra configuration available and adapt the probe according to that.
> > > >
> > > > For that we have to disable pci driver autoprobe and tweak the
> > > > configfs settings:
> > > >
> > > > 1) echo 0 > /sys/bus/pci/drivers_autoprobe
> > > > modprobe xe
> > > >
> > > > 2) mkdir /sys/kernel/config/xe/0000:03:00.0
> > > > echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
> > > > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
> > > >
> > > >
> > > > Lucas De Marchi
> > > >
> > > > >
> > > > > The attr is WO and any value can be written, only if the correct bdf
> > > > > is added in the attr then it'll be used on probe to enter
> > > > > survivability mode.The current patch only checks the format and does
> > > > > not check if bdf belongs to xe.
> > > > > >
> > > > > > > + */
> > > > > > > +
> > > > > > > +void xe_configfs_clear_survivability_mode(void)
> > > > > > > +{
> > > > > > > + kfree(xe_modparam.survivability_mode);
> > > > > > > + xe_modparam.survivability_mode = NULL;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static ssize_t survivability_mode_store(struct config_item
> > > > > > > *item, const char *page, size_t len)
> > > > > >
> > > > > > once you handle the mkdir mentioned above, this should probably be
> > > > > > a boolean attr like this:
> > > > > >
> > > > > > drivers/nvme/target/configfs.c:CONFIGFS_ATTR(nvmet_,
> > > > > > param_pi_enable);
> > > > > >
> > > > > > > +{
> > > > > > > + char *survivability_mode;
> > > > > > > + int ret;
> > > > > > > + unsigned int domain, bus, slot, function;
> > > > > > > +
> > > > > > > + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus,
> > > > > > > &slot, &function);
> > > > > > > + if (ret != 4)
> > > > > > > + return -EINVAL;
> > > > > > > +
> > > > > > > + survivability_mode = kstrdup(page, GFP_KERNEL);
> > > > > > > + if (!survivability_mode)
> > > > > > > + return -ENOMEM;
> > > > > > > +
> > > > > > > + xe_configfs_clear_survivability_mode();
> > > > > > > + xe_modparam.survivability_mode = survivability_mode;
> > > > > > > +
> > > > > > > + return len;
> > > > > > > +}
> > > > > > > +
> > > > > > > +CONFIGFS_ATTR_WO(, survivability_mode);
> > > > > > > +
> > > > > > > +static struct configfs_attribute *xe_configfs_attrs[] = {
> > > > > > > + &attr_survivability_mode,
> > > > > > > + NULL,
> > > > > > > +};
> > > > > > > +
> > > > > > > +static const struct config_item_type xe_config_type = {
> > > > > > > + .ct_attrs = xe_configfs_attrs,
> > > > > > > + .ct_owner = THIS_MODULE,
> > > > > > > +};
> > > > > > > +
> > > > > > > +static struct configfs_subsystem xe_config_subsys = {
> > > > > > > + .su_group = {
> > > > > > > + .cg_item = {
> > > > > > > + .ci_namebuf = "xe",
> > > > > > > + .ci_type = &xe_config_type,
> > > > > > > + },
> > > > > > > + },
> > > > > > > +};
> > > > > > >
> > > > > >
> > > > > > so... it seems you are missing a configfs_group_operations.make_group.
> > > > > >
> > > > > > > +int __init xe_configfs_init(void)
> > > > > > > +{
> > > > > > > + int ret;
> > > > > > > +
> > > > > > > + config_group_init(&xe_config_subsys.su_group);
> > > > > > > + mutex_init(&xe_config_subsys.su_mutex);
> > > > > >
> > > > > > this mutex_init seems odd, but inline with other drivers
> > > > > yeah it is not used anywhere but the sample and other drivers also
> > > > > initialize it
> > > > > >
> > > > > > > + ret = configfs_register_subsystem(&xe_config_subsys);
> > > > > > > + if (ret) {
> > > > > > > + pr_err("Error %d while registering subsystem %s\n",
> > > > > > > + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
> > > > > > > + mutex_destroy(&xe_config_subsys.su_mutex);
> > > > > > > + return ret;
> > > > > > > + }
> > > > > > > +
> > > > > > > + return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void __exit xe_configfs_exit(void)
> > > > > > > +{
> > > > > > > + xe_configfs_clear_survivability_mode();
> > > > > > > + configfs_unregister_subsystem(&xe_config_subsys);
> > > > > > > + mutex_destroy(&xe_config_subsys.su_mutex);
> > > > > > > +}
> > > > > > > +
> > > > > > > diff --git a/drivers/gpu/drm/xe/xe_configfs.h
> > > > > > > b/drivers/gpu/drm/xe/ xe_configfs.h
> > > > > > > new file mode 100644
> > > > > > > index 000000000000..491629a2ca53
> > > > > > > --- /dev/null
> > > > > > > +++ b/drivers/gpu/drm/xe/xe_configfs.h
> > > > > > > @@ -0,0 +1,12 @@
> > > > > > > +/* SPDX-License-Identifier: MIT */
> > > > > > > +/*
> > > > > > > + * Copyright © 2025 Intel Corporation
> > > > > > > + */
> > > > > > > +#ifndef _XE_CONFIGFS_H_
> > > > > > > +#define _XE_CONFIGFS_H_
> > > > > > > +
> > > > > > > +int xe_configfs_init(void);
> > > > > > > +void xe_configfs_exit(void);
> > > > > > > +void xe_configfs_clear_survivability_mode(void);
> > > > > > > +
> > > > > > > +#endif
> > > > > > > diff --git a/drivers/gpu/drm/xe/xe_module.c
> > > > > > > b/drivers/gpu/drm/xe/ xe_module.c
> > > > > > > index 475acdba2b55..15b3cf22193c 100644
> > > > > > > --- a/drivers/gpu/drm/xe/xe_module.c
> > > > > > > +++ b/drivers/gpu/drm/xe/xe_module.c
> > > > > > > @@ -11,6 +11,7 @@
> > > > > > > #include <drm/drm_module.h>
> > > > > > >
> > > > > > > #include "xe_drv.h"
> > > > > > > +#include "xe_configfs.h"
> > > > > > > #include "xe_hw_fence.h"
> > > > > > > #include "xe_pci.h"
> > > > > > > #include "xe_pm.h"
> > > > > > > @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
> > > > > > > {
> > > > > > > .init = xe_check_nomodeset,
> > > > > > > },
> > > > > > > + {
> > > > > > > + .init = xe_configfs_init,
> > > > > > > + .exit = xe_configfs_exit,
> > > > > > > + },
> > > > > > > {
> > > > > > > .init = xe_hw_fence_module_init,
> > > > > > > .exit = xe_hw_fence_module_exit,
> > > > > > > diff --git a/drivers/gpu/drm/xe/xe_module.h
> > > > > > > b/drivers/gpu/drm/xe/ xe_module.h
> > > > > > > index 84339e509c80..c238dbee6bc7 100644
> > > > > > > --- a/drivers/gpu/drm/xe/xe_module.h
> > > > > > > +++ b/drivers/gpu/drm/xe/xe_module.h
> > > > > > > @@ -24,6 +24,7 @@ struct xe_modparam {
> > > > > > > #endif
> > > > > > > int wedged_mode;
> > > > > > > u32 svm_notifier_size;
> > > > > > > + char *survivability_mode;
> > > > > >
> > > > > > drop this.. We want a config struct in the xe_device. It's actually
> > > > > > allocated by the mkdir in the configs and when we are probing, we should
> > > > > > find the config instace based on pdev and set the pointer xe->configfs
> > > > > > or something like that.
> > > > >
> > > > > Will add a new struct
> > > > >
> > > > > Thanks
> > > > > Riana
> > > > > >
> > > > > > thanks
> > > > > > Lucas De Marchi
> > > > > >
> > > > > > > };
> > > > > > >
> > > > > > > extern struct xe_modparam xe_modparam;
> > > > > > > --
> > > > > > > 2.47.1
> > > > > > >
> > > > >
> > >
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-10 17:14 ` Rodrigo Vivi
@ 2025-03-10 21:40 ` Matt Roper
0 siblings, 0 replies; 23+ messages in thread
From: Matt Roper @ 2025-03-10 21:40 UTC (permalink / raw)
To: Rodrigo Vivi; +Cc: Lucas De Marchi, Riana Tauro, intel-xe, anshuman.gupta
On Mon, Mar 10, 2025 at 01:14:02PM -0400, Rodrigo Vivi wrote:
> On Mon, Mar 10, 2025 at 12:01:55PM -0500, Lucas De Marchi wrote:
> > On Mon, Mar 10, 2025 at 12:40:39PM -0400, Rodrigo Vivi wrote:
> > > On Mon, Mar 10, 2025 at 07:53:27PM +0530, Riana Tauro wrote:
> > > > Hi Lucas
> > > >
> > > > On 3/10/2025 7:01 PM, Lucas De Marchi wrote:
> > > > > On Mon, Mar 10, 2025 at 11:01:50AM +0530, Riana Tauro wrote:
> > > > > > Hi Lucas
> > > > > >
> > > > > > On 3/7/2025 8:46 PM, Lucas De Marchi wrote:
> > > > > > > On Fri, Mar 07, 2025 at 07:54:44PM +0530, Riana Tauro wrote:
> > > > > > > > Registers a configfs subsystem called 'xe' to userspace. The user can
> > > > > > > > use this to modify exposed attributes.
> > > > > > > >
> > > > > > > > Add survivability mode attribute (config/xe/survivability_mode) to the
> > > > > > > > subsystem to allow the user to specify the card that should enter
> > > > > > > > survivability mode.
> > > > > > > >
> > > > > > > > Signed-off-by: Riana Tauro <riana.tauro@intel.com>
> > > > > > > > ---
> > > > > > > > drivers/gpu/drm/xe/Makefile | 1 +
> > > > > > > > drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
> > > > > > > > drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
> > > > > > > > drivers/gpu/drm/xe/xe_module.c | 5 ++
> > > > > > > > drivers/gpu/drm/xe/xe_module.h | 1 +
> > > > > > > > 5 files changed, 114 insertions(+)
> > > > > > > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
> > > > > > > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
> > > > > > > >
> > > > > > > > diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> > > > > > > > index 9699b08585f7..3f8c87292cee 100644
> > > > > > > > --- a/drivers/gpu/drm/xe/Makefile
> > > > > > > > +++ b/drivers/gpu/drm/xe/Makefile
> > > > > > > > @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c
> > > > > > > > $(obj)/generated/ %_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > > > > > > > xe-y += xe_bb.o \
> > > > > > > > xe_bo.o \
> > > > > > > > xe_bo_evict.o \
> > > > > > > > + xe_configfs.o \
> > > > > > > > xe_devcoredump.o \
> > > > > > > > xe_device.o \
> > > > > > > > xe_device_sysfs.o \
> > > > > > > > diff --git a/drivers/gpu/drm/xe/xe_configfs.c
> > > > > > > > b/drivers/gpu/drm/xe/ xe_configfs.c
> > > > > > > > new file mode 100644
> > > > > > > > index 000000000000..8c5f248e466d
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/drivers/gpu/drm/xe/xe_configfs.c
> > > > > > > > @@ -0,0 +1,95 @@
> > > > > > > > +// SPDX-License-Identifier: MIT
> > > > > > > > +/*
> > > > > > > > + * Copyright © 2025 Intel Corporation
> > > > > > > > + */
> > > > > > > > +
> > > > > > > > +#include <linux/configfs.h>
> > > > > > > > +#include <linux/init.h>
> > > > > > > > +#include <linux/module.h>
> > > > > > > > +
> > > > > > > > +#include "xe_configfs.h"
> > > > > > > > +#include "xe_module.h"
> > > > > > > > +
> > > > > > > > +/**
> > > > > > > > + * DOC: Xe Configfs
> > > > > > > > + *
> > > > > > > > + * XE KMD registers a configfs subsystem called 'xe'to
> > > > > > > > userspace that allows users to modify
> > > > > > > > + * the exposed attributes.
> > > > > > > > + *
> > > > > > > > + * Attributes:
> > > > > > > > + *
> > > > > > > > + * config/xe/survivability_mode : Write only attribute that
> > > > > > > > allows user to specify the PCI address
> > > > > > > > + * of the card that has to enter survivability mode
> > > > > > >
> > > > > > > I think the desired interface is actually that the user mkdir the bdf in
> > > > > > > <configfs>/xe/. This populates the available config entries that
> > > > > > > the user
> > > > > > > writes to.
> > > > > >
> > > > > > Initial thought was mkdir bdf, but since it was a single attribute
> > > > > > and after a offline discussion with Rodrigo, did a simpler version
> > > > > > to get comments on the RFC patch and if configfs is okay to use
> > > > > > for survivability mode
> > > > >
> > > > > I disagree on the conclusion as then you are moving away from how
> > > > > configfs is commonly used and also making it harder to add new
> > > > > attributes in a uniform way.
> > >
> > > My bad, I'm sorry.
> > > I might have created the confusion here.
> > >
> > > > >
> > > > > >
> > > > > > For survivability mode, below procedure needs to be followed
> > > > > >
> > > > > > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/unbind
> > > > > > add bdf for survivability mode
> > > > > > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
> > > > > >
> > > > > > After the device is removed directory has to be created, the bdf has
> > > > > > to be checked if it belongs to xe driver and then attrs populated.
> > > > > > Even i think mkdir is better in case other attrs have to be added in
> > > > > > future, but for unbind case the check of the driver might be tricky
> > > > > > .
> > > > >
> > > > > I think you are talking about the mkdir in the wrong place. It isn't
> > > > > related to unbind at all. The sequence you mentioned is just because we
> > > > > lazy and let pci auto probe the driver. Under the hood you are doing:
> > > > >
> > > > > 1) load the module
> > > > > 2) bind the driver
> > > > > 3) unbind the driver
> > > > > 4) bind the driver in survivability mode
> > > > >
> > > > > The sequence we should actually have is:
> > > > >
> > > > > 1) load the module
> > > > > 2) bind the driver in survivability mode
> > >
> > > No, this doesn't work for the use case we want it.
> > >
> > > The goal is to be able to only move one single device to the survivability
> > > mode at runtime without disrupting the work that is going on on other devices.
> > > So we cannot unload the module and remove the autoprobe.
> >
> > where are you seeing the need to remove the module in the above
> > sequence?
> >
> > See that step 1 is about loading the module. *If* the module
> > was already loaded, then unbinding it would make sense, but it's not a
> > required step.
>
> Ah okay. Yeap, in the use case that we have the module is already loaded
> and we don't want to disrupt other running GPUs. So let's considered
> that module is already loaded at boot and we need to unbind. So we are
> in the same page and avoid some confusion here.
>
> so,
> 1. unbind specific device
> 2. configfs (create dirs/files needed with the content of the desired pci bus address.)
> 3. bind specific device (during the bind we check the config against the device it is
> getting probed and move to survivability mode. ignore otherwise and proceed with the probe)
>
> >
> > Lucas De Marchi
> >
> > >
> > > >
> > > > This is what i meant.
> > > >
> > > > 1) load the module
> > > > 2) bind the driver
> > > > 3) unbind the driver
> > > >
> > > > 4) mkdir /sys/kernel/config/xe/0000:03:00.0
> > > > echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
> > > > 5) bind the driver in survivability mode
> > > >
> > > >
> > > > At step 4, when creating directory. The bdf needs to be validated, ie (it
> > > > should be pci_dev that xe will probe). i checked the struct pci_dev but
> > > > didn't find anything.
> > > > Will have to match the device id against supported pci device ids.
I don't think we need to validate the BDF here. If the user does a
mkdir with some non-existent BDF, it doesn't really matter or harm
anything (aside from wasting a trivial amount of memory to store the
structure). Each mkdir is basically a notification that "*if* we ever
wind up probing the driver on this BDF, we'll have some extra knobs that
can be configured." If no probe ever happens on that BDF, that's fine
and there's no harm in having setup the config knobs for it.
In theory it would even be possible for devices to be physically
hotplugged and show up later, so doing BDF verification at the time
mkdir is called probably wouldn't even be desirable. I don't know if
it's truly possible to hotplug any of our GPUs like that or not today,
but in general devices are hotpluggable, and we assume the sysadmin
using this configfs knows what devices they do/don't expect to need to
configure.
Matt
> > >
> > > I don't believe we need to run any validation. If admin creates a file/directory
> > > with invalid address it will be just ignored, no?!
> > >
> > > On the probe we only get the directory, file for the pci device we are probing.
> > >
> > > >
> > > > Otherwise, there might be wrong non functional bdf directories present based
> > > > on what user provides.
> > > >
> > > > Thanks
> > > > Riana
> > > > >
> > > > > You shouldn't create any directory when unbinding. It's the user who
> > > > > should create it. When **binding** the driver we should check what are
> > > > > the extra configuration available and adapt the probe according to that.
> > > > >
> > > > > For that we have to disable pci driver autoprobe and tweak the
> > > > > configfs settings:
> > > > >
> > > > > 1) echo 0 > /sys/bus/pci/drivers_autoprobe
> > > > > modprobe xe
> > > > >
> > > > > 2) mkdir /sys/kernel/config/xe/0000:03:00.0
> > > > > echo .... > /sys/kernel/kernel/config/xe/0000:03:00.0/...
> > > > > echo "0000:03:00.0" > /sys/bus/pci/drivers/xe/bind
> > > > >
> > > > >
> > > > > Lucas De Marchi
> > > > >
> > > > > >
> > > > > > The attr is WO and any value can be written, only if the correct bdf
> > > > > > is added in the attr then it'll be used on probe to enter
> > > > > > survivability mode.The current patch only checks the format and does
> > > > > > not check if bdf belongs to xe.
> > > > > > >
> > > > > > > > + */
> > > > > > > > +
> > > > > > > > +void xe_configfs_clear_survivability_mode(void)
> > > > > > > > +{
> > > > > > > > + kfree(xe_modparam.survivability_mode);
> > > > > > > > + xe_modparam.survivability_mode = NULL;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static ssize_t survivability_mode_store(struct config_item
> > > > > > > > *item, const char *page, size_t len)
> > > > > > >
> > > > > > > once you handle the mkdir mentioned above, this should probably be
> > > > > > > a boolean attr like this:
> > > > > > >
> > > > > > > drivers/nvme/target/configfs.c:CONFIGFS_ATTR(nvmet_,
> > > > > > > param_pi_enable);
> > > > > > >
> > > > > > > > +{
> > > > > > > > + char *survivability_mode;
> > > > > > > > + int ret;
> > > > > > > > + unsigned int domain, bus, slot, function;
> > > > > > > > +
> > > > > > > > + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus,
> > > > > > > > &slot, &function);
> > > > > > > > + if (ret != 4)
> > > > > > > > + return -EINVAL;
> > > > > > > > +
> > > > > > > > + survivability_mode = kstrdup(page, GFP_KERNEL);
> > > > > > > > + if (!survivability_mode)
> > > > > > > > + return -ENOMEM;
> > > > > > > > +
> > > > > > > > + xe_configfs_clear_survivability_mode();
> > > > > > > > + xe_modparam.survivability_mode = survivability_mode;
> > > > > > > > +
> > > > > > > > + return len;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +CONFIGFS_ATTR_WO(, survivability_mode);
> > > > > > > > +
> > > > > > > > +static struct configfs_attribute *xe_configfs_attrs[] = {
> > > > > > > > + &attr_survivability_mode,
> > > > > > > > + NULL,
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +static const struct config_item_type xe_config_type = {
> > > > > > > > + .ct_attrs = xe_configfs_attrs,
> > > > > > > > + .ct_owner = THIS_MODULE,
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +static struct configfs_subsystem xe_config_subsys = {
> > > > > > > > + .su_group = {
> > > > > > > > + .cg_item = {
> > > > > > > > + .ci_namebuf = "xe",
> > > > > > > > + .ci_type = &xe_config_type,
> > > > > > > > + },
> > > > > > > > + },
> > > > > > > > +};
> > > > > > > >
> > > > > > >
> > > > > > > so... it seems you are missing a configfs_group_operations.make_group.
> > > > > > >
> > > > > > > > +int __init xe_configfs_init(void)
> > > > > > > > +{
> > > > > > > > + int ret;
> > > > > > > > +
> > > > > > > > + config_group_init(&xe_config_subsys.su_group);
> > > > > > > > + mutex_init(&xe_config_subsys.su_mutex);
> > > > > > >
> > > > > > > this mutex_init seems odd, but inline with other drivers
> > > > > > yeah it is not used anywhere but the sample and other drivers also
> > > > > > initialize it
> > > > > > >
> > > > > > > > + ret = configfs_register_subsystem(&xe_config_subsys);
> > > > > > > > + if (ret) {
> > > > > > > > + pr_err("Error %d while registering subsystem %s\n",
> > > > > > > > + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
> > > > > > > > + mutex_destroy(&xe_config_subsys.su_mutex);
> > > > > > > > + return ret;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + return 0;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +void __exit xe_configfs_exit(void)
> > > > > > > > +{
> > > > > > > > + xe_configfs_clear_survivability_mode();
> > > > > > > > + configfs_unregister_subsystem(&xe_config_subsys);
> > > > > > > > + mutex_destroy(&xe_config_subsys.su_mutex);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > diff --git a/drivers/gpu/drm/xe/xe_configfs.h
> > > > > > > > b/drivers/gpu/drm/xe/ xe_configfs.h
> > > > > > > > new file mode 100644
> > > > > > > > index 000000000000..491629a2ca53
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/drivers/gpu/drm/xe/xe_configfs.h
> > > > > > > > @@ -0,0 +1,12 @@
> > > > > > > > +/* SPDX-License-Identifier: MIT */
> > > > > > > > +/*
> > > > > > > > + * Copyright © 2025 Intel Corporation
> > > > > > > > + */
> > > > > > > > +#ifndef _XE_CONFIGFS_H_
> > > > > > > > +#define _XE_CONFIGFS_H_
> > > > > > > > +
> > > > > > > > +int xe_configfs_init(void);
> > > > > > > > +void xe_configfs_exit(void);
> > > > > > > > +void xe_configfs_clear_survivability_mode(void);
> > > > > > > > +
> > > > > > > > +#endif
> > > > > > > > diff --git a/drivers/gpu/drm/xe/xe_module.c
> > > > > > > > b/drivers/gpu/drm/xe/ xe_module.c
> > > > > > > > index 475acdba2b55..15b3cf22193c 100644
> > > > > > > > --- a/drivers/gpu/drm/xe/xe_module.c
> > > > > > > > +++ b/drivers/gpu/drm/xe/xe_module.c
> > > > > > > > @@ -11,6 +11,7 @@
> > > > > > > > #include <drm/drm_module.h>
> > > > > > > >
> > > > > > > > #include "xe_drv.h"
> > > > > > > > +#include "xe_configfs.h"
> > > > > > > > #include "xe_hw_fence.h"
> > > > > > > > #include "xe_pci.h"
> > > > > > > > #include "xe_pm.h"
> > > > > > > > @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
> > > > > > > > {
> > > > > > > > .init = xe_check_nomodeset,
> > > > > > > > },
> > > > > > > > + {
> > > > > > > > + .init = xe_configfs_init,
> > > > > > > > + .exit = xe_configfs_exit,
> > > > > > > > + },
> > > > > > > > {
> > > > > > > > .init = xe_hw_fence_module_init,
> > > > > > > > .exit = xe_hw_fence_module_exit,
> > > > > > > > diff --git a/drivers/gpu/drm/xe/xe_module.h
> > > > > > > > b/drivers/gpu/drm/xe/ xe_module.h
> > > > > > > > index 84339e509c80..c238dbee6bc7 100644
> > > > > > > > --- a/drivers/gpu/drm/xe/xe_module.h
> > > > > > > > +++ b/drivers/gpu/drm/xe/xe_module.h
> > > > > > > > @@ -24,6 +24,7 @@ struct xe_modparam {
> > > > > > > > #endif
> > > > > > > > int wedged_mode;
> > > > > > > > u32 svm_notifier_size;
> > > > > > > > + char *survivability_mode;
> > > > > > >
> > > > > > > drop this.. We want a config struct in the xe_device. It's actually
> > > > > > > allocated by the mkdir in the configs and when we are probing, we should
> > > > > > > find the config instace based on pdev and set the pointer xe->configfs
> > > > > > > or something like that.
> > > > > >
> > > > > > Will add a new struct
> > > > > >
> > > > > > Thanks
> > > > > > Riana
> > > > > > >
> > > > > > > thanks
> > > > > > > Lucas De Marchi
> > > > > > >
> > > > > > > > };
> > > > > > > >
> > > > > > > > extern struct xe_modparam xe_modparam;
> > > > > > > > --
> > > > > > > > 2.47.1
> > > > > > > >
> > > > > >
> > > >
--
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-10 7:14 ` Aravind Iddamsetty
@ 2025-03-13 6:18 ` Riana Tauro
2025-03-13 14:52 ` Rodrigo Vivi
0 siblings, 1 reply; 23+ messages in thread
From: Riana Tauro @ 2025-03-13 6:18 UTC (permalink / raw)
To: Aravind Iddamsetty, intel-xe, rodrigo.vivi
Cc: anshuman.gupta, matthew.d.roper, lucas.demarchi
Hi Aravind
On 3/10/2025 12:44 PM, Aravind Iddamsetty wrote:
>
> On 07-03-2025 19:54, Riana Tauro wrote:
>
> Hi Riana,
>
> I do think we can achieve the same functionality with module param and
> we needn't reload the driver
> if we are doing unbind, Since the driver will be loaded event after
> unbind we can modify the module param
tried this. We can modify the module param using sysfs and bind similar
to configfs.
If we want to configure any other attributes or move existing module
params to device specific then adding configfs seems better.
@Rodrigo thoughts?
Thanks
Riana
> and once we bind the device back it can check if the BDF belongs to this
> driver instance and configure the mode accordingly.
>
> Thanks,
> Aravind.
>> Registers a configfs subsystem called 'xe' to userspace. The user can
>> use this to modify exposed attributes.
>>
>> Add survivability mode attribute (config/xe/survivability_mode) to the
>> subsystem to allow the user to specify the card that should enter
>> survivability mode.
>>
>> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>> ---
>> drivers/gpu/drm/xe/Makefile | 1 +
>> drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
>> drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
>> drivers/gpu/drm/xe/xe_module.c | 5 ++
>> drivers/gpu/drm/xe/xe_module.h | 1 +
>> 5 files changed, 114 insertions(+)
>> create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
>> create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index 9699b08585f7..3f8c87292cee 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
>> xe-y += xe_bb.o \
>> xe_bo.o \
>> xe_bo_evict.o \
>> + xe_configfs.o \
>> xe_devcoredump.o \
>> xe_device.o \
>> xe_device_sysfs.o \
>> diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
>> new file mode 100644
>> index 000000000000..8c5f248e466d
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_configfs.c
>> @@ -0,0 +1,95 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +
>> +#include <linux/configfs.h>
>> +#include <linux/init.h>
>> +#include <linux/module.h>
>> +
>> +#include "xe_configfs.h"
>> +#include "xe_module.h"
>> +
>> +/**
>> + * DOC: Xe Configfs
>> + *
>> + * XE KMD registers a configfs subsystem called 'xe'to userspace that allows users to modify
>> + * the exposed attributes.
>> + *
>> + * Attributes:
>> + *
>> + * config/xe/survivability_mode : Write only attribute that allows user to specify the PCI address
>> + * of the card that has to enter survivability mode
>> + */
>> +
>> +void xe_configfs_clear_survivability_mode(void)
>> +{
>> + kfree(xe_modparam.survivability_mode);
>> + xe_modparam.survivability_mode = NULL;
>> +}
>> +
>> +static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
>> +{
>> + char *survivability_mode;
>> + int ret;
>> + unsigned int domain, bus, slot, function;
>> +
>> + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus, &slot, &function);
>> + if (ret != 4)
>> + return -EINVAL;
>> +
>> + survivability_mode = kstrdup(page, GFP_KERNEL);
>> + if (!survivability_mode)
>> + return -ENOMEM;
>> +
>> + xe_configfs_clear_survivability_mode();
>> + xe_modparam.survivability_mode = survivability_mode;
>> +
>> + return len;
>> +}
>> +
>> +CONFIGFS_ATTR_WO(, survivability_mode);
>> +
>> +static struct configfs_attribute *xe_configfs_attrs[] = {
>> + &attr_survivability_mode,
>> + NULL,
>> +};
>> +
>> +static const struct config_item_type xe_config_type = {
>> + .ct_attrs = xe_configfs_attrs,
>> + .ct_owner = THIS_MODULE,
>> +};
>> +
>> +static struct configfs_subsystem xe_config_subsys = {
>> + .su_group = {
>> + .cg_item = {
>> + .ci_namebuf = "xe",
>> + .ci_type = &xe_config_type,
>> + },
>> + },
>> +};
>> +
>> +int __init xe_configfs_init(void)
>> +{
>> + int ret;
>> +
>> + config_group_init(&xe_config_subsys.su_group);
>> + mutex_init(&xe_config_subsys.su_mutex);
>> + ret = configfs_register_subsystem(&xe_config_subsys);
>> + if (ret) {
>> + pr_err("Error %d while registering subsystem %s\n",
>> + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
>> + mutex_destroy(&xe_config_subsys.su_mutex);
>> + return ret;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +void __exit xe_configfs_exit(void)
>> +{
>> + xe_configfs_clear_survivability_mode();
>> + configfs_unregister_subsystem(&xe_config_subsys);
>> + mutex_destroy(&xe_config_subsys.su_mutex);
>> +}
>> +
>> diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
>> new file mode 100644
>> index 000000000000..491629a2ca53
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_configfs.h
>> @@ -0,0 +1,12 @@
>> +/* SPDX-License-Identifier: MIT */
>> +/*
>> + * Copyright © 2025 Intel Corporation
>> + */
>> +#ifndef _XE_CONFIGFS_H_
>> +#define _XE_CONFIGFS_H_
>> +
>> +int xe_configfs_init(void);
>> +void xe_configfs_exit(void);
>> +void xe_configfs_clear_survivability_mode(void);
>> +
>> +#endif
>> diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
>> index 475acdba2b55..15b3cf22193c 100644
>> --- a/drivers/gpu/drm/xe/xe_module.c
>> +++ b/drivers/gpu/drm/xe/xe_module.c
>> @@ -11,6 +11,7 @@
>> #include <drm/drm_module.h>
>>
>> #include "xe_drv.h"
>> +#include "xe_configfs.h"
>> #include "xe_hw_fence.h"
>> #include "xe_pci.h"
>> #include "xe_pm.h"
>> @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
>> {
>> .init = xe_check_nomodeset,
>> },
>> + {
>> + .init = xe_configfs_init,
>> + .exit = xe_configfs_exit,
>> + },
>> {
>> .init = xe_hw_fence_module_init,
>> .exit = xe_hw_fence_module_exit,
>> diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
>> index 84339e509c80..c238dbee6bc7 100644
>> --- a/drivers/gpu/drm/xe/xe_module.h
>> +++ b/drivers/gpu/drm/xe/xe_module.h
>> @@ -24,6 +24,7 @@ struct xe_modparam {
>> #endif
>> int wedged_mode;
>> u32 svm_notifier_size;
>> + char *survivability_mode;
>> };
>>
>> extern struct xe_modparam xe_modparam;
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-13 6:18 ` Riana Tauro
@ 2025-03-13 14:52 ` Rodrigo Vivi
2025-03-13 19:52 ` Lucas De Marchi
0 siblings, 1 reply; 23+ messages in thread
From: Rodrigo Vivi @ 2025-03-13 14:52 UTC (permalink / raw)
To: Riana Tauro, Lucas De Marchi, Thomas Hellström
Cc: Aravind Iddamsetty, intel-xe, anshuman.gupta, matthew.d.roper,
lucas.demarchi
On Thu, Mar 13, 2025 at 11:48:41AM +0530, Riana Tauro wrote:
> Hi Aravind
>
> On 3/10/2025 12:44 PM, Aravind Iddamsetty wrote:
> >
> > On 07-03-2025 19:54, Riana Tauro wrote:
> >
> > Hi Riana,
> >
> > I do think we can achieve the same functionality with module param and
> > we needn't reload the driver
> > if we are doing unbind, Since the driver will be loaded event after
> > unbind we can modify the module param
>
> tried this. We can modify the module param using sysfs and bind similar to
> configfs.
>
> If we want to configure any other attributes or move existing module params
> to device specific then adding configfs seems better.
>
> @Rodrigo thoughts?
Yeap, if we make the param write-able and document that it is only checked
at probe we could indeed use the flow
echo <bdf> > /sys/../xe/unbind
echo <bdf> > /sys/../xe/param/survivability_mode
echo <bdf> > /sys/../xe/bind
and accomplish the same.
on the good side:
So, this is the future prof case. Because if we start seeing cases where
the device fails at boot without the FW request for the survivability mode
we might be forced to have a parameter anyway. :/
on the bad side:
But we were actually aiming to reduce the parameters that we have
because that was getting indiscriminately used and even recommended by some
distros' doc and wikis.
Another problem with the write-able param is that people might expect to
echo <bdf> > /sys/../xe/param/survivability_mode
and immediately get the device in the survivability_mode for that device.
Then we are going to get questions and bug reports that this is not working.
But well, perhaps documenting the flow properly in the param description
might solve this concern.
Lucas, Thomas, thoughts?
>
> Thanks
> Riana
> > and once we bind the device back it can check if the BDF belongs to this
> > driver instance and configure the mode accordingly.
> >
> > Thanks,
> > Aravind.
> > > Registers a configfs subsystem called 'xe' to userspace. The user can
> > > use this to modify exposed attributes.
> > >
> > > Add survivability mode attribute (config/xe/survivability_mode) to the
> > > subsystem to allow the user to specify the card that should enter
> > > survivability mode.
> > >
> > > Signed-off-by: Riana Tauro <riana.tauro@intel.com>
> > > ---
> > > drivers/gpu/drm/xe/Makefile | 1 +
> > > drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
> > > drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
> > > drivers/gpu/drm/xe/xe_module.c | 5 ++
> > > drivers/gpu/drm/xe/xe_module.h | 1 +
> > > 5 files changed, 114 insertions(+)
> > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
> > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
> > >
> > > diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
> > > index 9699b08585f7..3f8c87292cee 100644
> > > --- a/drivers/gpu/drm/xe/Makefile
> > > +++ b/drivers/gpu/drm/xe/Makefile
> > > @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
> > > xe-y += xe_bb.o \
> > > xe_bo.o \
> > > xe_bo_evict.o \
> > > + xe_configfs.o \
> > > xe_devcoredump.o \
> > > xe_device.o \
> > > xe_device_sysfs.o \
> > > diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
> > > new file mode 100644
> > > index 000000000000..8c5f248e466d
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/xe_configfs.c
> > > @@ -0,0 +1,95 @@
> > > +// SPDX-License-Identifier: MIT
> > > +/*
> > > + * Copyright © 2025 Intel Corporation
> > > + */
> > > +
> > > +#include <linux/configfs.h>
> > > +#include <linux/init.h>
> > > +#include <linux/module.h>
> > > +
> > > +#include "xe_configfs.h"
> > > +#include "xe_module.h"
> > > +
> > > +/**
> > > + * DOC: Xe Configfs
> > > + *
> > > + * XE KMD registers a configfs subsystem called 'xe'to userspace that allows users to modify
> > > + * the exposed attributes.
> > > + *
> > > + * Attributes:
> > > + *
> > > + * config/xe/survivability_mode : Write only attribute that allows user to specify the PCI address
> > > + * of the card that has to enter survivability mode
> > > + */
> > > +
> > > +void xe_configfs_clear_survivability_mode(void)
> > > +{
> > > + kfree(xe_modparam.survivability_mode);
> > > + xe_modparam.survivability_mode = NULL;
> > > +}
> > > +
> > > +static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
> > > +{
> > > + char *survivability_mode;
> > > + int ret;
> > > + unsigned int domain, bus, slot, function;
> > > +
> > > + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus, &slot, &function);
> > > + if (ret != 4)
> > > + return -EINVAL;
> > > +
> > > + survivability_mode = kstrdup(page, GFP_KERNEL);
> > > + if (!survivability_mode)
> > > + return -ENOMEM;
> > > +
> > > + xe_configfs_clear_survivability_mode();
> > > + xe_modparam.survivability_mode = survivability_mode;
> > > +
> > > + return len;
> > > +}
> > > +
> > > +CONFIGFS_ATTR_WO(, survivability_mode);
> > > +
> > > +static struct configfs_attribute *xe_configfs_attrs[] = {
> > > + &attr_survivability_mode,
> > > + NULL,
> > > +};
> > > +
> > > +static const struct config_item_type xe_config_type = {
> > > + .ct_attrs = xe_configfs_attrs,
> > > + .ct_owner = THIS_MODULE,
> > > +};
> > > +
> > > +static struct configfs_subsystem xe_config_subsys = {
> > > + .su_group = {
> > > + .cg_item = {
> > > + .ci_namebuf = "xe",
> > > + .ci_type = &xe_config_type,
> > > + },
> > > + },
> > > +};
> > > +
> > > +int __init xe_configfs_init(void)
> > > +{
> > > + int ret;
> > > +
> > > + config_group_init(&xe_config_subsys.su_group);
> > > + mutex_init(&xe_config_subsys.su_mutex);
> > > + ret = configfs_register_subsystem(&xe_config_subsys);
> > > + if (ret) {
> > > + pr_err("Error %d while registering subsystem %s\n",
> > > + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
> > > + mutex_destroy(&xe_config_subsys.su_mutex);
> > > + return ret;
> > > + }
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +void __exit xe_configfs_exit(void)
> > > +{
> > > + xe_configfs_clear_survivability_mode();
> > > + configfs_unregister_subsystem(&xe_config_subsys);
> > > + mutex_destroy(&xe_config_subsys.su_mutex);
> > > +}
> > > +
> > > diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
> > > new file mode 100644
> > > index 000000000000..491629a2ca53
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/xe/xe_configfs.h
> > > @@ -0,0 +1,12 @@
> > > +/* SPDX-License-Identifier: MIT */
> > > +/*
> > > + * Copyright © 2025 Intel Corporation
> > > + */
> > > +#ifndef _XE_CONFIGFS_H_
> > > +#define _XE_CONFIGFS_H_
> > > +
> > > +int xe_configfs_init(void);
> > > +void xe_configfs_exit(void);
> > > +void xe_configfs_clear_survivability_mode(void);
> > > +
> > > +#endif
> > > diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
> > > index 475acdba2b55..15b3cf22193c 100644
> > > --- a/drivers/gpu/drm/xe/xe_module.c
> > > +++ b/drivers/gpu/drm/xe/xe_module.c
> > > @@ -11,6 +11,7 @@
> > > #include <drm/drm_module.h>
> > > #include "xe_drv.h"
> > > +#include "xe_configfs.h"
> > > #include "xe_hw_fence.h"
> > > #include "xe_pci.h"
> > > #include "xe_pm.h"
> > > @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
> > > {
> > > .init = xe_check_nomodeset,
> > > },
> > > + {
> > > + .init = xe_configfs_init,
> > > + .exit = xe_configfs_exit,
> > > + },
> > > {
> > > .init = xe_hw_fence_module_init,
> > > .exit = xe_hw_fence_module_exit,
> > > diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
> > > index 84339e509c80..c238dbee6bc7 100644
> > > --- a/drivers/gpu/drm/xe/xe_module.h
> > > +++ b/drivers/gpu/drm/xe/xe_module.h
> > > @@ -24,6 +24,7 @@ struct xe_modparam {
> > > #endif
> > > int wedged_mode;
> > > u32 svm_notifier_size;
> > > + char *survivability_mode;
> > > };
> > > extern struct xe_modparam xe_modparam;
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-13 14:52 ` Rodrigo Vivi
@ 2025-03-13 19:52 ` Lucas De Marchi
2025-03-14 7:24 ` Riana Tauro
0 siblings, 1 reply; 23+ messages in thread
From: Lucas De Marchi @ 2025-03-13 19:52 UTC (permalink / raw)
To: Rodrigo Vivi
Cc: Riana Tauro, Thomas Hellström, Aravind Iddamsetty, intel-xe,
anshuman.gupta, matthew.d.roper
On Thu, Mar 13, 2025 at 10:52:36AM -0400, Rodrigo Vivi wrote:
>On Thu, Mar 13, 2025 at 11:48:41AM +0530, Riana Tauro wrote:
>> Hi Aravind
>>
>> On 3/10/2025 12:44 PM, Aravind Iddamsetty wrote:
>> >
>> > On 07-03-2025 19:54, Riana Tauro wrote:
>> >
>> > Hi Riana,
>> >
>> > I do think we can achieve the same functionality with module param and
>> > we needn't reload the driver
>> > if we are doing unbind, Since the driver will be loaded event after
>> > unbind we can modify the module param
>>
>> tried this. We can modify the module param using sysfs and bind similar to
>> configfs.
>>
>> If we want to configure any other attributes or move existing module params
>> to device specific then adding configfs seems better.
>>
>> @Rodrigo thoughts?
>
>Yeap, if we make the param write-able and document that it is only checked
>at probe we could indeed use the flow
>
>echo <bdf> > /sys/../xe/unbind
>echo <bdf> > /sys/../xe/param/survivability_mode
>echo <bdf> > /sys/../xe/bind
>
>and accomplish the same.
>
>on the good side:
>
>So, this is the future prof case. Because if we start seeing cases where
>the device fails at boot without the FW request for the survivability mode
>we might be forced to have a parameter anyway. :/
With parameters we have these possibilities:
1) driver is already loaded:
echo <bdf> > /sys/../xe/unbind
echo <bdf> > /sys/module/xe/parameters/survivability_mode
echo <bdf> > /sys/../xe/bind
2) driver is not loaded
# put all device in survivability mode
modprobe xe survivability_mode=
3) driver not loaded, and only one device in this mode
echo 0 > /sys/bus/pci/drivers_autoprobe
modprobe xe
echo <bdf> > /sys/module/xe/parameters/survivability_mode
echo <bdf> > /sys/../xe/bind
With configfs we have these possibilities:
a) driver is already loaded:
echo <bdf> > /sys/../xe/unbind
mkdir /sys/kernel/config/xe/0000:03:00.0
echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
echo <bdf> > /sys/../xe/bind
b) driver is not loaded
# no equivalent option for "all devices in survivability mode"
# other than repeating (c) for each since the option is per
# device not per module
c) driver not loaded, and only one device in this mode
echo 0 > /sys/bus/pci/drivers_autoprobe
modprobe xe
mkdir /sys/kernel/config/xe/0000:03:00.0
echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
echo <bdf> > /sys/../xe/bind
>
>on the bad side:
>
>But we were actually aiming to reduce the parameters that we have
>because that was getting indiscriminately used and even recommended by some
>distros' doc and wikis.
>
>Another problem with the write-able param is that people might expect to
>
>echo <bdf> > /sys/../xe/param/survivability_mode
>
>and immediately get the device in the survivability_mode for that device.
>Then we are going to get questions and bug reports that this is not working.
>
>But well, perhaps documenting the flow properly in the param description
>might solve this concern.
>
>Lucas, Thomas, thoughts?
Let me add here that we will have to extend configuration to more
things. The one I will work once this configfs lands (or even before it)
is to allow developers to add WAs (or we could say
register-save-restore) dynamically so it's much easier to
try/error/recover. For that my plan is something like this:
echo 0 > /sys/bus/pci/drivers_autoprobe
modprobe xe
mkdir /sys/kernel/config/xe/0000:03:00.0
cat gt-wa.txt > /sys/kernel/config/xe/0000:03:00.0/gt_wa
cat engine-wa.txt > /sys/kernel/config/xe/0000:03:00.0/engine_wa
echo <bdf> > /sys/../xe/bind
And cleanup my kmod "frontend" protoype to make it simpler[1]:
i) no additional configuration:
kmod bind --device 0000:03:00.0 xe
ii) with configuration stored somewhere:
kmod bind --device 0000:03:00.0 \
--config <path-to-config-dir|path-to-config-tarball> \
xe
iii) with configuration in the command line:
kmod bind --device 0000:03:00.0 \
--config-kv survivability_mode:1 \
xe
So, I think the configfs approach is more future proof. For a simple
switch/panic-button like survivability_mode I wouldn't oppose to have it
as a module parameter though. Maybe make the module param read-only
and if per-device support is desired, then handle only via configfs?
Lucas De Marchi
[1] Typing here without much thought on the actual interface - my
prototype hardcodes it for pci, but the kmod command would
probably have to consider other buses too.
>
>>
>> Thanks
>> Riana
>> > and once we bind the device back it can check if the BDF belongs to this
>> > driver instance and configure the mode accordingly.
>> >
>> > Thanks,
>> > Aravind.
>> > > Registers a configfs subsystem called 'xe' to userspace. The user can
>> > > use this to modify exposed attributes.
>> > >
>> > > Add survivability mode attribute (config/xe/survivability_mode) to the
>> > > subsystem to allow the user to specify the card that should enter
>> > > survivability mode.
>> > >
>> > > Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>> > > ---
>> > > drivers/gpu/drm/xe/Makefile | 1 +
>> > > drivers/gpu/drm/xe/xe_configfs.c | 95 ++++++++++++++++++++++++++++++++
>> > > drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
>> > > drivers/gpu/drm/xe/xe_module.c | 5 ++
>> > > drivers/gpu/drm/xe/xe_module.h | 1 +
>> > > 5 files changed, 114 insertions(+)
>> > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
>> > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>> > >
>> > > diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> > > index 9699b08585f7..3f8c87292cee 100644
>> > > --- a/drivers/gpu/drm/xe/Makefile
>> > > +++ b/drivers/gpu/drm/xe/Makefile
>> > > @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \
>> > > xe-y += xe_bb.o \
>> > > xe_bo.o \
>> > > xe_bo_evict.o \
>> > > + xe_configfs.o \
>> > > xe_devcoredump.o \
>> > > xe_device.o \
>> > > xe_device_sysfs.o \
>> > > diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/xe/xe_configfs.c
>> > > new file mode 100644
>> > > index 000000000000..8c5f248e466d
>> > > --- /dev/null
>> > > +++ b/drivers/gpu/drm/xe/xe_configfs.c
>> > > @@ -0,0 +1,95 @@
>> > > +// SPDX-License-Identifier: MIT
>> > > +/*
>> > > + * Copyright © 2025 Intel Corporation
>> > > + */
>> > > +
>> > > +#include <linux/configfs.h>
>> > > +#include <linux/init.h>
>> > > +#include <linux/module.h>
>> > > +
>> > > +#include "xe_configfs.h"
>> > > +#include "xe_module.h"
>> > > +
>> > > +/**
>> > > + * DOC: Xe Configfs
>> > > + *
>> > > + * XE KMD registers a configfs subsystem called 'xe'to userspace that allows users to modify
>> > > + * the exposed attributes.
>> > > + *
>> > > + * Attributes:
>> > > + *
>> > > + * config/xe/survivability_mode : Write only attribute that allows user to specify the PCI address
>> > > + * of the card that has to enter survivability mode
>> > > + */
>> > > +
>> > > +void xe_configfs_clear_survivability_mode(void)
>> > > +{
>> > > + kfree(xe_modparam.survivability_mode);
>> > > + xe_modparam.survivability_mode = NULL;
>> > > +}
>> > > +
>> > > +static ssize_t survivability_mode_store(struct config_item *item, const char *page, size_t len)
>> > > +{
>> > > + char *survivability_mode;
>> > > + int ret;
>> > > + unsigned int domain, bus, slot, function;
>> > > +
>> > > + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus, &slot, &function);
>> > > + if (ret != 4)
>> > > + return -EINVAL;
>> > > +
>> > > + survivability_mode = kstrdup(page, GFP_KERNEL);
>> > > + if (!survivability_mode)
>> > > + return -ENOMEM;
>> > > +
>> > > + xe_configfs_clear_survivability_mode();
>> > > + xe_modparam.survivability_mode = survivability_mode;
>> > > +
>> > > + return len;
>> > > +}
>> > > +
>> > > +CONFIGFS_ATTR_WO(, survivability_mode);
>> > > +
>> > > +static struct configfs_attribute *xe_configfs_attrs[] = {
>> > > + &attr_survivability_mode,
>> > > + NULL,
>> > > +};
>> > > +
>> > > +static const struct config_item_type xe_config_type = {
>> > > + .ct_attrs = xe_configfs_attrs,
>> > > + .ct_owner = THIS_MODULE,
>> > > +};
>> > > +
>> > > +static struct configfs_subsystem xe_config_subsys = {
>> > > + .su_group = {
>> > > + .cg_item = {
>> > > + .ci_namebuf = "xe",
>> > > + .ci_type = &xe_config_type,
>> > > + },
>> > > + },
>> > > +};
>> > > +
>> > > +int __init xe_configfs_init(void)
>> > > +{
>> > > + int ret;
>> > > +
>> > > + config_group_init(&xe_config_subsys.su_group);
>> > > + mutex_init(&xe_config_subsys.su_mutex);
>> > > + ret = configfs_register_subsystem(&xe_config_subsys);
>> > > + if (ret) {
>> > > + pr_err("Error %d while registering subsystem %s\n",
>> > > + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
>> > > + mutex_destroy(&xe_config_subsys.su_mutex);
>> > > + return ret;
>> > > + }
>> > > +
>> > > + return 0;
>> > > +}
>> > > +
>> > > +void __exit xe_configfs_exit(void)
>> > > +{
>> > > + xe_configfs_clear_survivability_mode();
>> > > + configfs_unregister_subsystem(&xe_config_subsys);
>> > > + mutex_destroy(&xe_config_subsys.su_mutex);
>> > > +}
>> > > +
>> > > diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/xe/xe_configfs.h
>> > > new file mode 100644
>> > > index 000000000000..491629a2ca53
>> > > --- /dev/null
>> > > +++ b/drivers/gpu/drm/xe/xe_configfs.h
>> > > @@ -0,0 +1,12 @@
>> > > +/* SPDX-License-Identifier: MIT */
>> > > +/*
>> > > + * Copyright © 2025 Intel Corporation
>> > > + */
>> > > +#ifndef _XE_CONFIGFS_H_
>> > > +#define _XE_CONFIGFS_H_
>> > > +
>> > > +int xe_configfs_init(void);
>> > > +void xe_configfs_exit(void);
>> > > +void xe_configfs_clear_survivability_mode(void);
>> > > +
>> > > +#endif
>> > > diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
>> > > index 475acdba2b55..15b3cf22193c 100644
>> > > --- a/drivers/gpu/drm/xe/xe_module.c
>> > > +++ b/drivers/gpu/drm/xe/xe_module.c
>> > > @@ -11,6 +11,7 @@
>> > > #include <drm/drm_module.h>
>> > > #include "xe_drv.h"
>> > > +#include "xe_configfs.h"
>> > > #include "xe_hw_fence.h"
>> > > #include "xe_pci.h"
>> > > #include "xe_pm.h"
>> > > @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
>> > > {
>> > > .init = xe_check_nomodeset,
>> > > },
>> > > + {
>> > > + .init = xe_configfs_init,
>> > > + .exit = xe_configfs_exit,
>> > > + },
>> > > {
>> > > .init = xe_hw_fence_module_init,
>> > > .exit = xe_hw_fence_module_exit,
>> > > diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
>> > > index 84339e509c80..c238dbee6bc7 100644
>> > > --- a/drivers/gpu/drm/xe/xe_module.h
>> > > +++ b/drivers/gpu/drm/xe/xe_module.h
>> > > @@ -24,6 +24,7 @@ struct xe_modparam {
>> > > #endif
>> > > int wedged_mode;
>> > > u32 svm_notifier_size;
>> > > + char *survivability_mode;
>> > > };
>> > > extern struct xe_modparam xe_modparam;
>>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-13 19:52 ` Lucas De Marchi
@ 2025-03-14 7:24 ` Riana Tauro
2025-03-14 16:17 ` Aravind Iddamsetty
0 siblings, 1 reply; 23+ messages in thread
From: Riana Tauro @ 2025-03-14 7:24 UTC (permalink / raw)
To: Lucas De Marchi, Rodrigo Vivi
Cc: Thomas Hellström, Aravind Iddamsetty, intel-xe,
anshuman.gupta, matthew.d.roper
On 3/14/2025 1:22 AM, Lucas De Marchi wrote:
> On Thu, Mar 13, 2025 at 10:52:36AM -0400, Rodrigo Vivi wrote:
>> On Thu, Mar 13, 2025 at 11:48:41AM +0530, Riana Tauro wrote:
>>> Hi Aravind
>>>
>>> On 3/10/2025 12:44 PM, Aravind Iddamsetty wrote:
>>> >
>>> > On 07-03-2025 19:54, Riana Tauro wrote:
>>> >
>>> > Hi Riana,
>>> >
>>> > I do think we can achieve the same functionality with module param and
>>> > we needn't reload the driver
>>> > if we are doing unbind, Since the driver will be loaded event after
>>> > unbind we can modify the module param
>>>
>>> tried this. We can modify the module param using sysfs and bind
>>> similar to
>>> configfs.
>>>
>>> If we want to configure any other attributes or move existing module
>>> params
>>> to device specific then adding configfs seems better.
>>>
>>> @Rodrigo thoughts?
>>
>> Yeap, if we make the param write-able and document that it is only
>> checked
>> at probe we could indeed use the flow
>>
>> echo <bdf> > /sys/../xe/unbind
>> echo <bdf> > /sys/../xe/param/survivability_mode
>> echo <bdf> > /sys/../xe/bind
>>
>> and accomplish the same.
>>
>> on the good side:
>>
>> So, this is the future prof case. Because if we start seeing cases where
>> the device fails at boot without the FW request for the survivability
>> mode
>> we might be forced to have a parameter anyway. :/
>
> With parameters we have these possibilities:
>
> 1) driver is already loaded:
>
> echo <bdf> > /sys/../xe/unbind
> echo <bdf> > /sys/module/xe/parameters/survivability_mode
> echo <bdf> > /sys/../xe/bind
>
> 2) driver is not loaded
>
> # put all device in survivability mode
> modprobe xe survivability_mode=
>
> 3) driver not loaded, and only one device in this mode
>
> echo 0 > /sys/bus/pci/drivers_autoprobe
> modprobe xe
> echo <bdf> > /sys/module/xe/parameters/survivability_mode
> echo <bdf> > /sys/../xe/bind
>
> With configfs we have these possibilities:
>
> a) driver is already loaded:
>
> echo <bdf> > /sys/../xe/unbind
> mkdir /sys/kernel/config/xe/0000:03:00.0
> echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
> echo <bdf> > /sys/../xe/bind
>
> b) driver is not loaded
>
> # no equivalent option for "all devices in survivability mode"
> # other than repeating (c) for each since the option is per
> # device not per module
>
> c) driver not loaded, and only one device in this mode
>
> echo 0 > /sys/bus/pci/drivers_autoprobe
> modprobe xe
> mkdir /sys/kernel/config/xe/0000:03:00.0
> echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
> echo <bdf> > /sys/../xe/bind
>
>>
>> on the bad side:
>>
>> But we were actually aiming to reduce the parameters that we have
>> because that was getting indiscriminately used and even recommended by
>> some
>> distros' doc and wikis.
>>
>> Another problem with the write-able param is that people might expect to
>>
>> echo <bdf> > /sys/../xe/param/survivability_mode
>>
>> and immediately get the device in the survivability_mode for that device.
>> Then we are going to get questions and bug reports that this is not
>> working.
>>
>> But well, perhaps documenting the flow properly in the param description
>> might solve this concern.
>>
>> Lucas, Thomas, thoughts?
>
> Let me add here that we will have to extend configuration to more
> things. The one I will work once this configfs lands (or even before it)
> is to allow developers to add WAs (or we could say
> register-save-restore) dynamically so it's much easier to
> try/error/recover. For that my plan is something like this:
>
> echo 0 > /sys/bus/pci/drivers_autoprobe
> modprobe xe
> mkdir /sys/kernel/config/xe/0000:03:00.0
> cat gt-wa.txt > /sys/kernel/config/xe/0000:03:00.0/gt_wa
> cat engine-wa.txt > /sys/kernel/config/xe/0000:03:00.0/engine_wa
> echo <bdf> > /sys/../xe/bind
>
> And cleanup my kmod "frontend" protoype to make it simpler[1]:
>
> i) no additional configuration:
>
> kmod bind --device 0000:03:00.0 xe
>
> ii) with configuration stored somewhere:
>
> kmod bind --device 0000:03:00.0 \
> --config <path-to-config-dir|path-to-config-tarball> \
> xe
>
> iii) with configuration in the command line:
>
> kmod bind --device 0000:03:00.0 \
> --config-kv survivability_mode:1 \
> xe
>
> So, I think the configfs approach is more future proof. For a simple
> switch/panic-button like survivability_mode I wouldn't oppose to have it
> as a module parameter though. Maybe make the module param read-only
> and if per-device support is desired, then handle only via configfs?
yeah survivability mode is per device. Since there is a plan to extend
configfs for other attributes will go ahead with configfs.
Thank you all for the inputs.
Thanks
Riana
>
> Lucas De Marchi
>
> [1] Typing here without much thought on the actual interface - my
> prototype hardcodes it for pci, but the kmod command would
> probably have to consider other buses too.
>
>>
>>>
>>> Thanks
>>> Riana
>>> > and once we bind the device back it can check if the BDF belongs to
>>> this
>>> > driver instance and configure the mode accordingly.
>>> >
>>> > Thanks,
>>> > Aravind.
>>> > > Registers a configfs subsystem called 'xe' to userspace. The user
>>> can
>>> > > use this to modify exposed attributes.
>>> > >
>>> > > Add survivability mode attribute (config/xe/survivability_mode)
>>> to the
>>> > > subsystem to allow the user to specify the card that should enter
>>> > > survivability mode.
>>> > >
>>> > > Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>>> > > ---
>>> > > drivers/gpu/drm/xe/Makefile | 1 +
>>> > > drivers/gpu/drm/xe/xe_configfs.c | 95 +++++++++++++++++++++++++
>>> +++++++
>>> > > drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
>>> > > drivers/gpu/drm/xe/xe_module.c | 5 ++
>>> > > drivers/gpu/drm/xe/xe_module.h | 1 +
>>> > > 5 files changed, 114 insertions(+)
>>> > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
>>> > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>>> > >
>>> > > diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/
>>> Makefile
>>> > > index 9699b08585f7..3f8c87292cee 100644
>>> > > --- a/drivers/gpu/drm/xe/Makefile
>>> > > +++ b/drivers/gpu/drm/xe/Makefile
>>> > > @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/
>>> %_wa_oob.h: $(obj)/xe_gen_wa_oob \
>>> > > xe-y += xe_bb.o \
>>> > > xe_bo.o \
>>> > > xe_bo_evict.o \
>>> > > + xe_configfs.o \
>>> > > xe_devcoredump.o \
>>> > > xe_device.o \
>>> > > xe_device_sysfs.o \
>>> > > diff --git a/drivers/gpu/drm/xe/xe_configfs.c b/drivers/gpu/drm/
>>> xe/xe_configfs.c
>>> > > new file mode 100644
>>> > > index 000000000000..8c5f248e466d
>>> > > --- /dev/null
>>> > > +++ b/drivers/gpu/drm/xe/xe_configfs.c
>>> > > @@ -0,0 +1,95 @@
>>> > > +// SPDX-License-Identifier: MIT
>>> > > +/*
>>> > > + * Copyright © 2025 Intel Corporation
>>> > > + */
>>> > > +
>>> > > +#include <linux/configfs.h>
>>> > > +#include <linux/init.h>
>>> > > +#include <linux/module.h>
>>> > > +
>>> > > +#include "xe_configfs.h"
>>> > > +#include "xe_module.h"
>>> > > +
>>> > > +/**
>>> > > + * DOC: Xe Configfs
>>> > > + *
>>> > > + * XE KMD registers a configfs subsystem called 'xe'to userspace
>>> that allows users to modify
>>> > > + * the exposed attributes.
>>> > > + *
>>> > > + * Attributes:
>>> > > + *
>>> > > + * config/xe/survivability_mode : Write only attribute that
>>> allows user to specify the PCI address
>>> > > + * of the card that has to enter survivability
>>> mode
>>> > > + */
>>> > > +
>>> > > +void xe_configfs_clear_survivability_mode(void)
>>> > > +{
>>> > > + kfree(xe_modparam.survivability_mode);
>>> > > + xe_modparam.survivability_mode = NULL;
>>> > > +}
>>> > > +
>>> > > +static ssize_t survivability_mode_store(struct config_item
>>> *item, const char *page, size_t len)
>>> > > +{
>>> > > + char *survivability_mode;
>>> > > + int ret;
>>> > > + unsigned int domain, bus, slot, function;
>>> > > +
>>> > > + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus,
>>> &slot, &function);
>>> > > + if (ret != 4)
>>> > > + return -EINVAL;
>>> > > +
>>> > > + survivability_mode = kstrdup(page, GFP_KERNEL);
>>> > > + if (!survivability_mode)
>>> > > + return -ENOMEM;
>>> > > +
>>> > > + xe_configfs_clear_survivability_mode();
>>> > > + xe_modparam.survivability_mode = survivability_mode;
>>> > > +
>>> > > + return len;
>>> > > +}
>>> > > +
>>> > > +CONFIGFS_ATTR_WO(, survivability_mode);
>>> > > +
>>> > > +static struct configfs_attribute *xe_configfs_attrs[] = {
>>> > > + &attr_survivability_mode,
>>> > > + NULL,
>>> > > +};
>>> > > +
>>> > > +static const struct config_item_type xe_config_type = {
>>> > > + .ct_attrs = xe_configfs_attrs,
>>> > > + .ct_owner = THIS_MODULE,
>>> > > +};
>>> > > +
>>> > > +static struct configfs_subsystem xe_config_subsys = {
>>> > > + .su_group = {
>>> > > + .cg_item = {
>>> > > + .ci_namebuf = "xe",
>>> > > + .ci_type = &xe_config_type,
>>> > > + },
>>> > > + },
>>> > > +};
>>> > > +
>>> > > +int __init xe_configfs_init(void)
>>> > > +{
>>> > > + int ret;
>>> > > +
>>> > > + config_group_init(&xe_config_subsys.su_group);
>>> > > + mutex_init(&xe_config_subsys.su_mutex);
>>> > > + ret = configfs_register_subsystem(&xe_config_subsys);
>>> > > + if (ret) {
>>> > > + pr_err("Error %d while registering subsystem %s\n",
>>> > > + ret, xe_config_subsys.su_group.cg_item.ci_namebuf);
>>> > > + mutex_destroy(&xe_config_subsys.su_mutex);
>>> > > + return ret;
>>> > > + }
>>> > > +
>>> > > + return 0;
>>> > > +}
>>> > > +
>>> > > +void __exit xe_configfs_exit(void)
>>> > > +{
>>> > > + xe_configfs_clear_survivability_mode();
>>> > > + configfs_unregister_subsystem(&xe_config_subsys);
>>> > > + mutex_destroy(&xe_config_subsys.su_mutex);
>>> > > +}
>>> > > +
>>> > > diff --git a/drivers/gpu/drm/xe/xe_configfs.h b/drivers/gpu/drm/
>>> xe/xe_configfs.h
>>> > > new file mode 100644
>>> > > index 000000000000..491629a2ca53
>>> > > --- /dev/null
>>> > > +++ b/drivers/gpu/drm/xe/xe_configfs.h
>>> > > @@ -0,0 +1,12 @@
>>> > > +/* SPDX-License-Identifier: MIT */
>>> > > +/*
>>> > > + * Copyright © 2025 Intel Corporation
>>> > > + */
>>> > > +#ifndef _XE_CONFIGFS_H_
>>> > > +#define _XE_CONFIGFS_H_
>>> > > +
>>> > > +int xe_configfs_init(void);
>>> > > +void xe_configfs_exit(void);
>>> > > +void xe_configfs_clear_survivability_mode(void);
>>> > > +
>>> > > +#endif
>>> > > diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/
>>> xe_module.c
>>> > > index 475acdba2b55..15b3cf22193c 100644
>>> > > --- a/drivers/gpu/drm/xe/xe_module.c
>>> > > +++ b/drivers/gpu/drm/xe/xe_module.c
>>> > > @@ -11,6 +11,7 @@
>>> > > #include <drm/drm_module.h>
>>> > > #include "xe_drv.h"
>>> > > +#include "xe_configfs.h"
>>> > > #include "xe_hw_fence.h"
>>> > > #include "xe_pci.h"
>>> > > #include "xe_pm.h"
>>> > > @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
>>> > > {
>>> > > .init = xe_check_nomodeset,
>>> > > },
>>> > > + {
>>> > > + .init = xe_configfs_init,
>>> > > + .exit = xe_configfs_exit,
>>> > > + },
>>> > > {
>>> > > .init = xe_hw_fence_module_init,
>>> > > .exit = xe_hw_fence_module_exit,
>>> > > diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/
>>> xe_module.h
>>> > > index 84339e509c80..c238dbee6bc7 100644
>>> > > --- a/drivers/gpu/drm/xe/xe_module.h
>>> > > +++ b/drivers/gpu/drm/xe/xe_module.h
>>> > > @@ -24,6 +24,7 @@ struct xe_modparam {
>>> > > #endif
>>> > > int wedged_mode;
>>> > > u32 svm_notifier_size;
>>> > > + char *survivability_mode;
>>> > > };
>>> > > extern struct xe_modparam xe_modparam;
>>>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/2] RFC drm/xe: Add configfs to enable survivability mode
2025-03-14 7:24 ` Riana Tauro
@ 2025-03-14 16:17 ` Aravind Iddamsetty
0 siblings, 0 replies; 23+ messages in thread
From: Aravind Iddamsetty @ 2025-03-14 16:17 UTC (permalink / raw)
To: Riana Tauro, Lucas De Marchi, Rodrigo Vivi
Cc: Thomas Hellström, intel-xe, anshuman.gupta, matthew.d.roper
On 14-03-2025 12:54, Riana Tauro wrote:
>
>
> On 3/14/2025 1:22 AM, Lucas De Marchi wrote:
>> On Thu, Mar 13, 2025 at 10:52:36AM -0400, Rodrigo Vivi wrote:
>>> On Thu, Mar 13, 2025 at 11:48:41AM +0530, Riana Tauro wrote:
>>>> Hi Aravind
>>>>
>>>> On 3/10/2025 12:44 PM, Aravind Iddamsetty wrote:
>>>> >
>>>> > On 07-03-2025 19:54, Riana Tauro wrote:
>>>> >
>>>> > Hi Riana,
>>>> >
>>>> > I do think we can achieve the same functionality with module
>>>> param and
>>>> > we needn't reload the driver
>>>> > if we are doing unbind, Since the driver will be loaded event after
>>>> > unbind we can modify the module param
>>>>
>>>> tried this. We can modify the module param using sysfs and bind
>>>> similar to
>>>> configfs.
>>>>
>>>> If we want to configure any other attributes or move existing
>>>> module params
>>>> to device specific then adding configfs seems better.
>>>>
>>>> @Rodrigo thoughts?
>>>
>>> Yeap, if we make the param write-able and document that it is only
>>> checked
>>> at probe we could indeed use the flow
>>>
>>> echo <bdf> > /sys/../xe/unbind
>>> echo <bdf> > /sys/../xe/param/survivability_mode
>>> echo <bdf> > /sys/../xe/bind
>>>
>>> and accomplish the same.
>>>
>>> on the good side:
>>>
>>> So, this is the future prof case. Because if we start seeing cases
>>> where
>>> the device fails at boot without the FW request for the
>>> survivability mode
>>> we might be forced to have a parameter anyway. :/
>>
>> With parameters we have these possibilities:
>>
>> 1) driver is already loaded:
>>
>> echo <bdf> > /sys/../xe/unbind
>> echo <bdf> > /sys/module/xe/parameters/survivability_mode
>> echo <bdf> > /sys/../xe/bind
>>
>> 2) driver is not loaded
>>
>> # put all device in survivability mode
>> modprobe xe survivability_mode=
>>
>> 3) driver not loaded, and only one device in this mode
>>
>> echo 0 > /sys/bus/pci/drivers_autoprobe
>> modprobe xe
>> echo <bdf> > /sys/module/xe/parameters/survivability_mode
>> echo <bdf> > /sys/../xe/bind
>>
>> With configfs we have these possibilities:
>>
>> a) driver is already loaded:
>>
>> echo <bdf> > /sys/../xe/unbind
>> mkdir /sys/kernel/config/xe/0000:03:00.0
>> echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
>> echo <bdf> > /sys/../xe/bind
>>
>> b) driver is not loaded
>>
>> # no equivalent option for "all devices in survivability mode"
>> # other than repeating (c) for each since the option is per
>> # device not per module
>>
>> c) driver not loaded, and only one device in this mode
>>
>> echo 0 > /sys/bus/pci/drivers_autoprobe
>> modprobe xe
>> mkdir /sys/kernel/config/xe/0000:03:00.0
>> echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
>> echo <bdf> > /sys/../xe/bind
>>
>>>
>>> on the bad side:
>>>
>>> But we were actually aiming to reduce the parameters that we have
>>> because that was getting indiscriminately used and even recommended
>>> by some
>>> distros' doc and wikis.
>>>
>>> Another problem with the write-able param is that people might
>>> expect to
>>>
>>> echo <bdf> > /sys/../xe/param/survivability_mode
>>>
>>> and immediately get the device in the survivability_mode for that
>>> device.
>>> Then we are going to get questions and bug reports that this is not
>>> working.
>>>
>>> But well, perhaps documenting the flow properly in the param
>>> description
>>> might solve this concern.
>>>
>>> Lucas, Thomas, thoughts?
>>
>> Let me add here that we will have to extend configuration to more
>> things. The one I will work once this configfs lands (or even before it)
>> is to allow developers to add WAs (or we could say
>> register-save-restore) dynamically so it's much easier to
>> try/error/recover. For that my plan is something like this:
>>
>> echo 0 > /sys/bus/pci/drivers_autoprobe
>> modprobe xe
>> mkdir /sys/kernel/config/xe/0000:03:00.0
>> cat gt-wa.txt > /sys/kernel/config/xe/0000:03:00.0/gt_wa
>> cat engine-wa.txt > /sys/kernel/config/xe/0000:03:00.0/engine_wa
>> echo <bdf> > /sys/../xe/bind
>>
>> And cleanup my kmod "frontend" protoype to make it simpler[1]:
>>
>> i) no additional configuration:
>>
>> kmod bind --device 0000:03:00.0 xe
>>
>> ii) with configuration stored somewhere:
>>
>> kmod bind --device 0000:03:00.0 \
>> --config <path-to-config-dir|path-to-config-tarball> \
>> xe
>>
>> iii) with configuration in the command line:
>>
>> kmod bind --device 0000:03:00.0 \
>> --config-kv survivability_mode:1 \
>> xe
>>
>> So, I think the configfs approach is more future proof. For a simple
>> switch/panic-button like survivability_mode I wouldn't oppose to have it
>> as a module parameter though. Maybe make the module param read-only
>> and if per-device support is desired, then handle only via configfs?
> yeah survivability mode is per device. Since there is a plan to extend
> configfs for other attributes will go ahead with configfs.
>
> Thank you all for the inputs.
Even if we go ahead with configfs should we support per device, because
thinking from end user or production environment it doesn't
make sense to recover a device from a system while the system is active
in service. In my opinion the entire system will be taken out of
service for maintenance.
Thanks,
Aravind.
>
> Thanks
> Riana
>
>>
>> Lucas De Marchi
>>
>> [1] Typing here without much thought on the actual interface - my
>> prototype hardcodes it for pci, but the kmod command would
>> probably have to consider other buses too.
>>
>>>
>>>>
>>>> Thanks
>>>> Riana
>>>> > and once we bind the device back it can check if the BDF belongs
>>>> to this
>>>> > driver instance and configure the mode accordingly.
>>>> >
>>>> > Thanks,
>>>> > Aravind.
>>>> > > Registers a configfs subsystem called 'xe' to userspace. The
>>>> user can
>>>> > > use this to modify exposed attributes.
>>>> > >
>>>> > > Add survivability mode attribute (config/xe/survivability_mode)
>>>> to the
>>>> > > subsystem to allow the user to specify the card that should enter
>>>> > > survivability mode.
>>>> > >
>>>> > > Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>>>> > > ---
>>>> > > drivers/gpu/drm/xe/Makefile | 1 +
>>>> > > drivers/gpu/drm/xe/xe_configfs.c | 95
>>>> +++++++++++++++++++++++++ +++++++
>>>> > > drivers/gpu/drm/xe/xe_configfs.h | 12 ++++
>>>> > > drivers/gpu/drm/xe/xe_module.c | 5 ++
>>>> > > drivers/gpu/drm/xe/xe_module.h | 1 +
>>>> > > 5 files changed, 114 insertions(+)
>>>> > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.c
>>>> > > create mode 100644 drivers/gpu/drm/xe/xe_configfs.h
>>>> > >
>>>> > > diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/
>>>> Makefile
>>>> > > index 9699b08585f7..3f8c87292cee 100644
>>>> > > --- a/drivers/gpu/drm/xe/Makefile
>>>> > > +++ b/drivers/gpu/drm/xe/Makefile
>>>> > > @@ -28,6 +28,7 @@ $(obj)/generated/%_wa_oob.c $(obj)/generated/
>>>> %_wa_oob.h: $(obj)/xe_gen_wa_oob \
>>>> > > xe-y += xe_bb.o \
>>>> > > xe_bo.o \
>>>> > > xe_bo_evict.o \
>>>> > > + xe_configfs.o \
>>>> > > xe_devcoredump.o \
>>>> > > xe_device.o \
>>>> > > xe_device_sysfs.o \
>>>> > > diff --git a/drivers/gpu/drm/xe/xe_configfs.c
>>>> b/drivers/gpu/drm/ xe/xe_configfs.c
>>>> > > new file mode 100644
>>>> > > index 000000000000..8c5f248e466d
>>>> > > --- /dev/null
>>>> > > +++ b/drivers/gpu/drm/xe/xe_configfs.c
>>>> > > @@ -0,0 +1,95 @@
>>>> > > +// SPDX-License-Identifier: MIT
>>>> > > +/*
>>>> > > + * Copyright © 2025 Intel Corporation
>>>> > > + */
>>>> > > +
>>>> > > +#include <linux/configfs.h>
>>>> > > +#include <linux/init.h>
>>>> > > +#include <linux/module.h>
>>>> > > +
>>>> > > +#include "xe_configfs.h"
>>>> > > +#include "xe_module.h"
>>>> > > +
>>>> > > +/**
>>>> > > + * DOC: Xe Configfs
>>>> > > + *
>>>> > > + * XE KMD registers a configfs subsystem called 'xe'to
>>>> userspace that allows users to modify
>>>> > > + * the exposed attributes.
>>>> > > + *
>>>> > > + * Attributes:
>>>> > > + *
>>>> > > + * config/xe/survivability_mode : Write only attribute that
>>>> allows user to specify the PCI address
>>>> > > + * of the card that has to enter
>>>> survivability mode
>>>> > > + */
>>>> > > +
>>>> > > +void xe_configfs_clear_survivability_mode(void)
>>>> > > +{
>>>> > > + kfree(xe_modparam.survivability_mode);
>>>> > > + xe_modparam.survivability_mode = NULL;
>>>> > > +}
>>>> > > +
>>>> > > +static ssize_t survivability_mode_store(struct config_item
>>>> *item, const char *page, size_t len)
>>>> > > +{
>>>> > > + char *survivability_mode;
>>>> > > + int ret;
>>>> > > + unsigned int domain, bus, slot, function;
>>>> > > +
>>>> > > + ret = sscanf(page, "%04x:%02x:%02x.%x", &domain, &bus,
>>>> &slot, &function);
>>>> > > + if (ret != 4)
>>>> > > + return -EINVAL;
>>>> > > +
>>>> > > + survivability_mode = kstrdup(page, GFP_KERNEL);
>>>> > > + if (!survivability_mode)
>>>> > > + return -ENOMEM;
>>>> > > +
>>>> > > + xe_configfs_clear_survivability_mode();
>>>> > > + xe_modparam.survivability_mode = survivability_mode;
>>>> > > +
>>>> > > + return len;
>>>> > > +}
>>>> > > +
>>>> > > +CONFIGFS_ATTR_WO(, survivability_mode);
>>>> > > +
>>>> > > +static struct configfs_attribute *xe_configfs_attrs[] = {
>>>> > > + &attr_survivability_mode,
>>>> > > + NULL,
>>>> > > +};
>>>> > > +
>>>> > > +static const struct config_item_type xe_config_type = {
>>>> > > + .ct_attrs = xe_configfs_attrs,
>>>> > > + .ct_owner = THIS_MODULE,
>>>> > > +};
>>>> > > +
>>>> > > +static struct configfs_subsystem xe_config_subsys = {
>>>> > > + .su_group = {
>>>> > > + .cg_item = {
>>>> > > + .ci_namebuf = "xe",
>>>> > > + .ci_type = &xe_config_type,
>>>> > > + },
>>>> > > + },
>>>> > > +};
>>>> > > +
>>>> > > +int __init xe_configfs_init(void)
>>>> > > +{
>>>> > > + int ret;
>>>> > > +
>>>> > > + config_group_init(&xe_config_subsys.su_group);
>>>> > > + mutex_init(&xe_config_subsys.su_mutex);
>>>> > > + ret = configfs_register_subsystem(&xe_config_subsys);
>>>> > > + if (ret) {
>>>> > > + pr_err("Error %d while registering subsystem %s\n",
>>>> > > + ret,
>>>> xe_config_subsys.su_group.cg_item.ci_namebuf);
>>>> > > + mutex_destroy(&xe_config_subsys.su_mutex);
>>>> > > + return ret;
>>>> > > + }
>>>> > > +
>>>> > > + return 0;
>>>> > > +}
>>>> > > +
>>>> > > +void __exit xe_configfs_exit(void)
>>>> > > +{
>>>> > > + xe_configfs_clear_survivability_mode();
>>>> > > + configfs_unregister_subsystem(&xe_config_subsys);
>>>> > > + mutex_destroy(&xe_config_subsys.su_mutex);
>>>> > > +}
>>>> > > +
>>>> > > diff --git a/drivers/gpu/drm/xe/xe_configfs.h
>>>> b/drivers/gpu/drm/ xe/xe_configfs.h
>>>> > > new file mode 100644
>>>> > > index 000000000000..491629a2ca53
>>>> > > --- /dev/null
>>>> > > +++ b/drivers/gpu/drm/xe/xe_configfs.h
>>>> > > @@ -0,0 +1,12 @@
>>>> > > +/* SPDX-License-Identifier: MIT */
>>>> > > +/*
>>>> > > + * Copyright © 2025 Intel Corporation
>>>> > > + */
>>>> > > +#ifndef _XE_CONFIGFS_H_
>>>> > > +#define _XE_CONFIGFS_H_
>>>> > > +
>>>> > > +int xe_configfs_init(void);
>>>> > > +void xe_configfs_exit(void);
>>>> > > +void xe_configfs_clear_survivability_mode(void);
>>>> > > +
>>>> > > +#endif
>>>> > > diff --git a/drivers/gpu/drm/xe/xe_module.c
>>>> b/drivers/gpu/drm/xe/ xe_module.c
>>>> > > index 475acdba2b55..15b3cf22193c 100644
>>>> > > --- a/drivers/gpu/drm/xe/xe_module.c
>>>> > > +++ b/drivers/gpu/drm/xe/xe_module.c
>>>> > > @@ -11,6 +11,7 @@
>>>> > > #include <drm/drm_module.h>
>>>> > > #include "xe_drv.h"
>>>> > > +#include "xe_configfs.h"
>>>> > > #include "xe_hw_fence.h"
>>>> > > #include "xe_pci.h"
>>>> > > #include "xe_pm.h"
>>>> > > @@ -91,6 +92,10 @@ static const struct init_funcs init_funcs[] = {
>>>> > > {
>>>> > > .init = xe_check_nomodeset,
>>>> > > },
>>>> > > + {
>>>> > > + .init = xe_configfs_init,
>>>> > > + .exit = xe_configfs_exit,
>>>> > > + },
>>>> > > {
>>>> > > .init = xe_hw_fence_module_init,
>>>> > > .exit = xe_hw_fence_module_exit,
>>>> > > diff --git a/drivers/gpu/drm/xe/xe_module.h
>>>> b/drivers/gpu/drm/xe/ xe_module.h
>>>> > > index 84339e509c80..c238dbee6bc7 100644
>>>> > > --- a/drivers/gpu/drm/xe/xe_module.h
>>>> > > +++ b/drivers/gpu/drm/xe/xe_module.h
>>>> > > @@ -24,6 +24,7 @@ struct xe_modparam {
>>>> > > #endif
>>>> > > int wedged_mode;
>>>> > > u32 svm_notifier_size;
>>>> > > + char *survivability_mode;
>>>> > > };
>>>> > > extern struct xe_modparam xe_modparam;
>>>>
>
^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2025-03-14 16:17 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-07 14:24 [PATCH 0/2] Add configfs support for survivability mode Riana Tauro
2025-03-07 14:24 ` [PATCH 1/2] RFC drm/xe: Add configfs to enable " Riana Tauro
2025-03-07 14:45 ` Rodrigo Vivi
2025-03-07 15:16 ` Lucas De Marchi
2025-03-10 5:31 ` Riana Tauro
2025-03-10 13:31 ` Lucas De Marchi
2025-03-10 14:23 ` Riana Tauro
2025-03-10 16:40 ` Rodrigo Vivi
2025-03-10 17:01 ` Lucas De Marchi
2025-03-10 17:14 ` Rodrigo Vivi
2025-03-10 21:40 ` Matt Roper
2025-03-10 7:14 ` Aravind Iddamsetty
2025-03-13 6:18 ` Riana Tauro
2025-03-13 14:52 ` Rodrigo Vivi
2025-03-13 19:52 ` Lucas De Marchi
2025-03-14 7:24 ` Riana Tauro
2025-03-14 16:17 ` Aravind Iddamsetty
2025-03-07 14:24 ` [PATCH 2/2] RFC drm/xe: Enable configfs support for " Riana Tauro
2025-03-07 15:01 ` Rodrigo Vivi
2025-03-10 5:36 ` Riana Tauro
2025-03-07 14:36 ` ✓ CI.Patch_applied: success for Add " Patchwork
2025-03-07 14:36 ` ✗ CI.checkpatch: warning " Patchwork
2025-03-07 14:37 ` ✗ CI.KUnit: failure " Patchwork
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox