* [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time
@ 2024-09-20 13:18 Francois Dugast
2024-09-20 13:51 ` ✓ CI.Patch_applied: success for drm/xe: Use fault injection infrastructure to find issues at probe time (rev2) Patchwork
` (3 more replies)
0 siblings, 4 replies; 9+ messages in thread
From: Francois Dugast @ 2024-09-20 13:18 UTC (permalink / raw)
To: intel-xe
Cc: Francois Dugast, Lucas De Marchi, Matthew Brost, Rodrigo Vivi,
Michal Wajdeczko
The kernel fault injection infrastructure is used to test proper error
handling during probe. In particular, ALLOW_ERROR_INJECTION() is added
directly to functions which fullfill the injectable functions
requirements:
fault-injection.html#requirements-for-the-error-injectable-functions
Otherwise a helper function is added and called in the beginning of the
function where the fault is to be injected.
The return code of the functions using ALLOW_ERROR_INJECTION() can be
conditionnally modified at runtime by tuning some debugfs entries. This
requires CONFIG_FUNCTION_ERROR_INJECTION (among others).
One way to use fault injection at probe time by making each of those
functions fail one at a time is:
FAILTYPE=fail_function
DEVICE="0000:00:08.0" # depends on the system
ERRNO=-12 # -ENOMEM, can depend on the function
echo N > /sys/kernel/debug/$FAILTYPE/task-filter
echo 100 > /sys/kernel/debug/$FAILTYPE/probability
echo 0 > /sys/kernel/debug/$FAILTYPE/interval
echo -1 > /sys/kernel/debug/$FAILTYPE/times
echo 0 > /sys/kernel/debug/$FAILTYPE/space
echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
modprobe xe
echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
cut -d ' ' -f 1 | while read -r FUNCTION ; do
echo "Injecting fault in $FUNCTION"
echo "" > /sys/kernel/debug/$FAILTYPE/inject
echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
echo $DEVICE > /sys/bus/pci/drivers/xe/bind
done
rmmod xe
It will also be integrated into IGT for systematic execution by CI.
Signed-off-by: Francois Dugast <francois.dugast@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
---
drivers/gpu/drm/xe/xe_device.c | 12 ++++++++++++
drivers/gpu/drm/xe/xe_ggtt.c | 2 ++
drivers/gpu/drm/xe/xe_guc_ads.c | 13 +++++++++++++
drivers/gpu/drm/xe/xe_guc_ct.c | 11 +++++++++++
drivers/gpu/drm/xe/xe_guc_log.c | 13 +++++++++++++
drivers/gpu/drm/xe/xe_guc_relay.c | 11 +++++++++++
drivers/gpu/drm/xe/xe_pm.c | 11 +++++++++++
drivers/gpu/drm/xe/xe_sriov.c | 15 ++++++++++++++-
drivers/gpu/drm/xe/xe_tile.c | 12 ++++++++++++
drivers/gpu/drm/xe/xe_uc_fw.c | 11 +++++++++++
drivers/gpu/drm/xe/xe_wa.c | 12 ++++++++++++
drivers/gpu/drm/xe/xe_wopcm.c | 3 +++
12 files changed, 125 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index cb5a9fd820cf..d26352ecf75e 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -6,6 +6,7 @@
#include "xe_device.h"
#include <linux/delay.h>
+#include <linux/fault-inject.h>
#include <linux/units.h>
#include <drm/drm_aperture.h>
@@ -300,12 +301,22 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
ttm_device_fini(&xe->ttm);
}
+static noinline int fault_inject_device_create(void)
+{
+ return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_device_create, ERRNO);
+
struct xe_device *xe_device_create(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
struct xe_device *xe;
int err;
+ err = fault_inject_device_create();
+ if (err)
+ return ERR_PTR(err);
+
xe_display_driver_set_hooks(&driver);
err = drm_aperture_remove_conflicting_pci_framebuffers(pdev, &driver);
@@ -548,6 +559,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
return 0;
}
+ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
static void update_device_info(struct xe_device *xe)
{
diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
index f68af56c3f86..4906e3f3150b 100644
--- a/drivers/gpu/drm/xe/xe_ggtt.c
+++ b/drivers/gpu/drm/xe/xe_ggtt.c
@@ -5,6 +5,7 @@
#include "xe_ggtt.h"
+#include <linux/fault-inject.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/sizes.h>
@@ -264,6 +265,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
return 0;
}
+ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO);
static void xe_ggtt_invalidate(struct xe_ggtt *ggtt);
diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
index 66d4e5e95abd..e366043eb4b8 100644
--- a/drivers/gpu/drm/xe/xe_guc_ads.c
+++ b/drivers/gpu/drm/xe/xe_guc_ads.c
@@ -5,6 +5,8 @@
#include "xe_guc_ads.h"
+#include <linux/fault-inject.h>
+
#include <drm/drm_managed.h>
#include <generated/xe_wa_oob.h>
@@ -396,12 +398,23 @@ static int calculate_waklv_size(struct xe_guc_ads *ads)
#define MAX_GOLDEN_LRC_SIZE (SZ_4K * 64)
+static noinline int fault_inject_guc_ads_init(void)
+{
+ return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_guc_ads_init, ERRNO);
+
int xe_guc_ads_init(struct xe_guc_ads *ads)
{
struct xe_device *xe = ads_to_xe(ads);
struct xe_gt *gt = ads_to_gt(ads);
struct xe_tile *tile = gt_to_tile(gt);
struct xe_bo *bo;
+ int ret;
+
+ ret = fault_inject_guc_ads_init();
+ if (ret)
+ return ret;
ads->golden_lrc_size = calculate_golden_lrc_size(ads);
ads->regset_size = calculate_regset_size(gt);
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 4b95f75b1546..61967ddd319f 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -8,6 +8,7 @@
#include <linux/bitfield.h>
#include <linux/circ_buf.h>
#include <linux/delay.h>
+#include <linux/fault-inject.h>
#include <kunit/static_stub.h>
@@ -165,6 +166,12 @@ static void primelockdep(struct xe_guc_ct *ct)
fs_reclaim_release(GFP_KERNEL);
}
+static noinline int fault_inject_guc_ct_init(void)
+{
+ return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_guc_ct_init, ERRNO);
+
int xe_guc_ct_init(struct xe_guc_ct *ct)
{
struct xe_device *xe = ct_to_xe(ct);
@@ -173,6 +180,10 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
struct xe_bo *bo;
int err;
+ err = fault_inject_guc_ct_init();
+ if (err)
+ return err;
+
xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE));
ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", 0);
diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
index a37ee3419428..a3e54f1bb0c3 100644
--- a/drivers/gpu/drm/xe/xe_guc_log.c
+++ b/drivers/gpu/drm/xe/xe_guc_log.c
@@ -5,6 +5,8 @@
#include "xe_guc_log.h"
+#include <linux/fault-inject.h>
+
#include <drm/drm_managed.h>
#include "xe_bo.h"
@@ -77,11 +79,22 @@ void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p)
}
}
+static noinline int fault_inject_guc_log_init(void)
+{
+ return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_guc_log_init, ERRNO);
+
int xe_guc_log_init(struct xe_guc_log *log)
{
struct xe_device *xe = log_to_xe(log);
struct xe_tile *tile = gt_to_tile(log_to_gt(log));
struct xe_bo *bo;
+ int err;
+
+ err = fault_inject_guc_log_init();
+ if (err)
+ return err;
bo = xe_managed_bo_create_pin_map(xe, tile, guc_log_size(),
XE_BO_FLAG_SYSTEM |
diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c
index ade6162dc259..ede7fd3e7785 100644
--- a/drivers/gpu/drm/xe/xe_guc_relay.c
+++ b/drivers/gpu/drm/xe/xe_guc_relay.c
@@ -5,6 +5,7 @@
#include <linux/bitfield.h>
#include <linux/delay.h>
+#include <linux/fault-inject.h>
#include <drm/drm_managed.h>
@@ -320,6 +321,12 @@ static void __fini_relay(struct drm_device *drm, void *arg)
mempool_exit(&relay->pool);
}
+static noinline int fault_inject_guc_relay_init(void)
+{
+ return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_guc_relay_init, ERRNO);
+
/**
* xe_guc_relay_init - Initialize a &xe_guc_relay
* @relay: the &xe_guc_relay to initialize
@@ -335,6 +342,10 @@ int xe_guc_relay_init(struct xe_guc_relay *relay)
struct xe_device *xe = relay_to_xe(relay);
int err;
+ err = fault_inject_guc_relay_init();
+ if (err)
+ return err;
+
relay_assert(relay, !relay_is_ready(relay));
if (!IS_SRIOV(xe))
diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
index 33eb039053e4..87075aed885d 100644
--- a/drivers/gpu/drm/xe/xe_pm.c
+++ b/drivers/gpu/drm/xe/xe_pm.c
@@ -5,6 +5,7 @@
#include "xe_pm.h"
+#include <linux/fault-inject.h>
#include <linux/pm_runtime.h>
#include <drm/drm_managed.h>
@@ -247,10 +248,20 @@ static void xe_pm_runtime_init(struct xe_device *xe)
pm_runtime_put(dev);
}
+static noinline int fault_inject_pm_init_early(void)
+{
+ return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_pm_init_early, ERRNO);
+
int xe_pm_init_early(struct xe_device *xe)
{
int err;
+ err = fault_inject_pm_init_early();
+ if (err)
+ return err;
+
INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
err = drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock);
diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
index 69a066ef20c0..f1dafcfd4eae 100644
--- a/drivers/gpu/drm/xe/xe_sriov.c
+++ b/drivers/gpu/drm/xe/xe_sriov.c
@@ -3,6 +3,8 @@
* Copyright © 2023 Intel Corporation
*/
+#include <linux/fault-inject.h>
+
#include <drm/drm_managed.h>
#include "regs/xe_regs.h"
@@ -91,6 +93,12 @@ static void fini_sriov(struct drm_device *drm, void *arg)
xe->sriov.wq = NULL;
}
+static noinline int fault_inject_sriov_init(void)
+{
+ return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_sriov_init, ERRNO);
+
/**
* xe_sriov_init - Initialize SR-IOV specific data.
* @xe: the &xe_device to initialize
@@ -102,11 +110,16 @@ static void fini_sriov(struct drm_device *drm, void *arg)
*/
int xe_sriov_init(struct xe_device *xe)
{
+ int err = fault_inject_sriov_init();
+
+ if (err)
+ return err;
+
if (!IS_SRIOV(xe))
return 0;
if (IS_SRIOV_PF(xe)) {
- int err = xe_sriov_pf_init_early(xe);
+ err = xe_sriov_pf_init_early(xe);
if (err)
return err;
diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
index dda5268507d8..c82b4278c03e 100644
--- a/drivers/gpu/drm/xe/xe_tile.c
+++ b/drivers/gpu/drm/xe/xe_tile.c
@@ -3,6 +3,8 @@
* Copyright © 2023 Intel Corporation
*/
+#include <linux/fault-inject.h>
+
#include <drm/drm_managed.h>
#include "xe_device.h"
@@ -99,6 +101,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
return 0;
}
+static noinline int fault_inject_tile_init_early(void)
+{
+ return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_tile_init_early, ERRNO);
+
/**
* xe_tile_init_early - Initialize the tile and primary GT
* @tile: Tile to initialize
@@ -114,6 +122,10 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
{
int err;
+ err = fault_inject_tile_init_early();
+ if (err)
+ return err;
+
tile->xe = xe;
tile->id = id;
diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
index eab9456e051f..8fff0fd7c675 100644
--- a/drivers/gpu/drm/xe/xe_uc_fw.c
+++ b/drivers/gpu/drm/xe/xe_uc_fw.c
@@ -4,6 +4,7 @@
*/
#include <linux/bitfield.h>
+#include <linux/fault-inject.h>
#include <linux/firmware.h>
#include <drm/drm_managed.h>
@@ -776,11 +777,21 @@ static int uc_fw_copy(struct xe_uc_fw *uc_fw, const void *data, size_t size, u32
return err;
}
+static noinline int fault_inject_uc_fw_init(void)
+{
+ return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_uc_fw_init, ERRNO);
+
int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
{
const struct firmware *fw = NULL;
int err;
+ err = fault_inject_uc_fw_init();
+ if (err)
+ return err;
+
err = uc_fw_request(uc_fw, &fw);
if (err)
return err;
diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
index 22c148b1e996..121443b790bf 100644
--- a/drivers/gpu/drm/xe/xe_wa.c
+++ b/drivers/gpu/drm/xe/xe_wa.c
@@ -8,6 +8,7 @@
#include <drm/drm_managed.h>
#include <kunit/visibility.h>
#include <linux/compiler_types.h>
+#include <linux/fault-inject.h>
#include <generated/xe_wa_oob.h>
@@ -818,6 +819,12 @@ void xe_wa_process_lrc(struct xe_hw_engine *hwe)
xe_rtp_process_to_sr(&ctx, lrc_was, &hwe->reg_lrc);
}
+static noinline int fault_inject_wa_init(void)
+{
+ return 0;
+}
+ALLOW_ERROR_INJECTION(fault_inject_wa_init, ERRNO);
+
/**
* xe_wa_init - initialize gt with workaround bookkeeping
* @gt: GT instance to initialize
@@ -829,6 +836,11 @@ int xe_wa_init(struct xe_gt *gt)
struct xe_device *xe = gt_to_xe(gt);
size_t n_oob, n_lrc, n_engine, n_gt, total;
unsigned long *p;
+ int err;
+
+ err = fault_inject_wa_init();
+ if (err)
+ return err;
n_gt = BITS_TO_LONGS(ARRAY_SIZE(gt_was));
n_engine = BITS_TO_LONGS(ARRAY_SIZE(engine_was));
diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
index 93c82825d896..88a201122a22 100644
--- a/drivers/gpu/drm/xe/xe_wopcm.c
+++ b/drivers/gpu/drm/xe/xe_wopcm.c
@@ -5,6 +5,8 @@
#include "xe_wopcm.h"
+#include <linux/fault-inject.h>
+
#include "regs/xe_guc_regs.h"
#include "xe_device.h"
#include "xe_force_wake.h"
@@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
return ret;
}
+ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
--
2.43.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* ✓ CI.Patch_applied: success for drm/xe: Use fault injection infrastructure to find issues at probe time (rev2)
2024-09-20 13:18 [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
@ 2024-09-20 13:51 ` Patchwork
2024-09-20 13:51 ` ✓ CI.checkpatch: " Patchwork
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: Patchwork @ 2024-09-20 13:51 UTC (permalink / raw)
To: Francois Dugast; +Cc: intel-xe
== Series Details ==
Series: drm/xe: Use fault injection infrastructure to find issues at probe time (rev2)
URL : https://patchwork.freedesktop.org/series/138654/
State : success
== Summary ==
=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: 128ccc5f71d7 drm-tip: 2024y-09m-20d-07h-32m-33s UTC integration manifest
=== git am output follows ===
Applying: drm/xe: Use fault injection infrastructure to find issues at probe time
^ permalink raw reply [flat|nested] 9+ messages in thread
* ✓ CI.checkpatch: success for drm/xe: Use fault injection infrastructure to find issues at probe time (rev2)
2024-09-20 13:18 [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
2024-09-20 13:51 ` ✓ CI.Patch_applied: success for drm/xe: Use fault injection infrastructure to find issues at probe time (rev2) Patchwork
@ 2024-09-20 13:51 ` Patchwork
2024-09-20 13:52 ` ✗ CI.KUnit: failure " Patchwork
2024-09-23 18:05 ` [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time Rodrigo Vivi
3 siblings, 0 replies; 9+ messages in thread
From: Patchwork @ 2024-09-20 13:51 UTC (permalink / raw)
To: Francois Dugast; +Cc: intel-xe
== Series Details ==
Series: drm/xe: Use fault injection infrastructure to find issues at probe time (rev2)
URL : https://patchwork.freedesktop.org/series/138654/
State : success
== Summary ==
+ KERNEL=/kernel
+ git clone https://gitlab.freedesktop.org/drm/maintainer-tools mt
Cloning into 'mt'...
warning: redirecting to https://gitlab.freedesktop.org/drm/maintainer-tools.git/
+ git -C mt rev-list -n1 origin/master
30ab6715fc09baee6cc14cb3c89ad8858688d474
+ cd /kernel
+ git config --global --add safe.directory /kernel
+ git log -n1
commit 22f9fcf277ca94a694a29999d5ae7fff67f0406e
Author: Francois Dugast <francois.dugast@intel.com>
Date: Fri Sep 20 15:18:34 2024 +0200
drm/xe: Use fault injection infrastructure to find issues at probe time
The kernel fault injection infrastructure is used to test proper error
handling during probe. In particular, ALLOW_ERROR_INJECTION() is added
directly to functions which fullfill the injectable functions
requirements:
fault-injection.html#requirements-for-the-error-injectable-functions
Otherwise a helper function is added and called in the beginning of the
function where the fault is to be injected.
The return code of the functions using ALLOW_ERROR_INJECTION() can be
conditionnally modified at runtime by tuning some debugfs entries. This
requires CONFIG_FUNCTION_ERROR_INJECTION (among others).
One way to use fault injection at probe time by making each of those
functions fail one at a time is:
FAILTYPE=fail_function
DEVICE="0000:00:08.0" # depends on the system
ERRNO=-12 # -ENOMEM, can depend on the function
echo N > /sys/kernel/debug/$FAILTYPE/task-filter
echo 100 > /sys/kernel/debug/$FAILTYPE/probability
echo 0 > /sys/kernel/debug/$FAILTYPE/interval
echo -1 > /sys/kernel/debug/$FAILTYPE/times
echo 0 > /sys/kernel/debug/$FAILTYPE/space
echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
modprobe xe
echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
cut -d ' ' -f 1 | while read -r FUNCTION ; do
echo "Injecting fault in $FUNCTION"
echo "" > /sys/kernel/debug/$FAILTYPE/inject
echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
echo $DEVICE > /sys/bus/pci/drivers/xe/bind
done
rmmod xe
It will also be integrated into IGT for systematic execution by CI.
Signed-off-by: Francois Dugast <francois.dugast@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
+ /mt/dim checkpatch 128ccc5f71d7b9d84ce8f0651aa713ae490f4990 drm-intel
22f9fcf277ca drm/xe: Use fault injection infrastructure to find issues at probe time
^ permalink raw reply [flat|nested] 9+ messages in thread
* ✗ CI.KUnit: failure for drm/xe: Use fault injection infrastructure to find issues at probe time (rev2)
2024-09-20 13:18 [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
2024-09-20 13:51 ` ✓ CI.Patch_applied: success for drm/xe: Use fault injection infrastructure to find issues at probe time (rev2) Patchwork
2024-09-20 13:51 ` ✓ CI.checkpatch: " Patchwork
@ 2024-09-20 13:52 ` Patchwork
2024-09-23 10:34 ` Francois Dugast
2024-09-23 18:05 ` [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time Rodrigo Vivi
3 siblings, 1 reply; 9+ messages in thread
From: Patchwork @ 2024-09-20 13:52 UTC (permalink / raw)
To: Francois Dugast; +Cc: intel-xe
== Series Details ==
Series: drm/xe: Use fault injection infrastructure to find issues at probe time (rev2)
URL : https://patchwork.freedesktop.org/series/138654/
State : failure
== Summary ==
+ trap cleanup EXIT
+ /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
ERROR:root:In file included from ../drivers/gpu/drm/xe/xe_guc_ads.c:8:
../include/linux/fault-inject.h:97:15: error: unknown type name ‘bool’
97 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
| ^~~~
../include/linux/fault-inject.h:97:43: error: unknown type name ‘gfp_t’
97 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
| ^~~~~
../include/linux/fault-inject.h:106:57: error: unknown type name ‘gfp_t’
106 | static inline int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
| ^~~~~
make[7]: *** [../scripts/Makefile.build:244: drivers/gpu/drm/xe/xe_guc_ads.o] Error 1
make[7]: *** Waiting for unfinished jobs....
make[6]: *** [../scripts/Makefile.build:485: drivers/gpu/drm/xe] Error 2
make[6]: *** Waiting for unfinished jobs....
../lib/iomap.c:156:5: warning: no previous prototype for ‘ioread64_lo_hi’ [-Wmissing-prototypes]
156 | u64 ioread64_lo_hi(const void __iomem *addr)
| ^~~~~~~~~~~~~~
../lib/iomap.c:163:5: warning: no previous prototype for ‘ioread64_hi_lo’ [-Wmissing-prototypes]
163 | u64 ioread64_hi_lo(const void __iomem *addr)
| ^~~~~~~~~~~~~~
../lib/iomap.c:170:5: warning: no previous prototype for ‘ioread64be_lo_hi’ [-Wmissing-prototypes]
170 | u64 ioread64be_lo_hi(const void __iomem *addr)
| ^~~~~~~~~~~~~~~~
../lib/iomap.c:178:5: warning: no previous prototype for ‘ioread64be_hi_lo’ [-Wmissing-prototypes]
178 | u64 ioread64be_hi_lo(const void __iomem *addr)
| ^~~~~~~~~~~~~~~~
../lib/iomap.c:264:6: warning: no previous prototype for ‘iowrite64_lo_hi’ [-Wmissing-prototypes]
264 | void iowrite64_lo_hi(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~
../lib/iomap.c:272:6: warning: no previous prototype for ‘iowrite64_hi_lo’ [-Wmissing-prototypes]
272 | void iowrite64_hi_lo(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~
../lib/iomap.c:280:6: warning: no previous prototype for ‘iowrite64be_lo_hi’ [-Wmissing-prototypes]
280 | void iowrite64be_lo_hi(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~~~
../lib/iomap.c:288:6: warning: no previous prototype for ‘iowrite64be_hi_lo’ [-Wmissing-prototypes]
288 | void iowrite64be_hi_lo(u64 val, void __iomem *addr)
| ^~~~~~~~~~~~~~~~~
make[5]: *** [../scripts/Makefile.build:485: drivers/gpu/drm] Error 2
make[4]: *** [../scripts/Makefile.build:485: drivers/gpu] Error 2
make[3]: *** [../scripts/Makefile.build:485: drivers] Error 2
make[3]: *** Waiting for unfinished jobs....
make[2]: *** [/kernel/Makefile:1926: .] Error 2
make[1]: *** [/kernel/Makefile:224: __sub-make] Error 2
make: *** [Makefile:224: __sub-make] Error 2
[13:51:49] Configuring KUnit Kernel ...
Generating .config ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
[13:51:53] Building KUnit Kernel ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
Building with:
$ make ARCH=um O=.kunit --jobs=48
+ cleanup
++ stat -c %u:%g /kernel
+ chown -R 1003:1003 /kernel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: ✗ CI.KUnit: failure for drm/xe: Use fault injection infrastructure to find issues at probe time (rev2)
2024-09-20 13:52 ` ✗ CI.KUnit: failure " Patchwork
@ 2024-09-23 10:34 ` Francois Dugast
2024-09-23 12:00 ` Jani Nikula
0 siblings, 1 reply; 9+ messages in thread
From: Francois Dugast @ 2024-09-23 10:34 UTC (permalink / raw)
To: intel-xe
On Fri, Sep 20, 2024 at 01:52:17PM +0000, Patchwork wrote:
> == Series Details ==
>
> Series: drm/xe: Use fault injection infrastructure to find issues at probe time (rev2)
> URL : https://patchwork.freedesktop.org/series/138654/
> State : failure
>
> == Summary ==
>
> + trap cleanup EXIT
> + /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
> ERROR:root:In file included from ../drivers/gpu/drm/xe/xe_guc_ads.c:8:
> ../include/linux/fault-inject.h:97:15: error: unknown type name ‘bool’
> 97 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
> | ^~~~
> ../include/linux/fault-inject.h:97:43: error: unknown type name ‘gfp_t’
> 97 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
> | ^~~~~
> ../include/linux/fault-inject.h:106:57: error: unknown type name ‘gfp_t’
> 106 | static inline int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
> | ^~~~~
> make[7]: *** [../scripts/Makefile.build:244: drivers/gpu/drm/xe/xe_guc_ads.o] Error 1
This build error occurs when CONFIG_FAULT_INJECTION=n due to missing inclusion
of linux/types.h in linux/fault-inject.h. It is solved with Jani's series:
https://lore.kernel.org/intel-xe/20240813121237.2382534-1-jani.nikula@intel.com/
Francois
> make[7]: *** Waiting for unfinished jobs....
> make[6]: *** [../scripts/Makefile.build:485: drivers/gpu/drm/xe] Error 2
> make[6]: *** Waiting for unfinished jobs....
> ../lib/iomap.c:156:5: warning: no previous prototype for ‘ioread64_lo_hi’ [-Wmissing-prototypes]
> 156 | u64 ioread64_lo_hi(const void __iomem *addr)
> | ^~~~~~~~~~~~~~
> ../lib/iomap.c:163:5: warning: no previous prototype for ‘ioread64_hi_lo’ [-Wmissing-prototypes]
> 163 | u64 ioread64_hi_lo(const void __iomem *addr)
> | ^~~~~~~~~~~~~~
> ../lib/iomap.c:170:5: warning: no previous prototype for ‘ioread64be_lo_hi’ [-Wmissing-prototypes]
> 170 | u64 ioread64be_lo_hi(const void __iomem *addr)
> | ^~~~~~~~~~~~~~~~
> ../lib/iomap.c:178:5: warning: no previous prototype for ‘ioread64be_hi_lo’ [-Wmissing-prototypes]
> 178 | u64 ioread64be_hi_lo(const void __iomem *addr)
> | ^~~~~~~~~~~~~~~~
> ../lib/iomap.c:264:6: warning: no previous prototype for ‘iowrite64_lo_hi’ [-Wmissing-prototypes]
> 264 | void iowrite64_lo_hi(u64 val, void __iomem *addr)
> | ^~~~~~~~~~~~~~~
> ../lib/iomap.c:272:6: warning: no previous prototype for ‘iowrite64_hi_lo’ [-Wmissing-prototypes]
> 272 | void iowrite64_hi_lo(u64 val, void __iomem *addr)
> | ^~~~~~~~~~~~~~~
> ../lib/iomap.c:280:6: warning: no previous prototype for ‘iowrite64be_lo_hi’ [-Wmissing-prototypes]
> 280 | void iowrite64be_lo_hi(u64 val, void __iomem *addr)
> | ^~~~~~~~~~~~~~~~~
> ../lib/iomap.c:288:6: warning: no previous prototype for ‘iowrite64be_hi_lo’ [-Wmissing-prototypes]
> 288 | void iowrite64be_hi_lo(u64 val, void __iomem *addr)
> | ^~~~~~~~~~~~~~~~~
> make[5]: *** [../scripts/Makefile.build:485: drivers/gpu/drm] Error 2
> make[4]: *** [../scripts/Makefile.build:485: drivers/gpu] Error 2
> make[3]: *** [../scripts/Makefile.build:485: drivers] Error 2
> make[3]: *** Waiting for unfinished jobs....
> make[2]: *** [/kernel/Makefile:1926: .] Error 2
> make[1]: *** [/kernel/Makefile:224: __sub-make] Error 2
> make: *** [Makefile:224: __sub-make] Error 2
>
> [13:51:49] Configuring KUnit Kernel ...
> Generating .config ...
> Populating config with:
> $ make ARCH=um O=.kunit olddefconfig
> [13:51:53] Building KUnit Kernel ...
> Populating config with:
> $ make ARCH=um O=.kunit olddefconfig
> Building with:
> $ make ARCH=um O=.kunit --jobs=48
> + cleanup
> ++ stat -c %u:%g /kernel
> + chown -R 1003:1003 /kernel
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: ✗ CI.KUnit: failure for drm/xe: Use fault injection infrastructure to find issues at probe time (rev2)
2024-09-23 10:34 ` Francois Dugast
@ 2024-09-23 12:00 ` Jani Nikula
0 siblings, 0 replies; 9+ messages in thread
From: Jani Nikula @ 2024-09-23 12:00 UTC (permalink / raw)
To: Francois Dugast, intel-xe
On Mon, 23 Sep 2024, Francois Dugast <francois.dugast@intel.com> wrote:
> On Fri, Sep 20, 2024 at 01:52:17PM +0000, Patchwork wrote:
>> == Series Details ==
>>
>> Series: drm/xe: Use fault injection infrastructure to find issues at probe time (rev2)
>> URL : https://patchwork.freedesktop.org/series/138654/
>> State : failure
>>
>> == Summary ==
>>
>> + trap cleanup EXIT
>> + /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
>> ERROR:root:In file included from ../drivers/gpu/drm/xe/xe_guc_ads.c:8:
>> ../include/linux/fault-inject.h:97:15: error: unknown type name ‘bool’
>> 97 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
>> | ^~~~
>> ../include/linux/fault-inject.h:97:43: error: unknown type name ‘gfp_t’
>> 97 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
>> | ^~~~~
>> ../include/linux/fault-inject.h:106:57: error: unknown type name ‘gfp_t’
>> 106 | static inline int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
>> | ^~~~~
>> make[7]: *** [../scripts/Makefile.build:244: drivers/gpu/drm/xe/xe_guc_ads.o] Error 1
>
> This build error occurs when CONFIG_FAULT_INJECTION=n due to missing inclusion
> of linux/types.h in linux/fault-inject.h. It is solved with Jani's series:
>
> https://lore.kernel.org/intel-xe/20240813121237.2382534-1-jani.nikula@intel.com/
These two are in Linus' master now, we'll get them after -rc1 and some
backmerges:
6ce2082fd3a2 ("fault-inject: improve build for CONFIG_FAULT_INJECTION=n")
ccbfd2df3018 ("drm/xe: clean up fault injection usage")
BR,
Jani.
--
Jani Nikula, Intel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time
2024-09-20 13:18 [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
` (2 preceding siblings ...)
2024-09-20 13:52 ` ✗ CI.KUnit: failure " Patchwork
@ 2024-09-23 18:05 ` Rodrigo Vivi
2024-09-24 12:58 ` Francois Dugast
3 siblings, 1 reply; 9+ messages in thread
From: Rodrigo Vivi @ 2024-09-23 18:05 UTC (permalink / raw)
To: Francois Dugast
Cc: intel-xe, Lucas De Marchi, Matthew Brost, Michal Wajdeczko
On Fri, Sep 20, 2024 at 03:18:34PM +0200, Francois Dugast wrote:
> The kernel fault injection infrastructure is used to test proper error
> handling during probe. In particular, ALLOW_ERROR_INJECTION() is added
> directly to functions which fullfill the injectable functions
> requirements:
> fault-injection.html#requirements-for-the-error-injectable-functions
Okay, I read this. This was my biggest initial question here...
>
> Otherwise a helper function is added and called in the beginning of the
> function where the fault is to be injected.
>
> The return code of the functions using ALLOW_ERROR_INJECTION() can be
> conditionnally modified at runtime by tuning some debugfs entries. This
> requires CONFIG_FUNCTION_ERROR_INJECTION (among others).
>
> One way to use fault injection at probe time by making each of those
> functions fail one at a time is:
>
> FAILTYPE=fail_function
> DEVICE="0000:00:08.0" # depends on the system
> ERRNO=-12 # -ENOMEM, can depend on the function
>
> echo N > /sys/kernel/debug/$FAILTYPE/task-filter
> echo 100 > /sys/kernel/debug/$FAILTYPE/probability
> echo 0 > /sys/kernel/debug/$FAILTYPE/interval
> echo -1 > /sys/kernel/debug/$FAILTYPE/times
> echo 0 > /sys/kernel/debug/$FAILTYPE/space
> echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
>
> modprobe xe
> echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
>
> grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
> cut -d ' ' -f 1 | while read -r FUNCTION ; do
> echo "Injecting fault in $FUNCTION"
> echo "" > /sys/kernel/debug/$FAILTYPE/inject
> echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
> printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
> echo $DEVICE > /sys/bus/pci/drivers/xe/bind
> done
>
> rmmod xe
>
> It will also be integrated into IGT for systematic execution by CI.
>
> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> ---
> drivers/gpu/drm/xe/xe_device.c | 12 ++++++++++++
> drivers/gpu/drm/xe/xe_ggtt.c | 2 ++
> drivers/gpu/drm/xe/xe_guc_ads.c | 13 +++++++++++++
> drivers/gpu/drm/xe/xe_guc_ct.c | 11 +++++++++++
> drivers/gpu/drm/xe/xe_guc_log.c | 13 +++++++++++++
> drivers/gpu/drm/xe/xe_guc_relay.c | 11 +++++++++++
> drivers/gpu/drm/xe/xe_pm.c | 11 +++++++++++
> drivers/gpu/drm/xe/xe_sriov.c | 15 ++++++++++++++-
> drivers/gpu/drm/xe/xe_tile.c | 12 ++++++++++++
> drivers/gpu/drm/xe/xe_uc_fw.c | 11 +++++++++++
> drivers/gpu/drm/xe/xe_wa.c | 12 ++++++++++++
> drivers/gpu/drm/xe/xe_wopcm.c | 3 +++
> 12 files changed, 125 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index cb5a9fd820cf..d26352ecf75e 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -6,6 +6,7 @@
> #include "xe_device.h"
>
> #include <linux/delay.h>
> +#include <linux/fault-inject.h>
> #include <linux/units.h>
>
> #include <drm/drm_aperture.h>
> @@ -300,12 +301,22 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
> ttm_device_fini(&xe->ttm);
> }
>
> +static noinline int fault_inject_device_create(void)
> +{
> + return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_device_create, ERRNO);
> +
> struct xe_device *xe_device_create(struct pci_dev *pdev,
> const struct pci_device_id *ent)
> {
> struct xe_device *xe;
> int err;
>
> + err = fault_inject_device_create();
But then I don't understand here. To me it looks like this function would
fulfill the criteria. It is not doing any clean up on the err goto,
nor changingn any state before the first error return.
Why can't we go directly?
Perhaps we should add a comment on each of this new fault_inject
functions to explain why that was chosen instead of the direct
macro assignment?
> + if (err)
> + return ERR_PTR(err);
> +
> xe_display_driver_set_hooks(&driver);
>
> err = drm_aperture_remove_conflicting_pci_framebuffers(pdev, &driver);
> @@ -548,6 +559,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
>
> return 0;
> }
> +ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
>
> static void update_device_info(struct xe_device *xe)
> {
> diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> index f68af56c3f86..4906e3f3150b 100644
> --- a/drivers/gpu/drm/xe/xe_ggtt.c
> +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> @@ -5,6 +5,7 @@
>
> #include "xe_ggtt.h"
>
> +#include <linux/fault-inject.h>
> #include <linux/io-64-nonatomic-lo-hi.h>
> #include <linux/sizes.h>
>
> @@ -264,6 +265,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
>
> return 0;
> }
> +ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO);
>
> static void xe_ggtt_invalidate(struct xe_ggtt *ggtt);
>
> diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> index 66d4e5e95abd..e366043eb4b8 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> @@ -5,6 +5,8 @@
>
> #include "xe_guc_ads.h"
>
> +#include <linux/fault-inject.h>
> +
> #include <drm/drm_managed.h>
>
> #include <generated/xe_wa_oob.h>
> @@ -396,12 +398,23 @@ static int calculate_waklv_size(struct xe_guc_ads *ads)
>
> #define MAX_GOLDEN_LRC_SIZE (SZ_4K * 64)
>
> +static noinline int fault_inject_guc_ads_init(void)
> +{
> + return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_guc_ads_init, ERRNO);
> +
> int xe_guc_ads_init(struct xe_guc_ads *ads)
> {
> struct xe_device *xe = ads_to_xe(ads);
> struct xe_gt *gt = ads_to_gt(ads);
> struct xe_tile *tile = gt_to_tile(gt);
> struct xe_bo *bo;
> + int ret;
> +
> + ret = fault_inject_guc_ads_init();
> + if (ret)
> + return ret;
>
> ads->golden_lrc_size = calculate_golden_lrc_size(ads);
> ads->regset_size = calculate_regset_size(gt);
> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> index 4b95f75b1546..61967ddd319f 100644
> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> @@ -8,6 +8,7 @@
> #include <linux/bitfield.h>
> #include <linux/circ_buf.h>
> #include <linux/delay.h>
> +#include <linux/fault-inject.h>
>
> #include <kunit/static_stub.h>
>
> @@ -165,6 +166,12 @@ static void primelockdep(struct xe_guc_ct *ct)
> fs_reclaim_release(GFP_KERNEL);
> }
>
> +static noinline int fault_inject_guc_ct_init(void)
> +{
> + return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_guc_ct_init, ERRNO);
> +
> int xe_guc_ct_init(struct xe_guc_ct *ct)
> {
> struct xe_device *xe = ct_to_xe(ct);
> @@ -173,6 +180,10 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
> struct xe_bo *bo;
> int err;
>
> + err = fault_inject_guc_ct_init();
> + if (err)
> + return err;
> +
> xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE));
>
> ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", 0);
> diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
> index a37ee3419428..a3e54f1bb0c3 100644
> --- a/drivers/gpu/drm/xe/xe_guc_log.c
> +++ b/drivers/gpu/drm/xe/xe_guc_log.c
> @@ -5,6 +5,8 @@
>
> #include "xe_guc_log.h"
>
> +#include <linux/fault-inject.h>
> +
> #include <drm/drm_managed.h>
>
> #include "xe_bo.h"
> @@ -77,11 +79,22 @@ void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p)
> }
> }
>
> +static noinline int fault_inject_guc_log_init(void)
> +{
> + return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_guc_log_init, ERRNO);
> +
> int xe_guc_log_init(struct xe_guc_log *log)
> {
> struct xe_device *xe = log_to_xe(log);
> struct xe_tile *tile = gt_to_tile(log_to_gt(log));
> struct xe_bo *bo;
> + int err;
> +
> + err = fault_inject_guc_log_init();
> + if (err)
> + return err;
>
> bo = xe_managed_bo_create_pin_map(xe, tile, guc_log_size(),
> XE_BO_FLAG_SYSTEM |
> diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c
> index ade6162dc259..ede7fd3e7785 100644
> --- a/drivers/gpu/drm/xe/xe_guc_relay.c
> +++ b/drivers/gpu/drm/xe/xe_guc_relay.c
> @@ -5,6 +5,7 @@
>
> #include <linux/bitfield.h>
> #include <linux/delay.h>
> +#include <linux/fault-inject.h>
>
> #include <drm/drm_managed.h>
>
> @@ -320,6 +321,12 @@ static void __fini_relay(struct drm_device *drm, void *arg)
> mempool_exit(&relay->pool);
> }
>
> +static noinline int fault_inject_guc_relay_init(void)
> +{
> + return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_guc_relay_init, ERRNO);
> +
> /**
> * xe_guc_relay_init - Initialize a &xe_guc_relay
> * @relay: the &xe_guc_relay to initialize
> @@ -335,6 +342,10 @@ int xe_guc_relay_init(struct xe_guc_relay *relay)
> struct xe_device *xe = relay_to_xe(relay);
> int err;
>
> + err = fault_inject_guc_relay_init();
> + if (err)
> + return err;
> +
> relay_assert(relay, !relay_is_ready(relay));
>
> if (!IS_SRIOV(xe))
> diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
> index 33eb039053e4..87075aed885d 100644
> --- a/drivers/gpu/drm/xe/xe_pm.c
> +++ b/drivers/gpu/drm/xe/xe_pm.c
> @@ -5,6 +5,7 @@
>
> #include "xe_pm.h"
>
> +#include <linux/fault-inject.h>
> #include <linux/pm_runtime.h>
>
> #include <drm/drm_managed.h>
> @@ -247,10 +248,20 @@ static void xe_pm_runtime_init(struct xe_device *xe)
> pm_runtime_put(dev);
> }
>
> +static noinline int fault_inject_pm_init_early(void)
> +{
> + return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_pm_init_early, ERRNO);
> +
> int xe_pm_init_early(struct xe_device *xe)
> {
> int err;
>
> + err = fault_inject_pm_init_early();
> + if (err)
> + return err;
> +
> INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
>
> err = drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock);
> diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
> index 69a066ef20c0..f1dafcfd4eae 100644
> --- a/drivers/gpu/drm/xe/xe_sriov.c
> +++ b/drivers/gpu/drm/xe/xe_sriov.c
> @@ -3,6 +3,8 @@
> * Copyright © 2023 Intel Corporation
> */
>
> +#include <linux/fault-inject.h>
> +
> #include <drm/drm_managed.h>
>
> #include "regs/xe_regs.h"
> @@ -91,6 +93,12 @@ static void fini_sriov(struct drm_device *drm, void *arg)
> xe->sriov.wq = NULL;
> }
>
> +static noinline int fault_inject_sriov_init(void)
> +{
> + return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_sriov_init, ERRNO);
> +
> /**
> * xe_sriov_init - Initialize SR-IOV specific data.
> * @xe: the &xe_device to initialize
> @@ -102,11 +110,16 @@ static void fini_sriov(struct drm_device *drm, void *arg)
> */
> int xe_sriov_init(struct xe_device *xe)
> {
> + int err = fault_inject_sriov_init();
> +
> + if (err)
> + return err;
> +
> if (!IS_SRIOV(xe))
> return 0;
>
> if (IS_SRIOV_PF(xe)) {
> - int err = xe_sriov_pf_init_early(xe);
> + err = xe_sriov_pf_init_early(xe);
>
> if (err)
> return err;
> diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> index dda5268507d8..c82b4278c03e 100644
> --- a/drivers/gpu/drm/xe/xe_tile.c
> +++ b/drivers/gpu/drm/xe/xe_tile.c
> @@ -3,6 +3,8 @@
> * Copyright © 2023 Intel Corporation
> */
>
> +#include <linux/fault-inject.h>
> +
> #include <drm/drm_managed.h>
>
> #include "xe_device.h"
> @@ -99,6 +101,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
> return 0;
> }
>
> +static noinline int fault_inject_tile_init_early(void)
> +{
> + return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_tile_init_early, ERRNO);
> +
> /**
> * xe_tile_init_early - Initialize the tile and primary GT
> * @tile: Tile to initialize
> @@ -114,6 +122,10 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
> {
> int err;
>
> + err = fault_inject_tile_init_early();
> + if (err)
> + return err;
> +
> tile->xe = xe;
> tile->id = id;
>
> diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
> index eab9456e051f..8fff0fd7c675 100644
> --- a/drivers/gpu/drm/xe/xe_uc_fw.c
> +++ b/drivers/gpu/drm/xe/xe_uc_fw.c
> @@ -4,6 +4,7 @@
> */
>
> #include <linux/bitfield.h>
> +#include <linux/fault-inject.h>
> #include <linux/firmware.h>
>
> #include <drm/drm_managed.h>
> @@ -776,11 +777,21 @@ static int uc_fw_copy(struct xe_uc_fw *uc_fw, const void *data, size_t size, u32
> return err;
> }
>
> +static noinline int fault_inject_uc_fw_init(void)
> +{
> + return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_uc_fw_init, ERRNO);
> +
> int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
> {
> const struct firmware *fw = NULL;
> int err;
>
> + err = fault_inject_uc_fw_init();
> + if (err)
> + return err;
> +
> err = uc_fw_request(uc_fw, &fw);
> if (err)
> return err;
> diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
> index 22c148b1e996..121443b790bf 100644
> --- a/drivers/gpu/drm/xe/xe_wa.c
> +++ b/drivers/gpu/drm/xe/xe_wa.c
> @@ -8,6 +8,7 @@
> #include <drm/drm_managed.h>
> #include <kunit/visibility.h>
> #include <linux/compiler_types.h>
> +#include <linux/fault-inject.h>
>
> #include <generated/xe_wa_oob.h>
>
> @@ -818,6 +819,12 @@ void xe_wa_process_lrc(struct xe_hw_engine *hwe)
> xe_rtp_process_to_sr(&ctx, lrc_was, &hwe->reg_lrc);
> }
>
> +static noinline int fault_inject_wa_init(void)
> +{
> + return 0;
> +}
> +ALLOW_ERROR_INJECTION(fault_inject_wa_init, ERRNO);
> +
> /**
> * xe_wa_init - initialize gt with workaround bookkeeping
> * @gt: GT instance to initialize
> @@ -829,6 +836,11 @@ int xe_wa_init(struct xe_gt *gt)
> struct xe_device *xe = gt_to_xe(gt);
> size_t n_oob, n_lrc, n_engine, n_gt, total;
> unsigned long *p;
> + int err;
> +
> + err = fault_inject_wa_init();
> + if (err)
> + return err;
>
> n_gt = BITS_TO_LONGS(ARRAY_SIZE(gt_was));
> n_engine = BITS_TO_LONGS(ARRAY_SIZE(engine_was));
> diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
> index 93c82825d896..88a201122a22 100644
> --- a/drivers/gpu/drm/xe/xe_wopcm.c
> +++ b/drivers/gpu/drm/xe/xe_wopcm.c
> @@ -5,6 +5,8 @@
>
> #include "xe_wopcm.h"
>
> +#include <linux/fault-inject.h>
> +
> #include "regs/xe_guc_regs.h"
> #include "xe_device.h"
> #include "xe_force_wake.h"
> @@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
>
> return ret;
> }
> +ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
> --
> 2.43.0
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time
2024-09-23 18:05 ` [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time Rodrigo Vivi
@ 2024-09-24 12:58 ` Francois Dugast
2024-09-26 18:43 ` Rodrigo Vivi
0 siblings, 1 reply; 9+ messages in thread
From: Francois Dugast @ 2024-09-24 12:58 UTC (permalink / raw)
To: Rodrigo Vivi; +Cc: intel-xe, Lucas De Marchi, Matthew Brost, Michal Wajdeczko
On Mon, Sep 23, 2024 at 02:05:01PM -0400, Rodrigo Vivi wrote:
> On Fri, Sep 20, 2024 at 03:18:34PM +0200, Francois Dugast wrote:
> > The kernel fault injection infrastructure is used to test proper error
> > handling during probe. In particular, ALLOW_ERROR_INJECTION() is added
> > directly to functions which fullfill the injectable functions
> > requirements:
> > fault-injection.html#requirements-for-the-error-injectable-functions
>
> Okay, I read this. This was my biggest initial question here...
>
> >
> > Otherwise a helper function is added and called in the beginning of the
> > function where the fault is to be injected.
> >
> > The return code of the functions using ALLOW_ERROR_INJECTION() can be
> > conditionnally modified at runtime by tuning some debugfs entries. This
> > requires CONFIG_FUNCTION_ERROR_INJECTION (among others).
> >
> > One way to use fault injection at probe time by making each of those
> > functions fail one at a time is:
> >
> > FAILTYPE=fail_function
> > DEVICE="0000:00:08.0" # depends on the system
> > ERRNO=-12 # -ENOMEM, can depend on the function
> >
> > echo N > /sys/kernel/debug/$FAILTYPE/task-filter
> > echo 100 > /sys/kernel/debug/$FAILTYPE/probability
> > echo 0 > /sys/kernel/debug/$FAILTYPE/interval
> > echo -1 > /sys/kernel/debug/$FAILTYPE/times
> > echo 0 > /sys/kernel/debug/$FAILTYPE/space
> > echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
> >
> > modprobe xe
> > echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
> >
> > grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
> > cut -d ' ' -f 1 | while read -r FUNCTION ; do
> > echo "Injecting fault in $FUNCTION"
> > echo "" > /sys/kernel/debug/$FAILTYPE/inject
> > echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
> > printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
> > echo $DEVICE > /sys/bus/pci/drivers/xe/bind
> > done
> >
> > rmmod xe
> >
> > It will also be integrated into IGT for systematic execution by CI.
> >
> > Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> > Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> > Cc: Matthew Brost <matthew.brost@intel.com>
> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> > ---
> > drivers/gpu/drm/xe/xe_device.c | 12 ++++++++++++
> > drivers/gpu/drm/xe/xe_ggtt.c | 2 ++
> > drivers/gpu/drm/xe/xe_guc_ads.c | 13 +++++++++++++
> > drivers/gpu/drm/xe/xe_guc_ct.c | 11 +++++++++++
> > drivers/gpu/drm/xe/xe_guc_log.c | 13 +++++++++++++
> > drivers/gpu/drm/xe/xe_guc_relay.c | 11 +++++++++++
> > drivers/gpu/drm/xe/xe_pm.c | 11 +++++++++++
> > drivers/gpu/drm/xe/xe_sriov.c | 15 ++++++++++++++-
> > drivers/gpu/drm/xe/xe_tile.c | 12 ++++++++++++
> > drivers/gpu/drm/xe/xe_uc_fw.c | 11 +++++++++++
> > drivers/gpu/drm/xe/xe_wa.c | 12 ++++++++++++
> > drivers/gpu/drm/xe/xe_wopcm.c | 3 +++
> > 12 files changed, 125 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> > index cb5a9fd820cf..d26352ecf75e 100644
> > --- a/drivers/gpu/drm/xe/xe_device.c
> > +++ b/drivers/gpu/drm/xe/xe_device.c
> > @@ -6,6 +6,7 @@
> > #include "xe_device.h"
> >
> > #include <linux/delay.h>
> > +#include <linux/fault-inject.h>
> > #include <linux/units.h>
> >
> > #include <drm/drm_aperture.h>
> > @@ -300,12 +301,22 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
> > ttm_device_fini(&xe->ttm);
> > }
> >
> > +static noinline int fault_inject_device_create(void)
> > +{
> > + return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(fault_inject_device_create, ERRNO);
> > +
> > struct xe_device *xe_device_create(struct pci_dev *pdev,
> > const struct pci_device_id *ent)
> > {
> > struct xe_device *xe;
> > int err;
> >
> > + err = fault_inject_device_create();
>
> But then I don't understand here. To me it looks like this function would
> fulfill the criteria. It is not doing any clean up on the err goto,
> nor changingn any state before the first error return.
I believe the second requirement for error injectable functions [1] is
not fullfilled:
"The function does not execute any code which can change any state
before the first error return. [...] For example, [...] set a flag"
Before this code change, the first return statement comes after the call
to xe_display_driver_set_hooks() which sets some flags. After this, the
call to drm_aperture_remove_conflicting_pci_framebuffers() continues to
change the state.
But for the sake of keeping the code simple, maybe in such cases we do
not need to take this requirement so strictly and could go with using
ALLOW_ERROR_INJECTION() directly, without the wrapper, as long as the
caller function does not expect something to be done in the called
function. For example, this can be the case when in the caller function
the return code is propagated without the need for cleanup. It seems
to be the case here.
[1] https://docs.kernel.org/fault-injection/fault-injection.html#requirements-for-the-error-injectable-functions
Francois
>
> Why can't we go directly?
>
> Perhaps we should add a comment on each of this new fault_inject
> functions to explain why that was chosen instead of the direct
> macro assignment?
>
> > + if (err)
> > + return ERR_PTR(err);
> > +
> > xe_display_driver_set_hooks(&driver);
> >
> > err = drm_aperture_remove_conflicting_pci_framebuffers(pdev, &driver);
> > @@ -548,6 +559,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
> >
> > return 0;
> > }
> > +ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
> >
> > static void update_device_info(struct xe_device *xe)
> > {
> > diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> > index f68af56c3f86..4906e3f3150b 100644
> > --- a/drivers/gpu/drm/xe/xe_ggtt.c
> > +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> > @@ -5,6 +5,7 @@
> >
> > #include "xe_ggtt.h"
> >
> > +#include <linux/fault-inject.h>
> > #include <linux/io-64-nonatomic-lo-hi.h>
> > #include <linux/sizes.h>
> >
> > @@ -264,6 +265,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
> >
> > return 0;
> > }
> > +ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO);
> >
> > static void xe_ggtt_invalidate(struct xe_ggtt *ggtt);
> >
> > diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> > index 66d4e5e95abd..e366043eb4b8 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> > @@ -5,6 +5,8 @@
> >
> > #include "xe_guc_ads.h"
> >
> > +#include <linux/fault-inject.h>
> > +
> > #include <drm/drm_managed.h>
> >
> > #include <generated/xe_wa_oob.h>
> > @@ -396,12 +398,23 @@ static int calculate_waklv_size(struct xe_guc_ads *ads)
> >
> > #define MAX_GOLDEN_LRC_SIZE (SZ_4K * 64)
> >
> > +static noinline int fault_inject_guc_ads_init(void)
> > +{
> > + return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(fault_inject_guc_ads_init, ERRNO);
> > +
> > int xe_guc_ads_init(struct xe_guc_ads *ads)
> > {
> > struct xe_device *xe = ads_to_xe(ads);
> > struct xe_gt *gt = ads_to_gt(ads);
> > struct xe_tile *tile = gt_to_tile(gt);
> > struct xe_bo *bo;
> > + int ret;
> > +
> > + ret = fault_inject_guc_ads_init();
> > + if (ret)
> > + return ret;
> >
> > ads->golden_lrc_size = calculate_golden_lrc_size(ads);
> > ads->regset_size = calculate_regset_size(gt);
> > diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> > index 4b95f75b1546..61967ddd319f 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> > @@ -8,6 +8,7 @@
> > #include <linux/bitfield.h>
> > #include <linux/circ_buf.h>
> > #include <linux/delay.h>
> > +#include <linux/fault-inject.h>
> >
> > #include <kunit/static_stub.h>
> >
> > @@ -165,6 +166,12 @@ static void primelockdep(struct xe_guc_ct *ct)
> > fs_reclaim_release(GFP_KERNEL);
> > }
> >
> > +static noinline int fault_inject_guc_ct_init(void)
> > +{
> > + return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(fault_inject_guc_ct_init, ERRNO);
> > +
> > int xe_guc_ct_init(struct xe_guc_ct *ct)
> > {
> > struct xe_device *xe = ct_to_xe(ct);
> > @@ -173,6 +180,10 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
> > struct xe_bo *bo;
> > int err;
> >
> > + err = fault_inject_guc_ct_init();
> > + if (err)
> > + return err;
> > +
> > xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE));
> >
> > ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", 0);
> > diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
> > index a37ee3419428..a3e54f1bb0c3 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_log.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_log.c
> > @@ -5,6 +5,8 @@
> >
> > #include "xe_guc_log.h"
> >
> > +#include <linux/fault-inject.h>
> > +
> > #include <drm/drm_managed.h>
> >
> > #include "xe_bo.h"
> > @@ -77,11 +79,22 @@ void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p)
> > }
> > }
> >
> > +static noinline int fault_inject_guc_log_init(void)
> > +{
> > + return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(fault_inject_guc_log_init, ERRNO);
> > +
> > int xe_guc_log_init(struct xe_guc_log *log)
> > {
> > struct xe_device *xe = log_to_xe(log);
> > struct xe_tile *tile = gt_to_tile(log_to_gt(log));
> > struct xe_bo *bo;
> > + int err;
> > +
> > + err = fault_inject_guc_log_init();
> > + if (err)
> > + return err;
> >
> > bo = xe_managed_bo_create_pin_map(xe, tile, guc_log_size(),
> > XE_BO_FLAG_SYSTEM |
> > diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c
> > index ade6162dc259..ede7fd3e7785 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_relay.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_relay.c
> > @@ -5,6 +5,7 @@
> >
> > #include <linux/bitfield.h>
> > #include <linux/delay.h>
> > +#include <linux/fault-inject.h>
> >
> > #include <drm/drm_managed.h>
> >
> > @@ -320,6 +321,12 @@ static void __fini_relay(struct drm_device *drm, void *arg)
> > mempool_exit(&relay->pool);
> > }
> >
> > +static noinline int fault_inject_guc_relay_init(void)
> > +{
> > + return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(fault_inject_guc_relay_init, ERRNO);
> > +
> > /**
> > * xe_guc_relay_init - Initialize a &xe_guc_relay
> > * @relay: the &xe_guc_relay to initialize
> > @@ -335,6 +342,10 @@ int xe_guc_relay_init(struct xe_guc_relay *relay)
> > struct xe_device *xe = relay_to_xe(relay);
> > int err;
> >
> > + err = fault_inject_guc_relay_init();
> > + if (err)
> > + return err;
> > +
> > relay_assert(relay, !relay_is_ready(relay));
> >
> > if (!IS_SRIOV(xe))
> > diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
> > index 33eb039053e4..87075aed885d 100644
> > --- a/drivers/gpu/drm/xe/xe_pm.c
> > +++ b/drivers/gpu/drm/xe/xe_pm.c
> > @@ -5,6 +5,7 @@
> >
> > #include "xe_pm.h"
> >
> > +#include <linux/fault-inject.h>
> > #include <linux/pm_runtime.h>
> >
> > #include <drm/drm_managed.h>
> > @@ -247,10 +248,20 @@ static void xe_pm_runtime_init(struct xe_device *xe)
> > pm_runtime_put(dev);
> > }
> >
> > +static noinline int fault_inject_pm_init_early(void)
> > +{
> > + return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(fault_inject_pm_init_early, ERRNO);
> > +
> > int xe_pm_init_early(struct xe_device *xe)
> > {
> > int err;
> >
> > + err = fault_inject_pm_init_early();
> > + if (err)
> > + return err;
> > +
> > INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
> >
> > err = drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock);
> > diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
> > index 69a066ef20c0..f1dafcfd4eae 100644
> > --- a/drivers/gpu/drm/xe/xe_sriov.c
> > +++ b/drivers/gpu/drm/xe/xe_sriov.c
> > @@ -3,6 +3,8 @@
> > * Copyright © 2023 Intel Corporation
> > */
> >
> > +#include <linux/fault-inject.h>
> > +
> > #include <drm/drm_managed.h>
> >
> > #include "regs/xe_regs.h"
> > @@ -91,6 +93,12 @@ static void fini_sriov(struct drm_device *drm, void *arg)
> > xe->sriov.wq = NULL;
> > }
> >
> > +static noinline int fault_inject_sriov_init(void)
> > +{
> > + return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(fault_inject_sriov_init, ERRNO);
> > +
> > /**
> > * xe_sriov_init - Initialize SR-IOV specific data.
> > * @xe: the &xe_device to initialize
> > @@ -102,11 +110,16 @@ static void fini_sriov(struct drm_device *drm, void *arg)
> > */
> > int xe_sriov_init(struct xe_device *xe)
> > {
> > + int err = fault_inject_sriov_init();
> > +
> > + if (err)
> > + return err;
> > +
> > if (!IS_SRIOV(xe))
> > return 0;
> >
> > if (IS_SRIOV_PF(xe)) {
> > - int err = xe_sriov_pf_init_early(xe);
> > + err = xe_sriov_pf_init_early(xe);
> >
> > if (err)
> > return err;
> > diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> > index dda5268507d8..c82b4278c03e 100644
> > --- a/drivers/gpu/drm/xe/xe_tile.c
> > +++ b/drivers/gpu/drm/xe/xe_tile.c
> > @@ -3,6 +3,8 @@
> > * Copyright © 2023 Intel Corporation
> > */
> >
> > +#include <linux/fault-inject.h>
> > +
> > #include <drm/drm_managed.h>
> >
> > #include "xe_device.h"
> > @@ -99,6 +101,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
> > return 0;
> > }
> >
> > +static noinline int fault_inject_tile_init_early(void)
> > +{
> > + return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(fault_inject_tile_init_early, ERRNO);
> > +
> > /**
> > * xe_tile_init_early - Initialize the tile and primary GT
> > * @tile: Tile to initialize
> > @@ -114,6 +122,10 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
> > {
> > int err;
> >
> > + err = fault_inject_tile_init_early();
> > + if (err)
> > + return err;
> > +
> > tile->xe = xe;
> > tile->id = id;
> >
> > diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
> > index eab9456e051f..8fff0fd7c675 100644
> > --- a/drivers/gpu/drm/xe/xe_uc_fw.c
> > +++ b/drivers/gpu/drm/xe/xe_uc_fw.c
> > @@ -4,6 +4,7 @@
> > */
> >
> > #include <linux/bitfield.h>
> > +#include <linux/fault-inject.h>
> > #include <linux/firmware.h>
> >
> > #include <drm/drm_managed.h>
> > @@ -776,11 +777,21 @@ static int uc_fw_copy(struct xe_uc_fw *uc_fw, const void *data, size_t size, u32
> > return err;
> > }
> >
> > +static noinline int fault_inject_uc_fw_init(void)
> > +{
> > + return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(fault_inject_uc_fw_init, ERRNO);
> > +
> > int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
> > {
> > const struct firmware *fw = NULL;
> > int err;
> >
> > + err = fault_inject_uc_fw_init();
> > + if (err)
> > + return err;
> > +
> > err = uc_fw_request(uc_fw, &fw);
> > if (err)
> > return err;
> > diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
> > index 22c148b1e996..121443b790bf 100644
> > --- a/drivers/gpu/drm/xe/xe_wa.c
> > +++ b/drivers/gpu/drm/xe/xe_wa.c
> > @@ -8,6 +8,7 @@
> > #include <drm/drm_managed.h>
> > #include <kunit/visibility.h>
> > #include <linux/compiler_types.h>
> > +#include <linux/fault-inject.h>
> >
> > #include <generated/xe_wa_oob.h>
> >
> > @@ -818,6 +819,12 @@ void xe_wa_process_lrc(struct xe_hw_engine *hwe)
> > xe_rtp_process_to_sr(&ctx, lrc_was, &hwe->reg_lrc);
> > }
> >
> > +static noinline int fault_inject_wa_init(void)
> > +{
> > + return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(fault_inject_wa_init, ERRNO);
> > +
> > /**
> > * xe_wa_init - initialize gt with workaround bookkeeping
> > * @gt: GT instance to initialize
> > @@ -829,6 +836,11 @@ int xe_wa_init(struct xe_gt *gt)
> > struct xe_device *xe = gt_to_xe(gt);
> > size_t n_oob, n_lrc, n_engine, n_gt, total;
> > unsigned long *p;
> > + int err;
> > +
> > + err = fault_inject_wa_init();
> > + if (err)
> > + return err;
> >
> > n_gt = BITS_TO_LONGS(ARRAY_SIZE(gt_was));
> > n_engine = BITS_TO_LONGS(ARRAY_SIZE(engine_was));
> > diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
> > index 93c82825d896..88a201122a22 100644
> > --- a/drivers/gpu/drm/xe/xe_wopcm.c
> > +++ b/drivers/gpu/drm/xe/xe_wopcm.c
> > @@ -5,6 +5,8 @@
> >
> > #include "xe_wopcm.h"
> >
> > +#include <linux/fault-inject.h>
> > +
> > #include "regs/xe_guc_regs.h"
> > #include "xe_device.h"
> > #include "xe_force_wake.h"
> > @@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
> >
> > return ret;
> > }
> > +ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
> > --
> > 2.43.0
> >
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time
2024-09-24 12:58 ` Francois Dugast
@ 2024-09-26 18:43 ` Rodrigo Vivi
0 siblings, 0 replies; 9+ messages in thread
From: Rodrigo Vivi @ 2024-09-26 18:43 UTC (permalink / raw)
To: Francois Dugast
Cc: intel-xe, Lucas De Marchi, Matthew Brost, Michal Wajdeczko
On Tue, Sep 24, 2024 at 02:58:43PM +0200, Francois Dugast wrote:
> On Mon, Sep 23, 2024 at 02:05:01PM -0400, Rodrigo Vivi wrote:
> > On Fri, Sep 20, 2024 at 03:18:34PM +0200, Francois Dugast wrote:
> > > The kernel fault injection infrastructure is used to test proper error
> > > handling during probe. In particular, ALLOW_ERROR_INJECTION() is added
> > > directly to functions which fullfill the injectable functions
> > > requirements:
> > > fault-injection.html#requirements-for-the-error-injectable-functions
> >
> > Okay, I read this. This was my biggest initial question here...
> >
> > >
> > > Otherwise a helper function is added and called in the beginning of the
> > > function where the fault is to be injected.
> > >
> > > The return code of the functions using ALLOW_ERROR_INJECTION() can be
> > > conditionnally modified at runtime by tuning some debugfs entries. This
> > > requires CONFIG_FUNCTION_ERROR_INJECTION (among others).
> > >
> > > One way to use fault injection at probe time by making each of those
> > > functions fail one at a time is:
> > >
> > > FAILTYPE=fail_function
> > > DEVICE="0000:00:08.0" # depends on the system
> > > ERRNO=-12 # -ENOMEM, can depend on the function
> > >
> > > echo N > /sys/kernel/debug/$FAILTYPE/task-filter
> > > echo 100 > /sys/kernel/debug/$FAILTYPE/probability
> > > echo 0 > /sys/kernel/debug/$FAILTYPE/interval
> > > echo -1 > /sys/kernel/debug/$FAILTYPE/times
> > > echo 0 > /sys/kernel/debug/$FAILTYPE/space
> > > echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
> > >
> > > modprobe xe
> > > echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
> > >
> > > grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
> > > cut -d ' ' -f 1 | while read -r FUNCTION ; do
> > > echo "Injecting fault in $FUNCTION"
> > > echo "" > /sys/kernel/debug/$FAILTYPE/inject
> > > echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
> > > printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
> > > echo $DEVICE > /sys/bus/pci/drivers/xe/bind
> > > done
> > >
> > > rmmod xe
> > >
> > > It will also be integrated into IGT for systematic execution by CI.
> > >
> > > Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> > > Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> > > Cc: Matthew Brost <matthew.brost@intel.com>
> > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> > > ---
> > > drivers/gpu/drm/xe/xe_device.c | 12 ++++++++++++
> > > drivers/gpu/drm/xe/xe_ggtt.c | 2 ++
> > > drivers/gpu/drm/xe/xe_guc_ads.c | 13 +++++++++++++
> > > drivers/gpu/drm/xe/xe_guc_ct.c | 11 +++++++++++
> > > drivers/gpu/drm/xe/xe_guc_log.c | 13 +++++++++++++
> > > drivers/gpu/drm/xe/xe_guc_relay.c | 11 +++++++++++
> > > drivers/gpu/drm/xe/xe_pm.c | 11 +++++++++++
> > > drivers/gpu/drm/xe/xe_sriov.c | 15 ++++++++++++++-
> > > drivers/gpu/drm/xe/xe_tile.c | 12 ++++++++++++
> > > drivers/gpu/drm/xe/xe_uc_fw.c | 11 +++++++++++
> > > drivers/gpu/drm/xe/xe_wa.c | 12 ++++++++++++
> > > drivers/gpu/drm/xe/xe_wopcm.c | 3 +++
> > > 12 files changed, 125 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> > > index cb5a9fd820cf..d26352ecf75e 100644
> > > --- a/drivers/gpu/drm/xe/xe_device.c
> > > +++ b/drivers/gpu/drm/xe/xe_device.c
> > > @@ -6,6 +6,7 @@
> > > #include "xe_device.h"
> > >
> > > #include <linux/delay.h>
> > > +#include <linux/fault-inject.h>
> > > #include <linux/units.h>
> > >
> > > #include <drm/drm_aperture.h>
> > > @@ -300,12 +301,22 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
> > > ttm_device_fini(&xe->ttm);
> > > }
> > >
> > > +static noinline int fault_inject_device_create(void)
> > > +{
> > > + return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(fault_inject_device_create, ERRNO);
> > > +
> > > struct xe_device *xe_device_create(struct pci_dev *pdev,
> > > const struct pci_device_id *ent)
> > > {
> > > struct xe_device *xe;
> > > int err;
> > >
> > > + err = fault_inject_device_create();
> >
> > But then I don't understand here. To me it looks like this function would
> > fulfill the criteria. It is not doing any clean up on the err goto,
> > nor changingn any state before the first error return.
>
> I believe the second requirement for error injectable functions [1] is
> not fullfilled:
>
> "The function does not execute any code which can change any state
> before the first error return. [...] For example, [...] set a flag"
ouch! even local flags that means nothing after the returned probe!
that is really so strict.
>
> Before this code change, the first return statement comes after the call
> to xe_display_driver_set_hooks() which sets some flags. After this, the
> call to drm_aperture_remove_conflicting_pci_framebuffers() continues to
> change the state.
hmm, in this case, this drm_apearture call can really change the memory.
But I had only looked to the xe_display_driver_set_hooks() that is
really inoffensive, because the rule tells 'before the first error'.
With this I was assuming that the first line to return the error is never
executed... only what is *before* that... but that would be too magic
right?!
>
> But for the sake of keeping the code simple, maybe in such cases we do
> not need to take this requirement so strictly and could go with using
> ALLOW_ERROR_INJECTION() directly, without the wrapper, as long as the
> caller function does not expect something to be done in the called
> function. For example, this can be the case when in the caller function
> the return code is propagated without the need for cleanup. It seems
> to be the case here.
>
> [1] https://docs.kernel.org/fault-injection/fault-injection.html#requirements-for-the-error-injectable-functions
>
> Francois
>
> >
> > Why can't we go directly?
> >
> > Perhaps we should add a comment on each of this new fault_inject
> > functions to explain why that was chosen instead of the direct
> > macro assignment?
> >
> > > + if (err)
> > > + return ERR_PTR(err);
> > > +
> > > xe_display_driver_set_hooks(&driver);
> > >
> > > err = drm_aperture_remove_conflicting_pci_framebuffers(pdev, &driver);
> > > @@ -548,6 +559,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
> > >
> > > return 0;
> > > }
> > > +ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
> > >
> > > static void update_device_info(struct xe_device *xe)
> > > {
> > > diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
> > > index f68af56c3f86..4906e3f3150b 100644
> > > --- a/drivers/gpu/drm/xe/xe_ggtt.c
> > > +++ b/drivers/gpu/drm/xe/xe_ggtt.c
> > > @@ -5,6 +5,7 @@
> > >
> > > #include "xe_ggtt.h"
> > >
> > > +#include <linux/fault-inject.h>
> > > #include <linux/io-64-nonatomic-lo-hi.h>
> > > #include <linux/sizes.h>
> > >
> > > @@ -264,6 +265,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
> > >
> > > return 0;
> > > }
> > > +ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO);
> > >
> > > static void xe_ggtt_invalidate(struct xe_ggtt *ggtt);
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> > > index 66d4e5e95abd..e366043eb4b8 100644
> > > --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> > > +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> > > @@ -5,6 +5,8 @@
> > >
> > > #include "xe_guc_ads.h"
> > >
> > > +#include <linux/fault-inject.h>
> > > +
> > > #include <drm/drm_managed.h>
> > >
> > > #include <generated/xe_wa_oob.h>
> > > @@ -396,12 +398,23 @@ static int calculate_waklv_size(struct xe_guc_ads *ads)
> > >
> > > #define MAX_GOLDEN_LRC_SIZE (SZ_4K * 64)
> > >
> > > +static noinline int fault_inject_guc_ads_init(void)
> > > +{
> > > + return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(fault_inject_guc_ads_init, ERRNO);
> > > +
> > > int xe_guc_ads_init(struct xe_guc_ads *ads)
> > > {
> > > struct xe_device *xe = ads_to_xe(ads);
> > > struct xe_gt *gt = ads_to_gt(ads);
> > > struct xe_tile *tile = gt_to_tile(gt);
> > > struct xe_bo *bo;
> > > + int ret;
> > > +
> > > + ret = fault_inject_guc_ads_init();
> > > + if (ret)
> > > + return ret;
> > >
> > > ads->golden_lrc_size = calculate_golden_lrc_size(ads);
> > > ads->regset_size = calculate_regset_size(gt);
> > > diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> > > index 4b95f75b1546..61967ddd319f 100644
> > > --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> > > +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> > > @@ -8,6 +8,7 @@
> > > #include <linux/bitfield.h>
> > > #include <linux/circ_buf.h>
> > > #include <linux/delay.h>
> > > +#include <linux/fault-inject.h>
> > >
> > > #include <kunit/static_stub.h>
> > >
> > > @@ -165,6 +166,12 @@ static void primelockdep(struct xe_guc_ct *ct)
> > > fs_reclaim_release(GFP_KERNEL);
> > > }
> > >
> > > +static noinline int fault_inject_guc_ct_init(void)
> > > +{
> > > + return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(fault_inject_guc_ct_init, ERRNO);
> > > +
> > > int xe_guc_ct_init(struct xe_guc_ct *ct)
> > > {
> > > struct xe_device *xe = ct_to_xe(ct);
> > > @@ -173,6 +180,10 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
> > > struct xe_bo *bo;
> > > int err;
> > >
> > > + err = fault_inject_guc_ct_init();
> > > + if (err)
> > > + return err;
> > > +
> > > xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE));
> > >
> > > ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", 0);
> > > diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
> > > index a37ee3419428..a3e54f1bb0c3 100644
> > > --- a/drivers/gpu/drm/xe/xe_guc_log.c
> > > +++ b/drivers/gpu/drm/xe/xe_guc_log.c
> > > @@ -5,6 +5,8 @@
> > >
> > > #include "xe_guc_log.h"
> > >
> > > +#include <linux/fault-inject.h>
> > > +
> > > #include <drm/drm_managed.h>
> > >
> > > #include "xe_bo.h"
> > > @@ -77,11 +79,22 @@ void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p)
> > > }
> > > }
> > >
> > > +static noinline int fault_inject_guc_log_init(void)
> > > +{
> > > + return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(fault_inject_guc_log_init, ERRNO);
> > > +
> > > int xe_guc_log_init(struct xe_guc_log *log)
> > > {
> > > struct xe_device *xe = log_to_xe(log);
> > > struct xe_tile *tile = gt_to_tile(log_to_gt(log));
> > > struct xe_bo *bo;
> > > + int err;
> > > +
> > > + err = fault_inject_guc_log_init();
> > > + if (err)
> > > + return err;
> > >
> > > bo = xe_managed_bo_create_pin_map(xe, tile, guc_log_size(),
> > > XE_BO_FLAG_SYSTEM |
> > > diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c
> > > index ade6162dc259..ede7fd3e7785 100644
> > > --- a/drivers/gpu/drm/xe/xe_guc_relay.c
> > > +++ b/drivers/gpu/drm/xe/xe_guc_relay.c
> > > @@ -5,6 +5,7 @@
> > >
> > > #include <linux/bitfield.h>
> > > #include <linux/delay.h>
> > > +#include <linux/fault-inject.h>
> > >
> > > #include <drm/drm_managed.h>
> > >
> > > @@ -320,6 +321,12 @@ static void __fini_relay(struct drm_device *drm, void *arg)
> > > mempool_exit(&relay->pool);
> > > }
> > >
> > > +static noinline int fault_inject_guc_relay_init(void)
> > > +{
> > > + return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(fault_inject_guc_relay_init, ERRNO);
> > > +
> > > /**
> > > * xe_guc_relay_init - Initialize a &xe_guc_relay
> > > * @relay: the &xe_guc_relay to initialize
> > > @@ -335,6 +342,10 @@ int xe_guc_relay_init(struct xe_guc_relay *relay)
> > > struct xe_device *xe = relay_to_xe(relay);
> > > int err;
> > >
> > > + err = fault_inject_guc_relay_init();
> > > + if (err)
> > > + return err;
> > > +
> > > relay_assert(relay, !relay_is_ready(relay));
> > >
> > > if (!IS_SRIOV(xe))
> > > diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
> > > index 33eb039053e4..87075aed885d 100644
> > > --- a/drivers/gpu/drm/xe/xe_pm.c
> > > +++ b/drivers/gpu/drm/xe/xe_pm.c
> > > @@ -5,6 +5,7 @@
> > >
> > > #include "xe_pm.h"
> > >
> > > +#include <linux/fault-inject.h>
> > > #include <linux/pm_runtime.h>
> > >
> > > #include <drm/drm_managed.h>
> > > @@ -247,10 +248,20 @@ static void xe_pm_runtime_init(struct xe_device *xe)
> > > pm_runtime_put(dev);
> > > }
> > >
> > > +static noinline int fault_inject_pm_init_early(void)
> > > +{
> > > + return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(fault_inject_pm_init_early, ERRNO);
> > > +
> > > int xe_pm_init_early(struct xe_device *xe)
> > > {
> > > int err;
> > >
> > > + err = fault_inject_pm_init_early();
> > > + if (err)
> > > + return err;
> > > +
> > > INIT_LIST_HEAD(&xe->mem_access.vram_userfault.list);
> > >
> > > err = drmm_mutex_init(&xe->drm, &xe->mem_access.vram_userfault.lock);
> > > diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
> > > index 69a066ef20c0..f1dafcfd4eae 100644
> > > --- a/drivers/gpu/drm/xe/xe_sriov.c
> > > +++ b/drivers/gpu/drm/xe/xe_sriov.c
> > > @@ -3,6 +3,8 @@
> > > * Copyright © 2023 Intel Corporation
> > > */
> > >
> > > +#include <linux/fault-inject.h>
> > > +
> > > #include <drm/drm_managed.h>
> > >
> > > #include "regs/xe_regs.h"
> > > @@ -91,6 +93,12 @@ static void fini_sriov(struct drm_device *drm, void *arg)
> > > xe->sriov.wq = NULL;
> > > }
> > >
> > > +static noinline int fault_inject_sriov_init(void)
> > > +{
> > > + return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(fault_inject_sriov_init, ERRNO);
> > > +
> > > /**
> > > * xe_sriov_init - Initialize SR-IOV specific data.
> > > * @xe: the &xe_device to initialize
> > > @@ -102,11 +110,16 @@ static void fini_sriov(struct drm_device *drm, void *arg)
> > > */
> > > int xe_sriov_init(struct xe_device *xe)
> > > {
> > > + int err = fault_inject_sriov_init();
> > > +
> > > + if (err)
> > > + return err;
> > > +
> > > if (!IS_SRIOV(xe))
> > > return 0;
> > >
> > > if (IS_SRIOV_PF(xe)) {
> > > - int err = xe_sriov_pf_init_early(xe);
> > > + err = xe_sriov_pf_init_early(xe);
> > >
> > > if (err)
> > > return err;
> > > diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> > > index dda5268507d8..c82b4278c03e 100644
> > > --- a/drivers/gpu/drm/xe/xe_tile.c
> > > +++ b/drivers/gpu/drm/xe/xe_tile.c
> > > @@ -3,6 +3,8 @@
> > > * Copyright © 2023 Intel Corporation
> > > */
> > >
> > > +#include <linux/fault-inject.h>
> > > +
> > > #include <drm/drm_managed.h>
> > >
> > > #include "xe_device.h"
> > > @@ -99,6 +101,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
> > > return 0;
> > > }
> > >
> > > +static noinline int fault_inject_tile_init_early(void)
> > > +{
> > > + return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(fault_inject_tile_init_early, ERRNO);
> > > +
> > > /**
> > > * xe_tile_init_early - Initialize the tile and primary GT
> > > * @tile: Tile to initialize
> > > @@ -114,6 +122,10 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
> > > {
> > > int err;
> > >
> > > + err = fault_inject_tile_init_early();
> > > + if (err)
> > > + return err;
> > > +
> > > tile->xe = xe;
> > > tile->id = id;
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
> > > index eab9456e051f..8fff0fd7c675 100644
> > > --- a/drivers/gpu/drm/xe/xe_uc_fw.c
> > > +++ b/drivers/gpu/drm/xe/xe_uc_fw.c
> > > @@ -4,6 +4,7 @@
> > > */
> > >
> > > #include <linux/bitfield.h>
> > > +#include <linux/fault-inject.h>
> > > #include <linux/firmware.h>
> > >
> > > #include <drm/drm_managed.h>
> > > @@ -776,11 +777,21 @@ static int uc_fw_copy(struct xe_uc_fw *uc_fw, const void *data, size_t size, u32
> > > return err;
> > > }
> > >
> > > +static noinline int fault_inject_uc_fw_init(void)
> > > +{
> > > + return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(fault_inject_uc_fw_init, ERRNO);
> > > +
> > > int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
> > > {
> > > const struct firmware *fw = NULL;
> > > int err;
> > >
> > > + err = fault_inject_uc_fw_init();
> > > + if (err)
> > > + return err;
> > > +
> > > err = uc_fw_request(uc_fw, &fw);
> > > if (err)
> > > return err;
> > > diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
> > > index 22c148b1e996..121443b790bf 100644
> > > --- a/drivers/gpu/drm/xe/xe_wa.c
> > > +++ b/drivers/gpu/drm/xe/xe_wa.c
> > > @@ -8,6 +8,7 @@
> > > #include <drm/drm_managed.h>
> > > #include <kunit/visibility.h>
> > > #include <linux/compiler_types.h>
> > > +#include <linux/fault-inject.h>
> > >
> > > #include <generated/xe_wa_oob.h>
> > >
> > > @@ -818,6 +819,12 @@ void xe_wa_process_lrc(struct xe_hw_engine *hwe)
> > > xe_rtp_process_to_sr(&ctx, lrc_was, &hwe->reg_lrc);
> > > }
> > >
> > > +static noinline int fault_inject_wa_init(void)
> > > +{
> > > + return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(fault_inject_wa_init, ERRNO);
> > > +
> > > /**
> > > * xe_wa_init - initialize gt with workaround bookkeeping
> > > * @gt: GT instance to initialize
> > > @@ -829,6 +836,11 @@ int xe_wa_init(struct xe_gt *gt)
> > > struct xe_device *xe = gt_to_xe(gt);
> > > size_t n_oob, n_lrc, n_engine, n_gt, total;
> > > unsigned long *p;
> > > + int err;
> > > +
> > > + err = fault_inject_wa_init();
> > > + if (err)
> > > + return err;
> > >
> > > n_gt = BITS_TO_LONGS(ARRAY_SIZE(gt_was));
> > > n_engine = BITS_TO_LONGS(ARRAY_SIZE(engine_was));
> > > diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
> > > index 93c82825d896..88a201122a22 100644
> > > --- a/drivers/gpu/drm/xe/xe_wopcm.c
> > > +++ b/drivers/gpu/drm/xe/xe_wopcm.c
> > > @@ -5,6 +5,8 @@
> > >
> > > #include "xe_wopcm.h"
> > >
> > > +#include <linux/fault-inject.h>
> > > +
> > > #include "regs/xe_guc_regs.h"
> > > #include "xe_device.h"
> > > #include "xe_force_wake.h"
> > > @@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
> > >
> > > return ret;
> > > }
> > > +ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
> > > --
> > > 2.43.0
> > >
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2024-09-26 18:43 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-20 13:18 [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
2024-09-20 13:51 ` ✓ CI.Patch_applied: success for drm/xe: Use fault injection infrastructure to find issues at probe time (rev2) Patchwork
2024-09-20 13:51 ` ✓ CI.checkpatch: " Patchwork
2024-09-20 13:52 ` ✗ CI.KUnit: failure " Patchwork
2024-09-23 10:34 ` Francois Dugast
2024-09-23 12:00 ` Jani Nikula
2024-09-23 18:05 ` [PATCH v2] drm/xe: Use fault injection infrastructure to find issues at probe time Rodrigo Vivi
2024-09-24 12:58 ` Francois Dugast
2024-09-26 18:43 ` Rodrigo Vivi
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox