intel-xe.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] drm/xe: Use fault injection infrastructure to find issues at probe time
@ 2024-09-25 15:55 Francois Dugast
  2024-09-26  9:43 ` Jani Nikula
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Francois Dugast @ 2024-09-25 15:55 UTC (permalink / raw)
  To: intel-xe
  Cc: Francois Dugast, Lucas De Marchi, Matthew Brost, Rodrigo Vivi,
	Michal Wajdeczko

The kernel fault injection infrastructure is used to test proper error
handling during probe. The return code of the functions using
ALLOW_ERROR_INJECTION() can be conditionnally modified at runtime by
tuning some debugfs entries. This requires CONFIG_FUNCTION_ERROR_INJECTION
(among others).

One way to use fault injection at probe time by making each of those
functions fail one at a time is:

    FAILTYPE=fail_function
    DEVICE="0000:00:08.0" # depends on the system
    ERRNO=-12 # -ENOMEM, can depend on the function

    echo N > /sys/kernel/debug/$FAILTYPE/task-filter
    echo 100 > /sys/kernel/debug/$FAILTYPE/probability
    echo 0 > /sys/kernel/debug/$FAILTYPE/interval
    echo -1 > /sys/kernel/debug/$FAILTYPE/times
    echo 0 > /sys/kernel/debug/$FAILTYPE/space
    echo 1 > /sys/kernel/debug/$FAILTYPE/verbose

    modprobe xe
    echo $DEVICE > /sys/bus/pci/drivers/xe/unbind

    grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | \
    cut -d ' ' -f 1 | while read -r FUNCTION ; do
        echo "Injecting fault in $FUNCTION"
        echo "" > /sys/kernel/debug/$FAILTYPE/inject
        echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
        printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
        echo $DEVICE > /sys/bus/pci/drivers/xe/bind
    done

    rmmod xe

It will also be integrated into IGT for systematic execution by CI.

v2: Wrappers are not needed in the cases covered by this patch, so
    remove them and use ALLOW_ERROR_INJECTION() directly.

Signed-off-by: Francois Dugast <francois.dugast@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c    | 17 +++++++++++++++++
 drivers/gpu/drm/xe/xe_ggtt.c      |  8 ++++++++
 drivers/gpu/drm/xe/xe_guc_ads.c   | 10 ++++++++++
 drivers/gpu/drm/xe/xe_guc_ct.c    |  9 +++++++++
 drivers/gpu/drm/xe/xe_guc_log.c   | 10 ++++++++++
 drivers/gpu/drm/xe/xe_guc_relay.c |  8 ++++++++
 drivers/gpu/drm/xe/xe_pm.c        |  9 +++++++++
 drivers/gpu/drm/xe/xe_sriov.c     | 10 ++++++++++
 drivers/gpu/drm/xe/xe_tile.c      |  9 +++++++++
 drivers/gpu/drm/xe/xe_uc_fw.c     |  9 +++++++++
 drivers/gpu/drm/xe/xe_wa.c        |  8 ++++++++
 drivers/gpu/drm/xe/xe_wopcm.c     |  9 +++++++++
 12 files changed, 116 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 53dcece40fc5..f84327f8d2bf 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -6,6 +6,7 @@
 #include "xe_device.h"
 
 #include <linux/delay.h>
+#include <linux/fault-inject.h>
 #include <linux/units.h>
 
 #include <drm/drm_aperture.h>
@@ -298,6 +299,13 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
 	ttm_device_fini(&xe->ttm);
 }
 
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
 struct xe_device *xe_device_create(struct pci_dev *pdev,
 				   const struct pci_device_id *ent)
 {
@@ -378,6 +386,7 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
 err:
 	return ERR_PTR(err);
 }
+ALLOW_ERROR_INJECTION(xe_device_create, ERRNO);
 
 static bool xe_driver_flr_disabled(struct xe_device *xe)
 {
@@ -500,6 +509,13 @@ static bool verify_lmem_ready(struct xe_device *xe)
 	return !!val;
 }
 
+/*
+ * This function fullfills the requirements for the error injectable functions
+ * fault-injection.html#requirements-for-the-error-injectable-functions so the
+ * ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in
+ * the callers.
+ */
 static int wait_for_lmem_ready(struct xe_device *xe)
 {
 	unsigned long timeout, start;
@@ -546,6 +562,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
 
 	return 0;
 }
+ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
 
 static void update_device_info(struct xe_device *xe)
 {
diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c
index f68af56c3f86..a55f4afff4d6 100644
--- a/drivers/gpu/drm/xe/xe_ggtt.c
+++ b/drivers/gpu/drm/xe/xe_ggtt.c
@@ -5,6 +5,7 @@
 
 #include "xe_ggtt.h"
 
+#include <linux/fault-inject.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/sizes.h>
 
@@ -209,6 +210,12 @@ static const struct xe_ggtt_pt_ops xelpg_pt_wa_ops = {
  * initial clear done to it yet. That will happen in the regular, non-early
  * GGTT initialization.
  *
+ * This function fullfills the requirements for the error injectable functions
+ * fault-injection.html#requirements-for-the-error-injectable-functions so the
+ * ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in
+ * the callers.
+ *
  * Return: 0 on success or a negative error code on failure.
  */
 int xe_ggtt_init_early(struct xe_ggtt *ggtt)
@@ -264,6 +271,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
 
 	return 0;
 }
+ALLOW_ERROR_INJECTION(xe_ggtt_init_early, ERRNO);
 
 static void xe_ggtt_invalidate(struct xe_ggtt *ggtt);
 
diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
index 66d4e5e95abd..74a2d600e811 100644
--- a/drivers/gpu/drm/xe/xe_guc_ads.c
+++ b/drivers/gpu/drm/xe/xe_guc_ads.c
@@ -5,6 +5,8 @@
 
 #include "xe_guc_ads.h"
 
+#include <linux/fault-inject.h>
+
 #include <drm/drm_managed.h>
 
 #include <generated/xe_wa_oob.h>
@@ -396,6 +398,13 @@ static int calculate_waklv_size(struct xe_guc_ads *ads)
 
 #define MAX_GOLDEN_LRC_SIZE	(SZ_4K * 64)
 
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
 int xe_guc_ads_init(struct xe_guc_ads *ads)
 {
 	struct xe_device *xe = ads_to_xe(ads);
@@ -418,6 +427,7 @@ int xe_guc_ads_init(struct xe_guc_ads *ads)
 
 	return 0;
 }
+ALLOW_ERROR_INJECTION(xe_guc_ads_init, ERRNO);
 
 /**
  * xe_guc_ads_init_post_hwconfig - initialize ADS post hwconfig load
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 4b95f75b1546..41900d1da5e8 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -8,6 +8,7 @@
 #include <linux/bitfield.h>
 #include <linux/circ_buf.h>
 #include <linux/delay.h>
+#include <linux/fault-inject.h>
 
 #include <kunit/static_stub.h>
 
@@ -165,6 +166,13 @@ static void primelockdep(struct xe_guc_ct *ct)
 	fs_reclaim_release(GFP_KERNEL);
 }
 
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
 int xe_guc_ct_init(struct xe_guc_ct *ct)
 {
 	struct xe_device *xe = ct_to_xe(ct);
@@ -209,6 +217,7 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
 	ct->state = XE_GUC_CT_STATE_DISABLED;
 	return 0;
 }
+ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO);
 
 #define desc_read(xe_, guc_ctb__, field_)			\
 	xe_map_rd_field(xe_, &guc_ctb__->desc, 0,		\
diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c
index a37ee3419428..d448c7f09c01 100644
--- a/drivers/gpu/drm/xe/xe_guc_log.c
+++ b/drivers/gpu/drm/xe/xe_guc_log.c
@@ -5,6 +5,8 @@
 
 #include "xe_guc_log.h"
 
+#include <linux/fault-inject.h>
+
 #include <drm/drm_managed.h>
 
 #include "xe_bo.h"
@@ -77,6 +79,13 @@ void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p)
 	}
 }
 
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
 int xe_guc_log_init(struct xe_guc_log *log)
 {
 	struct xe_device *xe = log_to_xe(log);
@@ -96,3 +105,4 @@ int xe_guc_log_init(struct xe_guc_log *log)
 
 	return 0;
 }
+ALLOW_ERROR_INJECTION(xe_guc_log_init, ERRNO);
diff --git a/drivers/gpu/drm/xe/xe_guc_relay.c b/drivers/gpu/drm/xe/xe_guc_relay.c
index ade6162dc259..d077e2ddb531 100644
--- a/drivers/gpu/drm/xe/xe_guc_relay.c
+++ b/drivers/gpu/drm/xe/xe_guc_relay.c
@@ -5,6 +5,7 @@
 
 #include <linux/bitfield.h>
 #include <linux/delay.h>
+#include <linux/fault-inject.h>
 
 #include <drm/drm_managed.h>
 
@@ -327,6 +328,12 @@ static void __fini_relay(struct drm_device *drm, void *arg)
  * Initialize remaining members of &xe_guc_relay that may depend
  * on the SR-IOV mode.
  *
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ *
  * Return: 0 on success or a negative error code on failure.
  */
 int xe_guc_relay_init(struct xe_guc_relay *relay)
@@ -355,6 +362,7 @@ int xe_guc_relay_init(struct xe_guc_relay *relay)
 
 	return drmm_add_action_or_reset(&xe->drm, __fini_relay, relay);
 }
+ALLOW_ERROR_INJECTION(xe_guc_relay_init, ERRNO);
 
 static u32 to_relay_error(int err)
 {
diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
index 33eb039053e4..b2f0cd1adc88 100644
--- a/drivers/gpu/drm/xe/xe_pm.c
+++ b/drivers/gpu/drm/xe/xe_pm.c
@@ -5,6 +5,7 @@
 
 #include "xe_pm.h"
 
+#include <linux/fault-inject.h>
 #include <linux/pm_runtime.h>
 
 #include <drm/drm_managed.h>
@@ -247,6 +248,13 @@ static void xe_pm_runtime_init(struct xe_device *xe)
 	pm_runtime_put(dev);
 }
 
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
 int xe_pm_init_early(struct xe_device *xe)
 {
 	int err;
@@ -263,6 +271,7 @@ int xe_pm_init_early(struct xe_device *xe)
 
 	return 0;
 }
+ALLOW_ERROR_INJECTION(xe_pm_init_early, ERRNO);
 
 /**
  * xe_pm_init - Initialize Xe Power Management
diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c
index 69a066ef20c0..b714825c209c 100644
--- a/drivers/gpu/drm/xe/xe_sriov.c
+++ b/drivers/gpu/drm/xe/xe_sriov.c
@@ -3,6 +3,8 @@
  * Copyright © 2023 Intel Corporation
  */
 
+#include <linux/fault-inject.h>
+
 #include <drm/drm_managed.h>
 
 #include "regs/xe_regs.h"
@@ -98,6 +100,13 @@ static void fini_sriov(struct drm_device *drm, void *arg)
  * In this function we create dedicated workqueue that will be used
  * by the SR-IOV specific workers.
  *
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip
+ * execution at runtime and use a provided return value, in order to
+ * test errors paths in the callers. The requirements for the error
+ * injectable functions are not strictly fullfilled but this is
+ * acceptable because the caller only propagates the error up the
+ * stack without cleanup of resources potentially allocated here.
+ *
  * Return: 0 on success or a negative error code on failure.
  */
 int xe_sriov_init(struct xe_device *xe)
@@ -119,6 +128,7 @@ int xe_sriov_init(struct xe_device *xe)
 
 	return drmm_add_action_or_reset(&xe->drm, fini_sriov, xe);
 }
+ALLOW_ERROR_INJECTION(xe_sriov_init, ERRNO);
 
 /**
  * xe_sriov_print_info - Print basic SR-IOV information.
diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
index dda5268507d8..fe56fb6327a9 100644
--- a/drivers/gpu/drm/xe/xe_tile.c
+++ b/drivers/gpu/drm/xe/xe_tile.c
@@ -3,6 +3,8 @@
  * Copyright © 2023 Intel Corporation
  */
 
+#include <linux/fault-inject.h>
+
 #include <drm/drm_managed.h>
 
 #include "xe_device.h"
@@ -108,6 +110,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
  * Initializes per-tile resources that don't require any interactions with the
  * hardware or any knowledge about the Graphics/Media IP version.
  *
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ *
  * Returns: 0 on success, negative error code on error.
  */
 int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
@@ -129,6 +137,7 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
 
 	return 0;
 }
+ALLOW_ERROR_INJECTION(xe_tile_init_early, ERRNO);
 
 static int tile_ttm_mgr_init(struct xe_tile *tile)
 {
diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c
index eab9456e051f..8f43b0990947 100644
--- a/drivers/gpu/drm/xe/xe_uc_fw.c
+++ b/drivers/gpu/drm/xe/xe_uc_fw.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/bitfield.h>
+#include <linux/fault-inject.h>
 #include <linux/firmware.h>
 
 #include <drm/drm_managed.h>
@@ -776,6 +777,13 @@ static int uc_fw_copy(struct xe_uc_fw *uc_fw, const void *data, size_t size, u32
 	return err;
 }
 
+/*
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ */
 int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
 {
 	const struct firmware *fw = NULL;
@@ -797,6 +805,7 @@ int xe_uc_fw_init(struct xe_uc_fw *uc_fw)
 
 	return err;
 }
+ALLOW_ERROR_INJECTION(xe_uc_fw_init, ERRNO);
 
 static u32 uc_fw_ggtt_offset(struct xe_uc_fw *uc_fw)
 {
diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
index 22c148b1e996..3c9d36fd1a2c 100644
--- a/drivers/gpu/drm/xe/xe_wa.c
+++ b/drivers/gpu/drm/xe/xe_wa.c
@@ -8,6 +8,7 @@
 #include <drm/drm_managed.h>
 #include <kunit/visibility.h>
 #include <linux/compiler_types.h>
+#include <linux/fault-inject.h>
 
 #include <generated/xe_wa_oob.h>
 
@@ -822,6 +823,12 @@ void xe_wa_process_lrc(struct xe_hw_engine *hwe)
  * xe_wa_init - initialize gt with workaround bookkeeping
  * @gt: GT instance to initialize
  *
+ * The ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in the
+ * callers. The requirements for the error injectable functions are not strictly
+ * fullfilled but this is acceptable because the caller only propagates the error
+ * up the stack without cleanup of resources potentially allocated here.
+ *
  * Returns 0 for success, negative error code otherwise.
  */
 int xe_wa_init(struct xe_gt *gt)
@@ -850,6 +857,7 @@ int xe_wa_init(struct xe_gt *gt)
 
 	return 0;
 }
+ALLOW_ERROR_INJECTION(xe_wa_init, ERRNO);
 
 void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p)
 {
diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
index 93c82825d896..1ea9ddf32c2a 100644
--- a/drivers/gpu/drm/xe/xe_wopcm.c
+++ b/drivers/gpu/drm/xe/xe_wopcm.c
@@ -5,6 +5,8 @@
 
 #include "xe_wopcm.h"
 
+#include <linux/fault-inject.h>
+
 #include "regs/xe_guc_regs.h"
 #include "xe_device.h"
 #include "xe_force_wake.h"
@@ -193,6 +195,12 @@ u32 xe_wopcm_size(struct xe_device *xe)
  * enforce platform dependent hardware restrictions on GuC WOPCM offset and
  * size. It will fail the WOPCM init if any of these checks fail, so that the
  * following WOPCM registers setup and GuC firmware uploading would be aborted.
+ *
+ * This function fullfills the requirements for the error injectable functions
+ * fault-injection.html#requirements-for-the-error-injectable-functions so the
+ * ALLOW_ERROR_INJECTION() macro is added to conditionally skip execution at
+ * runtime and use a provided return value, in order to test errors paths in
+ * the callers.
  */
 int xe_wopcm_init(struct xe_wopcm *wopcm)
 {
@@ -268,3 +276,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
 
 	return ret;
 }
+ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-09-26 18:36 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-25 15:55 [PATCH v3] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
2024-09-26  9:43 ` Jani Nikula
2024-09-26 11:46   ` Francois Dugast
2024-09-26 11:57     ` Jani Nikula
2024-09-26 13:06       ` Lucas De Marchi
2024-09-26 13:32         ` Jani Nikula
2024-09-26 18:35           ` Rodrigo Vivi
2024-09-26 15:54 ` ✓ CI.Patch_applied: success for drm/xe: Use fault injection infrastructure to find issues at probe time (rev3) Patchwork
2024-09-26 15:55 ` ✓ CI.checkpatch: " Patchwork
2024-09-26 15:55 ` ✗ CI.KUnit: failure " Patchwork

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).