Intel-XE Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [RFC v1] drm/xe: Use fault injection infrastructure to find issues at probe time
@ 2024-09-13 14:33 Francois Dugast
  2024-09-13 18:15 ` ✓ CI.Patch_applied: success for " Patchwork
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Francois Dugast @ 2024-09-13 14:33 UTC (permalink / raw)
  To: intel-xe; +Cc: Francois Dugast, Lucas De Marchi, Matthew Brost, Rodrigo Vivi

This RFC is a rework of https://patchwork.freedesktop.org/series/137314/
but instead of importing custom macros from i915, it makes use of the
standard kernel fault injection infrastructure, see fault-injection.rst.

In particular, it leverages error injectable functions with the macro
ALLOW_ERROR_INJECTION(). This macro requires minimal code changes to the
faulting function, if it meets the injectable functions requirements:
fault-injection.html#requirements-for-the-error-injectable-functions

Unfortunately this is not the case in most of the injection points of the
original series, so a wrapper is added if needed. Only a few examples are
shown in this RFC to discuss the proper pattern to apply.

The return code of the functions using ALLOW_ERROR_INJECTION() can be
conditionnally modified at runtime by tuning some debugfs entries. This
requires CONFIG_FUNCTION_ERROR_INJECTION (among others).

One way to use fault injection at probe time by making each of those
functions fail one at a time is:

    FAILTYPE=fail_function
    DEVICE="0000:00:08.0" # depends on the system
    ERRNO=-12 # -ENOMEM, other value might be needed depending on error handling

    echo N > /sys/kernel/debug/$FAILTYPE/task-filter
    echo 100 > /sys/kernel/debug/$FAILTYPE/probability
    echo 0 > /sys/kernel/debug/$FAILTYPE/interval
    echo -1 > /sys/kernel/debug/$FAILTYPE/times
    echo 0 > /sys/kernel/debug/$FAILTYPE/space
    echo 1 > /sys/kernel/debug/$FAILTYPE/verbose

    modprobe xe
    echo $DEVICE > /sys/bus/pci/drivers/xe/unbind

    grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | cut -d ' ' -f 1 | while read -r FUNCTION ; do
        echo "Injecting fault in $FUNCTION"
        echo "" > /sys/kernel/debug/$FAILTYPE/inject
        echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
        printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
        echo $DEVICE > /sys/bus/pci/drivers/xe/bind
    done

    rmmod xe

Signed-off-by: Francois Dugast <francois.dugast@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c | 37 +++++++++++++++++++++++++++++++---
 drivers/gpu/drm/xe/xe_tile.c   | 18 +++++++++++++++++
 drivers/gpu/drm/xe/xe_wopcm.c  |  3 +++
 3 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 4d3c794f134c..2db62aa45b39 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -6,6 +6,7 @@
 #include "xe_device.h"
 
 #include <linux/delay.h>
+#include <linux/fault-inject.h>
 #include <linux/units.h>
 
 #include <drm/drm_aperture.h>
@@ -300,6 +301,37 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
 	ttm_device_fini(&xe->ttm);
 }
 
+/* Wrapper for fault injection */
+static noinline int device_create_ttm_device_init(struct xe_device *xe)
+{
+	/*
+	 * In case of error injection, the flow is modified because ttm_device_init()
+	 * executes (likely without error) but the return code is overridden in any
+	 * case to indicate an error in ttm_device_init(), which likely did not
+	 * occur.
+	 */
+	return ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
+			       xe->drm.anon_inode->i_mapping,
+			       xe->drm.vma_offset_manager, false, false);
+
+	/*
+	 * The alternative below would also change the flow because in case of error
+	 * injection, ttm_device_init() would not be executed at all.
+	 *
+	 * int err = device_create_ttm_device_init_inject_fault();
+	 * if (err)
+	 *	return err;
+	 * return ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
+	 *		       xe->drm.anon_inode->i_mapping,
+	 *		       xe->drm.vma_offset_manager, false, false);
+	 *
+	 * ...
+	 * static noinline int device_create_ttm_device_init_inject_fault() { return 0; }
+	 * ALLOW_ERROR_INJECTION(device_create_ttm_device_init_inject_fault, ERRNO);
+	 */
+}
+ALLOW_ERROR_INJECTION(device_create_ttm_device_init, ERRNO);
+
 struct xe_device *xe_device_create(struct pci_dev *pdev,
 				   const struct pci_device_id *ent)
 {
@@ -316,9 +348,7 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
 	if (IS_ERR(xe))
 		return xe;
 
-	err = ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
-			      xe->drm.anon_inode->i_mapping,
-			      xe->drm.vma_offset_manager, false, false);
+	err = device_create_ttm_device_init(xe);
 	if (WARN_ON(err))
 		goto err;
 
@@ -550,6 +580,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
 
 	return 0;
 }
+ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
 
 static void update_device_info(struct xe_device *xe)
 {
diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
index dda5268507d8..22c0a42af9c2 100644
--- a/drivers/gpu/drm/xe/xe_tile.c
+++ b/drivers/gpu/drm/xe/xe_tile.c
@@ -3,6 +3,8 @@
  * Copyright © 2023 Intel Corporation
  */
 
+#include <linux/fault-inject.h>
+
 #include <drm/drm_managed.h>
 
 #include "xe_device.h"
@@ -99,6 +101,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
 	return 0;
 }
 
+static noinline int tile_init_early_inject_fault(void)
+{
+	return 0;
+}
+ALLOW_ERROR_INJECTION(tile_init_early_inject_fault, ERRNO);
+
 /**
  * xe_tile_init_early - Initialize the tile and primary GT
  * @tile: Tile to initialize
@@ -117,6 +125,16 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
 	tile->xe = xe;
 	tile->id = id;
 
+	/*
+	 * The flow is modified because xe_tile_init_early() fails before the
+	 * first possible error, from xe_tile_alloc(). It is does not match the
+	 * 2nd requirement of
+	 * fault-injection.html#requirements-for-the-error-injectable-functions
+	 */
+	err = tile_init_early_inject_fault();
+	if (err)
+		return err;
+
 	err = xe_tile_alloc(tile);
 	if (err)
 		return err;
diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
index 93c82825d896..88a201122a22 100644
--- a/drivers/gpu/drm/xe/xe_wopcm.c
+++ b/drivers/gpu/drm/xe/xe_wopcm.c
@@ -5,6 +5,8 @@
 
 #include "xe_wopcm.h"
 
+#include <linux/fault-inject.h>
+
 #include "regs/xe_guc_regs.h"
 #include "xe_device.h"
 #include "xe_force_wake.h"
@@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
 
 	return ret;
 }
+ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* ✓ CI.Patch_applied: success for drm/xe: Use fault injection infrastructure to find issues at probe time
  2024-09-13 14:33 [RFC v1] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
@ 2024-09-13 18:15 ` Patchwork
  2024-09-13 18:15 ` ✗ CI.checkpatch: warning " Patchwork
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2024-09-13 18:15 UTC (permalink / raw)
  To: Francois Dugast; +Cc: intel-xe

== Series Details ==

Series: drm/xe: Use fault injection infrastructure to find issues at probe time
URL   : https://patchwork.freedesktop.org/series/138654/
State : success

== Summary ==

=== Applying kernel patches on branch 'drm-tip' with base: ===
Base commit: df3eecb1fa97 drm-tip: 2024y-09m-13d-16h-04m-40s UTC integration manifest
=== git am output follows ===
Applying: drm/xe: Use fault injection infrastructure to find issues at probe time



^ permalink raw reply	[flat|nested] 7+ messages in thread

* ✗ CI.checkpatch: warning for drm/xe: Use fault injection infrastructure to find issues at probe time
  2024-09-13 14:33 [RFC v1] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
  2024-09-13 18:15 ` ✓ CI.Patch_applied: success for " Patchwork
@ 2024-09-13 18:15 ` Patchwork
  2024-09-13 18:15 ` ✗ CI.KUnit: failure " Patchwork
  2024-09-13 20:15 ` [RFC v1] " Rodrigo Vivi
  3 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2024-09-13 18:15 UTC (permalink / raw)
  To: Francois Dugast; +Cc: intel-xe

== Series Details ==

Series: drm/xe: Use fault injection infrastructure to find issues at probe time
URL   : https://patchwork.freedesktop.org/series/138654/
State : warning

== Summary ==

+ KERNEL=/kernel
+ git clone https://gitlab.freedesktop.org/drm/maintainer-tools mt
Cloning into 'mt'...
warning: redirecting to https://gitlab.freedesktop.org/drm/maintainer-tools.git/
+ git -C mt rev-list -n1 origin/master
c62d7e164862503a3662a095da1c6c9014248cb2
+ cd /kernel
+ git config --global --add safe.directory /kernel
+ git log -n1
commit 8b96a301d4dd3321de7c422834dd165737733088
Author: Francois Dugast <francois.dugast@intel.com>
Date:   Fri Sep 13 16:33:05 2024 +0200

    drm/xe: Use fault injection infrastructure to find issues at probe time
    
    This RFC is a rework of https://patchwork.freedesktop.org/series/137314/
    but instead of importing custom macros from i915, it makes use of the
    standard kernel fault injection infrastructure, see fault-injection.rst.
    
    In particular, it leverages error injectable functions with the macro
    ALLOW_ERROR_INJECTION(). This macro requires minimal code changes to the
    faulting function, if it meets the injectable functions requirements:
    fault-injection.html#requirements-for-the-error-injectable-functions
    
    Unfortunately this is not the case in most of the injection points of the
    original series, so a wrapper is added if needed. Only a few examples are
    shown in this RFC to discuss the proper pattern to apply.
    
    The return code of the functions using ALLOW_ERROR_INJECTION() can be
    conditionnally modified at runtime by tuning some debugfs entries. This
    requires CONFIG_FUNCTION_ERROR_INJECTION (among others).
    
    One way to use fault injection at probe time by making each of those
    functions fail one at a time is:
    
        FAILTYPE=fail_function
        DEVICE="0000:00:08.0" # depends on the system
        ERRNO=-12 # -ENOMEM, other value might be needed depending on error handling
    
        echo N > /sys/kernel/debug/$FAILTYPE/task-filter
        echo 100 > /sys/kernel/debug/$FAILTYPE/probability
        echo 0 > /sys/kernel/debug/$FAILTYPE/interval
        echo -1 > /sys/kernel/debug/$FAILTYPE/times
        echo 0 > /sys/kernel/debug/$FAILTYPE/space
        echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
    
        modprobe xe
        echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
    
        grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | cut -d ' ' -f 1 | while read -r FUNCTION ; do
            echo "Injecting fault in $FUNCTION"
            echo "" > /sys/kernel/debug/$FAILTYPE/inject
            echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
            printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
            echo $DEVICE > /sys/bus/pci/drivers/xe/bind
        done
    
        rmmod xe
    
    Signed-off-by: Francois Dugast <francois.dugast@intel.com>
    Cc: Lucas De Marchi <lucas.demarchi@intel.com>
    Cc: Matthew Brost <matthew.brost@intel.com>
    Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
+ /mt/dim checkpatch df3eecb1fa97d6d35569c2fa39f0da78b9f13328 drm-intel
8b96a301d4dd drm/xe: Use fault injection infrastructure to find issues at probe time
-:29: WARNING:COMMIT_LOG_LONG_LINE: Prefer a maximum 75 chars per line (possible unwrapped commit description?)
#29: 
    ERRNO=-12 # -ENOMEM, other value might be needed depending on error handling

total: 0 errors, 1 warnings, 0 checks, 109 lines checked



^ permalink raw reply	[flat|nested] 7+ messages in thread

* ✗ CI.KUnit: failure for drm/xe: Use fault injection infrastructure to find issues at probe time
  2024-09-13 14:33 [RFC v1] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
  2024-09-13 18:15 ` ✓ CI.Patch_applied: success for " Patchwork
  2024-09-13 18:15 ` ✗ CI.checkpatch: warning " Patchwork
@ 2024-09-13 18:15 ` Patchwork
  2024-09-13 20:15 ` [RFC v1] " Rodrigo Vivi
  3 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2024-09-13 18:15 UTC (permalink / raw)
  To: Francois Dugast; +Cc: intel-xe

== Series Details ==

Series: drm/xe: Use fault injection infrastructure to find issues at probe time
URL   : https://patchwork.freedesktop.org/series/138654/
State : failure

== Summary ==

+ trap cleanup EXIT
+ /kernel/tools/testing/kunit/kunit.py run --kunitconfig /kernel/drivers/gpu/drm/xe/.kunitconfig
ERROR:root:In file included from ../drivers/gpu/drm/xe/xe_tile.c:6:
../include/linux/fault-inject.h:97:15: error: unknown type name ‘bool’
   97 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
      |               ^~~~
../include/linux/fault-inject.h:97:43: error: unknown type name ‘gfp_t’
   97 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
      |                                           ^~~~~
../include/linux/fault-inject.h:106:57: error: unknown type name ‘gfp_t’
  106 | static inline int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
      |                                                         ^~~~~
../lib/iomap.c:156:5: warning: no previous prototype for ‘ioread64_lo_hi’ [-Wmissing-prototypes]
  156 | u64 ioread64_lo_hi(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~
../lib/iomap.c:163:5: warning: no previous prototype for ‘ioread64_hi_lo’ [-Wmissing-prototypes]
  163 | u64 ioread64_hi_lo(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~
../lib/iomap.c:170:5: warning: no previous prototype for ‘ioread64be_lo_hi’ [-Wmissing-prototypes]
  170 | u64 ioread64be_lo_hi(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~~~
../lib/iomap.c:178:5: warning: no previous prototype for ‘ioread64be_hi_lo’ [-Wmissing-prototypes]
  178 | u64 ioread64be_hi_lo(const void __iomem *addr)
      |     ^~~~~~~~~~~~~~~~
../lib/iomap.c:264:6: warning: no previous prototype for ‘iowrite64_lo_hi’ [-Wmissing-prototypes]
  264 | void iowrite64_lo_hi(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~
../lib/iomap.c:272:6: warning: no previous prototype for ‘iowrite64_hi_lo’ [-Wmissing-prototypes]
  272 | void iowrite64_hi_lo(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~
../lib/iomap.c:280:6: warning: no previous prototype for ‘iowrite64be_lo_hi’ [-Wmissing-prototypes]
  280 | void iowrite64be_lo_hi(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~~~
../lib/iomap.c:288:6: warning: no previous prototype for ‘iowrite64be_hi_lo’ [-Wmissing-prototypes]
  288 | void iowrite64be_hi_lo(u64 val, void __iomem *addr)
      |      ^~~~~~~~~~~~~~~~~
make[7]: *** [../scripts/Makefile.build:244: drivers/gpu/drm/xe/xe_tile.o] Error 1
make[7]: *** Waiting for unfinished jobs....
make[6]: *** [../scripts/Makefile.build:485: drivers/gpu/drm/xe] Error 2
make[5]: *** [../scripts/Makefile.build:485: drivers/gpu/drm] Error 2
make[4]: *** [../scripts/Makefile.build:485: drivers/gpu] Error 2
make[3]: *** [../scripts/Makefile.build:485: drivers] Error 2
make[2]: *** [/kernel/Makefile:1926: .] Error 2
make[1]: *** [/kernel/Makefile:224: __sub-make] Error 2
make: *** [Makefile:224: __sub-make] Error 2

[18:15:16] Configuring KUnit Kernel ...
Generating .config ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
[18:15:21] Building KUnit Kernel ...
Populating config with:
$ make ARCH=um O=.kunit olddefconfig
Building with:
$ make ARCH=um O=.kunit --jobs=48
+ cleanup
++ stat -c %u:%g /kernel
+ chown -R 1003:1003 /kernel



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC v1] drm/xe: Use fault injection infrastructure to find issues at probe time
  2024-09-13 14:33 [RFC v1] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
                   ` (2 preceding siblings ...)
  2024-09-13 18:15 ` ✗ CI.KUnit: failure " Patchwork
@ 2024-09-13 20:15 ` Rodrigo Vivi
  2024-09-16  8:45   ` Francois Dugast
  3 siblings, 1 reply; 7+ messages in thread
From: Rodrigo Vivi @ 2024-09-13 20:15 UTC (permalink / raw)
  To: Francois Dugast; +Cc: intel-xe, Lucas De Marchi, Matthew Brost

On Fri, Sep 13, 2024 at 04:33:05PM +0200, Francois Dugast wrote:
> This RFC is a rework of https://patchwork.freedesktop.org/series/137314/
> but instead of importing custom macros from i915, it makes use of the
> standard kernel fault injection infrastructure, see fault-injection.rst.
> 
> In particular, it leverages error injectable functions with the macro
> ALLOW_ERROR_INJECTION(). This macro requires minimal code changes to the
> faulting function, if it meets the injectable functions requirements:
> fault-injection.html#requirements-for-the-error-injectable-functions
> 
> Unfortunately this is not the case in most of the injection points of the
> original series, so a wrapper is added if needed. Only a few examples are
> shown in this RFC to discuss the proper pattern to apply.
> 
> The return code of the functions using ALLOW_ERROR_INJECTION() can be
> conditionnally modified at runtime by tuning some debugfs entries. This
> requires CONFIG_FUNCTION_ERROR_INJECTION (among others).
> 
> One way to use fault injection at probe time by making each of those
> functions fail one at a time is:
> 
>     FAILTYPE=fail_function
>     DEVICE="0000:00:08.0" # depends on the system
>     ERRNO=-12 # -ENOMEM, other value might be needed depending on error handling
> 
>     echo N > /sys/kernel/debug/$FAILTYPE/task-filter
>     echo 100 > /sys/kernel/debug/$FAILTYPE/probability
>     echo 0 > /sys/kernel/debug/$FAILTYPE/interval
>     echo -1 > /sys/kernel/debug/$FAILTYPE/times
>     echo 0 > /sys/kernel/debug/$FAILTYPE/space
>     echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
> 
>     modprobe xe
>     echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
> 
>     grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | cut -d ' ' -f 1 | while read -r FUNCTION ; do
>         echo "Injecting fault in $FUNCTION"
>         echo "" > /sys/kernel/debug/$FAILTYPE/inject
>         echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
>         printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
>         echo $DEVICE > /sys/bus/pci/drivers/xe/bind
>     done
> 
>     rmmod xe
> 
> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device.c | 37 +++++++++++++++++++++++++++++++---
>  drivers/gpu/drm/xe/xe_tile.c   | 18 +++++++++++++++++
>  drivers/gpu/drm/xe/xe_wopcm.c  |  3 +++
>  3 files changed, 55 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 4d3c794f134c..2db62aa45b39 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -6,6 +6,7 @@
>  #include "xe_device.h"
>  
>  #include <linux/delay.h>
> +#include <linux/fault-inject.h>
>  #include <linux/units.h>
>  
>  #include <drm/drm_aperture.h>
> @@ -300,6 +301,37 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
>  	ttm_device_fini(&xe->ttm);
>  }
>  
> +/* Wrapper for fault injection */
> +static noinline int device_create_ttm_device_init(struct xe_device *xe)
> +{
> +	/*
> +	 * In case of error injection, the flow is modified because ttm_device_init()
> +	 * executes (likely without error) but the return code is overridden in any
> +	 * case to indicate an error in ttm_device_init(), which likely did not
> +	 * occur.
> +	 */
> +	return ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
> +			       xe->drm.anon_inode->i_mapping,
> +			       xe->drm.vma_offset_manager, false, false);
> +
> +	/*
> +	 * The alternative below would also change the flow because in case of error
> +	 * injection, ttm_device_init() would not be executed at all.
> +	 *
> +	 * int err = device_create_ttm_device_init_inject_fault();
> +	 * if (err)
> +	 *	return err;
> +	 * return ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
> +	 *		       xe->drm.anon_inode->i_mapping,
> +	 *		       xe->drm.vma_offset_manager, false, false);
> +	 *
> +	 * ...
> +	 * static noinline int device_create_ttm_device_init_inject_fault() { return 0; }
> +	 * ALLOW_ERROR_INJECTION(device_create_ttm_device_init_inject_fault, ERRNO);
> +	 */
> +}
> +ALLOW_ERROR_INJECTION(device_create_ttm_device_init, ERRNO);
> +
>  struct xe_device *xe_device_create(struct pci_dev *pdev,
>  				   const struct pci_device_id *ent)
>  {
> @@ -316,9 +348,7 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
>  	if (IS_ERR(xe))
>  		return xe;
>  
> -	err = ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
> -			      xe->drm.anon_inode->i_mapping,
> -			      xe->drm.vma_offset_manager, false, false);
> +	err = device_create_ttm_device_init(xe);

I believe the function name could be simplified to xe_device_init_ttm...

>  	if (WARN_ON(err))
>  		goto err;
>  
> @@ -550,6 +580,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
>  
>  	return 0;
>  }
> +ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);

I liked how simple that is....

>  
>  static void update_device_info(struct xe_device *xe)
>  {
> diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> index dda5268507d8..22c0a42af9c2 100644
> --- a/drivers/gpu/drm/xe/xe_tile.c
> +++ b/drivers/gpu/drm/xe/xe_tile.c
> @@ -3,6 +3,8 @@
>   * Copyright © 2023 Intel Corporation
>   */
>  
> +#include <linux/fault-inject.h>
> +
>  #include <drm/drm_managed.h>
>  
>  #include "xe_device.h"
> @@ -99,6 +101,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
>  	return 0;
>  }
>  
> +static noinline int tile_init_early_inject_fault(void)
> +{
> +	return 0;
> +}
> +ALLOW_ERROR_INJECTION(tile_init_early_inject_fault, ERRNO);
> +
>  /**
>   * xe_tile_init_early - Initialize the tile and primary GT
>   * @tile: Tile to initialize
> @@ -117,6 +125,16 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
>  	tile->xe = xe;
>  	tile->id = id;
>  
> +	/*
> +	 * The flow is modified because xe_tile_init_early() fails before the
> +	 * first possible error, from xe_tile_alloc(). It is does not match the
> +	 * 2nd requirement of
> +	 * fault-injection.html#requirements-for-the-error-injectable-functions
> +	 */

I'm afraid I didn't understand this.

and the name could perhaps start with fault_inject?!

or maybe:

xe_fault_inject_tile_init_early();

> +	err = tile_init_early_inject_fault();
> +	if (err)
> +		return err;
> +
>  	err = xe_tile_alloc(tile);
>  	if (err)
>  		return err;
> diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
> index 93c82825d896..88a201122a22 100644
> --- a/drivers/gpu/drm/xe/xe_wopcm.c
> +++ b/drivers/gpu/drm/xe/xe_wopcm.c
> @@ -5,6 +5,8 @@
>  
>  #include "xe_wopcm.h"
>  
> +#include <linux/fault-inject.h>
> +
>  #include "regs/xe_guc_regs.h"
>  #include "xe_device.h"
>  #include "xe_force_wake.h"
> @@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
>  
>  	return ret;
>  }
> +ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
> -- 
> 2.43.0
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC v1] drm/xe: Use fault injection infrastructure to find issues at probe time
  2024-09-13 20:15 ` [RFC v1] " Rodrigo Vivi
@ 2024-09-16  8:45   ` Francois Dugast
  2024-09-18 12:55     ` Francois Dugast
  0 siblings, 1 reply; 7+ messages in thread
From: Francois Dugast @ 2024-09-16  8:45 UTC (permalink / raw)
  To: Rodrigo Vivi; +Cc: intel-xe, Lucas De Marchi, Matthew Brost

On Fri, Sep 13, 2024 at 04:15:49PM -0400, Rodrigo Vivi wrote:
> On Fri, Sep 13, 2024 at 04:33:05PM +0200, Francois Dugast wrote:
> > This RFC is a rework of https://patchwork.freedesktop.org/series/137314/
> > but instead of importing custom macros from i915, it makes use of the
> > standard kernel fault injection infrastructure, see fault-injection.rst.
> > 
> > In particular, it leverages error injectable functions with the macro
> > ALLOW_ERROR_INJECTION(). This macro requires minimal code changes to the
> > faulting function, if it meets the injectable functions requirements:
> > fault-injection.html#requirements-for-the-error-injectable-functions
> > 
> > Unfortunately this is not the case in most of the injection points of the
> > original series, so a wrapper is added if needed. Only a few examples are
> > shown in this RFC to discuss the proper pattern to apply.
> > 
> > The return code of the functions using ALLOW_ERROR_INJECTION() can be
> > conditionnally modified at runtime by tuning some debugfs entries. This
> > requires CONFIG_FUNCTION_ERROR_INJECTION (among others).
> > 
> > One way to use fault injection at probe time by making each of those
> > functions fail one at a time is:
> > 
> >     FAILTYPE=fail_function
> >     DEVICE="0000:00:08.0" # depends on the system
> >     ERRNO=-12 # -ENOMEM, other value might be needed depending on error handling
> > 
> >     echo N > /sys/kernel/debug/$FAILTYPE/task-filter
> >     echo 100 > /sys/kernel/debug/$FAILTYPE/probability
> >     echo 0 > /sys/kernel/debug/$FAILTYPE/interval
> >     echo -1 > /sys/kernel/debug/$FAILTYPE/times
> >     echo 0 > /sys/kernel/debug/$FAILTYPE/space
> >     echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
> > 
> >     modprobe xe
> >     echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
> > 
> >     grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | cut -d ' ' -f 1 | while read -r FUNCTION ; do
> >         echo "Injecting fault in $FUNCTION"
> >         echo "" > /sys/kernel/debug/$FAILTYPE/inject
> >         echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
> >         printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
> >         echo $DEVICE > /sys/bus/pci/drivers/xe/bind
> >     done
> > 
> >     rmmod xe
> > 
> > Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> > Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> > Cc: Matthew Brost <matthew.brost@intel.com>
> > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_device.c | 37 +++++++++++++++++++++++++++++++---
> >  drivers/gpu/drm/xe/xe_tile.c   | 18 +++++++++++++++++
> >  drivers/gpu/drm/xe/xe_wopcm.c  |  3 +++
> >  3 files changed, 55 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> > index 4d3c794f134c..2db62aa45b39 100644
> > --- a/drivers/gpu/drm/xe/xe_device.c
> > +++ b/drivers/gpu/drm/xe/xe_device.c
> > @@ -6,6 +6,7 @@
> >  #include "xe_device.h"
> >  
> >  #include <linux/delay.h>
> > +#include <linux/fault-inject.h>
> >  #include <linux/units.h>
> >  
> >  #include <drm/drm_aperture.h>
> > @@ -300,6 +301,37 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
> >  	ttm_device_fini(&xe->ttm);
> >  }
> >  
> > +/* Wrapper for fault injection */
> > +static noinline int device_create_ttm_device_init(struct xe_device *xe)
> > +{
> > +	/*
> > +	 * In case of error injection, the flow is modified because ttm_device_init()
> > +	 * executes (likely without error) but the return code is overridden in any
> > +	 * case to indicate an error in ttm_device_init(), which likely did not
> > +	 * occur.
> > +	 */
> > +	return ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
> > +			       xe->drm.anon_inode->i_mapping,
> > +			       xe->drm.vma_offset_manager, false, false);
> > +
> > +	/*
> > +	 * The alternative below would also change the flow because in case of error
> > +	 * injection, ttm_device_init() would not be executed at all.
> > +	 *
> > +	 * int err = device_create_ttm_device_init_inject_fault();
> > +	 * if (err)
> > +	 *	return err;
> > +	 * return ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
> > +	 *		       xe->drm.anon_inode->i_mapping,
> > +	 *		       xe->drm.vma_offset_manager, false, false);
> > +	 *
> > +	 * ...
> > +	 * static noinline int device_create_ttm_device_init_inject_fault() { return 0; }
> > +	 * ALLOW_ERROR_INJECTION(device_create_ttm_device_init_inject_fault, ERRNO);
> > +	 */
> > +}
> > +ALLOW_ERROR_INJECTION(device_create_ttm_device_init, ERRNO);
> > +
> >  struct xe_device *xe_device_create(struct pci_dev *pdev,
> >  				   const struct pci_device_id *ent)
> >  {
> > @@ -316,9 +348,7 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
> >  	if (IS_ERR(xe))
> >  		return xe;
> >  
> > -	err = ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
> > -			      xe->drm.anon_inode->i_mapping,
> > -			      xe->drm.vma_offset_manager, false, false);
> > +	err = device_create_ttm_device_init(xe);
> 
> I believe the function name could be simplified to xe_device_init_ttm...

Yes indeed.

> 
> >  	if (WARN_ON(err))
> >  		goto err;
> >  
> > @@ -550,6 +580,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
> >  
> >  	return 0;
> >  }
> > +ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
> 
> I liked how simple that is....

It is quite straightforward when the function is error injectable, which is
often not the case. Especially this requirement is often not met: "The
function does not execute any code which can change any state before the first
error return."

This is why other examples in this patch require an extra wrapper, see below.

> 
> >  
> >  static void update_device_info(struct xe_device *xe)
> >  {
> > diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> > index dda5268507d8..22c0a42af9c2 100644
> > --- a/drivers/gpu/drm/xe/xe_tile.c
> > +++ b/drivers/gpu/drm/xe/xe_tile.c
> > @@ -3,6 +3,8 @@
> >   * Copyright © 2023 Intel Corporation
> >   */
> >  
> > +#include <linux/fault-inject.h>
> > +
> >  #include <drm/drm_managed.h>
> >  
> >  #include "xe_device.h"
> > @@ -99,6 +101,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
> >  	return 0;
> >  }
> >  
> > +static noinline int tile_init_early_inject_fault(void)
> > +{
> > +	return 0;
> > +}
> > +ALLOW_ERROR_INJECTION(tile_init_early_inject_fault, ERRNO);
> > +
> >  /**
> >   * xe_tile_init_early - Initialize the tile and primary GT
> >   * @tile: Tile to initialize
> > @@ -117,6 +125,16 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
> >  	tile->xe = xe;
> >  	tile->id = id;
> >  
> > +	/*
> > +	 * The flow is modified because xe_tile_init_early() fails before the
> > +	 * first possible error, from xe_tile_alloc(). It is does not match the
> > +	 * 2nd requirement of
> > +	 * fault-injection.html#requirements-for-the-error-injectable-functions
> > +	 */
> 
> I'm afraid I didn't understand this.

In this example we end up returning an error from an artificial code location:

    err = tile_init_early_inject_fault();
    if (err)
        return err;

... before even starting to execute the first statement which could return a
real error, xe_tile_alloc(). Not focusing on this particular example but in
general this can have side effects and create false positives if the state
is not changed as expected, either because the first statement that can fail
was not executed at all, or because all was executed successfully but still
reported as error.

Not sure if this is overthinking, after all this issue is also present with
the i915 macro for error injection and maybe the pros outweigh the cons.

Francois

> 
> and the name could perhaps start with fault_inject?!
> 
> or maybe:
> 
> xe_fault_inject_tile_init_early();

Sure, whatever helps easily identify wrappers only added for fault
injection. Maybe we can even remove the xe_ prefix as the function can be
static?

Francois

> 
> > +	err = tile_init_early_inject_fault();
> > +	if (err)
> > +		return err;
> > +
> >  	err = xe_tile_alloc(tile);
> >  	if (err)
> >  		return err;
> > diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
> > index 93c82825d896..88a201122a22 100644
> > --- a/drivers/gpu/drm/xe/xe_wopcm.c
> > +++ b/drivers/gpu/drm/xe/xe_wopcm.c
> > @@ -5,6 +5,8 @@
> >  
> >  #include "xe_wopcm.h"
> >  
> > +#include <linux/fault-inject.h>
> > +
> >  #include "regs/xe_guc_regs.h"
> >  #include "xe_device.h"
> >  #include "xe_force_wake.h"
> > @@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
> >  
> >  	return ret;
> >  }
> > +ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
> > -- 
> > 2.43.0
> > 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC v1] drm/xe: Use fault injection infrastructure to find issues at probe time
  2024-09-16  8:45   ` Francois Dugast
@ 2024-09-18 12:55     ` Francois Dugast
  0 siblings, 0 replies; 7+ messages in thread
From: Francois Dugast @ 2024-09-18 12:55 UTC (permalink / raw)
  To: Rodrigo Vivi; +Cc: intel-xe, Lucas De Marchi, Matthew Brost, Michal Wajdeczko

On Mon, Sep 16, 2024 at 10:45:37AM +0200, Francois Dugast wrote:
> On Fri, Sep 13, 2024 at 04:15:49PM -0400, Rodrigo Vivi wrote:
> > On Fri, Sep 13, 2024 at 04:33:05PM +0200, Francois Dugast wrote:
> > > This RFC is a rework of https://patchwork.freedesktop.org/series/137314/
> > > but instead of importing custom macros from i915, it makes use of the
> > > standard kernel fault injection infrastructure, see fault-injection.rst.
> > > 
> > > In particular, it leverages error injectable functions with the macro
> > > ALLOW_ERROR_INJECTION(). This macro requires minimal code changes to the
> > > faulting function, if it meets the injectable functions requirements:
> > > fault-injection.html#requirements-for-the-error-injectable-functions
> > > 
> > > Unfortunately this is not the case in most of the injection points of the
> > > original series, so a wrapper is added if needed. Only a few examples are
> > > shown in this RFC to discuss the proper pattern to apply.
> > > 
> > > The return code of the functions using ALLOW_ERROR_INJECTION() can be
> > > conditionnally modified at runtime by tuning some debugfs entries. This
> > > requires CONFIG_FUNCTION_ERROR_INJECTION (among others).
> > > 
> > > One way to use fault injection at probe time by making each of those
> > > functions fail one at a time is:
> > > 
> > >     FAILTYPE=fail_function
> > >     DEVICE="0000:00:08.0" # depends on the system
> > >     ERRNO=-12 # -ENOMEM, other value might be needed depending on error handling
> > > 
> > >     echo N > /sys/kernel/debug/$FAILTYPE/task-filter
> > >     echo 100 > /sys/kernel/debug/$FAILTYPE/probability
> > >     echo 0 > /sys/kernel/debug/$FAILTYPE/interval
> > >     echo -1 > /sys/kernel/debug/$FAILTYPE/times
> > >     echo 0 > /sys/kernel/debug/$FAILTYPE/space
> > >     echo 1 > /sys/kernel/debug/$FAILTYPE/verbose
> > > 
> > >     modprobe xe
> > >     echo $DEVICE > /sys/bus/pci/drivers/xe/unbind
> > > 
> > >     grep -oP "^.* \[xe\]" /sys/kernel/debug/$FAILTYPE/injectable | cut -d ' ' -f 1 | while read -r FUNCTION ; do
> > >         echo "Injecting fault in $FUNCTION"
> > >         echo "" > /sys/kernel/debug/$FAILTYPE/inject
> > >         echo $FUNCTION > /sys/kernel/debug/$FAILTYPE/inject
> > >         printf %#x $ERRNO > /sys/kernel/debug/$FAILTYPE/$FUNCTION/retval
> > >         echo $DEVICE > /sys/bus/pci/drivers/xe/bind
> > >     done
> > > 
> > >     rmmod xe
> > > 
> > > Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> > > Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> > > Cc: Matthew Brost <matthew.brost@intel.com>
> > > Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>

+ Michal

Any thoughts on this approach?

> > > ---
> > >  drivers/gpu/drm/xe/xe_device.c | 37 +++++++++++++++++++++++++++++++---
> > >  drivers/gpu/drm/xe/xe_tile.c   | 18 +++++++++++++++++
> > >  drivers/gpu/drm/xe/xe_wopcm.c  |  3 +++
> > >  3 files changed, 55 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> > > index 4d3c794f134c..2db62aa45b39 100644
> > > --- a/drivers/gpu/drm/xe/xe_device.c
> > > +++ b/drivers/gpu/drm/xe/xe_device.c
> > > @@ -6,6 +6,7 @@
> > >  #include "xe_device.h"
> > >  
> > >  #include <linux/delay.h>
> > > +#include <linux/fault-inject.h>
> > >  #include <linux/units.h>
> > >  
> > >  #include <drm/drm_aperture.h>
> > > @@ -300,6 +301,37 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
> > >  	ttm_device_fini(&xe->ttm);
> > >  }
> > >  
> > > +/* Wrapper for fault injection */
> > > +static noinline int device_create_ttm_device_init(struct xe_device *xe)
> > > +{
> > > +	/*
> > > +	 * In case of error injection, the flow is modified because ttm_device_init()
> > > +	 * executes (likely without error) but the return code is overridden in any
> > > +	 * case to indicate an error in ttm_device_init(), which likely did not
> > > +	 * occur.
> > > +	 */
> > > +	return ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
> > > +			       xe->drm.anon_inode->i_mapping,
> > > +			       xe->drm.vma_offset_manager, false, false);
> > > +
> > > +	/*
> > > +	 * The alternative below would also change the flow because in case of error
> > > +	 * injection, ttm_device_init() would not be executed at all.
> > > +	 *
> > > +	 * int err = device_create_ttm_device_init_inject_fault();
> > > +	 * if (err)
> > > +	 *	return err;
> > > +	 * return ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
> > > +	 *		       xe->drm.anon_inode->i_mapping,
> > > +	 *		       xe->drm.vma_offset_manager, false, false);
> > > +	 *
> > > +	 * ...
> > > +	 * static noinline int device_create_ttm_device_init_inject_fault() { return 0; }
> > > +	 * ALLOW_ERROR_INJECTION(device_create_ttm_device_init_inject_fault, ERRNO);
> > > +	 */
> > > +}
> > > +ALLOW_ERROR_INJECTION(device_create_ttm_device_init, ERRNO);
> > > +
> > >  struct xe_device *xe_device_create(struct pci_dev *pdev,
> > >  				   const struct pci_device_id *ent)
> > >  {
> > > @@ -316,9 +348,7 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
> > >  	if (IS_ERR(xe))
> > >  		return xe;
> > >  
> > > -	err = ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev,
> > > -			      xe->drm.anon_inode->i_mapping,
> > > -			      xe->drm.vma_offset_manager, false, false);
> > > +	err = device_create_ttm_device_init(xe);
> > 
> > I believe the function name could be simplified to xe_device_init_ttm...
> 
> Yes indeed.
> 
> > 
> > >  	if (WARN_ON(err))
> > >  		goto err;
> > >  
> > > @@ -550,6 +580,7 @@ static int wait_for_lmem_ready(struct xe_device *xe)
> > >  
> > >  	return 0;
> > >  }
> > > +ALLOW_ERROR_INJECTION(wait_for_lmem_ready, ERRNO);
> > 
> > I liked how simple that is....
> 
> It is quite straightforward when the function is error injectable, which is
> often not the case. Especially this requirement is often not met: "The
> function does not execute any code which can change any state before the first
> error return."
> 
> This is why other examples in this patch require an extra wrapper, see below.
> 
> > 
> > >  
> > >  static void update_device_info(struct xe_device *xe)
> > >  {
> > > diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c
> > > index dda5268507d8..22c0a42af9c2 100644
> > > --- a/drivers/gpu/drm/xe/xe_tile.c
> > > +++ b/drivers/gpu/drm/xe/xe_tile.c
> > > @@ -3,6 +3,8 @@
> > >   * Copyright © 2023 Intel Corporation
> > >   */
> > >  
> > > +#include <linux/fault-inject.h>
> > > +
> > >  #include <drm/drm_managed.h>
> > >  
> > >  #include "xe_device.h"
> > > @@ -99,6 +101,12 @@ static int xe_tile_alloc(struct xe_tile *tile)
> > >  	return 0;
> > >  }
> > >  
> > > +static noinline int tile_init_early_inject_fault(void)
> > > +{
> > > +	return 0;
> > > +}
> > > +ALLOW_ERROR_INJECTION(tile_init_early_inject_fault, ERRNO);
> > > +
> > >  /**
> > >   * xe_tile_init_early - Initialize the tile and primary GT
> > >   * @tile: Tile to initialize
> > > @@ -117,6 +125,16 @@ int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id)
> > >  	tile->xe = xe;
> > >  	tile->id = id;
> > >  
> > > +	/*
> > > +	 * The flow is modified because xe_tile_init_early() fails before the
> > > +	 * first possible error, from xe_tile_alloc(). It is does not match the
> > > +	 * 2nd requirement of
> > > +	 * fault-injection.html#requirements-for-the-error-injectable-functions
> > > +	 */
> > 
> > I'm afraid I didn't understand this.
> 
> In this example we end up returning an error from an artificial code location:
> 
>     err = tile_init_early_inject_fault();
>     if (err)
>         return err;
> 
> ... before even starting to execute the first statement which could return a
> real error, xe_tile_alloc(). Not focusing on this particular example but in
> general this can have side effects and create false positives if the state
> is not changed as expected, either because the first statement that can fail
> was not executed at all, or because all was executed successfully but still
> reported as error.
> 
> Not sure if this is overthinking, after all this issue is also present with
> the i915 macro for error injection and maybe the pros outweigh the cons.
> 
> Francois
> 
> > 
> > and the name could perhaps start with fault_inject?!
> > 
> > or maybe:
> > 
> > xe_fault_inject_tile_init_early();
> 
> Sure, whatever helps easily identify wrappers only added for fault
> injection. Maybe we can even remove the xe_ prefix as the function can be
> static?
> 
> Francois
> 
> > 
> > > +	err = tile_init_early_inject_fault();
> > > +	if (err)
> > > +		return err;
> > > +
> > >  	err = xe_tile_alloc(tile);
> > >  	if (err)
> > >  		return err;
> > > diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c
> > > index 93c82825d896..88a201122a22 100644
> > > --- a/drivers/gpu/drm/xe/xe_wopcm.c
> > > +++ b/drivers/gpu/drm/xe/xe_wopcm.c
> > > @@ -5,6 +5,8 @@
> > >  
> > >  #include "xe_wopcm.h"
> > >  
> > > +#include <linux/fault-inject.h>
> > > +
> > >  #include "regs/xe_guc_regs.h"
> > >  #include "xe_device.h"
> > >  #include "xe_force_wake.h"
> > > @@ -268,3 +270,4 @@ int xe_wopcm_init(struct xe_wopcm *wopcm)
> > >  
> > >  	return ret;
> > >  }
> > > +ALLOW_ERROR_INJECTION(xe_wopcm_init, ERRNO);
> > > -- 
> > > 2.43.0
> > > 

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-09-18 12:55 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-13 14:33 [RFC v1] drm/xe: Use fault injection infrastructure to find issues at probe time Francois Dugast
2024-09-13 18:15 ` ✓ CI.Patch_applied: success for " Patchwork
2024-09-13 18:15 ` ✗ CI.checkpatch: warning " Patchwork
2024-09-13 18:15 ` ✗ CI.KUnit: failure " Patchwork
2024-09-13 20:15 ` [RFC v1] " Rodrigo Vivi
2024-09-16  8:45   ` Francois Dugast
2024-09-18 12:55     ` Francois Dugast

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox