All of lore.kernel.org
 help / color / mirror / Atom feed
From: Rodrigo Vivi <rodrigo.vivi@intel.com>
To: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>,
	intel-xe@lists.freedesktop.org
Subject: Re: [Intel-xe] [PATCH v5 2/2] drm/xe: Introduce fault injection for gt reset
Date: Thu, 20 Jul 2023 12:28:51 -0400	[thread overview]
Message-ID: <ZLlgw5A0DebwmGqv@intel.com> (raw)
In-Reply-To: <20230718133216.3079521-3-himal.prasad.ghimiray@intel.com>

On Tue, Jul 18, 2023 at 07:02:16PM +0530, Himal Prasad Ghimiray wrote:
> To trigger gt reset failure:
>  echo 100 >  /sys/kernel/debug/dri/<cardX>/fail_gt_reset/probability
>  echo 2 >  /sys/kernel/debug/dri/<cardX>/fail_gt_reset/times

why 2 and not 1?

anyway, neat solution!

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>

> 
> Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_debugfs.c | 10 ++++++++++
>  drivers/gpu/drm/xe/xe_gt.c      |  8 +++++++-
>  drivers/gpu/drm/xe/xe_gt.h      | 14 ++++++++++++++
>  3 files changed, 31 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
> index 7827a785b020..08d5bdf4cf61 100644
> --- a/drivers/gpu/drm/xe/xe_debugfs.c
> +++ b/drivers/gpu/drm/xe/xe_debugfs.c
> @@ -5,6 +5,7 @@
>  
>  #include "xe_debugfs.h"
>  
> +#include <linux/fault-inject.h>
>  #include <linux/string_helpers.h>
>  
>  #include <drm/drm_debugfs.h>
> @@ -20,6 +21,10 @@
>  #include "xe_vm.h"
>  #endif
>  
> +#ifdef CONFIG_FAULT_INJECTION
> +DECLARE_FAULT_ATTR(gt_reset_failure);
> +#endif
> +
>  static struct xe_device *node_to_xe(struct drm_info_node *node)
>  {
>  	return to_xe_device(node->minor->dev);
> @@ -131,4 +136,9 @@ void xe_debugfs_register(struct xe_device *xe)
>  
>  	for_each_gt(gt, xe, id)
>  		xe_gt_debugfs_register(gt);
> +
> +#ifdef CONFIG_FAULT_INJECTION
> +	fault_create_debugfs_attr("fail_gt_reset", root, &gt_reset_failure);
> +#endif
> +
>  }
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 1db4d610f2fd..370d4b96e616 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -525,6 +525,11 @@ static int gt_reset(struct xe_gt *gt)
>  
>  	xe_gt_info(gt, "reset started\n");
>  
> +	if (xe_fault_inject_gt_reset()) {
> +		err = -ECANCELED;
> +		goto err_fail;
> +	}
> +
>  	xe_gt_sanitize(gt);
>  
>  	xe_device_mem_access_get(gt_to_xe(gt));
> @@ -562,6 +567,7 @@ static int gt_reset(struct xe_gt *gt)
>  err_msg:
>  	XE_WARN_ON(xe_uc_start(&gt->uc));
>  	xe_device_mem_access_put(gt_to_xe(gt));
> +err_fail:
>  	xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
>  
>  	/* Notify userspace about gt reset failure */
> @@ -582,7 +588,7 @@ void xe_gt_reset_async(struct xe_gt *gt)
>  	xe_gt_info(gt, "trying reset\n");
>  
>  	/* Don't do a reset while one is already in flight */
> -	if (xe_uc_reset_prepare(&gt->uc))
> +	if (!xe_fault_inject_gt_reset() && xe_uc_reset_prepare(&gt->uc))
>  		return;
>  
>  	xe_gt_info(gt, "reset queued\n");
> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
> index 7298653a73de..caded203a8a0 100644
> --- a/drivers/gpu/drm/xe/xe_gt.h
> +++ b/drivers/gpu/drm/xe/xe_gt.h
> @@ -7,6 +7,7 @@
>  #define _XE_GT_H_
>  
>  #include <drm/drm_util.h>
> +#include <linux/fault-inject.h>
>  
>  #include "xe_device_types.h"
>  #include "xe_hw_engine.h"
> @@ -16,6 +17,19 @@
>  		for_each_if(((hwe__) = (gt__)->hw_engines + (id__)) && \
>  			  xe_hw_engine_is_valid((hwe__)))
>  
> +#ifdef CONFIG_FAULT_INJECTION
> +extern struct fault_attr gt_reset_failure;
> +static inline bool xe_fault_inject_gt_reset(void)
> +{
> +	return should_fail(&gt_reset_failure, 1);
> +}
> +#else
> +static inline bool xe_fault_inject_gt_reset(void)
> +{
> +	return false;
> +}
> +#endif
> +
>  struct xe_gt *xe_gt_alloc(struct xe_tile *tile);
>  int xe_gt_init_early(struct xe_gt *gt);
>  int xe_gt_init(struct xe_gt *gt);
> -- 
> 2.25.1
> 

  reply	other threads:[~2023-07-20 16:29 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-18 13:32 [Intel-xe] [PATCH v5 0/2] Notify userspace about uevent failure Himal Prasad Ghimiray
2023-07-18 13:32 ` [Intel-xe] [PATCH v5 1/2] drm/xe: Notify Userspace when gt reset fails Himal Prasad Ghimiray
2023-07-18 15:01   ` Nilawar, Badal
2023-07-18 23:52   ` Roper, Matthew D
2023-07-19  1:51     ` Ghimiray, Himal Prasad
2023-07-20 16:32       ` Rodrigo Vivi
2023-07-24  8:35         ` Ghimiray, Himal Prasad
2023-07-18 13:32 ` [Intel-xe] [PATCH v5 2/2] drm/xe: Introduce fault injection for gt reset Himal Prasad Ghimiray
2023-07-20 16:28   ` Rodrigo Vivi [this message]
2023-07-24  8:38     ` Ghimiray, Himal Prasad
2023-07-18 15:52 ` [Intel-xe] ✓ CI.Patch_applied: success for Notify userspace about uevent failure Patchwork
2023-07-18 15:52 ` [Intel-xe] ✗ CI.checkpatch: warning " Patchwork
2023-07-18 15:52 ` [Intel-xe] ✗ CI.KUnit: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZLlgw5A0DebwmGqv@intel.com \
    --to=rodrigo.vivi@intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=lucas.demarchi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.