From: Rodrigo Vivi <rodrigo.vivi@intel.com>
To: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: <intel-xe@lists.freedesktop.org>
Subject: Re: [PATCH 5/6] drm/xe: Introduce the busted_mode debugfs
Date: Mon, 18 Mar 2024 15:45:57 -0400 [thread overview]
Message-ID: <ZfiZ9Y9M_tF-GBNv@intel.com> (raw)
In-Reply-To: <qntepfekkplyecpcjgvosunoxvg7xdtrrgo6d6j6kgoai6lz54@3wc7rqxj3pdk>
On Mon, Mar 18, 2024 at 02:16:49PM -0500, Lucas De Marchi wrote:
> On Fri, Mar 15, 2024 at 10:01:07AM -0400, Rodrigo Vivi wrote:
> > So, the busted mode can be selected per device at runtime,
> > before the tests or before reproducing the issue.
> >
> > Cc: Lucas De Marchi <lucas.demarchi@intel.com>
> > Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > ---
> > drivers/gpu/drm/xe/xe_debugfs.c | 44 ++++++++++++++++++++++++++++
> > drivers/gpu/drm/xe/xe_device.c | 12 ++++++--
> > drivers/gpu/drm/xe/xe_device.h | 2 +-
> > drivers/gpu/drm/xe/xe_device_types.h | 11 +++++--
> > drivers/gpu/drm/xe/xe_guc_ads.c | 6 ++--
> > drivers/gpu/drm/xe/xe_guc_submit.c | 5 ++--
> > 6 files changed, 71 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
> > index 8abdf3c17e1d..175ba306c3eb 100644
> > --- a/drivers/gpu/drm/xe/xe_debugfs.c
> > +++ b/drivers/gpu/drm/xe/xe_debugfs.c
> > @@ -106,6 +106,47 @@ static const struct file_operations forcewake_all_fops = {
> > .release = forcewake_release,
> > };
> >
> > +static ssize_t busted_mode_show(struct file *f, char __user *ubuf,
> > + size_t size, loff_t *pos)
> > +{
> > + struct xe_device *xe = file_inode(f)->i_private;
> > + char buf[32];
> > + int len = 0;
> > +
> > + mutex_lock(&xe->busted.lock);
> > + len = scnprintf(buf, sizeof(buf), "%d\n", xe->busted.mode);
> > + mutex_unlock(&xe->busted.lock);
> > +
> > + return simple_read_from_buffer(ubuf, size, pos, buf, len);
> > +}
> > +
> > +static ssize_t busted_mode_set(struct file *f, const char __user *ubuf,
> > + size_t size, loff_t *pos)
> > +{
> > + struct xe_device *xe = file_inode(f)->i_private;
> > + u32 busted_mode;
> > + ssize_t ret;
> > +
> > + ret = kstrtouint_from_user(ubuf, size, 0, &busted_mode);
> > + if (ret)
> > + return ret;
> > +
> > + if (busted_mode > 2)
> > + return -EINVAL;
> > +
> > + mutex_lock(&xe->busted.lock);
> > + xe->busted.mode = busted_mode;
> > + mutex_unlock(&xe->busted.lock);
> > +
> > + return size;
> > +}
> > +
> > +static const struct file_operations busted_mode_fops = {
> > + .owner = THIS_MODULE,
> > + .read = busted_mode_show,
> > + .write = busted_mode_set,
> > +};
> > +
> > void xe_debugfs_register(struct xe_device *xe)
> > {
> > struct ttm_device *bdev = &xe->ttm;
> > @@ -123,6 +164,9 @@ void xe_debugfs_register(struct xe_device *xe)
> > debugfs_create_file("forcewake_all", 0400, root, xe,
> > &forcewake_all_fops);
> >
> > + debugfs_create_file("busted_mode", 0400, root, xe,
> > + &busted_mode_fops);
> > +
> > for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) {
> > man = ttm_manager_type(bdev, mem_type);
> >
> > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> > index e28e3628744f..49dc2a061463 100644
> > --- a/drivers/gpu/drm/xe/xe_device.c
> > +++ b/drivers/gpu/drm/xe/xe_device.c
> > @@ -434,6 +434,9 @@ int xe_device_probe_early(struct xe_device *xe)
> > if (err)
> > return err;
> >
> > + mutex_init(&xe->busted.lock);
> > + xe->busted.mode = xe_modparam.busted_mode;
> > +
> > return 0;
> > }
> >
> > @@ -791,10 +794,15 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
> > */
> > void xe_device_declare_busted(struct xe_device *xe)
> > {
> > - if (xe_modparam.busted_mode == 0)
> > + /*
> > + * It is okay to check unlocked.
> > + * busted disabled should be done at boot with parameter or with debugsfs
> > + * way before starting the workload.
> > + */
> > + if (xe->busted.mode == 0)
> > return;
> >
> > - if (!atomic_xchg(&xe->busted, 1))
> > + if (!atomic_xchg(&xe->busted.flag, 1))
> > drm_err(&xe->drm,
> > "CRITICAL: Xe has declared device %s as busted.\n"
> > "IOCTLs and executions are blocked until device is probed again with unbind and bind operations:\n"
> > diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> > index e6edf2d3ee4a..a8074975538e 100644
> > --- a/drivers/gpu/drm/xe/xe_device.h
> > +++ b/drivers/gpu/drm/xe/xe_device.h
> > @@ -178,7 +178,7 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address);
> >
> > static inline bool xe_device_busted(struct xe_device *xe)
> > {
> > - return atomic_read(&xe->busted);
> > + return atomic_read(&xe->busted.flag);
> > }
> >
> > void xe_device_declare_busted(struct xe_device *xe);
> > diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
> > index 2633fdfc1a38..f68beb660f90 100644
> > --- a/drivers/gpu/drm/xe/xe_device_types.h
> > +++ b/drivers/gpu/drm/xe/xe_device_types.h
> > @@ -455,8 +455,15 @@ struct xe_device {
> > /** @needs_flr_on_fini: requests function-reset on fini */
> > bool needs_flr_on_fini;
> >
> > - /** @busted: Xe device faced a critical error and is now blocked. */
> > - atomic_t busted;
> > + /** @busted: Struct to control Busted States and mode */
> > + struct {
> > + /** @busted.flag: Xe device faced a critical error and is now blocked. */
> > + atomic_t flag;
> > + /** @busted.mode: Mode controlled by kernel parameter and debugfs */
> > + int mode;
> > + /** @busted.lock: To protect @busted.mode */
> > + struct mutex lock;
> > + } busted;
> >
> > /* private: */
> >
> > diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c
> > index ecf45289b187..43f0a88bbe8a 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_ads.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_ads.c
> > @@ -18,7 +18,6 @@
> > #include "xe_lrc.h"
> > #include "xe_map.h"
> > #include "xe_mmio.h"
> > -#include "xe_module.h"
> > #include "xe_platform_types.h"
> >
> > /* Slack of a few additional entries per engine */
> > @@ -313,6 +312,7 @@ int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads)
> >
> > static void guc_policies_init(struct xe_guc_ads *ads)
> > {
> > + struct xe_device *xe = ads_to_xe(ads);
> > u32 global_flags = 0;
> >
> > ads_blob_write(ads, policies.dpc_promote_time,
> > @@ -320,8 +320,10 @@ static void guc_policies_init(struct xe_guc_ads *ads)
> > ads_blob_write(ads, policies.max_num_work_items,
> > GLOBAL_POLICY_MAX_NUM_WI);
> >
> > - if (xe_modparam.busted_mode == 2)
> > + mutex_lock(&xe->busted.lock);
> > + if (xe->busted.mode == 2)
>
> I'm confused about this added lock. Why is it ok to check it unlocked
> in xe_device_declare_busted() and not here... we don't even have debugfs
> here yet.
right... we could simply remove the locks from here as well.
or perhaps we could then make the declare_busted to always
grab the lock.
but then we need to split that in 2 or change that to be something
like
void xe_device_possibly_busted(struct xe_device *xe, bool in_timeout_path)
{
mutex_lock();
if (mode == 0)
goto out;
if (in_timeout_path && mode != 2)
goto out;
if (!atomic_xchg(&xe->busted.flag, 1))
drm_err();
out:
mutex_unlock();
}
thoughts? preferences?
>
> Lucas De Marchi
>
> > global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
> > + mutex_unlock(&xe->busted.lock);
> >
> > ads_blob_write(ads, policies.global_flags, 0);
> > ads_blob_write(ads, policies.is_valid, 1);
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 3f3160373631..cdda533ab317 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -34,7 +34,6 @@
> > #include "xe_macros.h"
> > #include "xe_map.h"
> > #include "xe_mocs.h"
> > -#include "xe_module.h"
> > #include "xe_ring_ops_types.h"
> > #include "xe_sched_job.h"
> > #include "xe_trace.h"
> > @@ -951,8 +950,10 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> > simple_error_capture(q);
> > xe_devcoredump(job);
> >
> > - if (xe_modparam.busted_mode == 2)
> > + mutex_lock(&xe->busted.lock);
> > + if (xe->busted.mode == 2)
> > xe_device_declare_busted(xe);
> > + mutex_unlock(&xe->busted.lock);
> >
> > trace_xe_sched_job_timedout(job);
> >
> > --
> > 2.44.0
> >
next prev parent reply other threads:[~2024-03-18 19:46 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-15 14:01 [PATCH 1/6] drm/xe: Introduce a simple busted state Rodrigo Vivi
2024-03-15 14:01 ` [PATCH 2/6] drm/xe: declare busted upon GuC load failure Rodrigo Vivi
2024-03-15 14:01 ` [PATCH 3/6] drm/xe: Add extra busted protection for the no GuC reset Rodrigo Vivi
2024-03-15 14:01 ` [PATCH 4/6] drm/xe: Force busted state and block GT reset upon any GPU hang Rodrigo Vivi
2024-03-18 21:08 ` Dafna Hirschfeld
2024-03-18 21:18 ` Rodrigo Vivi
2024-03-15 14:01 ` [PATCH 5/6] drm/xe: Introduce the busted_mode debugfs Rodrigo Vivi
2024-03-18 19:16 ` Lucas De Marchi
2024-03-18 19:45 ` Rodrigo Vivi [this message]
2024-03-15 14:01 ` [PATCH 6/6] " Rodrigo Vivi
2024-03-18 19:31 ` Lucas De Marchi
2024-03-18 20:14 ` Rodrigo Vivi
2024-03-21 13:16 ` Lucas De Marchi
2024-03-18 21:12 ` Dafna Hirschfeld
2024-03-18 21:25 ` Rodrigo Vivi
2024-03-15 14:06 ` ✓ CI.Patch_applied: success for series starting with [1/6] drm/xe: Introduce a simple busted state Patchwork
2024-03-15 14:07 ` ✓ CI.checkpatch: " Patchwork
2024-03-15 14:07 ` ✓ CI.KUnit: " Patchwork
2024-03-15 14:18 ` ✓ CI.Build: " Patchwork
2024-03-15 14:20 ` ✓ CI.Hooks: " Patchwork
2024-03-15 14:22 ` ✓ CI.checksparse: " Patchwork
2024-03-15 14:47 ` ✓ CI.BAT: " Patchwork
2024-03-18 19:04 ` [PATCH 1/6] " Lucas De Marchi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=ZfiZ9Y9M_tF-GBNv@intel.com \
--to=rodrigo.vivi@intel.com \
--cc=intel-xe@lists.freedesktop.org \
--cc=lucas.demarchi@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.