All of lore.kernel.org
 help / color / mirror / Atom feed
From: Raag Jadav <raag.jadav@intel.com>
To: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>,
	lucas.demarchi@intel.com, thomas.hellstrom@linux.intel.com,
	intel-xe@lists.freedesktop.org, dri-devel@lists.freedesktop.org,
	himal.prasad.ghimiray@intel.com, francois.dugast@intel.com,
	anshuman.gupta@intel.com
Subject: Re: [PATCH v2] drm/xe/uapi: Bring back reset uevent
Date: Wed, 14 Aug 2024 12:12:51 +0300	[thread overview]
Message-ID: <Zrx1E-Xd1bAl5kBw@black.fi.intel.com> (raw)
In-Reply-To: <deb48edf-0a35-49bf-a4b3-b7d60f127b44@linux.intel.com>

On Wed, Aug 14, 2024 at 12:16:41PM +0530, Aravind Iddamsetty wrote:
>
>On 13/08/24 22:24, Rodrigo Vivi wrote:
>> On Tue, Aug 13, 2024 at 04:28:32PM +0300, Raag Jadav wrote:
>>> On Mon, Aug 12, 2024 at 03:08:14PM +0530, Aravind Iddamsetty wrote:
>>>> On 12/08/24 13:18, Raag Jadav wrote:
>>>>> From: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
>>>>>
>>>>> This was dropped in commit 77a0d4d1cea2 ("drm/xe/uapi: Remove reset
>>>>> uevent for now") as part of refactoring.
>>>>>
>>>>> Now that we have better uapi semantics and naming for the uevent,
>>>>> bring it back. With this in place, userspace will be notified of
>>>>> wedged device along with its reason.
>>>>>
>>>>> $ udevadm monitor --property --kernel
>>>>> monitor will print the received events for:
>>>>> KERNEL - the kernel uevent
>>>>>
>>>>> KERNEL[871.188570] change   /devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 (pci)
>>>>> ACTION=change
>>>>> DEVPATH=/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0
>>>>> SUBSYSTEM=pci
>>>>> DEVICE_STATUS=NEEDS_RESET
>>>>> REASON=GT_RESET_FAILED
>>>>> TILE_ID=0
>>>>> GT_ID=0
>>>>> DRIVER=xe
>>>>> PCI_CLASS=30000
>>>>> PCI_ID=8086:56B1
>>>>> PCI_SUBSYS_ID=8086:1210
>>>>> PCI_SLOT_NAME=0000:03:00.0
>>>>> MODALIAS=pci:v00008086d000056B1sv00008086sd00001210bc03sc00i00
>>>>> SEQNUM=6104
>>>>>
>>>>> v2: Change authorship to Himal (Aravind)
>>>>>     Add uevent for all device wedged cases (Aravind)
>>>>>
>>>>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
>>>>> Signed-off-by: Raag Jadav <raag.jadav@intel.com>
>>>>> ---
>>>>>  drivers/gpu/drm/xe/xe_device.c     | 10 +++++++++-
>>>>>  drivers/gpu/drm/xe/xe_device.h     |  2 +-
>>>>>  drivers/gpu/drm/xe/xe_gt.c         | 23 +++++++++++++++++++----
>>>>>  drivers/gpu/drm/xe/xe_guc.c        | 13 ++++++++++++-
>>>>>  drivers/gpu/drm/xe/xe_guc_submit.c | 13 ++++++++++++-
>>>>>  include/uapi/drm/xe_drm.h          | 29 +++++++++++++++++++++++++++++
>>>>>  6 files changed, 82 insertions(+), 8 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
>>>>> index 1aba6f9eaa19..d975bdce4a7d 100644
>>>>> --- a/drivers/gpu/drm/xe/xe_device.c
>>>>> +++ b/drivers/gpu/drm/xe/xe_device.c
>>>>> @@ -955,6 +955,7 @@ static void xe_device_wedged_fini(struct drm_device *drm, void *arg)
>>>>>  /**
>>>>>   * xe_device_declare_wedged - Declare device wedged
>>>>>   * @xe: xe device instance
>>>>> + * @event_params: parameters to be sent along with uevent
>>>>>   *
>>>>>   * This is a final state that can only be cleared with a mudule
>>>>>   * re-probe (unbind + bind).
>>>>> @@ -965,8 +966,10 @@ static void xe_device_wedged_fini(struct drm_device *drm, void *arg)
>>>>>   * on every single execution timeout (a.k.a. GPU hang) right after devcoredump
>>>>>   * snapshot capture. In this mode, GT reset won't be attempted so the state of
>>>>>   * the issue is preserved for further debugging.
>>>>> + * Caller is expected to pass respective parameters to be sent along with
>>>>> + * uevent. Pass NULL in case of no params.
>>>>>   */
>>>>> -void xe_device_declare_wedged(struct xe_device *xe)
>>>>> +void xe_device_declare_wedged(struct xe_device *xe, char **event_params)
>>>>>  {
>>>>>  	struct xe_gt *gt;
>>>>>  	u8 id;
>>>>> @@ -984,12 +987,17 @@ void xe_device_declare_wedged(struct xe_device *xe)
>>>>>  	xe_pm_runtime_get_noresume(xe);
>>>>>  
>>>>>  	if (!atomic_xchg(&xe->wedged.flag, 1)) {
>>>>> +		struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
>>>>> +
>>>>>  		xe->needs_flr_on_fini = true;
>>>>>  		drm_err(&xe->drm,
>>>>>  			"CRITICAL: Xe has declared device %s as wedged.\n"
>>>>>  			"IOCTLs and executions are blocked. Only a rebind may clear the failure\n"
>>>>>  			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
>>>>>  			dev_name(xe->drm.dev));
>>>>> +
>>>>> +		/* Notify userspace about reset required */
>>>>> +		kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, event_params);
>>>>>  	}
>>>>>  
>>>>>  	for_each_gt(gt, xe, id)
>>>>> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
>>>>> index db6cc8d0d6b8..5d40fc6f0904 100644
>>>>> --- a/drivers/gpu/drm/xe/xe_device.h
>>>>> +++ b/drivers/gpu/drm/xe/xe_device.h
>>>>> @@ -174,7 +174,7 @@ static inline bool xe_device_wedged(struct xe_device *xe)
>>>>>  	return atomic_read(&xe->wedged.flag);
>>>>>  }
>>>>>  
>>>>> -void xe_device_declare_wedged(struct xe_device *xe);
>>>>> +void xe_device_declare_wedged(struct xe_device *xe, char **reset_event);
>>>>>  
>>>>>  struct xe_file *xe_file_get(struct xe_file *xef);
>>>>>  void xe_file_put(struct xe_file *xef);
>>>>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>>>>> index 58895ed22f6e..519f3c2cf9e2 100644
>>>>> --- a/drivers/gpu/drm/xe/xe_gt.c
>>>>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>>>>> @@ -741,6 +741,24 @@ static int do_gt_restart(struct xe_gt *gt)
>>>>>  	return 0;
>>>>>  }
>>>>>  
>>>>> +static void xe_gt_reset_failed(struct xe_gt *gt, int err)
>>>>> +{
>>>>> +	char *event_params[5];
>>>>> +
>>>>> +	xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
>>>>> +
>>>>> +	event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT;
>>>>> +	event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT;
>>>>> +	event_params[2] = kasprintf(GFP_KERNEL, "TILE_ID=%d", gt_to_tile(gt)->id);
>>>>> +	event_params[3] = kasprintf(GFP_KERNEL, "GT_ID=%d", gt->info.id);
>>>> the TILE_ID and GT_ID can be passed for other events as well, with that you can
>>>> have a common function to send uevent which would take reason as an input.
>>> But is that required for all cases? There could be potential cases atleast
>>> in the future where it is not needed.
> 
> 
> At least in these cases it makes sense as they (other reasons)
> can be associated to a GT and a Tile. If in future they arises a
> reason where these details are not needed i guess we can handle that.

But then we'll have to modify it with every new addition, which doesn't
look like a win. With current implementation the callers atleast have
the autonomy to send params as-needed.

Raag

  reply	other threads:[~2024-08-14  9:13 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-08-12  7:48 [PATCH v2] drm/xe/uapi: Bring back reset uevent Raag Jadav
2024-08-12  7:47 ` ✓ CI.Patch_applied: success for drm/xe/uapi: Bring back reset uevent (rev2) Patchwork
2024-08-12  7:47 ` ✗ CI.checkpatch: warning " Patchwork
2024-08-12  7:48 ` ✓ CI.KUnit: success " Patchwork
2024-08-12  8:00 ` ✓ CI.Build: " Patchwork
2024-08-12  8:02 ` ✓ CI.Hooks: " Patchwork
2024-08-12  8:04 ` ✓ CI.checksparse: " Patchwork
2024-08-12  8:06 ` [PATCH v2] drm/xe/uapi: Bring back reset uevent Ghimiray, Himal Prasad
2024-08-12  8:27 ` ✗ CI.BAT: failure for drm/xe/uapi: Bring back reset uevent (rev2) Patchwork
2024-08-12  9:38 ` [PATCH v2] drm/xe/uapi: Bring back reset uevent Aravind Iddamsetty
2024-08-13 13:28   ` Raag Jadav
2024-08-13 16:54     ` Rodrigo Vivi
2024-08-14  6:46       ` Aravind Iddamsetty
2024-08-14  9:12         ` Raag Jadav [this message]
2024-08-14 12:49           ` Aravind Iddamsetty
2024-08-14 14:24         ` Lucas De Marchi
2024-08-16  4:29           ` Aravind Iddamsetty
2024-08-16  5:41             ` Lucas De Marchi
2024-08-12  9:55 ` ✗ CI.FULL: failure for drm/xe/uapi: Bring back reset uevent (rev2) Patchwork
2024-08-13 14:13 ` [PATCH v2] drm/xe/uapi: Bring back reset uevent Lucas De Marchi
2024-08-14  6:31   ` Aravind Iddamsetty

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Zrx1E-Xd1bAl5kBw@black.fi.intel.com \
    --to=raag.jadav@intel.com \
    --cc=anshuman.gupta@intel.com \
    --cc=aravind.iddamsetty@linux.intel.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=francois.dugast@intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=lucas.demarchi@intel.com \
    --cc=rodrigo.vivi@intel.com \
    --cc=thomas.hellstrom@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.