All of lore.kernel.org
 help / color / mirror / Atom feed
From: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
To: "Ghimiray, Himal Prasad" <himal.prasad.ghimiray@intel.com>,
	"intel-xe@lists.freedesktop.org" <intel-xe@lists.freedesktop.org>
Subject: Re: [Intel-xe] [PATCH 06/11] drm/xe: Notify userspace about GSC HW errors.
Date: Thu, 12 Oct 2023 08:42:55 +0530	[thread overview]
Message-ID: <9a320ea2-94c0-dd15-d58e-d18dc8a9e5a2@linux.intel.com> (raw)
In-Reply-To: <MW4PR11MB70569F78CD647F840CD2DF0BB3CCA@MW4PR11MB7056.namprd11.prod.outlook.com>


On 11/10/23 12:55, Ghimiray, Himal Prasad wrote:
>
>> -----Original Message-----
>> From: Aravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
>> Sent: 11 October 2023 12:53
>> To: Ghimiray, Himal Prasad <himal.prasad.ghimiray@intel.com>; intel-
>> xe@lists.freedesktop.org
>> Subject: Re: [Intel-xe] [PATCH 06/11] drm/xe: Notify userspace about GSC
>> HW errors.
>>
>>
>> On 27/09/23 17:16, Himal Prasad Ghimiray wrote:
>>> Send uevent incase of nonfatal errors reported by gsc.
>>>
>>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
>>> ---
>>>  drivers/gpu/drm/xe/xe_device_types.h |  3 +++
>>>  drivers/gpu/drm/xe/xe_hw_error.c     | 20 ++++++++++++++++++++
>>>  drivers/gpu/drm/xe/xe_hw_error.h     |  3 ++-
>>>  drivers/gpu/drm/xe/xe_irq.c          |  4 ++++
>>>  include/uapi/drm/xe_drm.h            |  9 +++++++++
>>>  5 files changed, 38 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h
>>> b/drivers/gpu/drm/xe/xe_device_types.h
>>> index 6aa4f4801d81..ff476a167be4 100644
>>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>>> @@ -179,6 +179,9 @@ struct xe_tile {
>>>  	struct tile_hw_errors {
>>>  		unsigned long count[XE_TILE_HW_ERROR_MAX];
>>>  	} errors;
>>> +
>>> +	/** @gsc_hw_err_work: worker for uevent to report GSC HW errors
>> */
>>> +	struct work_struct gsc_hw_err_work;
>>>  };
>>>
>>>  /**
>>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c
>>> b/drivers/gpu/drm/xe/xe_hw_error.c
>>> index eb76b8e6a338..76ae12df013c 100644
>>> --- a/drivers/gpu/drm/xe/xe_hw_error.c
>>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>>> @@ -3,6 +3,8 @@
>>>   * Copyright © 2023 Intel Corporation
>>>   */
>>>
>>> +#include <drm/xe_drm.h>
>>> +
>>>  #include "xe_hw_error.h"
>>>
>>>  #include "regs/xe_regs.h"
>>> @@ -366,6 +368,22 @@ xe_gt_hw_error_handler(struct xe_gt *gt, const
>> enum hardware_error hw_err)
>>>  		xe_gt_hw_error_status_reg_handler(gt, hw_err);  }
>>>
>>> +void xe_gsc_hw_error_work(struct work_struct *work) {
>>> +	struct xe_tile *tile = container_of(work, typeof(*tile),
>> gsc_hw_err_work);
>>> +	char *csc_hw_error_event[4];
>>> +
>>> +	csc_hw_error_event[0] = XE_GSC_HW_HEALTH_UEVENT "=1";
>>> +	csc_hw_error_event[1] = "RESET_REQUIRED=1";
>>> +	csc_hw_error_event[2] = kasprintf(GFP_KERNEL, "TILE_ID=%d", tile-
>>> id);
>>> +	csc_hw_error_event[3] = NULL;
>>> +
>>> +	kobject_uevent_env(&tile->xe->drm.primary->kdev->kobj,
>> KOBJ_CHANGE,
>>> +			   csc_hw_error_event);
>>> +
>>> +	kfree(csc_hw_error_event[2]);
>>> +}
>>> +
>>>  static void
>>>  xe_gsc_hw_error_handler(struct xe_tile *tile, const enum
>>> hardware_error hw_err)  { @@ -423,6 +441,8 @@
>>> xe_gsc_hw_error_handler(struct xe_tile *tile, const enum hardware_error
>> hw_err)
>>>  			drm_err_ratelimited(&tile_to_xe(tile)->drm,
>>>  					    HW_ERR "GSC detected %s %s
>> error, bit[%d] is set\n",
>>>  					    errmsg, hw_err_str, errbit);
>>> +
>>> +			schedule_work(&tile->gsc_hw_err_work);
>>>  		}
>>>  		tile->errors.count[indx]++;
>>>  	}
>>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h
>>> b/drivers/gpu/drm/xe/xe_hw_error.h
>>> index 155722a0af4c..ee7705b3343b 100644
>>> --- a/drivers/gpu/drm/xe/xe_hw_error.h
>>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>>> @@ -7,6 +7,7 @@
>>>
>>>  #include <linux/stddef.h>
>>>  #include <linux/types.h>
>>> +#include <linux/workqueue.h>
>>>
>>>  /* Error categories reported by hardware */  enum hardware_error { @@
>>> -121,5 +122,5 @@ struct xe_tile;
>>>
>>>  void xe_hw_error_irq_handler(struct xe_tile *tile, const u32
>>> master_ctl);  void xe_process_hw_errors(struct xe_device *xe);
>>> -
>>> +void xe_gsc_hw_error_work(struct work_struct *work);
>>>  #endif
>>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>>> index 06c9b43e2c71..285c657cc789 100644
>>> --- a/drivers/gpu/drm/xe/xe_irq.c
>>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>>> @@ -586,6 +586,10 @@ int xe_irq_install(struct xe_device *xe)
>>>  	irq_handler_t irq_handler;
>>>  	int err, irq;
>>>
>>> +	struct xe_tile *tile = xe_device_get_root_tile(xe);
>>> +
>>> +	INIT_WORK(&tile->gsc_hw_err_work, xe_gsc_hw_error_work);
>>> +
>>>  	irq_handler = xe_irq_handler(xe);
>>>  	if (!irq_handler) {
>>>  		drm_err(&xe->drm, "No supported interrupt handler"); diff --
>> git
>>> a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index
>>> d48d8e3c898c..c45833defcc7 100644
>>> --- a/include/uapi/drm/xe_drm.h
>>> +++ b/include/uapi/drm/xe_drm.h
>>> @@ -16,6 +16,15 @@ extern "C" {
>>>   * subject to backwards-compatibility constraints.
>>>   */
>>>
>>> +/**
>>> + * DOC: uevent generated by xe on it's tile node.
>>> + *
>>> + * XE_GSC_HW_HEALTH_UEVENT - Event is generated when GSC reports
>> HW
>>> + * errors. The value supplied with the event is always
>> "RESET_REQUIRED=1".
>>> + * Additional information supplied is tile id on which error is reported.
>> what is the relevance of tile id if it always reported on tile 0 only.
> Hmm. Ya right. Any other information we would like to send ?
> Instead of DEVICE_STATUS is it ok to send GSC_HW_STATUS ?
I think RESET_REQUIRED is sufficient but may be you have to add more details
to UAPI DOC why RESET is needed.

Thanks,

Aravind.
>> Thanks,
>>
>> Aravind
>>> + */
>>> +#define XE_GSC_HW_HEALTH_UEVENT "DEVICE_STATUS"
>>> +
>>>  /**
>>>   * DOC: uevent generated by xe on it's pci node.
>>>   *

  reply	other threads:[~2023-10-12  3:11 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-27 11:46 [Intel-xe] [PATCH 00/11] Supporting CSC and SOC HARDWARE ERROR HANDLING on PVC Himal Prasad Ghimiray
2023-09-27 11:43 ` [Intel-xe] ✓ CI.Patch_applied: success for " Patchwork
2023-09-27 11:43 ` [Intel-xe] ✗ CI.checkpatch: warning " Patchwork
2023-09-27 11:44 ` [Intel-xe] ✓ CI.KUnit: success " Patchwork
2023-09-27 11:46 ` [Intel-xe] [PATCH 01/11] drm/xe: Handle errors from various components Himal Prasad Ghimiray
2023-09-27 11:46 ` [Intel-xe] [PATCH 02/11] drm/xe: Log and count the GT hardware errors Himal Prasad Ghimiray
2023-09-27 11:46 ` [Intel-xe] [PATCH 03/11] drm/xe: Support GT hardware error reporting for PVC Himal Prasad Ghimiray
2023-09-27 11:46 ` [Intel-xe] [PATCH 04/11] drm/xe: Process fatal hardware errors Himal Prasad Ghimiray
2023-09-27 11:46 ` [Intel-xe] [PATCH 05/11] drm/xe: Support GSC hardware error reporting for PVC Himal Prasad Ghimiray
2023-10-11  7:18   ` Aravind Iddamsetty
2023-09-27 11:46 ` [Intel-xe] [PATCH 06/11] drm/xe: Notify userspace about GSC HW errors Himal Prasad Ghimiray
2023-10-11  7:23   ` Aravind Iddamsetty
2023-10-11  7:25     ` Ghimiray, Himal Prasad
2023-10-12  3:12       ` Aravind Iddamsetty [this message]
2023-09-27 11:46 ` [Intel-xe] [PATCH 07/11] drm/xe: Support SOC FATAL error handling for PVC Himal Prasad Ghimiray
2023-10-04  6:38   ` Aravind Iddamsetty
2023-10-04  6:50     ` Ghimiray, Himal Prasad
2023-10-08  9:32       ` Aravind Iddamsetty
2023-10-09  4:11         ` Ghimiray, Himal Prasad
2023-10-09  9:00           ` Aravind Iddamsetty
2023-10-09  9:15             ` Ghimiray, Himal Prasad
2023-10-10  6:27               ` Aravind Iddamsetty
2023-10-09  9:52   ` Aravind Iddamsetty
2023-10-09 10:14     ` Ghimiray, Himal Prasad
2023-09-27 11:46 ` [Intel-xe] [PATCH 08/11] drm/xe: Support SOC NONFATAL " Himal Prasad Ghimiray
2023-10-11  6:07   ` Aravind Iddamsetty
2023-09-27 11:46 ` [Intel-xe] [PATCH 09/11] drm/xe: Handle MDFI error severity Himal Prasad Ghimiray
2023-10-04 12:11   ` Aravind Iddamsetty
2023-09-27 11:46 ` [Intel-xe] [PATCH 10/11] drm/xe: Clear SOC CORRECTABLE error registers Himal Prasad Ghimiray
2023-10-09  9:58   ` Aravind Iddamsetty
2023-10-11  6:48   ` Aravind Iddamsetty
2023-10-11  6:52     ` Ghimiray, Himal Prasad
2023-10-12  2:59       ` Aravind Iddamsetty
2023-10-12  4:01         ` Ghimiray, Himal Prasad
2023-09-27 11:46 ` [Intel-xe] [PATCH 11/11] drm/xe: Clear all SoC errors post warm reset Himal Prasad Ghimiray
2023-10-11  6:56   ` Aravind Iddamsetty
2023-10-11  6:59     ` Ghimiray, Himal Prasad
2023-10-12  3:05       ` Aravind Iddamsetty
2023-09-27 11:51 ` [Intel-xe] ✓ CI.Build: success for Supporting CSC and SOC HARDWARE ERROR HANDLING on PVC Patchwork
2023-09-27 11:52 ` [Intel-xe] ✗ CI.Hooks: failure " Patchwork
2023-09-27 11:53 ` [Intel-xe] ✓ CI.checksparse: success " Patchwork
2023-09-27 12:28 ` [Intel-xe] ✗ CI.BAT: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=9a320ea2-94c0-dd15-d58e-d18dc8a9e5a2@linux.intel.com \
    --to=aravind.iddamsetty@linux.intel.com \
    --cc=himal.prasad.ghimiray@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.