Re: [PATCH v2 03/11] drm/xe/xe_pci_error: Implement PCI error recovery callbacks

public inbox for intel-xe@lists.freedesktop.org
 help / color / mirror / Atom feed

From: "Tauro, Riana" <riana.tauro@intel.com>
To: "Mallesh, Koujalagi" <mallesh.koujalagi@intel.com>,
	<intel-xe@lists.freedesktop.org>
Cc: <anshuman.gupta@intel.com>, <rodrigo.vivi@intel.com>,
	<aravind.iddamsetty@linux.intel.com>, <badal.nilawar@intel.com>,
	<raag.jadav@intel.com>, <ravi.kishore.koppuravuri@intel.com>,
	"Michal Wajdeczko" <michal.wajdeczko@intel.com>,
	Matthew Brost <matthew.brost@intel.com>,
	Matt Roper <matthew.d.roper@intel.com>
Subject: Re: [PATCH v2 03/11] drm/xe/xe_pci_error: Implement PCI error recovery callbacks
Date: Tue, 31 Mar 2026 10:48:52 +0530	[thread overview]
Message-ID: <9a2443d0-653f-462d-a661-dfa99658d569@intel.com> (raw)
In-Reply-To: <8976f5c8-fbc1-4ff3-aaf4-0584185e6cff@intel.com>


On 3/4/2026 4:08 PM, Mallesh, Koujalagi wrote:
>
> On 02-03-2026 03:51 pm, Riana Tauro wrote:
>> Add error_detected, mmio_enabled, slot_reset and resume
>> recovery callbacks to handle PCIe Advanced Error Reporting
>> (AER) errors.
>>
>> For fatal errors, the device is wedged and becomes
>> inaccessible. Return PCI_ERS_RESULT_SLOT_RESET from
>> error_detected to request a Secondary Bus Reset (SBR).
>>
>> For non-fatal errors, return PCI_ERS_RESULT_CAN_RECOVER from
>> error_detected to trigger the mmio_enabled callback. In this callback,
>> the device is queried to determine the error cause and attempt
>> recovery based on the error type.
>>
>> Once the secondary bus reset(SBR) is completed the slot_reset callback
>> cleanly removes and reprobe the device to restore functionality.
>>
>> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Matt Roper <matthew.d.roper@intel.com>
>> Signed-off-by: Riana Tauro <riana.tauro@intel.com>
>> ---
>> v2: re-order linux headers
>>      reword error messages
>>      do not clear in_recovery after remove
>>      return PCI_ERS_RESULT_DISCONNECT if probe fails (Michal)
>>      only wedge device do not send uevent (Raag)
>>      set recovery flag in error_detected and clear on resume
>>      add default switch case (Mallesh)
>> ---
>>   drivers/gpu/drm/xe/Makefile          |  1 +
>>   drivers/gpu/drm/xe/xe_device.h       | 15 +++++
>>   drivers/gpu/drm/xe/xe_device_types.h |  3 +
>>   drivers/gpu/drm/xe/xe_pci.c          |  3 +
>>   drivers/gpu/drm/xe/xe_pci_error.c    | 99 ++++++++++++++++++++++++++++
>>   5 files changed, 121 insertions(+)
>>   create mode 100644 drivers/gpu/drm/xe/xe_pci_error.c
>>
>> diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
>> index 1890bbd1b28d..417b030e5ce7 100644
>> --- a/drivers/gpu/drm/xe/Makefile
>> +++ b/drivers/gpu/drm/xe/Makefile
>> @@ -99,6 +99,7 @@ xe-y += xe_bb.o \
>>       xe_page_reclaim.o \
>>       xe_pat.o \
>>       xe_pci.o \
>> +    xe_pci_error.o \
>>       xe_pci_rebar.o \
>>       xe_pcode.o \
>>       xe_pm.o \
>> diff --git a/drivers/gpu/drm/xe/xe_device.h 
>> b/drivers/gpu/drm/xe/xe_device.h
>> index 39464650533b..972f43d20f1a 100644
>> --- a/drivers/gpu/drm/xe/xe_device.h
>> +++ b/drivers/gpu/drm/xe/xe_device.h
>> @@ -43,6 +43,21 @@ static inline struct xe_device 
>> *ttm_to_xe_device(struct ttm_device *ttm)
>>       return container_of(ttm, struct xe_device, ttm);
>>   }
>>   +static inline bool xe_device_is_in_recovery(struct xe_device *xe)
>> +{
>> +    return atomic_read(&xe->in_recovery);
>> +}
>> +
>> +static inline void xe_device_set_in_recovery(struct xe_device *xe)
>> +{
>> +    atomic_set(&xe->in_recovery, 1);
>> +}
>> +
>> +static inline void xe_device_clear_in_recovery(struct xe_device *xe)
>> +{
>> +     atomic_set(&xe->in_recovery, 0);
>> +}
>> +
>>   struct xe_device *xe_device_create(struct pci_dev *pdev,
>>                      const struct pci_device_id *ent);
>>   int xe_device_probe_early(struct xe_device *xe);
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
>> b/drivers/gpu/drm/xe/xe_device_types.h
>> index 5599534384fa..616d74792902 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -504,6 +504,9 @@ struct xe_device {
>>           bool inconsistent_reset;
>>       } wedged;
>>   +    /** @in_recovery: Indicates if device is in recovery */
>> +    atomic_t in_recovery;
>> +
>>       /** @bo_device: Struct to control async free of BOs */
>>       struct xe_bo_dev {
>>           /** @bo_device.async_free: Free worker */
>> diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
>> index ad1e5ef2ee89..825489287f28 100644
>> --- a/drivers/gpu/drm/xe/xe_pci.c
>> +++ b/drivers/gpu/drm/xe/xe_pci.c
>> @@ -1313,6 +1313,8 @@ static const struct dev_pm_ops xe_pm_ops = {
>>   };
>>   #endif
>>   +extern const struct pci_error_handlers xe_pci_error_handlers;
>> +
>>   static struct pci_driver xe_pci_driver = {
>>       .name = DRIVER_NAME,
>>       .id_table = pciidlist,
>> @@ -1320,6 +1322,7 @@ static struct pci_driver xe_pci_driver = {
>>       .remove = xe_pci_remove,
>>       .shutdown = xe_pci_shutdown,
>>       .sriov_configure = xe_pci_sriov_configure,
>> +    .err_handler = &xe_pci_error_handlers,
>>   #ifdef CONFIG_PM_SLEEP
>>       .driver.pm = &xe_pm_ops,
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_pci_error.c 
>> b/drivers/gpu/drm/xe/xe_pci_error.c
>> new file mode 100644
>> index 000000000000..d4896a4a5014
>> --- /dev/null
>> +++ b/drivers/gpu/drm/xe/xe_pci_error.c
>> @@ -0,0 +1,99 @@
>> +// SPDX-License-Identifier: MIT
>> +/*
>> + * Copyright © 2026 Intel Corporation
>> + */
>> +#include <linux/pci.h>
>> +
>> +#include <drm/drm_drv.h>
>> +
>> +#include "xe_device.h"
>> +#include "xe_gt.h"
>> +#include "xe_pci.h"
>> +#include "xe_uc.h"
>> +
>> +static void xe_pci_error_handling(struct pci_dev *pdev)
>> +{
>> +    struct xe_device *xe = pdev_to_xe_device(pdev);
>> +    struct xe_gt *gt;
>> +    u8 id;
>> +
> What is behavior when wedged mode is XE_WEDGED_MODE_NEVER and PCI 
> error occurred.

Even then to attempt a slot reset, we need to stop all ioctls and job 
submissions. Here we are reusing
the wedged flag to do the same.  We are not asking the userspace to 
recover so should be fine.
My intention was to re-use the wedged flag instead of using in_recovery 
everywhere.

>> +    /* Wedge the device to prevent userspace access but don't send 
>> the event yet */
>> +    atomic_set(&xe->wedged.flag, 1);
>> +
>> +    for_each_gt(gt, xe, id)
>> +        xe_gt_declare_wedged(gt);
>> +
>> +    pci_disable_device(pdev);
>> +}
>> +
>> +static pci_ers_result_t xe_pci_error_detected(struct pci_dev *pdev, 
>> pci_channel_state_t state)
>> +{
>> +    struct xe_device *xe = pdev_to_xe_device(pdev);
>> +
>> +    dev_err(&pdev->dev, "Xe Pci error recovery: error detected state 
>> %d\n", state);
>> +
>> +    xe_device_set_in_recovery(xe);
>> +
>> +    switch (state) {
>> +    case pci_channel_io_normal:
>> +        return PCI_ERS_RESULT_CAN_RECOVER;
>> +    case pci_channel_io_frozen:
>> +        xe_pci_error_handling(pdev);
>> +        return PCI_ERS_RESULT_NEED_RESET;
>> +    case pci_channel_io_perm_failure:
>
> Clear in_recovery flag before disconnect, dead/unusable device should 
> not appear to be recovering.
>
> in_recovery flag still set that suggest recovery is actively happening. 

Agree, last rev this was proper and in_recovery was added only before 
attempting a slot reset.
Will either move it back or return if state is disconnect.

Reverting to previous revision would make more sense here since we are 
using in_recovery
during job timeouts and gt resets when device is wedged.

Thanks
Riana

>
> Thanks
>
> -/Mallesh
>
>> +        return PCI_ERS_RESULT_DISCONNECT;
>> +    default:
>> +        dev_err(&pdev->dev, "Unknown state %d\n", state);
>> +        return PCI_ERS_RESULT_NEED_RESET;
>> +    }
>> +}
>> +
>> +static pci_ers_result_t xe_pci_error_mmio_enabled(struct pci_dev *pdev)
>> +{
>> +    dev_err(&pdev->dev, "Xe Pci error recovery: MMIO enabled\n");
>> +
>> +    return PCI_ERS_RESULT_NEED_RESET;
>> +}
>> +
>> +static pci_ers_result_t xe_pci_error_slot_reset(struct pci_dev *pdev)
>> +{
>> +    const struct pci_device_id *ent = 
>> pci_match_id(pdev->driver->id_table, pdev);
>> +    struct xe_device *xe = pdev_to_xe_device(pdev);
>> +
>> +    dev_err(&pdev->dev, "Xe Pci error recovery: Slot reset\n");
>> +
>> +    pci_restore_state(pdev);
>> +
>> +    if (pci_enable_device(pdev)) {
>> +        dev_err(&pdev->dev,
>> +            "Cannot re-enable PCI device after reset\n");
>> +        return PCI_ERS_RESULT_DISCONNECT;
>> +    }
>> +
>> +    /*
>> +     * Secondary Bus Reset wipes out all device memory
>> +     * requiring XE KMD to perform a device removal and reprobe.
>> +     */
>> +    pdev->driver->remove(pdev);
>> +
>> +    if (!pdev->driver->probe(pdev, ent))
>> +        return PCI_ERS_RESULT_RECOVERED;
>> +
>> +    return PCI_ERS_RESULT_DISCONNECT;
>> +}
>> +
>> +static void xe_pci_error_resume(struct pci_dev *pdev)
>> +{
>> +    struct xe_device *xe = pdev_to_xe_device(pdev);
>> +
>> +    dev_info(&pdev->dev, "Xe Pci error recovery: Recovered\n");
>> +
>> +    xe_device_clear_in_recovery(xe);
>> +}
>> +
>> +const struct pci_error_handlers xe_pci_error_handlers = {
>> +    .error_detected    = xe_pci_error_detected,
>> +    .mmio_enabled    = xe_pci_error_mmio_enabled,
>> +    .slot_reset    = xe_pci_error_slot_reset,
>> +    .resume        = xe_pci_error_resume,
>> +};

next prev parent reply	other threads:[~2026-03-31  5:19 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-02 10:21 [PATCH v2 00/11] Introduce Xe Uncorrectable Error Handling Riana Tauro
2026-03-02 10:21 ` [PATCH v2 01/11] drm/xe/xe_sysctrl: Add System controller patch Riana Tauro
2026-03-02 10:21 ` [PATCH v2 02/11] drm/xe/xe_survivability: Decouple survivability info from boot survivability Riana Tauro
2026-03-02 17:00   ` Raag Jadav
2026-03-03  8:18     ` Mallesh, Koujalagi
2026-03-30 12:56       ` Tauro, Riana
2026-03-30 13:00     ` Tauro, Riana
2026-03-02 10:21 ` [PATCH v2 03/11] drm/xe/xe_pci_error: Implement PCI error recovery callbacks Riana Tauro
2026-03-02 17:37   ` Raag Jadav
2026-03-03  5:09     ` Riana Tauro
2026-03-04 10:38   ` Mallesh, Koujalagi
2026-03-31  5:18     ` Tauro, Riana [this message]
2026-03-02 10:21 ` [PATCH v2 04/11] drm/xe/xe_pci_error: Group all devres to release them on PCIe slot reset Riana Tauro
2026-03-02 10:22 ` [PATCH v2 05/11] drm/xe: Skip device access during PCI error recovery Riana Tauro
2026-03-04 10:59   ` Mallesh, Koujalagi
2026-03-02 10:22 ` [PATCH v2 06/11] drm/xe/xe_ras: Initialize Uncorrectable AER Registers Riana Tauro
2026-03-02 10:22 ` [PATCH v2 07/11] drm/xe/xe_ras: Add structures and commands for Uncorrectable Core Compute Errors Riana Tauro
2026-03-04 16:32   ` Raag Jadav
2026-03-31 16:14     ` Tauro, Riana
2026-04-01  6:25       ` Raag Jadav
2026-04-01  6:39         ` Tauro, Riana
2026-03-02 10:22 ` [PATCH v2 08/11] drm/xe/xe_ras: Add support for Uncorrectable Core-Compute errors Riana Tauro
2026-03-04 16:52   ` Raag Jadav
2026-03-06 18:37     ` Raag Jadav
2026-03-31 16:24     ` Tauro, Riana
2026-04-01  6:34       ` Raag Jadav
2026-04-01  6:47         ` Tauro, Riana
2026-03-06  3:50   ` [v2,08/11] " Purkait, Soham
2026-03-31 16:16     ` Tauro, Riana
2026-03-02 10:22 ` [PATCH v2 09/11] drm/xe/xe_ras: Add structures for SoC Internal errors Riana Tauro
2026-03-10 13:02   ` Mallesh, Koujalagi
2026-03-11 14:51     ` Riana Tauro
2026-03-02 10:22 ` [PATCH v2 10/11] drm/xe/xe_ras: Handle Uncorrectable " Riana Tauro
2026-03-10 13:29   ` Mallesh, Koujalagi
2026-03-11 14:55     ` Riana Tauro
2026-03-02 10:22 ` [PATCH v2 11/11] drm/xe/xe_pci_error: Process errors in mmio_enabled Riana Tauro
2026-03-11  7:10   ` Mallesh, Koujalagi
2026-03-11 14:39     ` Riana Tauro
2026-03-12  8:08       ` Mallesh, Koujalagi
2026-03-02 16:10 ` ✗ CI.checkpatch: warning for Introduce Xe Uncorrectable Error Handling (rev2) Patchwork
2026-03-02 16:11 ` ✓ CI.KUnit: success " Patchwork
2026-03-02 16:48 ` ✓ Xe.CI.BAT: " Patchwork
2026-03-02 18:29 ` ✗ Xe.CI.FULL: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=9a2443d0-653f-462d-a661-dfa99658d569@intel.com \
    --to=riana.tauro@intel.com \
    --cc=anshuman.gupta@intel.com \
    --cc=aravind.iddamsetty@linux.intel.com \
    --cc=badal.nilawar@intel.com \
    --cc=intel-xe@lists.freedesktop.org \
    --cc=mallesh.koujalagi@intel.com \
    --cc=matthew.brost@intel.com \
    --cc=matthew.d.roper@intel.com \
    --cc=michal.wajdeczko@intel.com \
    --cc=raag.jadav@intel.com \
    --cc=ravi.kishore.koppuravuri@intel.com \
    --cc=rodrigo.vivi@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox