Re: [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Jonathan Cameron <jonathan.cameron@huawei.com>
To: Shuai Xue <xueshuai@linux.alibaba.com>
Cc: <linux-pci@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<linuxppc-dev@lists.ozlabs.org>, <bhelgaas@google.com>,
	<kbusch@kernel.org>, <sathyanarayanan.kuppuswamy@linux.intel.com>,
	<mahesh@linux.ibm.com>, <oohall@gmail.com>,
	<terry.bowman@amd.com>, <tianruidong@linux.alibaba.com>,
	<lukas@wunner.de>
Subject: Re: [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error
Date: Tue, 27 Jan 2026 10:24:02 +0000	[thread overview]
Message-ID: <20260127102402.00004da2@huawei.com> (raw)
In-Reply-To: <20260124074557.73961-3-xueshuai@linux.alibaba.com>

On Sat, 24 Jan 2026 15:45:54 +0800
Shuai Xue <xueshuai@linux.alibaba.com> wrote:

> The current implementation of pcie_do_recovery() assumes that the
> recovery process is executed for the device that detected the error.
> However, the DPC driver currently passes the error port that experienced
> the DPC event to pcie_do_recovery().
> 
> Use the SOURCE ID register to correctly identify the device that
> detected the error. When passing the error device, the
> pcie_do_recovery() will find the upstream bridge and walk bridges
> potentially AER affected. And subsequent commits will be able to
> accurately access AER status of the error device.
> 
> Should not observe any functional changes.
> 
> Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
> ---
>  drivers/pci/pci.h      |  2 +-
>  drivers/pci/pcie/dpc.c | 25 +++++++++++++++++++++----
>  drivers/pci/pcie/edr.c |  7 ++++---
>  3 files changed, 26 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 0e67014aa001..58640e656897 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -771,7 +771,7 @@ struct rcec_ea {
>  void pci_save_dpc_state(struct pci_dev *dev);
>  void pci_restore_dpc_state(struct pci_dev *dev);
>  void pci_dpc_init(struct pci_dev *pdev);
> -void dpc_process_error(struct pci_dev *pdev);
> +struct pci_dev *dpc_process_error(struct pci_dev *pdev);
>  pci_ers_result_t dpc_reset_link(struct pci_dev *pdev);
>  bool pci_dpc_recovered(struct pci_dev *pdev);
>  unsigned int dpc_tlp_log_len(struct pci_dev *dev);
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index bff29726c6a5..f6069f621683 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -260,10 +260,20 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
>  	return 1;
>  }
>  
> -void dpc_process_error(struct pci_dev *pdev)
> +/**
> + * dpc_process_error - handle the DPC error status
> + * @pdev: the port that experienced the containment event
> + *
> + * Return: the device that detected the error.
> + *
> + * NOTE: The device reference count is increased, the caller must decrement
> + * the reference count by calling pci_dev_put().
> + */
> +struct pci_dev *dpc_process_error(struct pci_dev *pdev)

Maybe it makes sense to carry the err_port naming for the pci_dev
in here as well?  Seems stronger than just relying on people
reading the documentation you've added.
 
>  {
>  	u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
>  	struct aer_err_info info = {};
> +	struct pci_dev *err_dev;
>  
>  	pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
>  
> @@ -279,6 +289,7 @@ void dpc_process_error(struct pci_dev *pdev)
>  			pci_aer_clear_nonfatal_status(pdev);
>  			pci_aer_clear_fatal_status(pdev);
>  		}
> +		err_dev = pci_dev_get(pdev);
>  		break;
>  	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE:
>  	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE:
> @@ -290,6 +301,8 @@ void dpc_process_error(struct pci_dev *pdev)
>  				"ERR_FATAL" : "ERR_NONFATAL",
>  			 pci_domain_nr(pdev->bus), PCI_BUS_NUM(source),
>  			 PCI_SLOT(source), PCI_FUNC(source));
> +		err_dev = pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
> +					    PCI_BUS_NUM(source), source & 0xff);

Bunch of replication in her with the pci_warn(). Maybe some local variables?
Maybe even rebuild the final parameter from PCI_DEVFN(slot, func) just to make the
association with the print really obvious?

Is there any chance that this might return NULL?   Feels like maybe that's
only a possibility on a broken setup, but I'm not sure of all the wonderful
races around hotplug and DPC occurring before the OS has caught up.

>  		break;
>  	case PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT:
>  		ext_reason = status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT;
> @@ -304,8 +317,11 @@ void dpc_process_error(struct pci_dev *pdev)
>  		if (ext_reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO &&
>  		    pdev->dpc_rp_extensions)
>  			dpc_process_rp_pio_error(pdev);
> +		err_dev = pci_dev_get(pdev);
>  		break;
>  	}
> +
> +	return err_dev;
>  }

> diff --git a/drivers/pci/pcie/edr.c b/drivers/pci/pcie/edr.c
> index 521fca2f40cb..b6e9d652297e 100644
> --- a/drivers/pci/pcie/edr.c
> +++ b/drivers/pci/pcie/edr.c
> @@ -150,7 +150,7 @@ static int acpi_send_edr_status(struct pci_dev *pdev, struct pci_dev *edev,
>  
>  static void edr_handle_event(acpi_handle handle, u32 event, void *data)
>  {
> -	struct pci_dev *pdev = data, *err_port;
> +	struct pci_dev *pdev = data, *err_port, *err_dev;
>  	pci_ers_result_t estate = PCI_ERS_RESULT_DISCONNECT;
>  	u16 status;
>  
> @@ -190,7 +190,7 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data)
>  		goto send_ost;
>  	}
>  
> -	dpc_process_error(err_port);
> +	err_dev = dpc_process_error(err_port);
>  	pci_aer_raw_clear_status(err_port);
>  
>  	/*
> @@ -198,7 +198,8 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data)
>  	 * or ERR_NONFATAL, since the link is already down, use the FATAL
>  	 * error recovery path for both cases.
>  	 */
> -	estate = pcie_do_recovery(err_port, pci_channel_io_frozen, dpc_reset_link);
> +	estate = pcie_do_recovery(err_dev, pci_channel_io_frozen, dpc_reset_link);
> +	pci_dev_put(err_dev);
>  
>  send_ost:
>

next prev parent reply	other threads:[~2026-01-27 10:24 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-24  7:45 [PATCH v7 0/5] PCI/AER: Report fatal errors of RCiEP and EP if link recoverd Shuai Xue
2026-01-24  7:45 ` [PATCH v7 1/5] PCI/DPC: Clarify naming for error port in DPC Handling Shuai Xue
2026-01-27 10:10   ` Jonathan Cameron
2026-01-24  7:45 ` [PATCH v7 2/5] PCI/DPC: Run recovery on device that detected the error Shuai Xue
2026-01-27 10:24   ` Jonathan Cameron [this message]
2026-01-28 12:27     ` Shuai Xue
2026-01-28 15:02       ` Jonathan Cameron
2026-01-29  5:49         ` Shuai Xue
2026-02-02 14:02   ` Lukas Wunner
2026-02-02 21:09     ` Lukas Wunner
2026-02-07  7:48       ` Shuai Xue
2026-02-27  8:28         ` Shuai Xue
2026-02-27 10:47           ` Lukas Wunner
2026-02-27 12:28             ` Shuai Xue
2026-02-06  8:41     ` Shuai Xue
2026-01-24  7:45 ` [PATCH v7 3/5] PCI/AER: Report fatal errors of RCiEP and EP if link recoverd Shuai Xue
2026-01-27 10:36   ` Jonathan Cameron
2026-01-28 12:29     ` Shuai Xue
2026-01-28 16:50   ` Kuppuswamy Sathyanarayanan
2026-01-29 11:46     ` Shuai Xue
2026-01-24  7:45 ` [PATCH v7 4/5] PCI/AER: Clear both AER fatal and non-fatal status Shuai Xue
2026-01-27 10:39   ` Jonathan Cameron
2026-01-28 12:30     ` Shuai Xue
2026-01-28 16:58   ` Kuppuswamy Sathyanarayanan
2026-02-03  8:06   ` Lukas Wunner
2026-02-07  8:34     ` Shuai Xue
2026-01-24  7:45 ` [PATCH v7 5/5] PCI/AER: Only clear error bits in pcie_clear_device_status() Shuai Xue
2026-01-27 10:45   ` Jonathan Cameron
2026-01-28 12:45     ` Shuai Xue
2026-02-03  7:44       ` Lukas Wunner
2026-02-06  8:12         ` Shuai Xue
2026-01-28 17:01   ` Kuppuswamy Sathyanarayanan
2026-01-29 12:09     ` Shuai Xue
2026-02-03  7:53   ` Lukas Wunner
2026-02-06  7:39     ` Shuai Xue

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260127102402.00004da2@huawei.com \
    --to=jonathan.cameron@huawei.com \
    --cc=bhelgaas@google.com \
    --cc=kbusch@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=lukas@wunner.de \
    --cc=mahesh@linux.ibm.com \
    --cc=oohall@gmail.com \
    --cc=sathyanarayanan.kuppuswamy@linux.intel.com \
    --cc=terry.bowman@amd.com \
    --cc=tianruidong@linux.alibaba.com \
    --cc=xueshuai@linux.alibaba.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.