From: Timothy Pearson <tpearson@raptorengineering.com>
To: Timothy Pearson <tpearson@raptorengineering.com>
Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>,
linux-kernel <linux-kernel@vger.kernel.org>,
linux-pci <linux-pci@vger.kernel.org>,
Madhavan Srinivasan <maddy@linux.ibm.com>,
Michael Ellerman <mpe@ellerman.id.au>,
christophe leroy <christophe.leroy@csgroup.eu>,
Naveen N Rao <naveen@kernel.org>,
Bjorn Helgaas <bhelgaas@google.com>,
Shawn Anastasio <sanastasio@raptorengineering.com>
Subject: [PATCH v3 4/6] powerpc/eeh: Make EEH driver device hotplug safe
Date: Tue, 15 Jul 2025 16:38:23 -0500 (CDT) [thread overview]
Message-ID: <1334208367.1359861.1752615503144.JavaMail.zimbra@raptorengineeringinc.com> (raw)
In-Reply-To: <1268570622.1359844.1752615109932.JavaMail.zimbra@raptorengineeringinc.com>
Multiple race conditions existed between the PCIe hotplug driver and
the EEH driver, leading to a variety of kernel oopses of the same
general nature:
<pcie device unplug>
<eeh driver trigger>
<hotplug removal trigger>
<pcie tree reconfiguration>
<eeh recovery next step>
<oops in EEH driver bus iteration loop>
A second class of oops is also seen when the underling bus disappears
during device recovery.
Refactor the EEH module to be PCI rescan and remove safe. Also clean
up a few minor formatting / readability issues.
Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
---
arch/powerpc/kernel/eeh_driver.c | 48 +++++++++++++++++++++-----------
arch/powerpc/kernel/eeh_pe.c | 10 ++++---
2 files changed, 38 insertions(+), 20 deletions(-)
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7efe04c68f0f..dd50de91c438 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -257,13 +257,12 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
struct pci_driver *driver;
enum pci_ers_result new_result;
- pci_lock_rescan_remove();
pdev = edev->pdev;
if (pdev)
get_device(&pdev->dev);
- pci_unlock_rescan_remove();
if (!pdev) {
eeh_edev_info(edev, "no device");
+ *result = PCI_ERS_RESULT_DISCONNECT;
return;
}
device_lock(&pdev->dev);
@@ -304,8 +303,9 @@ static void eeh_pe_report(const char *name, struct eeh_pe *root,
struct eeh_dev *edev, *tmp;
pr_info("EEH: Beginning: '%s'\n", name);
- eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp)
- eeh_pe_report_edev(edev, fn, result);
+ eeh_for_each_pe(root, pe)
+ eeh_pe_for_each_dev(pe, edev, tmp)
+ eeh_pe_report_edev(edev, fn, result);
if (result)
pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n",
name, pci_ers_result_name(*result));
@@ -383,6 +383,8 @@ static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
if (!edev)
return;
+ pci_lock_rescan_remove();
+
/*
* The content in the config space isn't saved because
* the blocked config space on some adapters. We have
@@ -393,14 +395,19 @@ static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
if (list_is_last(&edev->entry, &edev->pe->edevs))
eeh_pe_restore_bars(edev->pe);
+ pci_unlock_rescan_remove();
return;
}
pdev = eeh_dev_to_pci_dev(edev);
- if (!pdev)
+ if (!pdev) {
+ pci_unlock_rescan_remove();
return;
+ }
pci_restore_state(pdev);
+
+ pci_unlock_rescan_remove();
}
/**
@@ -647,9 +654,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) {
eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
} else {
- pci_lock_rescan_remove();
pci_hp_remove_devices(bus);
- pci_unlock_rescan_remove();
}
/*
@@ -665,8 +670,6 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
if (rc)
return rc;
- pci_lock_rescan_remove();
-
/* Restore PE */
eeh_ops->configure_bridge(pe);
eeh_pe_restore_bars(pe);
@@ -674,7 +677,6 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
/* Clear frozen state */
rc = eeh_clear_pe_frozen_state(pe, false);
if (rc) {
- pci_unlock_rescan_remove();
return rc;
}
@@ -709,7 +711,6 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
pe->tstamp = tstamp;
pe->freeze_count = cnt;
- pci_unlock_rescan_remove();
return 0;
}
@@ -843,10 +844,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
{LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
int devices = 0;
+ pci_lock_rescan_remove();
+
bus = eeh_pe_bus_get(pe);
if (!bus) {
pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
__func__, pe->phb->global_number, pe->addr);
+ pci_unlock_rescan_remove();
return;
}
@@ -1094,10 +1098,15 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
- pci_lock_rescan_remove();
- pci_hp_remove_devices(bus);
- pci_unlock_rescan_remove();
+ bus = eeh_pe_bus_get(pe);
+ if (bus)
+ pci_hp_remove_devices(bus);
+ else
+ pr_err("%s: PCI bus for PHB#%x-PE#%x disappeared\n",
+ __func__, pe->phb->global_number, pe->addr);
+
/* The passed PE should no longer be used */
+ pci_unlock_rescan_remove();
return;
}
@@ -1114,6 +1123,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
eeh_clear_slot_attention(edev->pdev);
eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
+
+ pci_unlock_rescan_remove();
}
/**
@@ -1132,6 +1143,7 @@ void eeh_handle_special_event(void)
unsigned long flags;
int rc;
+ pci_lock_rescan_remove();
do {
rc = eeh_ops->next_error(&pe);
@@ -1171,10 +1183,12 @@ void eeh_handle_special_event(void)
break;
case EEH_NEXT_ERR_NONE:
+ pci_unlock_rescan_remove();
return;
default:
pr_warn("%s: Invalid value %d from next_error()\n",
__func__, rc);
+ pci_unlock_rescan_remove();
return;
}
@@ -1186,7 +1200,9 @@ void eeh_handle_special_event(void)
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
rc == EEH_NEXT_ERR_FENCED_PHB) {
eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+ pci_unlock_rescan_remove();
eeh_handle_normal_event(pe);
+ pci_lock_rescan_remove();
} else {
eeh_for_each_pe(pe, tmp_pe)
eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
@@ -1199,7 +1215,6 @@ void eeh_handle_special_event(void)
eeh_report_failure, NULL);
eeh_set_channel_state(pe, pci_channel_io_perm_failure);
- pci_lock_rescan_remove();
list_for_each_entry(hose, &hose_list, list_node) {
phb_pe = eeh_phb_pe_get(hose);
if (!phb_pe ||
@@ -1218,7 +1233,6 @@ void eeh_handle_special_event(void)
}
pci_hp_remove_devices(bus);
}
- pci_unlock_rescan_remove();
}
/*
@@ -1228,4 +1242,6 @@ void eeh_handle_special_event(void)
if (rc == EEH_NEXT_ERR_DEAD_IOC)
break;
} while (rc != EEH_NEXT_ERR_NONE);
+
+ pci_unlock_rescan_remove();
}
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index d283d281d28e..e740101fadf3 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -671,10 +671,12 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
eeh_ops->write_config(edev, cap + PCI_EXP_LNKCTL, 2, val);
/* Check link */
- if (!edev->pdev->link_active_reporting) {
- eeh_edev_dbg(edev, "No link reporting capability\n");
- msleep(1000);
- return;
+ if (edev->pdev) {
+ if (!edev->pdev->link_active_reporting) {
+ eeh_edev_dbg(edev, "No link reporting capability\n");
+ msleep(1000);
+ return;
+ }
}
/* Wait the link is up until timeout (5s) */
--
2.39.5
next prev parent reply other threads:[~2025-07-15 21:38 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-07-15 21:31 [PATCH v3 0/6] PowerNV PCIe Hotplug Driver Fixes Timothy Pearson
2025-07-15 21:36 ` [PATCH v3 1/6] PCI: pnv_php: Properly clean up allocated IRQs on unplug Timothy Pearson
2025-07-15 21:36 ` [PATCH v3 2/6] PCI: pnv_php: Work around switches with broken presence detection Timothy Pearson
2025-07-15 21:37 ` [PATCH v3 3/6] powerpc/eeh: Export eeh_unfreeze_pe() Timothy Pearson
2025-07-15 21:38 ` Timothy Pearson [this message]
2025-07-15 21:39 ` [PATCH v3 5/6] PCI: pnv_php: Fix surprise plug detection and recovery Timothy Pearson
2025-07-17 23:27 ` Bjorn Helgaas
2025-07-18 0:05 ` Timothy Pearson
2025-07-15 21:39 ` [PATCH v3 6/6] PCI: pnv_php: Enable third attention indicator state Timothy Pearson
2025-07-17 23:27 ` [PATCH v3 0/6] PowerNV PCIe Hotplug Driver Fixes Bjorn Helgaas
2025-07-22 20:47 ` Bjorn Helgaas
2025-07-23 11:00 ` Madhavan Srinivasan
2025-07-23 11:39 ` Bjorn Helgaas
2025-07-23 11:47 ` Ganesh G R
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1334208367.1359861.1752615503144.JavaMail.zimbra@raptorengineeringinc.com \
--to=tpearson@raptorengineering.com \
--cc=bhelgaas@google.com \
--cc=christophe.leroy@csgroup.eu \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-pci@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=maddy@linux.ibm.com \
--cc=mpe@ellerman.id.au \
--cc=naveen@kernel.org \
--cc=sanastasio@raptorengineering.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).