From: David Matlack <dmatlack@google.com>
To: iommu@lists.linux.dev, kexec@lists.infradead.org,
linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-mm@kvack.org, linux-pci@vger.kernel.org
Cc: Adithya Jayachandran <ajayachandra@nvidia.com>,
Alexander Graf <graf@amazon.com>,
Alex Williamson <alex@shazbot.org>,
Bjorn Helgaas <bhelgaas@google.com>, Chris Li <chrisl@kernel.org>,
David Matlack <dmatlack@google.com>,
David Rientjes <rientjes@google.com>,
Jacob Pan <jacob.pan@linux.microsoft.com>,
Jason Gunthorpe <jgg@nvidia.com>, Joerg Roedel <joro@8bytes.org>,
Jonathan Corbet <corbet@lwn.net>, Josh Hilke <jrhilke@google.com>,
Leon Romanovsky <leonro@nvidia.com>,
Lukas Wunner <lukas@wunner.de>, Mike Rapoport <rppt@kernel.org>,
Parav Pandit <parav@nvidia.com>,
Pasha Tatashin <pasha.tatashin@soleen.com>,
Pranjal Shrivastava <praan@google.com>,
Pratyush Yadav <pratyush@kernel.org>,
Robin Murphy <robin.murphy@arm.com>,
Saeed Mahameed <saeedm@nvidia.com>,
Samiullah Khawaja <skhawaja@google.com>,
Shuah Khan <skhan@linuxfoundation.org>,
Will Deacon <will@kernel.org>, William Tu <witu@nvidia.com>,
Yi Liu <yi.l.liu@intel.com>
Subject: [PATCH v4 03/11] PCI: liveupdate: Track incoming preserved PCI devices
Date: Thu, 23 Apr 2026 21:23:07 +0000 [thread overview]
Message-ID: <20260423212316.3431746-4-dmatlack@google.com> (raw)
In-Reply-To: <20260423212316.3431746-1-dmatlack@google.com>
During PCI enumeration, the previous kernel might have passed state about
devices that were preserved across kexec. The PCI core needs to fetch
this state to identify which devices are "incoming" and require special
handling.
Add pci_liveupdate_setup_device() which is called during device setup
to fetch the serialized state (struct pci_ser) from the Live Update
Orchestrator. The first time this happens, pci_flb_retrieve() will run
and convert the array of pci_dev_ser structs into an xarray so that it
can be looked up efficiently.
If a device is found in the xarray, the PCI core stores a pointer to its
state in dev->liveupdate_incoming and holds a reference to the incoming
FLB until pci_liveupdate_finish() is called by the driver.
This ensures proper lifecycle management for incoming preserved devices
and allows the PCI core and drivers to apply specific Live Update
logic to them in subsequent commits.
Signed-off-by: David Matlack <dmatlack@google.com>
---
drivers/pci/liveupdate.c | 189 ++++++++++++++++++++++++++++++++++++++-
drivers/pci/pci.h | 13 +++
drivers/pci/probe.c | 4 +
include/linux/pci.h | 16 ++++
4 files changed, 218 insertions(+), 4 deletions(-)
diff --git a/drivers/pci/liveupdate.c b/drivers/pci/liveupdate.c
index 2dd8daa2f17c..e616cecc37c8 100644
--- a/drivers/pci/liveupdate.c
+++ b/drivers/pci/liveupdate.c
@@ -56,6 +56,20 @@
* This allows the PCI core to keep it's FLB data (struct pci_ser) up to date
* with the list of **outgoing** preserved devices for the next kernel.
*
+ * After kexec, whenever a device is enumerated, the PCI core will check if it
+ * is an **incoming** preserved device (i.e. preserved by the previous kernel)
+ * by checking the incoming FLB data (struct pci_ser).
+ *
+ * Drivers must notify the PCI core when an **incoming** device is done
+ * participating in the incoming Live Update with the following API:
+ *
+ * * ``pci_liveupdate_finish(pci_dev)``
+ *
+ * The PCI core does not enforce any ordering of ``pci_liveupdate_finish()`` and
+ * ``pci_liveupdate_preserve()``. i.e. A PCI device can be **outgoing**
+ * (preserved for next kernel) and **incoming** (preserved by previous kernel)
+ * at the same time.
+ *
* Restrictions
* ============
*
@@ -67,7 +81,6 @@
#define pr_fmt(fmt) "PCI: liveupdate: " fmt
-#include <linux/bsearch.h>
#include <linux/io.h>
#include <linux/kexec_handover.h>
#include <linux/kho/abi/pci.h>
@@ -75,10 +88,24 @@
#include <linux/mutex.h>
#include <linux/mm.h>
#include <linux/pci.h>
-#include <linux/sort.h>
+
+#include "pci.h"
static DEFINE_MUTEX(pci_flb_outgoing_lock);
+struct pci_flb_incoming {
+ /* The pci_ser struct passed by the previous kernel. */
+ struct pci_ser *ser;
+
+ /* xarray used to quickly find a device in ser->devices[] */
+ struct xarray xa;
+};
+
+static unsigned long pci_ser_xa_key(unsigned long domain, unsigned long bdf)
+{
+ return domain << 16 | bdf;
+}
+
static int pci_flb_preserve(struct liveupdate_flb_op_args *args)
{
struct pci_dev *dev = NULL;
@@ -124,13 +151,44 @@ static void pci_flb_unpreserve(struct liveupdate_flb_op_args *args)
static int pci_flb_retrieve(struct liveupdate_flb_op_args *args)
{
- args->obj = phys_to_virt(args->data);
+ struct pci_flb_incoming *incoming;
+ int i, ret;
+
+ incoming = kmalloc(sizeof(*incoming), GFP_KERNEL);
+ if (!incoming)
+ return -ENOMEM;
+
+ incoming->ser = phys_to_virt(args->data);
+
+ xa_init(&incoming->xa);
+
+ for (i = 0; i < incoming->ser->max_nr_devices; i++) {
+ struct pci_dev_ser *dev_ser = &incoming->ser->devices[i];
+ unsigned long key;
+
+ if (!dev_ser->refcount)
+ continue;
+
+ key = pci_ser_xa_key(dev_ser->domain, dev_ser->bdf);
+ ret = xa_err(xa_store(&incoming->xa, key, dev_ser, GFP_KERNEL));
+ if (ret) {
+ xa_destroy(&incoming->xa);
+ kfree(incoming);
+ return ret;
+ }
+ }
+
+ args->obj = incoming;
return 0;
}
static void pci_flb_finish(struct liveupdate_flb_op_args *args)
{
- kho_restore_free(args->obj);
+ struct pci_flb_incoming *incoming = args->obj;
+
+ xa_destroy(&incoming->xa);
+ kho_restore_free(incoming->ser);
+ kfree(incoming);
}
static struct liveupdate_flb_ops pci_liveupdate_flb_ops = {
@@ -225,6 +283,129 @@ void pci_liveupdate_unpreserve(struct pci_dev *dev)
}
EXPORT_SYMBOL_GPL(pci_liveupdate_unpreserve);
+static struct xarray *pci_liveupdate_flb_get_incoming(void)
+{
+ struct pci_flb_incoming *incoming;
+ int ret;
+
+ ret = liveupdate_flb_get_incoming(&pci_liveupdate_flb, (void **)&incoming);
+
+ /* Live Update is not enabled. */
+ if (ret == -EOPNOTSUPP)
+ return NULL;
+
+ /* Live Update is enabled, but there is no incoming FLB data. */
+ if (ret == -ENODATA)
+ return NULL;
+
+ /*
+ * Live Update is enabled and there is incoming FLB data, but none of it
+ * matches pci_liveupdate_flb.compatible.
+ *
+ * This could mean that no PCI FLB data was passed by the previous
+ * kernel, but it could also mean the previous kernel used a different
+ * compatibility string (i.e. a different ABI).
+ */
+ if (ret == -ENOENT) {
+ pr_info_once("No incoming FLB matched %s\n", pci_liveupdate_flb.compatible);
+ return NULL;
+ }
+
+ /*
+ * There is incoming FLB data that matches pci_liveupdate_flb.compatible
+ * but it cannot be retrieved.
+ */
+ if (ret) {
+ WARN_ONCE(ret, "Failed to retrieve incoming FLB data\n");
+ return NULL;
+ }
+
+ return &incoming->xa;
+}
+
+static void pci_liveupdate_flb_put_incoming(void)
+{
+ liveupdate_flb_put_incoming(&pci_liveupdate_flb);
+}
+
+void pci_liveupdate_setup_device(struct pci_dev *dev)
+{
+ struct pci_dev_ser *dev_ser;
+ struct xarray *xa;
+ unsigned long key;
+
+ xa = pci_liveupdate_flb_get_incoming();
+ if (!xa)
+ return;
+
+ key = pci_ser_xa_key(pci_domain_nr(dev->bus), pci_dev_id(dev));
+ dev_ser = xa_load(xa, key);
+
+ /* This device was not preserved across Live Update */
+ if (!dev_ser) {
+ pci_liveupdate_flb_put_incoming();
+ return;
+ }
+
+ /*
+ * This device was preserved, but has already been probed and gone
+ * through pci_liveupdate_finish(). This can happen if PCI core probes
+ * the same device multiple times, e.g. due to hotplug.
+ */
+ if (!dev_ser->refcount) {
+ pci_liveupdate_flb_put_incoming();
+ return;
+ }
+
+ pci_info(dev, "Device was preserved by previous kernel across Live Update\n");
+
+ /*
+ * Hold the ref on the incoming FLB until pci_liveupdate_finish() so
+ * that dev_ser does not get freed while it is in use.
+ */
+ dev->liveupdate_incoming = dev_ser;
+}
+
+void pci_liveupdate_cleanup_device(struct pci_dev *dev)
+{
+ /*
+ * Drop the FLB reference acquired in pci_liveupdate_setup_device() if
+ * the device is being cleaned up before pci_liveupdate_finish(), e.g.
+ * due to allocation failure during setup.
+ *
+ * Do not drop dev->liveupdate_incoming->refcount since this device has
+ * not gone through pci_liveupdate_finish() and thus is still an
+ * incoming preserved device.
+ *
+ * Note: This cannot race with pci_liveupdate_finish() since it is only
+ * called in cleanup paths when there are no users of the pci_dev.
+ */
+ if (dev->liveupdate_incoming)
+ pci_liveupdate_flb_put_incoming();
+}
+
+void pci_liveupdate_finish(struct pci_dev *dev)
+{
+ if (!dev->liveupdate_incoming) {
+ pci_warn(dev, "Cannot finish preserving an unpreserved device\n");
+ return;
+ }
+
+ pci_info(dev, "Device is finished participating in Live Update\n");
+
+ /*
+ * Drop the refcount so this device does not get treated as an incoming
+ * device again, e.g. in case pci_liveupdate_setup_device() gets called
+ * again becase the device is hot-plugged.
+ */
+ dev->liveupdate_incoming->refcount = 0;
+ dev->liveupdate_incoming = NULL;
+
+ /* Drop this device's reference on the incoming FLB. */
+ pci_liveupdate_flb_put_incoming();
+}
+EXPORT_SYMBOL_GPL(pci_liveupdate_finish);
+
int pci_liveupdate_register_flb(struct liveupdate_file_handler *fh)
{
pr_debug("Registering file handler \"%s\"\n", fh->compatible);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 4a14f88e543a..09bab39738d7 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1439,4 +1439,17 @@ static inline int pci_msix_write_tph_tag(struct pci_dev *pdev, unsigned int inde
(PCI_CONF1_ADDRESS(bus, dev, func, reg) | \
PCI_CONF1_EXT_REG(reg))
+#ifdef CONFIG_PCI_LIVEUPDATE
+void pci_liveupdate_setup_device(struct pci_dev *dev);
+void pci_liveupdate_cleanup_device(struct pci_dev *dev);
+#else
+static inline void pci_liveupdate_setup_device(struct pci_dev *dev)
+{
+}
+
+static inline void pci_liveupdate_cleanup_device(struct pci_dev *dev)
+{
+}
+#endif
+
#endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index b63cd0c310bc..938a28e4a7a0 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2069,6 +2069,8 @@ int pci_setup_device(struct pci_dev *dev)
if (pci_early_dump)
early_dump_pci_device(dev);
+ pci_liveupdate_setup_device(dev);
+
/* Need to have dev->class ready */
dev->cfg_size = pci_cfg_space_size(dev);
@@ -2192,6 +2194,7 @@ int pci_setup_device(struct pci_dev *dev)
default: /* unknown header */
pci_err(dev, "unknown header type %02x, ignoring device\n",
dev->hdr_type);
+ pci_liveupdate_cleanup_device(dev);
pci_release_of_node(dev);
return -EIO;
@@ -2490,6 +2493,7 @@ static void pci_release_dev(struct device *dev)
pci_dev = to_pci_dev(dev);
pci_release_capabilities(pci_dev);
+ pci_liveupdate_cleanup_device(pci_dev);
pci_release_of_node(pci_dev);
pcibios_release_device(pci_dev);
pci_bus_put(pci_dev->bus);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index eb94cbd8ab9d..dd6b26ca9462 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -597,6 +597,7 @@ struct pci_dev {
#endif
#ifdef CONFIG_PCI_LIVEUPDATE
struct pci_dev_ser *liveupdate_outgoing; /* State preserved for next kernel */
+ struct pci_dev_ser *liveupdate_incoming; /* State preserved by previous kernel */
#endif
};
@@ -2887,11 +2888,17 @@ void pci_liveupdate_unregister_flb(struct liveupdate_file_handler *fh);
int pci_liveupdate_preserve(struct pci_dev *dev);
void pci_liveupdate_unpreserve(struct pci_dev *dev);
+void pci_liveupdate_finish(struct pci_dev *dev);
static inline struct pci_dev_ser *pci_liveupdate_outgoing(struct pci_dev *dev)
{
return dev->liveupdate_outgoing;
}
+
+static inline struct pci_dev_ser *pci_liveupdate_incoming(struct pci_dev *dev)
+{
+ return dev->liveupdate_incoming;
+}
#else
static inline int pci_liveupdate_register_flb(struct liveupdate_file_handler *fh)
{
@@ -2911,10 +2918,19 @@ static inline void pci_liveupdate_unpreserve(struct pci_dev *dev)
{
}
+static inline void pci_liveupdate_finish(struct pci_dev *dev)
+{
+}
+
static inline struct pci_dev_ser *pci_liveupdate_outgoing(struct pci_dev *dev)
{
return NULL;
}
+
+static inline struct pci_dev_ser *pci_liveupdate_incoming(struct pci_dev *dev)
+{
+ return NULL;
+}
#endif
#endif /* LINUX_PCI_H */
--
2.54.0.rc2.544.gc7ae2d5bb8-goog
next prev parent reply other threads:[~2026-04-23 21:23 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-23 21:23 [PATCH v4 00/11] PCI: liveupdate: PCI core support for Live Update David Matlack
2026-04-23 21:23 ` [PATCH v4 01/11] PCI: liveupdate: Set up FLB handler for the PCI core David Matlack
2026-04-23 21:23 ` [PATCH v4 02/11] PCI: liveupdate: Track outgoing preserved PCI devices David Matlack
2026-04-23 21:23 ` David Matlack [this message]
2026-04-23 21:23 ` [PATCH v4 04/11] PCI: liveupdate: Document driver binding responsibilities David Matlack
2026-04-23 21:23 ` [PATCH v4 05/11] PCI: liveupdate: Inherit bus numbers during Live Update David Matlack
2026-04-23 21:23 ` [PATCH v4 06/11] PCI: liveupdate: Auto-preserve upstream bridges across " David Matlack
2026-04-23 21:23 ` [PATCH v4 07/11] PCI: liveupdate: Inherit ACS flags in incoming preserved devices David Matlack
2026-04-23 21:23 ` [PATCH v4 08/11] PCI: liveupdate: Require preserved devices are in immutable singleton IOMMU groups David Matlack
2026-04-23 22:10 ` David Matlack
2026-04-23 22:52 ` Jason Gunthorpe
2026-04-23 23:09 ` David Matlack
2026-04-23 23:27 ` Samiullah Khawaja
2026-04-23 21:23 ` [PATCH v4 09/11] PCI: liveupdate: Inherit ARI Forwarding Enable on preserved bridges David Matlack
2026-04-23 21:23 ` [PATCH v4 10/11] PCI: liveupdate: Do not disable bus mastering on preserved devices during kexec David Matlack
2026-04-23 21:23 ` [PATCH v4 11/11] Documentation: PCI: Add documentation for Live Update David Matlack
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260423212316.3431746-4-dmatlack@google.com \
--to=dmatlack@google.com \
--cc=ajayachandra@nvidia.com \
--cc=alex@shazbot.org \
--cc=bhelgaas@google.com \
--cc=chrisl@kernel.org \
--cc=corbet@lwn.net \
--cc=graf@amazon.com \
--cc=iommu@lists.linux.dev \
--cc=jacob.pan@linux.microsoft.com \
--cc=jgg@nvidia.com \
--cc=joro@8bytes.org \
--cc=jrhilke@google.com \
--cc=kexec@lists.infradead.org \
--cc=leonro@nvidia.com \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-pci@vger.kernel.org \
--cc=lukas@wunner.de \
--cc=parav@nvidia.com \
--cc=pasha.tatashin@soleen.com \
--cc=praan@google.com \
--cc=pratyush@kernel.org \
--cc=rientjes@google.com \
--cc=robin.murphy@arm.com \
--cc=rppt@kernel.org \
--cc=saeedm@nvidia.com \
--cc=skhan@linuxfoundation.org \
--cc=skhawaja@google.com \
--cc=will@kernel.org \
--cc=witu@nvidia.com \
--cc=yi.l.liu@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox