All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v18 0/4] [VFIO] Error recovery for vfio-pci devices on s390x
@ 2026-06-03 18:24 Farhan Ali
  2026-06-03 18:24 ` [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices Farhan Ali
                   ` (3 more replies)
  0 siblings, 4 replies; 23+ messages in thread
From: Farhan Ali @ 2026-06-03 18:24 UTC (permalink / raw)
  To: linux-s390, linux-kernel, linux-pci, alex
  Cc: helgaas, alifm, schnelle, mjrosato

Hi Alex,

This patch set includes only the VFIO patches of the original series for
error recovery for vfio-pci devices on s390x [1]. Breaking up the patch
series into PCI and VFIO only patches to make merging easier based on our
discussion [2].

Thanks
Farhan

[1] https://lore.kernel.org/all/20260520171113.1111-1-alifm@linux.ibm.com/
[2] https://lore.kernel.org/all/20260602163344.1eda12d2@shazbot.org/

ChangeLog
---------
v17 -> v18
 - Print a warn with dev_warn_ratelimited() for copy_to_user failure.
 - Rebase on 7.1-rc6.
 - Break patch series into VFIO patch set.

Farhan Ali (4):
  s390/pci: Store PCI error information for passthrough devices
  vfio-pci/zdev: Add a device feature for error information
  vfio/pci: Add a reset_done callback for vfio-pci driver
  vfio/pci: Remove the pcie check for VFIO_PCI_ERR_IRQ_INDEX

 arch/s390/include/asm/pci.h       |  32 +++++++
 arch/s390/pci/pci.c               |   1 +
 arch/s390/pci/pci_event.c         | 135 +++++++++++++++++++-----------
 drivers/vfio/pci/vfio_pci_core.c  |  22 +++--
 drivers/vfio/pci/vfio_pci_intrs.c |   3 +-
 drivers/vfio/pci/vfio_pci_priv.h  |   9 ++
 drivers/vfio/pci/vfio_pci_zdev.c  |  61 +++++++++++++-
 include/uapi/linux/vfio.h         |  30 +++++++
 8 files changed, 231 insertions(+), 62 deletions(-)

-- 
2.43.0


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices
  2026-06-03 18:24 [PATCH v18 0/4] [VFIO] Error recovery for vfio-pci devices on s390x Farhan Ali
@ 2026-06-03 18:24 ` Farhan Ali
  2026-06-03 22:20   ` Alex Williamson
  2026-06-03 18:24 ` [PATCH v18 2/4] vfio-pci/zdev: Add a device feature for error information Farhan Ali
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 23+ messages in thread
From: Farhan Ali @ 2026-06-03 18:24 UTC (permalink / raw)
  To: linux-s390, linux-kernel, linux-pci, alex
  Cc: helgaas, alifm, schnelle, mjrosato

For a passthrough device we need co-operation from user space to recover
the device. This would require to bubble up any error information to user
space.  Let's store this error information for passthrough devices, so it
can be retrieved later.

We can now have userspace drivers (vfio-pci based) on s390x. The userspace
drivers will not have any KVM fd and so no kzdev associated with them. So
we need to update the logic for detecting passthrough devices to not depend
on struct kvm_zdev.

Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
---
 arch/s390/include/asm/pci.h      |  30 ++++++++
 arch/s390/pci/pci.c              |   1 +
 arch/s390/pci/pci_event.c        | 116 +++++++++++++++++--------------
 drivers/vfio/pci/vfio_pci_zdev.c |  18 ++++-
 4 files changed, 111 insertions(+), 54 deletions(-)

diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 5dcf35f0f325..016386f7ef4a 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -118,6 +118,32 @@ struct zpci_bus {
 	enum pci_bus_speed	max_bus_speed;
 };
 
+/* Content Code Description for PCI Function Error */
+struct zpci_ccdf_err {
+	u32 reserved1;
+	u32 fh;                         /* function handle */
+	u32 fid;                        /* function id */
+	u32 ett         :  4;           /* expected table type */
+	u32 mvn         : 12;           /* MSI vector number */
+	u32 dmaas       :  8;           /* DMA address space */
+	u32 reserved2   :  6;
+	u32 q           :  1;           /* event qualifier */
+	u32 rw          :  1;           /* read/write */
+	u64 faddr;                      /* failing address */
+	u32 reserved3;
+	u16 reserved4;
+	u16 pec;                        /* PCI event code */
+} __packed;
+
+#define ZPCI_ERR_PENDING_MAX 4
+struct zpci_ccdf_pending {
+	bool mediated_recovery;
+	u8 count;
+	u8 head;
+	u8 tail;
+	struct zpci_ccdf_err err[ZPCI_ERR_PENDING_MAX];
+};
+
 /* Private data per function */
 struct zpci_dev {
 	struct zpci_bus *zbus;
@@ -192,6 +218,8 @@ struct zpci_dev {
 	struct iommu_domain *s390_domain; /* attached IOMMU domain */
 	struct kvm_zdev *kzdev;
 	struct mutex kzdev_lock;
+	struct zpci_ccdf_pending pending_errs;
+	struct mutex pending_errs_lock;
 	spinlock_t dom_lock;		/* protect s390_domain change */
 };
 
@@ -334,6 +362,8 @@ void zpci_debug_exit_device(struct zpci_dev *);
 int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *);
 int zpci_clear_error_state(struct zpci_dev *zdev);
 int zpci_reset_load_store_blocked(struct zpci_dev *zdev);
+void zpci_start_mediated_recovery(struct zpci_dev *zdev);
+void zpci_stop_mediated_recovery(struct zpci_dev *zdev);
 
 #ifdef CONFIG_NUMA
 
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 39bd2adfc240..2d377c2e194d 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -842,6 +842,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
 	mutex_init(&zdev->state_lock);
 	mutex_init(&zdev->fmb_lock);
 	mutex_init(&zdev->kzdev_lock);
+	mutex_init(&zdev->pending_errs_lock);
 
 	return zdev;
 
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index 839bd91c056e..cf2ffa21ab8c 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -17,23 +17,6 @@
 #include "pci_bus.h"
 #include "pci_report.h"
 
-/* Content Code Description for PCI Function Error */
-struct zpci_ccdf_err {
-	u32 reserved1;
-	u32 fh;				/* function handle */
-	u32 fid;			/* function id */
-	u32 ett		:  4;		/* expected table type */
-	u32 mvn		: 12;		/* MSI vector number */
-	u32 dmaas	:  8;		/* DMA address space */
-	u32		:  6;
-	u32 q		:  1;		/* event qualifier */
-	u32 rw		:  1;		/* read/write */
-	u64 faddr;			/* failing address */
-	u32 reserved3;
-	u16 reserved4;
-	u16 pec;			/* PCI event code */
-} __packed;
-
 /* Content Code Description for PCI Function Availability */
 struct zpci_ccdf_avail {
 	u32 reserved1;
@@ -60,18 +43,6 @@ static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res)
 	}
 }
 
-static bool is_passed_through(struct pci_dev *pdev)
-{
-	struct zpci_dev *zdev = to_zpci(pdev);
-	bool ret;
-
-	mutex_lock(&zdev->kzdev_lock);
-	ret = !!zdev->kzdev;
-	mutex_unlock(&zdev->kzdev_lock);
-
-	return ret;
-}
-
 static bool is_driver_supported(struct pci_driver *driver)
 {
 	if (!driver || !driver->err_handler)
@@ -81,6 +52,47 @@ static bool is_driver_supported(struct pci_driver *driver)
 	return true;
 }
 
+static int zpci_store_pci_error(struct pci_dev *pdev,
+				 struct zpci_ccdf_err *ccdf)
+{
+	struct zpci_dev *zdev = to_zpci(pdev);
+	int i;
+
+	guard(mutex)(&zdev->pending_errs_lock);
+	if (!zdev->pending_errs.mediated_recovery)
+		return -EINVAL;
+
+	if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) {
+		pr_err("%s: Maximum number (%d) of pending error events queued\n",
+		       pci_name(pdev), ZPCI_ERR_PENDING_MAX);
+		return -ENOMEM;
+	}
+
+	i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX;
+	memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err));
+	zdev->pending_errs.tail++;
+	zdev->pending_errs.count++;
+	return 0;
+}
+
+void zpci_start_mediated_recovery(struct zpci_dev *zdev)
+{
+	guard(mutex)(&zdev->pending_errs_lock);
+	zdev->pending_errs.mediated_recovery = true;
+}
+EXPORT_SYMBOL_GPL(zpci_start_mediated_recovery);
+
+void zpci_stop_mediated_recovery(struct zpci_dev *zdev)
+{
+	guard(mutex)(&zdev->pending_errs_lock);
+	zdev->pending_errs.mediated_recovery = false;
+	if (zdev->pending_errs.count)
+		pr_info("Unhandled PCI error events count=%d for PCI function 0x%x\n",
+			zdev->pending_errs.count, zdev->fid);
+	memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
+}
+EXPORT_SYMBOL_GPL(zpci_stop_mediated_recovery);
+
 static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
 							 struct pci_driver *driver)
 {
@@ -175,12 +187,15 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
  * and the platform determines which functions are affected for
  * multi-function devices.
  */
-static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
+static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev,
+							  struct zpci_ccdf_err *ccdf)
 {
 	pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
 	struct zpci_dev *zdev = to_zpci(pdev);
+	bool mediated_recovery = false;
 	char *status_str = "success";
 	struct pci_driver *driver;
+	int rc;
 
 	/*
 	 * Ensure that the PCI function is not removed concurrently, no driver
@@ -194,13 +209,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
 	}
 	pdev->error_state = pci_channel_io_frozen;
 
-	if (is_passed_through(pdev)) {
-		pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
-			pci_name(pdev));
-		status_str = "failed (pass-through)";
-		goto out_unlock;
-	}
-
 	driver = to_pci_driver(pdev->dev.driver);
 	if (!is_driver_supported(driver)) {
 		if (!driver) {
@@ -216,12 +224,24 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
 		goto out_unlock;
 	}
 
+	rc = zpci_store_pci_error(pdev, ccdf);
+	if (!rc || rc == -ENOMEM)
+		mediated_recovery = true;
+
 	ers_res = zpci_event_notify_error_detected(pdev, driver);
 	if (ers_result_indicates_abort(ers_res)) {
 		status_str = "failed (abort on detection)";
 		goto out_unlock;
 	}
 
+	if (mediated_recovery) {
+		pr_info("%s: Leaving recovery of pass-through device to user-space\n",
+			pci_name(pdev));
+		ers_res = PCI_ERS_RESULT_RECOVERED;
+		status_str = "in progress";
+		goto out_unlock;
+	}
+
 	if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
 		ers_res = zpci_event_do_error_state_clear(pdev, driver);
 		if (ers_result_indicates_abort(ers_res)) {
@@ -266,25 +286,19 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
  * @pdev: PCI function for which to report
  * @es: PCI channel failure state to report
  */
-static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
+static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
+				  struct zpci_ccdf_err *ccdf)
 {
 	struct pci_driver *driver;
 
 	pci_dev_lock(pdev);
 	pdev->error_state = es;
-	/**
-	 * While vfio-pci's error_detected callback notifies user-space QEMU
-	 * reacts to this by freezing the guest. In an s390 environment PCI
-	 * errors are rarely fatal so this is overkill. Instead in the future
-	 * we will inject the error event and let the guest recover the device
-	 * itself.
-	 */
-	if (is_passed_through(pdev))
-		goto out;
+
+	zpci_store_pci_error(pdev, ccdf);
 	driver = to_pci_driver(pdev->dev.driver);
 	if (driver && driver->err_handler && driver->err_handler->error_detected)
 		driver->err_handler->error_detected(pdev, pdev->error_state);
-out:
+
 	pci_dev_unlock(pdev);
 }
 
@@ -330,12 +344,12 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
 		break;
 	case 0x0040: /* Service Action or Error Recovery Failed */
 	case 0x003b:
-		zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
+		zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf);
 		break;
 	default: /* PCI function left in the error state attempt to recover */
-		ers_res = zpci_event_attempt_error_recovery(pdev);
+		ers_res = zpci_event_attempt_error_recovery(pdev, ccdf);
 		if (ers_res != PCI_ERS_RESULT_RECOVERED)
-			zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
+			zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf);
 		break;
 	}
 	pci_dev_put(pdev);
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 0990fdb146b7..78a28db00c6d 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -144,24 +144,36 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
 {
 	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+	int ret;
 
 	if (!zdev)
 		return -ENODEV;
 
+	zpci_start_mediated_recovery(zdev);
+
 	if (!vdev->vdev.kvm)
 		return 0;
 
+	ret = -ENOENT;
 	if (zpci_kvm_hook.kvm_register)
-		return zpci_kvm_hook.kvm_register(zdev, vdev->vdev.kvm);
+		ret = zpci_kvm_hook.kvm_register(zdev, vdev->vdev.kvm);
 
-	return -ENOENT;
+	if (ret)
+		zpci_stop_mediated_recovery(zdev);
+
+	return ret;
 }
 
 void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
 {
 	struct zpci_dev *zdev = to_zpci(vdev->pdev);
 
-	if (!zdev || !vdev->vdev.kvm)
+	if (!zdev)
+		return;
+
+	zpci_stop_mediated_recovery(zdev);
+
+	if (!vdev->vdev.kvm)
 		return;
 
 	if (zpci_kvm_hook.kvm_unregister)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH v18 2/4] vfio-pci/zdev: Add a device feature for error information
  2026-06-03 18:24 [PATCH v18 0/4] [VFIO] Error recovery for vfio-pci devices on s390x Farhan Ali
  2026-06-03 18:24 ` [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices Farhan Ali
@ 2026-06-03 18:24 ` Farhan Ali
  2026-06-03 18:49   ` sashiko-bot
  2026-06-03 22:37   ` Alex Williamson
  2026-06-03 18:24 ` [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver Farhan Ali
  2026-06-03 18:24 ` [PATCH v18 4/4] vfio/pci: Remove the pcie check for VFIO_PCI_ERR_IRQ_INDEX Farhan Ali
  3 siblings, 2 replies; 23+ messages in thread
From: Farhan Ali @ 2026-06-03 18:24 UTC (permalink / raw)
  To: linux-s390, linux-kernel, linux-pci, alex
  Cc: helgaas, alifm, schnelle, mjrosato

For zPCI devices, we have platform specific error information. The platform
firmware provides this error information to the operating system in an
architecture specific mechanism. To enable recovery from userspace for
these devices, we want to expose this error information to userspace. Add a
new device feature to expose this information.

Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
---
 arch/s390/include/asm/pci.h      |  2 ++
 arch/s390/pci/pci_event.c        | 19 ++++++++++++++
 drivers/vfio/pci/vfio_pci_core.c |  2 ++
 drivers/vfio/pci/vfio_pci_priv.h |  9 +++++++
 drivers/vfio/pci/vfio_pci_zdev.c | 43 ++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        | 30 ++++++++++++++++++++++
 6 files changed, 105 insertions(+)

diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 016386f7ef4a..88a125b92bdd 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -364,6 +364,8 @@ int zpci_clear_error_state(struct zpci_dev *zdev);
 int zpci_reset_load_store_blocked(struct zpci_dev *zdev);
 void zpci_start_mediated_recovery(struct zpci_dev *zdev);
 void zpci_stop_mediated_recovery(struct zpci_dev *zdev);
+int zpci_get_pending_error(struct zpci_dev *zdev,
+			   struct zpci_ccdf_err *ccdf);
 
 #ifdef CONFIG_NUMA
 
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index cf2ffa21ab8c..db1b44baf8fa 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -75,6 +75,25 @@ static int zpci_store_pci_error(struct pci_dev *pdev,
 	return 0;
 }
 
+int zpci_get_pending_error(struct zpci_dev *zdev,
+			   struct zpci_ccdf_err *ccdf)
+{
+	int head = 0;
+
+	guard(mutex)(&zdev->pending_errs_lock);
+
+	if (!zdev->pending_errs.count)
+		return -ENOMSG;
+
+	head = zdev->pending_errs.head % ZPCI_ERR_PENDING_MAX;
+	memcpy(ccdf, &zdev->pending_errs.err[head],
+	       sizeof(struct zpci_ccdf_err));
+	zdev->pending_errs.head++;
+	zdev->pending_errs.count--;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(zpci_get_pending_error);
+
 void zpci_start_mediated_recovery(struct zpci_dev *zdev)
 {
 	guard(mutex)(&zdev->pending_errs_lock);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 050e7542952e..27642f10fe97 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1569,6 +1569,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 		return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
 	case VFIO_DEVICE_FEATURE_DMA_BUF:
 		return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_ZPCI_ERROR:
+		return vfio_pci_zdev_feature_err(device, flags, arg, argsz);
 	default:
 		return -ENOTTY;
 	}
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index fca9d0dfac90..4e7162234a2e 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -93,6 +93,8 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 				struct vfio_info_cap *caps);
 int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev);
 void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev);
+int vfio_pci_zdev_feature_err(struct vfio_device *device, u32 flags,
+			      void __user *arg, size_t argsz);
 #else
 static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 					      struct vfio_info_cap *caps)
@@ -107,6 +109,13 @@ static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
 
 static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
 {}
+
+static inline int vfio_pci_zdev_feature_err(struct vfio_device *device,
+					    u32 flags, void __user *arg,
+					    size_t argsz)
+{
+	return -ENOTTY;
+}
 #endif
 
 static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 78a28db00c6d..c0d6362cc4eb 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -141,6 +141,49 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 	return ret;
 }
 
+int vfio_pci_zdev_feature_err(struct vfio_device *device, u32 flags,
+			      void __user *arg, size_t argsz)
+{
+	struct vfio_device_feature_zpci_err err = {};
+	struct vfio_pci_core_device *vdev;
+	struct zpci_ccdf_err ccdf = {};
+	struct zpci_dev *zdev;
+	int ret;
+
+	vdev = container_of(device, struct vfio_pci_core_device, vdev);
+	zdev = to_zpci(vdev->pdev);
+	if (!zdev)
+		return -ENODEV;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+				 sizeof(err));
+	if (ret != 1)
+		return ret;
+
+	ret = zpci_get_pending_error(zdev, &ccdf);
+	if (ret)
+		return ret;
+
+	err.fh = ccdf.fh;
+	err.fid = ccdf.fid;
+	err.ett = ccdf.ett;
+	err.mvn = ccdf.mvn;
+	err.dmaas = ccdf.dmaas;
+	err.q = ccdf.q;
+	err.rw = ccdf.rw;
+	err.faddr = ccdf.faddr;
+	err.pec = ccdf.pec;
+
+	if (copy_to_user(arg, &err, sizeof(err))) {
+		dev_warn_ratelimited(device->dev,
+				     "Failed to handle PCI error event for PCI function 0x%x",
+				     zdev->fid);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
 int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
 {
 	struct zpci_dev *zdev = to_zpci(vdev->pdev);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5de618a3a5ee..3eead86a00ab 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1534,6 +1534,36 @@ struct vfio_device_feature_dma_buf {
  */
 #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2  12
 
+/**
+ * VFIO_DEVICE_FEATURE_ZPCI_ERROR feature provides PCI error information to
+ * userspace for vfio-pci devices on s390. On s390, PCI error recovery
+ * involves platform firmware and notification to operating systems is done
+ * by architecture specific mechanism. Exposing this information to
+ * userspace allows it to take appropriate actions to handle an
+ * error on the device. The ioctl returns -ENOMSG if there are no pending
+ * PCI errors.
+ */
+
+struct vfio_device_feature_zpci_err {
+	__u32 feature_flags;		/* Indicate future features */
+	__u32 reserved1;
+	__u32 fh;			/* function handle */
+	__u32 fid;			/* function id */
+	__u32 ett		:  4;	/* expected table type */
+	__u32 mvn		: 12;	/* MSI vector number */
+	__u32 dmaas		:  8;	/* DMA address space */
+	__u32 reserved2		:  6;
+	__u32 q			:  1;	/* event qualifier */
+	__u32 rw		:  1;	/* read/write */
+	__u64 faddr;			/* failing address */
+	__u32 reserved3;
+	__u16 reserved4;
+	__u16 pec;			/* PCI event code */
+	__u8 reserved5[28];		/* Allow for future expansion */
+};
+
+#define VFIO_DEVICE_FEATURE_ZPCI_ERROR 13
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-03 18:24 [PATCH v18 0/4] [VFIO] Error recovery for vfio-pci devices on s390x Farhan Ali
  2026-06-03 18:24 ` [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices Farhan Ali
  2026-06-03 18:24 ` [PATCH v18 2/4] vfio-pci/zdev: Add a device feature for error information Farhan Ali
@ 2026-06-03 18:24 ` Farhan Ali
  2026-06-03 19:04   ` sashiko-bot
                     ` (2 more replies)
  2026-06-03 18:24 ` [PATCH v18 4/4] vfio/pci: Remove the pcie check for VFIO_PCI_ERR_IRQ_INDEX Farhan Ali
  3 siblings, 3 replies; 23+ messages in thread
From: Farhan Ali @ 2026-06-03 18:24 UTC (permalink / raw)
  To: linux-s390, linux-kernel, linux-pci, alex
  Cc: helgaas, alifm, schnelle, mjrosato, Julian Ruess

On error recovery for a PCI device bound to vfio-pci driver, we want to
recover the state of the device to its last known saved state. The callback
restores the state of the device to its initial saved state.

Reviewed-by: Julian Ruess <julianr@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Acked-by: Alex Williamson <alex@shazbot.org>
Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 27642f10fe97..95af3e827ee0 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -2290,6 +2290,17 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 }
 EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected);
 
+static void vfio_pci_core_aer_reset_done(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+
+	if (!vdev->pci_saved_state)
+		return;
+
+	pci_load_saved_state(pdev, vdev->pci_saved_state);
+	pci_restore_state(pdev);
+}
+
 int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
 				  int nr_virtfn)
 {
@@ -2354,6 +2365,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
 
 const struct pci_error_handlers vfio_pci_core_err_handlers = {
 	.error_detected = vfio_pci_core_aer_err_detected,
+	.reset_done = vfio_pci_core_aer_reset_done,
 };
 EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH v18 4/4] vfio/pci: Remove the pcie check for VFIO_PCI_ERR_IRQ_INDEX
  2026-06-03 18:24 [PATCH v18 0/4] [VFIO] Error recovery for vfio-pci devices on s390x Farhan Ali
                   ` (2 preceding siblings ...)
  2026-06-03 18:24 ` [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver Farhan Ali
@ 2026-06-03 18:24 ` Farhan Ali
  3 siblings, 0 replies; 23+ messages in thread
From: Farhan Ali @ 2026-06-03 18:24 UTC (permalink / raw)
  To: linux-s390, linux-kernel, linux-pci, alex
  Cc: helgaas, alifm, schnelle, mjrosato, Julian Ruess

The error signaling is configured for the vast majority of devices and it's
extremely rare that it fires anyway. Removing the pcie check will allow
userspace to be notified on errors for legacy PCI devices. The Internal
Shared Memory (ISM) device on s390 is one such device. For PCI devices on
IBM s390 error recovery involves platform firmware and notification to
operating system is done by architecture specific way. So the ISM device
can still be recovered when notified of an error.

Reviewed-by: Julian Ruess <julianr@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Reviewed-by: Alex Williamson <alex@shazbot.org>
Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
---
 drivers/vfio/pci/vfio_pci_core.c  | 8 ++------
 drivers/vfio/pci/vfio_pci_intrs.c | 3 +--
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 95af3e827ee0..e41669fa045f 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -821,8 +821,7 @@ static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_typ
 			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
 		}
 	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
-		if (pci_is_pcie(vdev->pdev))
-			return 1;
+		return 1;
 	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
 		return 1;
 	}
@@ -1198,11 +1197,8 @@ static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev,
 	switch (info.index) {
 	case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
 	case VFIO_PCI_REQ_IRQ_INDEX:
-		break;
 	case VFIO_PCI_ERR_IRQ_INDEX:
-		if (pci_is_pcie(vdev->pdev))
-			break;
-		fallthrough;
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 33944d4d9dc4..64f80f64ff57 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -859,8 +859,7 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
 	case VFIO_PCI_ERR_IRQ_INDEX:
 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
 		case VFIO_IRQ_SET_ACTION_TRIGGER:
-			if (pci_is_pcie(vdev->pdev))
-				func = vfio_pci_set_err_trigger;
+			func = vfio_pci_set_err_trigger;
 			break;
 		}
 		break;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 2/4] vfio-pci/zdev: Add a device feature for error information
  2026-06-03 18:24 ` [PATCH v18 2/4] vfio-pci/zdev: Add a device feature for error information Farhan Ali
@ 2026-06-03 18:49   ` sashiko-bot
  2026-06-03 22:37   ` Alex Williamson
  1 sibling, 0 replies; 23+ messages in thread
From: sashiko-bot @ 2026-06-03 18:49 UTC (permalink / raw)
  To: Farhan Ali; +Cc: linux-pci

Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [High] The UAPI structure `vfio_device_feature_zpci_err` uses C bitfields and contains implicit compiler padding holes, violating ABI stability guidelines.
- [Medium] User-triggerable kernel warning log due to failed `copy_to_user()`.
--

commit 91bf6efc558a8f6b752d8cbe1ecebd56aaccfea2
Author: Farhan Ali <alifm@linux.ibm.com>

vfio-pci/zdev: Add a device feature for error information

This commit adds a new device feature to expose zPCI platform specific
error information to userspace. This allows userspace to recover from
these errors.

> diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
> index 78a28db00c6d6..c0d6362cc4eb0 100644
> --- a/drivers/vfio/pci/vfio_pci_zdev.c
> +++ b/drivers/vfio/pci/vfio_pci_zdev.c
[ ... ]
> @@ -141,6 +141,49 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
[ ... ]
> +	if (copy_to_user(arg, &err, sizeof(err))) {
> +		dev_warn_ratelimited(device->dev,
> +				     "Failed to handle PCI error event for PCI function 0x%x",
> +				     zdev->fid);
> +		return -EFAULT;
> +	}

[Severity: Medium]
Is it intended to log a kernel warning when copy_to_user() fails?

Because copy_to_user() failures are caused directly by user space providing
an invalid, unmapped, or read-only memory address in the arg pointer,
could an unprivileged user intentionally pass invalid pointers to trigger
these warnings and spam the host kernel logs?

> +
> +	return 0;
> +}
[ ... ]
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 5de618a3a5eee..3eead86a00ab2 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
[ ... ]
> @@ -1534,6 +1534,36 @@ struct vfio_device_feature_dma_buf {
[ ... ]
> +struct vfio_device_feature_zpci_err {
> +	__u32 feature_flags;		/* Indicate future features */
> +	__u32 reserved1;
> +	__u32 fh;			/* function handle */
> +	__u32 fid;			/* function id */
> +	__u32 ett		:  4;	/* expected table type */
> +	__u32 mvn		: 12;	/* MSI vector number */
> +	__u32 dmaas		:  8;	/* DMA address space */
> +	__u32 reserved2		:  6;
> +	__u32 q			:  1;	/* event qualifier */
> +	__u32 rw		:  1;	/* read/write */
> +	__u64 faddr;			/* failing address */
> +	__u32 reserved3;
> +	__u16 reserved4;
> +	__u16 pec;			/* PCI event code */
> +	__u8 reserved5[28];		/* Allow for future expansion */
> +};

[Severity: High]
Does this struct layout rely on C bitfields and implicit compiler padding in
a UAPI header?

Bitfields are highly compiler- and architecture-dependent, which can break
cross-architecture compatibility.

Additionally, is there a 4-byte implicit padding hole between the bitfields
(ending at offset 20) and faddr (which must start at offset 24 due to
8-byte alignment on 64-bit platforms)? Is there another 4-byte implicit
padding hole at the end of the struct to satisfy the overall 8-byte
alignment requirement?

Should explicit padding fields and bitmask macros be used instead of C
bitfields to ensure ABI stability?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260603182415.2324-1-alifm@linux.ibm.com?part=2

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-03 18:24 ` [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver Farhan Ali
@ 2026-06-03 19:04   ` sashiko-bot
  2026-06-03 22:46   ` Alex Williamson
  2026-06-04  8:28   ` Keith Busch
  2 siblings, 0 replies; 23+ messages in thread
From: sashiko-bot @ 2026-06-03 19:04 UTC (permalink / raw)
  To: Farhan Ali; +Cc: linux-pci

Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:
- [Critical] The `reset_done` callback breaks standard device resets by irreversibly overwriting the runtime PCI configuration space.
- [High] Concurrent execution of `vfio_pci_core_aer_reset_done` and `vfio_pci_core_disable` leads to a Use-After-Free on `vdev->pci_saved_state`.
- [High] The `reset_done` callback is never invoked during AER error recovery.
--

commit ea3a78b14c5b561020bd16d25cbb09f0536e6e7a
Author: Farhan Ali <alifm@linux.ibm.com>

vfio/pci: Add a reset_done callback for vfio-pci driver

On error recovery for a PCI device bound to vfio-pci driver, we want to
recover the state of the device to its last known saved state. The callback
restores the state of the device to its initial saved state.

> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 27642f10fe973..95af3e827ee05 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -2290,6 +2290,17 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
>  }
>  EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected);
>  
> +static void vfio_pci_core_aer_reset_done(struct pci_dev *pdev)
> +{
> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
> +
> +	if (!vdev->pci_saved_state)
> +		return;

[Severity: High]
Can this race with vfio_pci_core_disable() and cause a use-after-free on
vdev->pci_saved_state?

When a user closes the device fd, vfio_pci_core_disable() is invoked:

vfio_pci_core_disable()
    pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state);

This frees the memory and sets the pointer to NULL without holding the PCI
device_lock.

Concurrently, if a system-initiated function reset occurs, the PCI core
holds device_lock and invokes this reset_done callback via
pci_dev_restore().

Because there is no common lock between these two paths, can the pointer be
freed by vfio_pci_core_disable() immediately after the NULL check here, but
before it is dereferenced in pci_load_saved_state()?

> +
> +	pci_load_saved_state(pdev, vdev->pci_saved_state);
> +	pci_restore_state(pdev);

[Severity: Critical]
Does this unintentionally overwrite the runtime PCI configuration space
during normal device resets?

The PCI core invokes reset_done at the end of standard PCI function resets
via pci_dev_restore(), which includes pci_try_reset_function() triggered by
the VFIO_DEVICE_RESET ioctl.

Normally, pci_try_reset_function() transparently saves the runtime state,
resets the device, and restores it.

However, restoring vdev->pci_saved_state here unconditionally applies the
state from when the device was initially opened. Could this revert all
runtime configuration changes made by a guest VM, such as BAR programming
and MSI-X enablement?

> +}
> +
>  int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
>  				  int nr_virtfn)

[ ... ]

> @@ -2354,6 +2365,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
>  
>  const struct pci_error_handlers vfio_pci_core_err_handlers = {
>  	.error_detected = vfio_pci_core_aer_err_detected,
> +	.reset_done = vfio_pci_core_aer_reset_done,

[Severity: High]
Will this callback actually be invoked during AER error recovery?

The commit message mentions restoring the device state during PCIe AER error
recovery. However, the AER core in pcie_do_recovery() coordinates recovery
by calling .error_detected, .mmio_enabled, .slot_reset, and .resume.

It uses low-level hardware resets like pci_bus_error_reset() or
pcie_reset_flr() which do not trigger the pci_dev_restore() wrapper.

Since reset_done is exclusively called from pci_dev_restore() during
standard function resets, does the state restoration logic belong in
.slot_reset instead to ensure it executes during an AER event?

>  };
>  EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260603182415.2324-1-alifm@linux.ibm.com?part=3

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices
  2026-06-03 18:24 ` [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices Farhan Ali
@ 2026-06-03 22:20   ` Alex Williamson
  2026-06-03 23:35     ` Farhan Ali
  0 siblings, 1 reply; 23+ messages in thread
From: Alex Williamson @ 2026-06-03 22:20 UTC (permalink / raw)
  To: Farhan Ali
  Cc: linux-s390, linux-kernel, linux-pci, helgaas, schnelle, mjrosato,
	alex

On Wed,  3 Jun 2026 11:24:12 -0700
Farhan Ali <alifm@linux.ibm.com> wrote:
> @@ -81,6 +52,47 @@ static bool is_driver_supported(struct pci_driver *driver)
>  	return true;
>  }
>  
> +static int zpci_store_pci_error(struct pci_dev *pdev,
> +				 struct zpci_ccdf_err *ccdf)
> +{
> +	struct zpci_dev *zdev = to_zpci(pdev);
> +	int i;
> +
> +	guard(mutex)(&zdev->pending_errs_lock);
> +	if (!zdev->pending_errs.mediated_recovery)
> +		return -EINVAL;
> +
> +	if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) {
> +		pr_err("%s: Maximum number (%d) of pending error events queued\n",
> +		       pci_name(pdev), ZPCI_ERR_PENDING_MAX);

Is this really an err condition or just a warn?  Nothing is
fundamentally broken here, the queue is just full and we're losing
errors.  Maybe this should be a warn?

Can this create a DoS if a device continues to generate errors and
nobody is consuming them?  Userspace could ignore the error.  This
should probably be _ratelimited.

pr_err + pci_name suggests this should be a pci_ or dev_ call and since
the pci variant doesn't exist, use dev_warn_ratelimited().

> +		return -ENOMEM;
> +	}
> +
> +	i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX;
> +	memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err));
> +	zdev->pending_errs.tail++;
> +	zdev->pending_errs.count++;
> +	return 0;
> +}
> +
> +void zpci_start_mediated_recovery(struct zpci_dev *zdev)
> +{
> +	guard(mutex)(&zdev->pending_errs_lock);
> +	zdev->pending_errs.mediated_recovery = true;
> +}
> +EXPORT_SYMBOL_GPL(zpci_start_mediated_recovery);
> +
> +void zpci_stop_mediated_recovery(struct zpci_dev *zdev)
> +{
> +	guard(mutex)(&zdev->pending_errs_lock);
> +	zdev->pending_errs.mediated_recovery = false;
> +	if (zdev->pending_errs.count)
> +		pr_info("Unhandled PCI error events count=%d for PCI function 0x%x\n",
> +			zdev->pending_errs.count, zdev->fid);

It seems like there's always a race that an error could occur as the
user is closing the device.  Is this really worth logging at anything
more than a dbg level, pci_dbg() in this case?

> +	memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
> +}
> +EXPORT_SYMBOL_GPL(zpci_stop_mediated_recovery);
> +
>  static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
>  							 struct pci_driver *driver)
>  {
> @@ -175,12 +187,15 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
>   * and the platform determines which functions are affected for
>   * multi-function devices.
>   */
> -static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev,
> +							  struct zpci_ccdf_err *ccdf)
>  {
>  	pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
>  	struct zpci_dev *zdev = to_zpci(pdev);
> +	bool mediated_recovery = false;
>  	char *status_str = "success";
>  	struct pci_driver *driver;
> +	int rc;
>  
>  	/*
>  	 * Ensure that the PCI function is not removed concurrently, no driver
> @@ -194,13 +209,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>  	}
>  	pdev->error_state = pci_channel_io_frozen;
>  
> -	if (is_passed_through(pdev)) {
> -		pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
> -			pci_name(pdev));
> -		status_str = "failed (pass-through)";
> -		goto out_unlock;
> -	}
> -
>  	driver = to_pci_driver(pdev->dev.driver);
>  	if (!is_driver_supported(driver)) {
>  		if (!driver) {
> @@ -216,12 +224,24 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>  		goto out_unlock;
>  	}
>  
> +	rc = zpci_store_pci_error(pdev, ccdf);
> +	if (!rc || rc == -ENOMEM)
> +		mediated_recovery = true;

This is a convoluted way to get the state of
zdev->pending_errs.mediated_recovery, which becomes invalid out of
pending_errs_lock anyway.

> +
>  	ers_res = zpci_event_notify_error_detected(pdev, driver);
>  	if (ers_result_indicates_abort(ers_res)) {
>  		status_str = "failed (abort on detection)";
>  		goto out_unlock;
>  	}
>  
> +	if (mediated_recovery) {
> +		pr_info("%s: Leaving recovery of pass-through device to user-space\n",
> +			pci_name(pdev));
> +		ers_res = PCI_ERS_RESULT_RECOVERED;
> +		status_str = "in progress";
> +		goto out_unlock;
> +	}

Since zdev->pending_errs.mediated_recovery is only valid while holding
pending_errs_lock, this is really no better than the
is_passed_through() test.

> +
>  	if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
>  		ers_res = zpci_event_do_error_state_clear(pdev, driver);
>  		if (ers_result_indicates_abort(ers_res)) {
> @@ -266,25 +286,19 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>   * @pdev: PCI function for which to report
>   * @es: PCI channel failure state to report
>   */
> -static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
> +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
> +				  struct zpci_ccdf_err *ccdf)
>  {
>  	struct pci_driver *driver;
>  
>  	pci_dev_lock(pdev);
>  	pdev->error_state = es;
> -	/**
> -	 * While vfio-pci's error_detected callback notifies user-space QEMU
> -	 * reacts to this by freezing the guest. In an s390 environment PCI
> -	 * errors are rarely fatal so this is overkill. Instead in the future
> -	 * we will inject the error event and let the guest recover the device
> -	 * itself.
> -	 */
> -	if (is_passed_through(pdev))
> -		goto out;
> +
> +	zpci_store_pci_error(pdev, ccdf);
>  	driver = to_pci_driver(pdev->dev.driver);
>  	if (driver && driver->err_handler && driver->err_handler->error_detected)
>  		driver->err_handler->error_detected(pdev, pdev->error_state);

How do you intend to stage this versus QEMU changes?  This seems like a
big regression if we're suddenly triggering the eventfd that causes
QEMU to halt.  Do you need userspace to opt-in to mediated recovery
rather than automatically enabling it on open?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 2/4] vfio-pci/zdev: Add a device feature for error information
  2026-06-03 18:24 ` [PATCH v18 2/4] vfio-pci/zdev: Add a device feature for error information Farhan Ali
  2026-06-03 18:49   ` sashiko-bot
@ 2026-06-03 22:37   ` Alex Williamson
  2026-06-03 23:40     ` Farhan Ali
  1 sibling, 1 reply; 23+ messages in thread
From: Alex Williamson @ 2026-06-03 22:37 UTC (permalink / raw)
  To: Farhan Ali
  Cc: linux-s390, linux-kernel, linux-pci, helgaas, schnelle, mjrosato,
	alex

On Wed,  3 Jun 2026 11:24:13 -0700
Farhan Ali <alifm@linux.ibm.com> wrote:
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 5de618a3a5ee..3eead86a00ab 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1534,6 +1534,36 @@ struct vfio_device_feature_dma_buf {
>   */
>  #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2  12
>  
> +/**
> + * VFIO_DEVICE_FEATURE_ZPCI_ERROR feature provides PCI error information to
> + * userspace for vfio-pci devices on s390. On s390, PCI error recovery
> + * involves platform firmware and notification to operating systems is done
> + * by architecture specific mechanism. Exposing this information to
> + * userspace allows it to take appropriate actions to handle an
> + * error on the device. The ioctl returns -ENOMSG if there are no pending
> + * PCI errors.
> + */
> +
> +struct vfio_device_feature_zpci_err {
> +	__u32 feature_flags;		/* Indicate future features */
> +	__u32 reserved1;
> +	__u32 fh;			/* function handle */
> +	__u32 fid;			/* function id */
> +	__u32 ett		:  4;	/* expected table type */
> +	__u32 mvn		: 12;	/* MSI vector number */
> +	__u32 dmaas		:  8;	/* DMA address space */
> +	__u32 reserved2		:  6;
> +	__u32 q			:  1;	/* event qualifier */
> +	__u32 rw		:  1;	/* read/write */

There's a 4-byte hole here.

> +	__u64 faddr;			/* failing address */
> +	__u32 reserved3;
> +	__u16 reserved4;
> +	__u16 pec;			/* PCI event code */
> +	__u8 reserved5[28];		/* Allow for future expansion */
> +};
> +
> +#define VFIO_DEVICE_FEATURE_ZPCI_ERROR 13

Based on the discussion with Omar and FMB reporting, I don't suppose
this could also be copied to a user buffer as opaque data with size
reported elsewhere, could it?  Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-03 18:24 ` [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver Farhan Ali
  2026-06-03 19:04   ` sashiko-bot
@ 2026-06-03 22:46   ` Alex Williamson
  2026-06-04  0:01     ` Farhan Ali
  2026-06-04  8:28   ` Keith Busch
  2 siblings, 1 reply; 23+ messages in thread
From: Alex Williamson @ 2026-06-03 22:46 UTC (permalink / raw)
  To: Farhan Ali
  Cc: linux-s390, linux-kernel, linux-pci, helgaas, schnelle, mjrosato,
	Julian Ruess, alex

On Wed,  3 Jun 2026 11:24:14 -0700
Farhan Ali <alifm@linux.ibm.com> wrote:

> On error recovery for a PCI device bound to vfio-pci driver, we want to
> recover the state of the device to its last known saved state. The callback
> restores the state of the device to its initial saved state.
> 
> Reviewed-by: Julian Ruess <julianr@linux.ibm.com>
> Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
> Acked-by: Alex Williamson <alex@shazbot.org>
> Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 12 ++++++++++++
>  1 file changed, 12 insertions(+)

Sashiko has some interesting comments:

https://sashiko.dev/#/patchset/20260603182415.2324-4-alifm@linux.ibm.com

I think the critical is a calculated risk we're taking, restoring the
hand-off state after reset.  I don't think we'd considered the race,
and it seems the commit log could use some improvement, especially if
we do need to revisit this based on the calculated risk.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices
  2026-06-03 22:20   ` Alex Williamson
@ 2026-06-03 23:35     ` Farhan Ali
  2026-06-04 18:27       ` Alex Williamson
  0 siblings, 1 reply; 23+ messages in thread
From: Farhan Ali @ 2026-06-03 23:35 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-s390, linux-kernel, linux-pci, helgaas, schnelle, mjrosato


On 6/3/2026 3:20 PM, Alex Williamson wrote:
> On Wed,  3 Jun 2026 11:24:12 -0700
> Farhan Ali <alifm@linux.ibm.com> wrote:
>> @@ -81,6 +52,47 @@ static bool is_driver_supported(struct pci_driver *driver)
>>   	return true;
>>   }
>>   
>> +static int zpci_store_pci_error(struct pci_dev *pdev,
>> +				 struct zpci_ccdf_err *ccdf)
>> +{
>> +	struct zpci_dev *zdev = to_zpci(pdev);
>> +	int i;
>> +
>> +	guard(mutex)(&zdev->pending_errs_lock);
>> +	if (!zdev->pending_errs.mediated_recovery)
>> +		return -EINVAL;
>> +
>> +	if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) {
>> +		pr_err("%s: Maximum number (%d) of pending error events queued\n",
>> +		       pci_name(pdev), ZPCI_ERR_PENDING_MAX);
> Is this really an err condition or just a warn?  Nothing is
> fundamentally broken here, the queue is just full and we're losing
> errors.  Maybe this should be a warn?
>
> Can this create a DoS if a device continues to generate errors and
> nobody is consuming them?  Userspace could ignore the error.  This
> should probably be _ratelimited.

Architecturally once a device is an error state, then it doesn't 
generate any further error events. It normally will have only one event 
pending, we used a value of 4 for ZPCI_ERR_PENDING_MAX just to be 
cautious. Though it could be improved to be ratelimited.


>
> pr_err + pci_name suggests this should be a pci_ or dev_ call and since
> the pci variant doesn't exist, use dev_warn_ratelimited().
>
>> +		return -ENOMEM;
>> +	}
>> +
>> +	i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX;
>> +	memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err));
>> +	zdev->pending_errs.tail++;
>> +	zdev->pending_errs.count++;
>> +	return 0;
>> +}
>> +
>> +void zpci_start_mediated_recovery(struct zpci_dev *zdev)
>> +{
>> +	guard(mutex)(&zdev->pending_errs_lock);
>> +	zdev->pending_errs.mediated_recovery = true;
>> +}
>> +EXPORT_SYMBOL_GPL(zpci_start_mediated_recovery);
>> +
>> +void zpci_stop_mediated_recovery(struct zpci_dev *zdev)
>> +{
>> +	guard(mutex)(&zdev->pending_errs_lock);
>> +	zdev->pending_errs.mediated_recovery = false;
>> +	if (zdev->pending_errs.count)
>> +		pr_info("Unhandled PCI error events count=%d for PCI function 0x%x\n",
>> +			zdev->pending_errs.count, zdev->fid);
> It seems like there's always a race that an error could occur as the
> user is closing the device.  Is this really worth logging at anything
> more than a dbg level, pci_dbg() in this case?

I wanted to avoid pci_* log messages as it would require getting a 
handle of struct pci_dev and there were some concerns indicated by 
sashiko as mentioned in the thread

https://lore.kernel.org/all/c6363f08f898c7f4bb3e291ab2d3f7d8e3280459.camel@linux.ibm.com/


>
>> +	memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
>> +}
>> +EXPORT_SYMBOL_GPL(zpci_stop_mediated_recovery);
>> +
>>   static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
>>   							 struct pci_driver *driver)
>>   {
>> @@ -175,12 +187,15 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
>>    * and the platform determines which functions are affected for
>>    * multi-function devices.
>>    */
>> -static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>> +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev,
>> +							  struct zpci_ccdf_err *ccdf)
>>   {
>>   	pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
>>   	struct zpci_dev *zdev = to_zpci(pdev);
>> +	bool mediated_recovery = false;
>>   	char *status_str = "success";
>>   	struct pci_driver *driver;
>> +	int rc;
>>   
>>   	/*
>>   	 * Ensure that the PCI function is not removed concurrently, no driver
>> @@ -194,13 +209,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>>   	}
>>   	pdev->error_state = pci_channel_io_frozen;
>>   
>> -	if (is_passed_through(pdev)) {
>> -		pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
>> -			pci_name(pdev));
>> -		status_str = "failed (pass-through)";
>> -		goto out_unlock;
>> -	}
>> -
>>   	driver = to_pci_driver(pdev->dev.driver);
>>   	if (!is_driver_supported(driver)) {
>>   		if (!driver) {
>> @@ -216,12 +224,24 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>>   		goto out_unlock;
>>   	}
>>   
>> +	rc = zpci_store_pci_error(pdev, ccdf);
>> +	if (!rc || rc == -ENOMEM)
>> +		mediated_recovery = true;
> This is a convoluted way to get the state of
> zdev->pending_errs.mediated_recovery, which becomes invalid out of
> pending_errs_lock anyway.

We had some discussion around this on a previous version, to simplify 
the error recovery to be driven by host or userspace recovery we decided 
with this approach 
https://lore.kernel.org/all/6dd6de60d72a70ff85ecab2a9d14f45a617f05e7.camel@linux.ibm.com/.


>
>> +
>>   	ers_res = zpci_event_notify_error_detected(pdev, driver);
>>   	if (ers_result_indicates_abort(ers_res)) {
>>   		status_str = "failed (abort on detection)";
>>   		goto out_unlock;
>>   	}
>>   
>> +	if (mediated_recovery) {
>> +		pr_info("%s: Leaving recovery of pass-through device to user-space\n",
>> +			pci_name(pdev));
>> +		ers_res = PCI_ERS_RESULT_RECOVERED;
>> +		status_str = "in progress";
>> +		goto out_unlock;
>> +	}
> Since zdev->pending_errs.mediated_recovery is only valid while holding
> pending_errs_lock, this is really no better than the
> is_passed_through() test.
>
>> +
>>   	if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
>>   		ers_res = zpci_event_do_error_state_clear(pdev, driver);
>>   		if (ers_result_indicates_abort(ers_res)) {
>> @@ -266,25 +286,19 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>>    * @pdev: PCI function for which to report
>>    * @es: PCI channel failure state to report
>>    */
>> -static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
>> +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
>> +				  struct zpci_ccdf_err *ccdf)
>>   {
>>   	struct pci_driver *driver;
>>   
>>   	pci_dev_lock(pdev);
>>   	pdev->error_state = es;
>> -	/**
>> -	 * While vfio-pci's error_detected callback notifies user-space QEMU
>> -	 * reacts to this by freezing the guest. In an s390 environment PCI
>> -	 * errors are rarely fatal so this is overkill. Instead in the future
>> -	 * we will inject the error event and let the guest recover the device
>> -	 * itself.
>> -	 */
>> -	if (is_passed_through(pdev))
>> -		goto out;
>> +
>> +	zpci_store_pci_error(pdev, ccdf);
>>   	driver = to_pci_driver(pdev->dev.driver);
>>   	if (driver && driver->err_handler && driver->err_handler->error_detected)
>>   		driver->err_handler->error_detected(pdev, pdev->error_state);
> How do you intend to stage this versus QEMU changes?  This seems like a
> big regression if we're suddenly triggering the eventfd that causes
> QEMU to halt.  Do you need userspace to opt-in to mediated recovery
> rather than automatically enabling it on open?  Thanks,
>
> Alex

AFAIU userspace registering an eventfd to receive notification for error 
events is an opt-in? And yes for QEMU the current behavior halts the 
guest, but even today on an error device becomes unusable and requires 
manual intervention. I am not sure if we need to add another opt-in 
mechanism for QEMU.

Thanks

Farhan


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 2/4] vfio-pci/zdev: Add a device feature for error information
  2026-06-03 22:37   ` Alex Williamson
@ 2026-06-03 23:40     ` Farhan Ali
  0 siblings, 0 replies; 23+ messages in thread
From: Farhan Ali @ 2026-06-03 23:40 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-s390, linux-kernel, linux-pci, helgaas, schnelle, mjrosato


On 6/3/2026 3:37 PM, Alex Williamson wrote:
> On Wed,  3 Jun 2026 11:24:13 -0700
> Farhan Ali <alifm@linux.ibm.com> wrote:
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 5de618a3a5ee..3eead86a00ab 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -1534,6 +1534,36 @@ struct vfio_device_feature_dma_buf {
>>    */
>>   #define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2  12
>>   
>> +/**
>> + * VFIO_DEVICE_FEATURE_ZPCI_ERROR feature provides PCI error information to
>> + * userspace for vfio-pci devices on s390. On s390, PCI error recovery
>> + * involves platform firmware and notification to operating systems is done
>> + * by architecture specific mechanism. Exposing this information to
>> + * userspace allows it to take appropriate actions to handle an
>> + * error on the device. The ioctl returns -ENOMSG if there are no pending
>> + * PCI errors.
>> + */
>> +
>> +struct vfio_device_feature_zpci_err {
>> +	__u32 feature_flags;		/* Indicate future features */
>> +	__u32 reserved1;
>> +	__u32 fh;			/* function handle */
>> +	__u32 fid;			/* function id */
>> +	__u32 ett		:  4;	/* expected table type */
>> +	__u32 mvn		: 12;	/* MSI vector number */
>> +	__u32 dmaas		:  8;	/* DMA address space */
>> +	__u32 reserved2		:  6;
>> +	__u32 q			:  1;	/* event qualifier */
>> +	__u32 rw		:  1;	/* read/write */
> There's a 4-byte hole here.
>
>> +	__u64 faddr;			/* failing address */
>> +	__u32 reserved3;
>> +	__u16 reserved4;
>> +	__u16 pec;			/* PCI event code */
>> +	__u8 reserved5[28];		/* Allow for future expansion */
>> +};
>> +
>> +#define VFIO_DEVICE_FEATURE_ZPCI_ERROR 13
> Based on the discussion with Omar and FMB reporting, I don't suppose
> this could also be copied to a user buffer as opaque data with size
> reported elsewhere, could it?  Thanks,
>
> Alex

We currently don't expose any PCI error data to userspace and so 
userspace has no way of knowing the size of the buffer. Unless we 
provide the size, i don't think we can copy the user buffer as opaque data.

Thanks

Farhan



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-03 22:46   ` Alex Williamson
@ 2026-06-04  0:01     ` Farhan Ali
  0 siblings, 0 replies; 23+ messages in thread
From: Farhan Ali @ 2026-06-04  0:01 UTC (permalink / raw)
  To: Alex Williamson
  Cc: linux-s390, linux-kernel, linux-pci, helgaas, schnelle, mjrosato,
	Julian Ruess


On 6/3/2026 3:46 PM, Alex Williamson wrote:
> On Wed,  3 Jun 2026 11:24:14 -0700
> Farhan Ali <alifm@linux.ibm.com> wrote:
>
>> On error recovery for a PCI device bound to vfio-pci driver, we want to
>> recover the state of the device to its last known saved state. The callback
>> restores the state of the device to its initial saved state.
>>
>> Reviewed-by: Julian Ruess <julianr@linux.ibm.com>
>> Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
>> Acked-by: Alex Williamson <alex@shazbot.org>
>> Reviewed-by: Matthew Rosato <mjrosato@linux.ibm.com>
>> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
>> ---
>>   drivers/vfio/pci/vfio_pci_core.c | 12 ++++++++++++
>>   1 file changed, 12 insertions(+)
> Sashiko has some interesting comments:
>
> https://sashiko.dev/#/patchset/20260603182415.2324-4-alifm@linux.ibm.com
>
> I think the critical is a calculated risk we're taking, restoring the
> hand-off state after reset.  I don't think we'd considered the race,
> and it seems the commit log could use some improvement, especially if
> we do need to revisit this based on the calculated risk.  Thanks,
>
> Alex

Regarding the race mentioned by sashiko, pci_load_saved_state() also has 
a check to make sure the state passed is not NULL. However the race 
mentioned by sashiko seems plausible. The reset_done() callback is 
called with pci device lock held, but it might not be enough to protect 
against the race and so we need to grab the dev_set->lock here? For the 
commit message how about adding something like this? "This will 
intentionally reset all the runtime PCI configuration changes such as 
BAR programming and MSI-X enablement"

Thanks

Farhan


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-03 18:24 ` [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver Farhan Ali
  2026-06-03 19:04   ` sashiko-bot
  2026-06-03 22:46   ` Alex Williamson
@ 2026-06-04  8:28   ` Keith Busch
  2026-06-04 17:17     ` Farhan Ali
  2 siblings, 1 reply; 23+ messages in thread
From: Keith Busch @ 2026-06-04  8:28 UTC (permalink / raw)
  To: Farhan Ali
  Cc: linux-s390, linux-kernel, linux-pci, alex, helgaas, schnelle,
	mjrosato, Julian Ruess

On Wed, Jun 03, 2026 at 11:24:14AM -0700, Farhan Ali wrote:
> +static void vfio_pci_core_aer_reset_done(struct pci_dev *pdev)
> +{
> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
> +
> +	if (!vdev->pci_saved_state)
> +		return;
> +
> +	pci_load_saved_state(pdev, vdev->pci_saved_state);
> +	pci_restore_state(pdev);
> +}

Shouldn't there be a cooresponding user space notification that the
device has been restored? There's an eventfd on the error detected side
so user space can know the device needs recovery, but how does it come
to know that the reset is completed?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-04  8:28   ` Keith Busch
@ 2026-06-04 17:17     ` Farhan Ali
  2026-06-04 19:57       ` Alex Williamson
  2026-06-04 20:42       ` Keith Busch
  0 siblings, 2 replies; 23+ messages in thread
From: Farhan Ali @ 2026-06-04 17:17 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-s390, linux-kernel, linux-pci, alex, helgaas, schnelle,
	mjrosato, Julian Ruess


On 6/4/2026 1:28 AM, Keith Busch wrote:
> On Wed, Jun 03, 2026 at 11:24:14AM -0700, Farhan Ali wrote:
>> +static void vfio_pci_core_aer_reset_done(struct pci_dev *pdev)
>> +{
>> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
>> +
>> +	if (!vdev->pci_saved_state)
>> +		return;
>> +
>> +	pci_load_saved_state(pdev, vdev->pci_saved_state);
>> +	pci_restore_state(pdev);
>> +}
> Shouldn't there be a cooresponding user space notification that the
> device has been restored? There's an eventfd on the error detected side
> so user space can know the device needs recovery, but how does it come
> to know that the reset is completed?

I think if the VFIO_DEVICE_RESET ioctl completes successfully it should 
be an indication that the reset has completed? AFAIU the ioctl will 
drive a reset via pci_try_reset_function(). If reset completes completes 
successfully the reset_done() callback is called via pci_dev_restore(). 
So I don't think we need an eventfd to notify on reset completion. 
Otherwise we would have the same problem today, where userspace is 
unaware that VFIO_DEVICE_RESET did indeed successfully reset the device, 
no? Or am I missing something?

Thanks

Farhan



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices
  2026-06-03 23:35     ` Farhan Ali
@ 2026-06-04 18:27       ` Alex Williamson
  0 siblings, 0 replies; 23+ messages in thread
From: Alex Williamson @ 2026-06-04 18:27 UTC (permalink / raw)
  To: Farhan Ali
  Cc: linux-s390, linux-kernel, linux-pci, helgaas, schnelle, mjrosato,
	alex

On Wed, 3 Jun 2026 16:35:11 -0700
Farhan Ali <alifm@linux.ibm.com> wrote:
> On 6/3/2026 3:20 PM, Alex Williamson wrote:
> > On Wed,  3 Jun 2026 11:24:12 -0700
> > Farhan Ali <alifm@linux.ibm.com> wrote:  
> >> @@ -266,25 +286,19 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> >>    * @pdev: PCI function for which to report
> >>    * @es: PCI channel failure state to report
> >>    */
> >> -static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
> >> +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
> >> +				  struct zpci_ccdf_err *ccdf)
> >>   {
> >>   	struct pci_driver *driver;
> >>   
> >>   	pci_dev_lock(pdev);
> >>   	pdev->error_state = es;
> >> -	/**
> >> -	 * While vfio-pci's error_detected callback notifies user-space QEMU
> >> -	 * reacts to this by freezing the guest. In an s390 environment PCI
> >> -	 * errors are rarely fatal so this is overkill. Instead in the future
> >> -	 * we will inject the error event and let the guest recover the device
> >> -	 * itself.
> >> -	 */
> >> -	if (is_passed_through(pdev))
> >> -		goto out;
> >> +
> >> +	zpci_store_pci_error(pdev, ccdf);
> >>   	driver = to_pci_driver(pdev->dev.driver);
> >>   	if (driver && driver->err_handler && driver->err_handler->error_detected)
> >>   		driver->err_handler->error_detected(pdev, pdev->error_state);  
> > How do you intend to stage this versus QEMU changes?  This seems like a
> > big regression if we're suddenly triggering the eventfd that causes
> > QEMU to halt.  Do you need userspace to opt-in to mediated recovery
> > rather than automatically enabling it on open?  Thanks,
> >
> > Alex  
> 
> AFAIU userspace registering an eventfd to receive notification for error 
> events is an opt-in? And yes for QEMU the current behavior halts the 
> guest, but even today on an error device becomes unusable and requires 
> manual intervention. I am not sure if we need to add another opt-in 
> mechanism for QEMU.

Yes, QEMU is performing an opt-in, but we're also now calling through
to that opt-in in more cases.  Arguably this is coming more in line
with AER handling where I believe only uncorrected errors trigger this
path and we signal through the error eventfd for all uncorrected AER
errors.  So long as you've considered the implications for existing
userspace, I won't object. Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-04 17:17     ` Farhan Ali
@ 2026-06-04 19:57       ` Alex Williamson
  2026-06-08 19:26         ` Farhan Ali
  2026-06-04 20:42       ` Keith Busch
  1 sibling, 1 reply; 23+ messages in thread
From: Alex Williamson @ 2026-06-04 19:57 UTC (permalink / raw)
  To: Farhan Ali
  Cc: Keith Busch, linux-s390, linux-kernel, linux-pci, helgaas,
	schnelle, mjrosato, Julian Ruess, alex, Chengwen Feng

On Thu, 4 Jun 2026 10:17:04 -0700
Farhan Ali <alifm@linux.ibm.com> wrote:

> On 6/4/2026 1:28 AM, Keith Busch wrote:
> > On Wed, Jun 03, 2026 at 11:24:14AM -0700, Farhan Ali wrote:  
> >> +static void vfio_pci_core_aer_reset_done(struct pci_dev *pdev)
> >> +{
> >> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
> >> +
> >> +	if (!vdev->pci_saved_state)
> >> +		return;
> >> +
> >> +	pci_load_saved_state(pdev, vdev->pci_saved_state);
> >> +	pci_restore_state(pdev);
> >> +}  
> > Shouldn't there be a cooresponding user space notification that the
> > device has been restored? There's an eventfd on the error detected side
> > so user space can know the device needs recovery, but how does it come
> > to know that the reset is completed?  
> 
> I think if the VFIO_DEVICE_RESET ioctl completes successfully it should 
> be an indication that the reset has completed? AFAIU the ioctl will 
> drive a reset via pci_try_reset_function(). If reset completes completes 
> successfully the reset_done() callback is called via pci_dev_restore(). 
> So I don't think we need an eventfd to notify on reset completion. 
> Otherwise we would have the same problem today, where userspace is 
> unaware that VFIO_DEVICE_RESET did indeed successfully reset the device, 
> no? Or am I missing something?

I'm starting to feel a little sketchy about this.  I asked claude to
enumerate the state restores and the source of that restored state.
Hopefully this ascii table survives:

  ┌──────────────────────────┬────────────────────────┬─────────────────────┐
  │           Step           │         Source         │ Snapshot-dependent? │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │                          │ EXP cap save buffer    │                     │
  │ pci_restore_pcie_state   │ (pci_find_saved_cap,   │ YES                 │
  │                          │ cap.data)              │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │                          │ live                   │                     │
  │ pci_restore_pasid_state  │ pdev->pasid_enabled +  │ no                  │
  │                          │ pasid_features         │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_restore_pri_state    │ live pdev->pri_enabled │ no                  │
  │                          │  + pri_reqs_alloc      │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_restore_ats_state    │ live dev->ats_enabled  │ no                  │
  │                          │ + ats_stu              │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_restore_vc_state     │ VC ext-cap save buffer │ YES                 │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │                          │ live resource_size()   │                     │
  │ pci_restore_rebar_state  │ (re-derived, written   │ no                  │
  │                          │ to hw)                 │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_restore_dpc_state    │ DPC ext-cap save       │ YES                 │
  │                          │ buffer                 │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_restore_ptm_state    │ PTM ext-cap save       │ YES                 │
  │                          │ buffer                 │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │                          │ TPH ext-cap save       │                     │
  │ pci_restore_tph_state    │ buffer, gated on live  │ YES (gated)         │
  │                          │ tph_enabled            │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_aer_clear_status     │ clears hw status (not  │ n/a                 │
  │                          │ a restore)             │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_restore_aer_state    │ ERR ext-cap save       │ YES                 │
  │                          │ buffer                 │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │                          │ saved_config_space[16] │                     │
  │ pci_restore_config_space │  — type-0 header       │ YES                 │
  │                          │ (COMMAND, BARs,        │                     │
  │                          │ cacheline…)            │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_restore_pcix_state   │ PCI-X cap save buffer  │ YES                 │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_restore_msi_state    │ live msi_desc list +   │ no                  │
  │                          │ msi(x)_enabled         │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_enable_acs           │ re-derived from ACS    │ no                  │
  │                          │ policy                 │                     │
  ├──────────────────────────┼────────────────────────┼─────────────────────┤
  │ pci_restore_iov_state    │ live dev->sriov        │ no                  │
  │                          │ (num_VFs, ctrl)        │                     │
  └──────────────────────────┴────────────────────────┴─────────────────────┘

For things like MSI/X, SR-IOV, RE-BAR, etc. we're actually restoring
from the kernel internal state rather than the save buffer state, so
this is a no-op.  However, one thing in that list stands out, TPH.

We don't yet support enabling TPH, but there are series on the list
that propose to add this.  The TPH buffer space in the saved state is
allocated just by the capability being present.  On open TPH is
disabled and the saved state is untouched, zeros.  If TPH is then
enabled and the device reset, the pre-reset save state populates the
TPH save buffer and we restore that state post-reset.  With the change
here, reset_done would then push the open saved state.  The live TPH
state is enabled, therefore the restore pushes the original open state,
zeros.

This would result in a visible user change and maybe more importantly
shows that we're relying on ad-hoc behavior, without really any specific
policy to have this work reliably.  It actually seems like only in the
close function, where we've disabled anything the user might have
enabled, is it really valid to restore the original state.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-04 17:17     ` Farhan Ali
  2026-06-04 19:57       ` Alex Williamson
@ 2026-06-04 20:42       ` Keith Busch
  2026-06-05 18:41         ` Farhan Ali
  1 sibling, 1 reply; 23+ messages in thread
From: Keith Busch @ 2026-06-04 20:42 UTC (permalink / raw)
  To: Farhan Ali
  Cc: linux-s390, linux-kernel, linux-pci, alex, helgaas, schnelle,
	mjrosato, Julian Ruess

On Thu, Jun 04, 2026 at 10:17:04AM -0700, Farhan Ali wrote:
> 
> On 6/4/2026 1:28 AM, Keith Busch wrote:
> > On Wed, Jun 03, 2026 at 11:24:14AM -0700, Farhan Ali wrote:
> > > +static void vfio_pci_core_aer_reset_done(struct pci_dev *pdev)
> > > +{
> > > +	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
> > > +
> > > +	if (!vdev->pci_saved_state)
> > > +		return;
> > > +
> > > +	pci_load_saved_state(pdev, vdev->pci_saved_state);
> > > +	pci_restore_state(pdev);
> > > +}
> > Shouldn't there be a cooresponding user space notification that the
> > device has been restored? There's an eventfd on the error detected side
> > so user space can know the device needs recovery, but how does it come
> > to know that the reset is completed?
> 
> I think if the VFIO_DEVICE_RESET ioctl completes successfully it should be
> an indication that the reset has completed? 

But isn't this reset initiated by the kernel via the kernel's AER
handler? The user space driven ioctl has nothing to do with it, unless
I'm missing something. I'm just mentioning it as I was recently asked to
look into DPC and AER handling for vfio, and I think there needs to be
coordination with userspace here for a more reliable recovery.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-04 20:42       ` Keith Busch
@ 2026-06-05 18:41         ` Farhan Ali
  2026-06-09 21:38           ` Keith Busch
  0 siblings, 1 reply; 23+ messages in thread
From: Farhan Ali @ 2026-06-05 18:41 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-s390, linux-kernel, linux-pci, alex, helgaas, schnelle,
	mjrosato, Julian Ruess


On 6/4/2026 1:42 PM, Keith Busch wrote:
> On Thu, Jun 04, 2026 at 10:17:04AM -0700, Farhan Ali wrote:
>> On 6/4/2026 1:28 AM, Keith Busch wrote:
>>> On Wed, Jun 03, 2026 at 11:24:14AM -0700, Farhan Ali wrote:
>>>> +static void vfio_pci_core_aer_reset_done(struct pci_dev *pdev)
>>>> +{
>>>> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
>>>> +
>>>> +	if (!vdev->pci_saved_state)
>>>> +		return;
>>>> +
>>>> +	pci_load_saved_state(pdev, vdev->pci_saved_state);
>>>> +	pci_restore_state(pdev);
>>>> +}
>>> Shouldn't there be a cooresponding user space notification that the
>>> device has been restored? There's an eventfd on the error detected side
>>> so user space can know the device needs recovery, but how does it come
>>> to know that the reset is completed?
>> I think if the VFIO_DEVICE_RESET ioctl completes successfully it should be
>> an indication that the reset has completed?
> But isn't this reset initiated by the kernel via the kernel's AER
> handler? The user space driven ioctl has nothing to do with it, unless
> I'm missing something. I'm just mentioning it as I was recently asked to
> look into DPC and AER handling for vfio, and I think there needs to be
> coordination with userspace here for a more reliable recovery.

The approach I have taken for s390x, is on an error for PCI devices 
bound to vfio, we bypass host recovery completely so the kernel doesn't 
drive the reset (see patch 1 of this series). The recovery will then 
have to be driven by userspace. The error_detected() callback and 
eventfd notifies userspace on an error, and then userspace can drive the 
recovery via VFIO_DEVICE_RESET. For our primary use case of QEMU, once 
notified it then injects this error into the VM so device drivers in the 
VM can take recovery actions. For example for a passthrough NVMe device, 
the VM's OS NVMe driver will access the device. At this point the VM's 
NVMe driver's error_detected() will drive the recovery by returning 
PCI_ERS_RESULT_NEED_RESET, and the s390x error recovery in the VM's OS 
will try to do a reset. Resets are privileged operations and so the VM 
will need intervention from QEMU to perform the reset. QEMU will invoke 
the VFIO_DEVICE_RESET ioctl to now notify the host that the VM is 
requesting a reset of the device. The vfio-pci driver on the host will 
then perform the reset on the device to recover it. This also aligns 
architecturally for us as on s390 as PCI devices are exposed as 
functions to the OS, so an OS can issue resets per function (with 
platform firmware doing the heavy lifting). But I am curious to learn 
about your thoughts for DPC/AER with vfio for other platforms 
(x86/ARM?). Thanks

Farhan



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-04 19:57       ` Alex Williamson
@ 2026-06-08 19:26         ` Farhan Ali
  2026-06-09 19:16           ` Alex Williamson
  0 siblings, 1 reply; 23+ messages in thread
From: Farhan Ali @ 2026-06-08 19:26 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Keith Busch, linux-s390, linux-kernel, linux-pci, helgaas,
	schnelle, mjrosato, Julian Ruess, Chengwen Feng


On 6/4/2026 12:57 PM, Alex Williamson wrote:
> On Thu, 4 Jun 2026 10:17:04 -0700
> Farhan Ali <alifm@linux.ibm.com> wrote:
>
>> On 6/4/2026 1:28 AM, Keith Busch wrote:
>>> On Wed, Jun 03, 2026 at 11:24:14AM -0700, Farhan Ali wrote:
>>>> +static void vfio_pci_core_aer_reset_done(struct pci_dev *pdev)
>>>> +{
>>>> +	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
>>>> +
>>>> +	if (!vdev->pci_saved_state)
>>>> +		return;
>>>> +
>>>> +	pci_load_saved_state(pdev, vdev->pci_saved_state);
>>>> +	pci_restore_state(pdev);
>>>> +}
>>> Shouldn't there be a cooresponding user space notification that the
>>> device has been restored? There's an eventfd on the error detected side
>>> so user space can know the device needs recovery, but how does it come
>>> to know that the reset is completed?
>> I think if the VFIO_DEVICE_RESET ioctl completes successfully it should
>> be an indication that the reset has completed? AFAIU the ioctl will
>> drive a reset via pci_try_reset_function(). If reset completes completes
>> successfully the reset_done() callback is called via pci_dev_restore().
>> So I don't think we need an eventfd to notify on reset completion.
>> Otherwise we would have the same problem today, where userspace is
>> unaware that VFIO_DEVICE_RESET did indeed successfully reset the device,
>> no? Or am I missing something?
> I'm starting to feel a little sketchy about this.  I asked claude to
> enumerate the state restores and the source of that restored state.
> Hopefully this ascii table survives:
>
>    ┌──────────────────────────┬────────────────────────┬─────────────────────┐
>    │           Step           │         Source         │ Snapshot-dependent? │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │                          │ EXP cap save buffer    │                     │
>    │ pci_restore_pcie_state   │ (pci_find_saved_cap,   │ YES                 │
>    │                          │ cap.data)              │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │                          │ live                   │                     │
>    │ pci_restore_pasid_state  │ pdev->pasid_enabled +  │ no                  │
>    │                          │ pasid_features         │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_restore_pri_state    │ live pdev->pri_enabled │ no                  │
>    │                          │  + pri_reqs_alloc      │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_restore_ats_state    │ live dev->ats_enabled  │ no                  │
>    │                          │ + ats_stu              │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_restore_vc_state     │ VC ext-cap save buffer │ YES                 │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │                          │ live resource_size()   │                     │
>    │ pci_restore_rebar_state  │ (re-derived, written   │ no                  │
>    │                          │ to hw)                 │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_restore_dpc_state    │ DPC ext-cap save       │ YES                 │
>    │                          │ buffer                 │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_restore_ptm_state    │ PTM ext-cap save       │ YES                 │
>    │                          │ buffer                 │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │                          │ TPH ext-cap save       │                     │
>    │ pci_restore_tph_state    │ buffer, gated on live  │ YES (gated)         │
>    │                          │ tph_enabled            │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_aer_clear_status     │ clears hw status (not  │ n/a                 │
>    │                          │ a restore)             │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_restore_aer_state    │ ERR ext-cap save       │ YES                 │
>    │                          │ buffer                 │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │                          │ saved_config_space[16] │                     │
>    │ pci_restore_config_space │  — type-0 header       │ YES                 │
>    │                          │ (COMMAND, BARs,        │                     │
>    │                          │ cacheline…)            │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_restore_pcix_state   │ PCI-X cap save buffer  │ YES                 │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_restore_msi_state    │ live msi_desc list +   │ no                  │
>    │                          │ msi(x)_enabled         │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_enable_acs           │ re-derived from ACS    │ no                  │
>    │                          │ policy                 │                     │
>    ├──────────────────────────┼────────────────────────┼─────────────────────┤
>    │ pci_restore_iov_state    │ live dev->sriov        │ no                  │
>    │                          │ (num_VFs, ctrl)        │                     │
>    └──────────────────────────┴────────────────────────┴─────────────────────┘
>
> For things like MSI/X, SR-IOV, RE-BAR, etc. we're actually restoring
> from the kernel internal state rather than the save buffer state, so
> this is a no-op.  However, one thing in that list stands out, TPH.
>
> We don't yet support enabling TPH, but there are series on the list
> that propose to add this.  The TPH buffer space in the saved state is
> allocated just by the capability being present.  On open TPH is
> disabled and the saved state is untouched, zeros.  If TPH is then
> enabled and the device reset, the pre-reset save state populates the
> TPH save buffer and we restore that state post-reset.  With the change
> here, reset_done would then push the open saved state.  The live TPH
> state is enabled, therefore the restore pushes the original open state,
> zeros.
>
> This would result in a visible user change and maybe more importantly
> shows that we're relying on ad-hoc behavior, without really any specific
> policy to have this work reliably.  It actually seems like only in the
> close function, where we've disabled anything the user might have
> enabled, is it really valid to restore the original state.  Thanks,
>
> Alex

I was trying to see if we can remove the callback and still restore the 
device. The original reason why we wanted the callback, was to restore 
the device state into something sane[1]. Since commit a2f1e22390ac 
("PCI/ERR: Ensure error recoverability at all times"), which removed the 
saved_state check from pci_restore_state(), we will always restore from 
the last saved state. However, the last saved state in vfio can have the 
Command register Memory bit disabled (for example could be done through 
vfio_pci_pre_reset() in QEMU). This becomes problematic when we try to 
restore MSI-X from in kernel data and when MSI-X is enabled. AFAICT the 
msix restore path doesn't check if the Memory bit is enabled before 
writing the MSI-X message, which could cause an Unsupported Request 
error from the device. From my experiments, enabling Memory bit before 
restoring MSI-X state was sufficient to get the device in a sane state 
without this patch.

So we could remove this patch. But maybe there is a gap in MSI-X 
restoration path?

[1] 
https://lore.kernel.org/all/20250814145743.204ca19a.alex.williamson@redhat.com/

Thanks

Farhan



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-08 19:26         ` Farhan Ali
@ 2026-06-09 19:16           ` Alex Williamson
  2026-06-09 20:13             ` Farhan Ali
  0 siblings, 1 reply; 23+ messages in thread
From: Alex Williamson @ 2026-06-09 19:16 UTC (permalink / raw)
  To: Farhan Ali
  Cc: Keith Busch, linux-s390, linux-kernel, linux-pci, helgaas,
	schnelle, mjrosato, Julian Ruess, Chengwen Feng, alex

On Mon, 8 Jun 2026 12:26:36 -0700
Farhan Ali <alifm@linux.ibm.com> wrote:
> On 6/4/2026 12:57 PM, Alex Williamson wrote:
> > For things like MSI/X, SR-IOV, RE-BAR, etc. we're actually restoring
> > from the kernel internal state rather than the save buffer state, so
> > this is a no-op.  However, one thing in that list stands out, TPH.
> >
> > We don't yet support enabling TPH, but there are series on the list
> > that propose to add this.  The TPH buffer space in the saved state is
> > allocated just by the capability being present.  On open TPH is
> > disabled and the saved state is untouched, zeros.  If TPH is then
> > enabled and the device reset, the pre-reset save state populates the
> > TPH save buffer and we restore that state post-reset.  With the change
> > here, reset_done would then push the open saved state.  The live TPH
> > state is enabled, therefore the restore pushes the original open state,
> > zeros.
> >
> > This would result in a visible user change and maybe more importantly
> > shows that we're relying on ad-hoc behavior, without really any specific
> > policy to have this work reliably.  It actually seems like only in the
> > close function, where we've disabled anything the user might have
> > enabled, is it really valid to restore the original state.  Thanks,
> >
> > Alex  
> 
> I was trying to see if we can remove the callback and still restore the 
> device. The original reason why we wanted the callback, was to restore 
> the device state into something sane[1]. Since commit a2f1e22390ac 
> ("PCI/ERR: Ensure error recoverability at all times"), which removed the 
> saved_state check from pci_restore_state(), we will always restore from 
> the last saved state. However, the last saved state in vfio can have the 
> Command register Memory bit disabled (for example could be done through 
> vfio_pci_pre_reset() in QEMU). This becomes problematic when we try to 
> restore MSI-X from in kernel data and when MSI-X is enabled. AFAICT the 
> msix restore path doesn't check if the Memory bit is enabled before 
> writing the MSI-X message, which could cause an Unsupported Request 
> error from the device. From my experiments, enabling Memory bit before 
> restoring MSI-X state was sufficient to get the device in a sane state 
> without this patch.
> 
> So we could remove this patch. But maybe there is a gap in MSI-X 
> restoration path?

I'd say yes, that's a gap in __pci_restore_msix_state() that it's
writing to device MMIO space under the assumption that the device
entered save/reset/restore with memory enabled.  It should really force
memory enable to be set around the access and restored after.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-09 19:16           ` Alex Williamson
@ 2026-06-09 20:13             ` Farhan Ali
  0 siblings, 0 replies; 23+ messages in thread
From: Farhan Ali @ 2026-06-09 20:13 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Keith Busch, linux-s390, linux-kernel, linux-pci, helgaas,
	schnelle, mjrosato, Julian Ruess, Chengwen Feng


On 6/9/2026 12:16 PM, Alex Williamson wrote:
> On Mon, 8 Jun 2026 12:26:36 -0700
> Farhan Ali <alifm@linux.ibm.com> wrote:
>> On 6/4/2026 12:57 PM, Alex Williamson wrote:
>>> For things like MSI/X, SR-IOV, RE-BAR, etc. we're actually restoring
>>> from the kernel internal state rather than the save buffer state, so
>>> this is a no-op.  However, one thing in that list stands out, TPH.
>>>
>>> We don't yet support enabling TPH, but there are series on the list
>>> that propose to add this.  The TPH buffer space in the saved state is
>>> allocated just by the capability being present.  On open TPH is
>>> disabled and the saved state is untouched, zeros.  If TPH is then
>>> enabled and the device reset, the pre-reset save state populates the
>>> TPH save buffer and we restore that state post-reset.  With the change
>>> here, reset_done would then push the open saved state.  The live TPH
>>> state is enabled, therefore the restore pushes the original open state,
>>> zeros.
>>>
>>> This would result in a visible user change and maybe more importantly
>>> shows that we're relying on ad-hoc behavior, without really any specific
>>> policy to have this work reliably.  It actually seems like only in the
>>> close function, where we've disabled anything the user might have
>>> enabled, is it really valid to restore the original state.  Thanks,
>>>
>>> Alex
>> I was trying to see if we can remove the callback and still restore the
>> device. The original reason why we wanted the callback, was to restore
>> the device state into something sane[1]. Since commit a2f1e22390ac
>> ("PCI/ERR: Ensure error recoverability at all times"), which removed the
>> saved_state check from pci_restore_state(), we will always restore from
>> the last saved state. However, the last saved state in vfio can have the
>> Command register Memory bit disabled (for example could be done through
>> vfio_pci_pre_reset() in QEMU). This becomes problematic when we try to
>> restore MSI-X from in kernel data and when MSI-X is enabled. AFAICT the
>> msix restore path doesn't check if the Memory bit is enabled before
>> writing the MSI-X message, which could cause an Unsupported Request
>> error from the device. From my experiments, enabling Memory bit before
>> restoring MSI-X state was sufficient to get the device in a sane state
>> without this patch.
>>
>> So we could remove this patch. But maybe there is a gap in MSI-X
>> restoration path?
> I'd say yes, that's a gap in __pci_restore_msix_state() that it's
> writing to device MMIO space under the assumption that the device
> entered save/reset/restore with memory enabled.  It should really force
> memory enable to be set around the access and restored after.  Thanks,
>
> Alex

Yeah, that makes sense. I can add a fix for it (in the PCI patch 
series), and will remove this patch.

Thanks

Farhan



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver
  2026-06-05 18:41         ` Farhan Ali
@ 2026-06-09 21:38           ` Keith Busch
  0 siblings, 0 replies; 23+ messages in thread
From: Keith Busch @ 2026-06-09 21:38 UTC (permalink / raw)
  To: Farhan Ali
  Cc: linux-s390, linux-kernel, linux-pci, alex, helgaas, schnelle,
	mjrosato, Julian Ruess

On Fri, Jun 05, 2026 at 11:41:07AM -0700, Farhan Ali wrote:
> But I am curious to learn about your thoughts for DPC/AER with vfio for
> other platforms (x86/ARM?). Thanks

Sure. I think we just need a checkpoint mechanism between the user and
kernel space, then the kernel's existing pci error recovery can proceed
as-is. No architecture specifics needed.

The below is a quick example of what I had in mind for the kernel side.
User space just needs to cooperate through the new feature here for this
to work.

---
commit f67c0a83a75ee5f69a116894cfedc5c6de7b2195
Author: Keith Busch <kbusch@kernel.org>
Date:   Wed Apr 22 17:11:21 2026 -0700

    vfio/pci: add error recovery handshake with userspace
    
    The existing VFIO PCI error handling fires an eventfd to notify
    userspace of an AER error but immediately returns CAN_RECOVER, allowing
    the kernel's AER recovery state machine to proceed with bus resets
    while userspace may still be actively accessing the device.
    
    Introduce VFIO_PCI_ERR_NOTIFY_IRQ_INDEX, a new IRQ index that userspace
    can register to opt into a proper error recovery handshake. When
    registered, the error_detected callback signals the eventfd and waits
    up to 60 seconds for userspace to acknowledge via VFIO_IRQ_SET_ACTION_UNMASK,
    giving userspace time to quiesce device access before the kernel proceeds
    with the reset. After the bus reset completes, the new slot_reset callback
    signals the same eventfd again so userspace knows the device is ready for
    reinitialization.
    
    The legacy VFIO_PCI_ERR_IRQ_INDEX behavior is preserved unchanged for
    backward compatibility with existing userspace drivers.
    
    Signed-off-by: Keith Busch <kbusch@kernel.org>

diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index bb121f635b9f7..cd045003093b7 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -1755,6 +1755,7 @@ static const struct pci_error_handlers hisi_acc_vf_err_handlers = {
 	.reset_prepare = hisi_acc_vf_pci_reset_prepare,
 	.reset_done = hisi_acc_vf_pci_aer_reset_done,
 	.error_detected = vfio_pci_core_aer_err_detected,
+	.slot_reset = vfio_pci_aer_slot_reset,
 };
 
 static struct pci_driver hisi_acc_vfio_pci_driver = {
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index de306dee1d1ad..3262918abe5ed 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1444,6 +1444,7 @@ MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
 static const struct pci_error_handlers mlx5vf_err_handlers = {
 	.reset_done = mlx5vf_pci_aer_reset_done,
 	.error_detected = vfio_pci_core_aer_err_detected,
+	.slot_reset = vfio_pci_aer_slot_reset,
 };
 
 static struct pci_driver mlx5vf_pci_driver = {
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index fa056b69f899a..6f3472e2ce11a 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -1273,6 +1273,7 @@ static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev)
 static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = {
 	.reset_done = nvgrace_gpu_vfio_pci_reset_done,
 	.error_detected = vfio_pci_core_aer_err_detected,
+	.slot_reset = vfio_pci_aer_slot_reset,
 };
 
 static struct pci_driver nvgrace_gpu_vfio_pci_driver = {
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
index 4923f18231263..50aba9e703ed0 100644
--- a/drivers/vfio/pci/pds/pci_drv.c
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -174,6 +174,7 @@ static void pds_vfio_pci_aer_reset_done(struct pci_dev *pdev)
 static const struct pci_error_handlers pds_vfio_pci_err_handlers = {
 	.reset_done = pds_vfio_pci_aer_reset_done,
 	.error_detected = vfio_pci_core_aer_err_detected,
+	.slot_reset = vfio_pci_aer_slot_reset,
 };
 
 static struct pci_driver pds_vfio_pci_driver = {
diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c
index ac9652539d66a..dc83b52162bbf 100644
--- a/drivers/vfio/pci/qat/main.c
+++ b/drivers/vfio/pci/qat/main.c
@@ -683,6 +683,7 @@ MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table);
 static const struct pci_error_handlers qat_vf_err_handlers = {
 	.reset_done = qat_vf_pci_aer_reset_done,
 	.error_detected = vfio_pci_core_aer_err_detected,
+	.slot_reset = vfio_pci_aer_slot_reset,
 };
 
 static struct pci_driver qat_vf_vfio_pci_driver = {
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index ad52abc46c04d..b2de978ac5221 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -738,8 +738,11 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 
 	vfio_pci_dma_buf_cleanup(vdev);
 
+	complete(&vdev->err_notify_ack);
+
 	mutex_lock(&vdev->igate);
 	vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL);
+	vfio_pci_eventfd_replace_locked(vdev, &vdev->err_notify_trigger, NULL);
 	vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL);
 	mutex_unlock(&vdev->igate);
 }
@@ -785,7 +788,8 @@ static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_typ
 
 			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
 		}
-	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
+	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX ||
+		   irq_type == VFIO_PCI_ERR_NOTIFY_IRQ_INDEX) {
 		if (pci_is_pcie(vdev->pdev))
 			return 1;
 	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
@@ -2105,6 +2109,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
 		return ret;
 	INIT_LIST_HEAD(&vdev->dmabufs);
 	init_rwsem(&vdev->memory_lock);
+	init_completion(&vdev->err_notify_ack);
 	xa_init(&vdev->ctx);
 
 	return 0;
@@ -2242,16 +2247,53 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 {
 	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
 	struct vfio_pci_eventfd *eventfd;
+	unsigned long timeout;
+
+	rcu_read_lock();
+	eventfd = rcu_dereference(vdev->err_notify_trigger);
+	if (!eventfd) {
+		eventfd = rcu_dereference(vdev->err_trigger);
+		if (eventfd)
+			eventfd_signal(eventfd->ctx);
+		rcu_read_unlock();
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	}
+
+	reinit_completion(&vdev->err_notify_ack);
+	eventfd_signal(eventfd->ctx);
+	rcu_read_unlock();
+
+	timeout = wait_for_completion_timeout(&vdev->err_notify_ack,
+					      msecs_to_jiffies(60000));
+	if (!timeout) {
+		pci_warn(pdev, "vfio error handler timed out waiting for userspace ack\n");
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	}
 
 	rcu_read_lock();
-	eventfd = rcu_dereference(vdev->err_trigger);
+	eventfd = rcu_dereference(vdev->err_notify_trigger);
+	rcu_read_unlock();
+	if (!eventfd)
+		return PCI_ERS_RESULT_CAN_RECOVER;
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected);
+
+pci_ers_result_t vfio_pci_aer_slot_reset(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
+	struct vfio_pci_eventfd *eventfd;
+
+	rcu_read_lock();
+	eventfd = rcu_dereference(vdev->err_notify_trigger);
 	if (eventfd)
 		eventfd_signal(eventfd->ctx);
 	rcu_read_unlock();
 
-	return PCI_ERS_RESULT_CAN_RECOVER;
+	return PCI_ERS_RESULT_RECOVERED;
 }
-EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected);
+EXPORT_SYMBOL_GPL(vfio_pci_aer_slot_reset);
 
 int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
 				  int nr_virtfn)
@@ -2317,6 +2359,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
 
 const struct pci_error_handlers vfio_pci_core_err_handlers = {
 	.error_detected = vfio_pci_core_aer_err_detected,
+	.slot_reset = vfio_pci_aer_slot_reset,
 };
 EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
 
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 33944d4d9dc43..d3a32d3695f0a 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -811,6 +811,31 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
 					       count, flags, data);
 }
 
+static int vfio_pci_set_err_notify_trigger(struct vfio_pci_core_device *vdev,
+					   unsigned index, unsigned start,
+					   unsigned count, uint32_t flags,
+					   void *data)
+{
+	if (index != VFIO_PCI_ERR_NOTIFY_IRQ_INDEX || start != 0 || count > 1)
+		return -EINVAL;
+
+	return vfio_pci_set_ctx_trigger_single(vdev,
+					       &vdev->err_notify_trigger,
+					       count, flags, data);
+}
+
+static int vfio_pci_set_err_notify_unmask(struct vfio_pci_core_device *vdev,
+					  unsigned index, unsigned start,
+					  unsigned count, uint32_t flags,
+					  void *data)
+{
+	if (index != VFIO_PCI_ERR_NOTIFY_IRQ_INDEX || start != 0 || count != 0)
+		return -EINVAL;
+
+	complete(&vdev->err_notify_ack);
+	return 0;
+}
+
 static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
@@ -864,6 +889,18 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
 			break;
 		}
 		break;
+	case VFIO_PCI_ERR_NOTIFY_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			if (pci_is_pcie(vdev->pdev))
+				func = vfio_pci_set_err_notify_trigger;
+			break;
+		case VFIO_IRQ_SET_ACTION_UNMASK:
+			if (pci_is_pcie(vdev->pdev))
+				func = vfio_pci_set_err_notify_unmask;
+			break;
+		}
+		break;
 	case VFIO_PCI_REQ_IRQ_INDEX:
 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
 		case VFIO_IRQ_SET_ACTION_TRIGGER:
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index d2e5cbca13c85..8ca48140d0f0c 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -212,6 +212,7 @@ static void virtiovf_pci_aer_reset_done(struct pci_dev *pdev)
 static const struct pci_error_handlers virtiovf_err_handlers = {
 	.reset_done = virtiovf_pci_aer_reset_done,
 	.error_detected = vfio_pci_core_aer_err_detected,
+	.slot_reset = vfio_pci_aer_slot_reset,
 };
 
 static struct pci_driver virtiovf_pci_driver = {
diff --git a/drivers/vfio/pci/xe/main.c b/drivers/vfio/pci/xe/main.c
index 4ecadbbfd86ec..f7158d3913740 100644
--- a/drivers/vfio/pci/xe/main.c
+++ b/drivers/vfio/pci/xe/main.c
@@ -143,6 +143,7 @@ static const struct pci_error_handlers xe_vfio_pci_err_handlers = {
 	.reset_prepare = xe_vfio_pci_reset_prepare,
 	.reset_done = xe_vfio_pci_reset_done,
 	.error_detected = vfio_pci_core_aer_err_detected,
+	.slot_reset = vfio_pci_aer_slot_reset,
 };
 
 static int xe_vfio_pci_open_device(struct vfio_device *core_vdev)
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 2ebba746c18f7..74ede0c9f7bd4 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -131,6 +131,8 @@ struct vfio_pci_core_device {
 	struct pci_saved_state	*pm_save;
 	int			ioeventfds_nr;
 	struct vfio_pci_eventfd __rcu *err_trigger;
+	struct vfio_pci_eventfd __rcu *err_notify_trigger;
+	struct completion	err_notify_ack;
 	struct vfio_pci_eventfd __rcu *req_trigger;
 	struct eventfd_ctx	*pm_wake_eventfd_ctx;
 	struct list_head	dummy_resources_list;
@@ -191,6 +193,7 @@ void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
 int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar);
 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 						pci_channel_state_t state);
+pci_ers_result_t vfio_pci_aer_slot_reset(struct pci_dev *pdev);
 ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 			       void __iomem *io, char __user *buf,
 			       loff_t off, size_t count, size_t x_start,
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5de618a3a5eee..9ce3d17c6d755 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -645,6 +645,7 @@ enum {
 	VFIO_PCI_MSIX_IRQ_INDEX,
 	VFIO_PCI_ERR_IRQ_INDEX,
 	VFIO_PCI_REQ_IRQ_INDEX,
+	VFIO_PCI_ERR_NOTIFY_IRQ_INDEX,
 	VFIO_PCI_NUM_IRQS
 };
 
--

^ permalink raw reply related	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2026-06-09 21:38 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-03 18:24 [PATCH v18 0/4] [VFIO] Error recovery for vfio-pci devices on s390x Farhan Ali
2026-06-03 18:24 ` [PATCH v18 1/4] s390/pci: Store PCI error information for passthrough devices Farhan Ali
2026-06-03 22:20   ` Alex Williamson
2026-06-03 23:35     ` Farhan Ali
2026-06-04 18:27       ` Alex Williamson
2026-06-03 18:24 ` [PATCH v18 2/4] vfio-pci/zdev: Add a device feature for error information Farhan Ali
2026-06-03 18:49   ` sashiko-bot
2026-06-03 22:37   ` Alex Williamson
2026-06-03 23:40     ` Farhan Ali
2026-06-03 18:24 ` [PATCH v18 3/4] vfio/pci: Add a reset_done callback for vfio-pci driver Farhan Ali
2026-06-03 19:04   ` sashiko-bot
2026-06-03 22:46   ` Alex Williamson
2026-06-04  0:01     ` Farhan Ali
2026-06-04  8:28   ` Keith Busch
2026-06-04 17:17     ` Farhan Ali
2026-06-04 19:57       ` Alex Williamson
2026-06-08 19:26         ` Farhan Ali
2026-06-09 19:16           ` Alex Williamson
2026-06-09 20:13             ` Farhan Ali
2026-06-04 20:42       ` Keith Busch
2026-06-05 18:41         ` Farhan Ali
2026-06-09 21:38           ` Keith Busch
2026-06-03 18:24 ` [PATCH v18 4/4] vfio/pci: Remove the pcie check for VFIO_PCI_ERR_IRQ_INDEX Farhan Ali

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.