From: Niklas Schnelle <schnelle@linux.ibm.com>
To: Farhan Ali <alifm@linux.ibm.com>,
linux-s390@vger.kernel.org, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-pci@vger.kernel.org
Cc: alex.williamson@redhat.com, helgaas@kernel.org, clg@redhat.com,
mjrosato@linux.ibm.com
Subject: Re: [PATCH v4 07/10] s390/pci: Store PCI error information for passthrough devices
Date: Thu, 25 Sep 2025 16:28:04 +0200 [thread overview]
Message-ID: <d22cb26b864362454ace07ed5fcb9758c40ee32e.camel@linux.ibm.com> (raw)
In-Reply-To: <20250924171628.826-8-alifm@linux.ibm.com>
On Wed, 2025-09-24 at 10:16 -0700, Farhan Ali wrote:
> For a passthrough device we need co-operation from user space to recover
> the device. This would require to bubble up any error information to user
> space. Let's store this error information for passthrough devices, so it
> can be retrieved later.
>
> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
> ---
> arch/s390/include/asm/pci.h | 28 ++++++++++
> arch/s390/pci/pci.c | 1 +
> arch/s390/pci/pci_event.c | 95 +++++++++++++++++++-------------
> drivers/vfio/pci/vfio_pci_zdev.c | 2 +
> 4 files changed, 88 insertions(+), 38 deletions(-)
>
> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
> index f47f62fc3bfd..40bfe5721109 100644
> --- a/arch/s390/include/asm/pci.h
> +++ b/arch/s390/include/asm/pci.h
> @@ -116,6 +116,31 @@ struct zpci_bus {
> enum pci_bus_speed max_bus_speed;
> };
>
> +/* Content Code Description for PCI Function Error */
> +struct zpci_ccdf_err {
> + u32 reserved1;
> + u32 fh; /* function handle */
> + u32 fid; /* function id */
> + u32 ett : 4; /* expected table type */
> + u32 mvn : 12; /* MSI vector number */
> + u32 dmaas : 8; /* DMA address space */
> + u32 reserved2 : 6;
> + u32 q : 1; /* event qualifier */
> + u32 rw : 1; /* read/write */
> + u64 faddr; /* failing address */
> + u32 reserved3;
> + u16 reserved4;
> + u16 pec; /* PCI event code */
> +} __packed;
> +
> +#define ZPCI_ERR_PENDING_MAX 4
> +struct zpci_ccdf_pending {
> + u8 count;
> + u8 head;
> + u8 tail;
> + struct zpci_ccdf_err err[ZPCI_ERR_PENDING_MAX];
> +};
Thanks this looks more reasonably sized.
> +
> /* Private data per function */
> struct zpci_dev {
> struct zpci_bus *zbus;
> @@ -191,6 +216,8 @@ struct zpci_dev {
> struct iommu_domain *s390_domain; /* attached IOMMU domain */
> struct kvm_zdev *kzdev;
> struct mutex kzdev_lock;
> + struct zpci_ccdf_pending pending_errs;
> + struct mutex pending_errs_lock;
> spinlock_t dom_lock; /* protect s390_domain change */
> };
>
--- snip ---
> +static void zpci_store_pci_error(struct pci_dev *pdev,
> + struct zpci_ccdf_err *ccdf)
> +{
> + struct zpci_dev *zdev = to_zpci(pdev);
> + int i;
> +
> + mutex_lock(&zdev->pending_errs_lock);
> + if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) {
> + pr_err("%s: Maximum number (%d) of pending error events queued",
> + pci_name(pdev), ZPCI_ERR_PENDING_MAX);
So for a vfio-pci user which doesn't pick up the error information but
does reset on error and thus recovers we would leave just the 4 first
errors that occurred in the pending_errs and get this message once. I
think that is okay and maybe even preferrable since most errors are,
well, errors. And often the first time something went wrong is the
interesting one. So I think this makes sense.
> + mutex_unlock(&zdev->pending_errs_lock);
> + return;
> + }
> +
> + i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX;
> + memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err));
> + zdev->pending_errs.tail++;
> + zdev->pending_errs.count++;
> + mutex_unlock(&zdev->pending_errs_lock);
> +}
> +
> +void zpci_cleanup_pending_errors(struct zpci_dev *zdev)
> +{
> + struct pci_dev *pdev = NULL;
> +
> + mutex_lock(&zdev->pending_errs_lock);
> + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
I think you missed my comment on the previous version. This is missing
the matching pci_dev_put() for the pci_get_slot().
> + if (zdev->pending_errs.count)
> + pr_info("%s: Unhandled PCI error events count=%d",
> + pci_name(pdev), zdev->pending_errs.count);
> + memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
> + mutex_unlock(&zdev->pending_errs_lock);
> +}
> +EXPORT_SYMBOL_GPL(zpci_cleanup_pending_errors);
> +
>
--- snip ---
>
> @@ -322,12 +340,13 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
> break;
> case 0x0040: /* Service Action or Error Recovery Failed */
> case 0x003b:
> - zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
> + zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf);
> break;
> default: /* PCI function left in the error state attempt to recover */
> - ers_res = zpci_event_attempt_error_recovery(pdev);
> + ers_res = zpci_event_attempt_error_recovery(pdev, ccdf);
> if (ers_res != PCI_ERS_RESULT_RECOVERED)
> - zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
> + zpci_event_io_failure(pdev, pci_channel_io_perm_failure,
> + ccdf);
Nit: I'd just keep the above on one line. It's still below the 100
columns limit and just cleaner on one line.
> break;
> }
> pci_dev_put(pdev);
> diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
> index a7bc23ce8483..2be37eab9279 100644
> --- a/drivers/vfio/pci/vfio_pci_zdev.c
> +++ b/drivers/vfio/pci/vfio_pci_zdev.c
> @@ -168,6 +168,8 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
>
> zdev->mediated_recovery = false;
>
> + zpci_cleanup_pending_errors(zdev);
> +
> if (!vdev->vdev.kvm)
> return;
>
next prev parent reply other threads:[~2025-09-25 14:28 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-09-24 17:16 [PATCH v4 00/10] Error recovery for vfio-pci devices on s390x Farhan Ali
2025-09-24 17:16 ` [PATCH v4 01/10] PCI: Avoid saving error values for config space Farhan Ali
2025-10-01 15:15 ` Benjamin Block
2025-10-01 17:12 ` Farhan Ali
2025-10-02 9:16 ` Benjamin Block
2025-10-04 14:54 ` Lukas Wunner
2025-10-06 17:54 ` Farhan Ali
2025-10-06 19:26 ` Lukas Wunner
2025-10-06 21:35 ` Farhan Ali
2025-10-08 13:34 ` Lukas Wunner
2025-10-08 17:56 ` Farhan Ali
2025-10-08 18:14 ` Lukas Wunner
2025-10-08 21:55 ` Farhan Ali
2025-10-09 4:52 ` Lukas Wunner
2025-10-09 17:02 ` Farhan Ali
2025-10-12 6:43 ` Lukas Wunner
2025-10-09 9:12 ` Niklas Schnelle
2025-10-12 6:34 ` Lukas Wunner
2025-10-14 12:07 ` Niklas Schnelle
2025-10-16 21:00 ` Farhan Ali
2025-10-19 14:34 ` Lukas Wunner
2025-10-20 8:59 ` Niklas Schnelle
2025-11-22 10:58 ` Lukas Wunner
2025-09-24 17:16 ` [PATCH v4 02/10] PCI: Add additional checks for flr reset Farhan Ali
2025-09-30 10:03 ` Benjamin Block
2025-09-30 17:04 ` Farhan Ali
2025-10-01 8:33 ` Benjamin Block
2025-10-01 14:37 ` Benjamin Block
2025-09-24 17:16 ` [PATCH v4 03/10] PCI: Allow per function PCI slots Farhan Ali
2025-10-01 14:34 ` Benjamin Block
2025-09-24 17:16 ` [PATCH v4 04/10] s390/pci: Add architecture specific resource/bus address translation Farhan Ali
2025-09-25 10:54 ` Niklas Schnelle
2025-10-01 16:04 ` Benjamin Block
2025-10-01 18:01 ` Farhan Ali
2025-10-02 12:58 ` Niklas Schnelle
2025-10-02 17:00 ` Bjorn Helgaas
2025-10-02 17:16 ` Ilpo Järvinen
2025-10-02 18:14 ` Niklas Schnelle
2025-09-24 17:16 ` [PATCH v4 05/10] s390/pci: Restore IRQ unconditionally for the zPCI device Farhan Ali
2025-09-24 17:16 ` [PATCH v4 06/10] s390/pci: Update the logic for detecting passthrough device Farhan Ali
2025-09-24 17:16 ` [PATCH v4 07/10] s390/pci: Store PCI error information for passthrough devices Farhan Ali
2025-09-25 14:28 ` Niklas Schnelle [this message]
2025-09-25 16:29 ` Farhan Ali
2025-09-24 17:16 ` [PATCH v4 08/10] vfio-pci/zdev: Add a device feature for error information Farhan Ali
2025-09-25 8:04 ` kernel test robot
2025-09-24 17:16 ` [PATCH v4 09/10] vfio: Add a reset_done callback for vfio-pci driver Farhan Ali
2025-09-24 17:16 ` [PATCH v4 10/10] vfio: Remove the pcie check for VFIO_PCI_ERR_IRQ_INDEX Farhan Ali
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=d22cb26b864362454ace07ed5fcb9758c40ee32e.camel@linux.ibm.com \
--to=schnelle@linux.ibm.com \
--cc=alex.williamson@redhat.com \
--cc=alifm@linux.ibm.com \
--cc=clg@redhat.com \
--cc=helgaas@kernel.org \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-pci@vger.kernel.org \
--cc=linux-s390@vger.kernel.org \
--cc=mjrosato@linux.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).