From: Andrew Cooper <andrew.cooper3@citrix.com>
To: Jan Beulich <JBeulich@suse.com>,
xen-devel <xen-devel@lists.xenproject.org>
Cc: Keir Fraser <keir@xen.org>
Subject: Re: [PATCH 1/4] x86/MSI-X: be more careful during teardown
Date: Fri, 13 Mar 2015 18:10:50 +0000 [thread overview]
Message-ID: <5503282A.6090000@citrix.com> (raw)
In-Reply-To: <54FF29970200007800068355@mail.emea.novell.com>
On 10/03/15 16:27, Jan Beulich wrote:
> When a device gets detached from a guest, pciback will clear its
> command register, thus disabling both memory and I/O decoding. The
> disabled memory decoding, however, has an effect on the MSI-X table
> accesses the hypervisor does: These won't have the intended effect
> anymore. Even worse, for PCIe devices (but not SR-IOV virtual
> functions) such accesses may (will?) be treated as Unsupported
> Requests
Will, as the root port will issue a write and get no reply.
VF config space is part of a memory bar on the PF, which itself will
still be valid for requests even if the VF has memory decoding disabled.
On the other hand, if the PF has memory decoding disabled, I expect any
VF config access to result in a UR.
> , causing respective errors to be surfaced, potentially in the
> form of NMIs that may be fatal to the hypervisor or Dom0 is different
> ways. Hence rather than carrying out these accesses, we should avoid
> them where we can, and use alternative (e.g. PCI config space based)
> mechanisms to achieve at least the same effect.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Backporting note (largely to myself):
> Depends on (not yet backported) commit 061eebe0e "x86/MSI: drop
> workaround for insecure Dom0 kernels" (due to re-use of struct
> arch_msix's warned field).
>
> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -121,6 +121,27 @@ static void msix_put_fixmap(struct arch_
> spin_unlock(&msix->table_lock);
> }
>
> +static bool_t memory_decoded(const struct pci_dev *dev)
> +{
> + u8 bus, slot, func;
> +
> + if ( !dev->info.is_virtfn )
> + {
> + bus = dev->bus;
> + slot = PCI_SLOT(dev->devfn);
> + func = PCI_FUNC(dev->devfn);
> + }
> + else
> + {
> + bus = dev->info.physfn.bus;
> + slot = PCI_SLOT(dev->info.physfn.devfn);
> + func = PCI_FUNC(dev->info.physfn.devfn);
> + }
> +
> + return !!(pci_conf_read16(dev->seg, bus, slot, func, PCI_COMMAND) &
> + PCI_COMMAND_MEMORY);
> +}
> +
> /*
> * MSI message composition
> */
> @@ -162,7 +183,7 @@ void msi_compose_msg(unsigned vector, co
> }
> }
>
> -static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
> +static bool_t read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
> {
> switch ( entry->msi_attrib.type )
> {
> @@ -198,6 +219,8 @@ static void read_msi_msg(struct msi_desc
> void __iomem *base;
> base = entry->mask_base;
>
> + if ( unlikely(!memory_decoded(entry->dev)) )
> + return 0;
> msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
> msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
> msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
> @@ -209,6 +232,8 @@ static void read_msi_msg(struct msi_desc
>
> if ( iommu_intremap )
> iommu_read_msi_from_ire(entry, msg);
> +
> + return 1;
> }
>
> static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
> @@ -260,6 +285,8 @@ static int write_msi_msg(struct msi_desc
> void __iomem *base;
> base = entry->mask_base;
>
> + if ( unlikely(!memory_decoded(entry->dev)) )
> + return -ENXIO;
> writel(msg->address_lo,
> base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
> writel(msg->address_hi,
> @@ -287,7 +314,8 @@ void set_msi_affinity(struct irq_desc *d
> ASSERT(spin_is_locked(&desc->lock));
>
> memset(&msg, 0, sizeof(msg));
> - read_msi_msg(msi_desc, &msg);
> + if ( !read_msi_msg(msi_desc, &msg) )
> + return;
>
> msg.data &= ~MSI_DATA_VECTOR_MASK;
> msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
Assuming a non-racy setup, this hunk makes the memory decode check in
write_msi_msg() redundant. I cant however think of a neat way to elide
the second access, because all other callers of write_msi_msg() need it.
> @@ -347,20 +375,24 @@ int msi_maskable_irq(const struct msi_de
> || entry->msi_attrib.maskbit;
> }
>
> -static void msi_set_mask_bit(struct irq_desc *desc, int flag)
> +static bool_t msi_set_mask_bit(struct irq_desc *desc, int flag)
> {
> struct msi_desc *entry = desc->msi_desc;
> + struct pci_dev *pdev;
> + u16 seg;
> + u8 bus, slot, func;
>
> ASSERT(spin_is_locked(&desc->lock));
> BUG_ON(!entry || !entry->dev);
> + pdev = entry->dev;
> + seg = pdev->seg;
> + bus = pdev->bus;
> + slot = PCI_SLOT(pdev->devfn);
> + func = PCI_FUNC(pdev->devfn);
> switch (entry->msi_attrib.type) {
> case PCI_CAP_ID_MSI:
> if (entry->msi_attrib.maskbit) {
> u32 mask_bits;
> - u16 seg = entry->dev->seg;
> - u8 bus = entry->dev->bus;
> - u8 slot = PCI_SLOT(entry->dev->devfn);
> - u8 func = PCI_FUNC(entry->dev->devfn);
>
> mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
> mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
> @@ -369,24 +401,52 @@ static void msi_set_mask_bit(struct irq_
> }
> break;
> case PCI_CAP_ID_MSIX:
> - {
> - int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
> - writel(flag, entry->mask_base + offset);
> - readl(entry->mask_base + offset);
> - break;
> - }
> + if ( likely(memory_decoded(pdev)) )
> + {
> + writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> + readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> + break;
> + }
> + if ( flag )
> + {
> + u16 control;
> + domid_t domid = pdev->domain->domain_id;
> +
> + control = pci_conf_read16(seg, bus, slot, func,
> + msix_control_reg(entry->msi_attrib.pos));
> + if ( control & PCI_MSIX_FLAGS_MASKALL )
> + break;
> + pci_conf_write16(seg, bus, slot, func,
> + msix_control_reg(entry->msi_attrib.pos),
> + control | PCI_MSIX_FLAGS_MASKALL);
> + if ( pdev->msix->warned != domid )
> + {
> + pdev->msix->warned = domid;
> + printk(XENLOG_G_WARNING
> + "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
"masked all MSI-X" to make it slightly clearer that the global mask bit
has been hit.
> + desc->irq, domid, pdev->seg, pdev->bus,
> + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
> + }
> + break;
> + }
> + /* fall through */
> default:
> - BUG();
> - break;
> + return 0;
> }
> entry->msi_attrib.masked = !!flag;
> +
> + return 1;
> }
>
> static int msi_get_mask_bit(const struct msi_desc *entry)
> {
> - switch (entry->msi_attrib.type) {
> + if ( !entry->dev )
> + return -1;
> +
> + switch ( entry->msi_attrib.type )
> + {
> case PCI_CAP_ID_MSI:
> - if (!entry->dev || !entry->msi_attrib.maskbit)
> + if ( !entry->msi_attrib.maskbit )
> break;
> return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
> PCI_SLOT(entry->dev->devfn),
> @@ -394,6 +454,8 @@ static int msi_get_mask_bit(const struct
> entry->msi.mpos) >>
> entry->msi_attrib.entry_nr) & 1;
> case PCI_CAP_ID_MSIX:
> + if ( unlikely(!memory_decoded(entry->dev)) )
> + break;
> return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
> }
> return -1;
> @@ -401,12 +463,14 @@ static int msi_get_mask_bit(const struct
>
> void mask_msi_irq(struct irq_desc *desc)
> {
> - msi_set_mask_bit(desc, 1);
> + if ( unlikely(!msi_set_mask_bit(desc, 1)) )
> + BUG();
Previously, BUG() was only on a codepath controlled by the
msi_attrib.type, whereas now it includes failures based on failure to
mask an issue.
Is this wise, as it can be guest-influenced?
> }
>
> void unmask_msi_irq(struct irq_desc *desc)
> {
> - msi_set_mask_bit(desc, 0);
> + if ( unlikely(!msi_set_mask_bit(desc, 0)) )
> + WARN();
> }
>
> static unsigned int startup_msi_irq(struct irq_desc *desc)
> @@ -713,6 +777,9 @@ static int msix_capability_init(struct p
> control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
> msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
>
> + if ( unlikely(!memory_decoded(dev)) )
> + return -ENXIO;
> +
> if ( desc )
> {
> entry = alloc_msi_entry(1);
> @@ -845,7 +912,8 @@ static int msix_capability_init(struct p
> ++msix->used_entries;
>
> /* Restore MSI-X enabled bits */
> - pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
> + pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
> + control & ~PCI_MSIX_FLAGS_MASKALL);
>
> return 0;
> }
> @@ -998,8 +1066,16 @@ static void __pci_disable_msix(struct ms
>
> BUG_ON(list_empty(&dev->msi_list));
>
> - writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> -
> + if ( likely(memory_decoded(dev)) )
> + writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> + else if ( !(control & PCI_MSIX_FLAGS_MASKALL) )
> + {
> + printk(XENLOG_WARNING
> + "cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n",
"masking all"
~Andrew
> + entry->irq, dev->seg, dev->bus,
> + PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
> + control |= PCI_MSIX_FLAGS_MASKALL;
> + }
> pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
>
> _pci_cleanup_msix(dev->msix);
> @@ -1137,14 +1213,23 @@ int pci_restore_msi_state(struct pci_dev
> nr = entry->msi.nvec;
> }
> else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
> + {
> msix_set_enable(pdev, 0);
> + if ( unlikely(!memory_decoded(pdev)) )
> + {
> + spin_unlock_irqrestore(&desc->lock, flags);
> + return -ENXIO;
> + }
> + }
>
> msg = entry->msg;
> write_msi_msg(entry, &msg);
>
> for ( i = 0; ; )
> {
> - msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
> + if ( unlikely(!msi_set_mask_bit(desc,
> + entry[i].msi_attrib.masked)) )
> + BUG();
>
> if ( !--nr )
> break;
>
>
next prev parent reply other threads:[~2015-03-13 18:10 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-03-10 16:16 [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Jan Beulich
2015-03-10 16:27 ` [PATCH 1/4] x86/MSI-X: be more careful during teardown Jan Beulich
2015-03-10 21:05 ` Andrew Cooper
2015-03-11 8:22 ` Jan Beulich
2015-03-13 18:10 ` Andrew Cooper [this message]
2015-03-16 11:03 ` Jan Beulich
2015-03-10 16:28 ` [PATCH 2/4] x86/MSI-X: access MSI‑X table only after having enabled MSI‑X Jan Beulich
2015-03-13 19:18 ` Andrew Cooper
2015-03-16 11:13 ` Jan Beulich
2015-03-10 16:29 ` [PATCH 3/4] x86/MSI-X: reduce fiddling with control register during restore Jan Beulich
2015-03-13 19:38 ` Andrew Cooper
2015-03-10 16:29 ` [PATCH 4/4] x86/MSI-X: cleanup Jan Beulich
2015-03-13 19:56 ` Andrew Cooper
2015-03-13 20:16 ` [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Andrew Cooper
2015-03-16 15:38 ` Jan Beulich
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5503282A.6090000@citrix.com \
--to=andrew.cooper3@citrix.com \
--cc=JBeulich@suse.com \
--cc=keir@xen.org \
--cc=xen-devel@lists.xenproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.