[PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

* [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
@ 2014-05-21 16:09 Jan Beulich
  2014-05-21 16:33 ` Andrew Cooper
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Jan Beulich @ 2014-05-21 16:09 UTC (permalink / raw)
  To: Andrew Cooper, Malcolm Crossley
  Cc: Yang Z Zhang, xen-devel, Kevin Tian, Donald D Dugger

[-- Attachment #1: Type: text/plain, Size: 14756 bytes --]

When firmware-first mode is being indicated by firmware, we shouldn't
be modifying AER registers - these ar considered to be owned by
firmware in that case. Violating this is being reported to result in
SMI (or was it SCI?) storms. While circumventing the workaround means
re-exposing affected hosts to the XSA-59 issues, this in any event
seems better than not booting at all. Respective messages are being
issued to the log, so the situation can be diagnosed.

The basic building blocks were taken from Linux 3.15-rc.

Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reported-by: Malcolm Crossley <malcolm.crossley@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -785,6 +785,8 @@ int __init acpi_boot_init(void)
 
 	erst_init();
 
+	acpi_hest_init();
+
 	acpi_table_parse(ACPI_SIG_BGRT, acpi_invalidate_bgrt);
 
 	return 0;
--- a/xen/drivers/acpi/apei/Makefile
+++ b/xen/drivers/acpi/apei/Makefile
@@ -1,3 +1,4 @@
 obj-y += erst.o
+obj-y += hest.o
 obj-y += apei-base.o
 obj-y += apei-io.o
--- /dev/null
+++ b/xen/drivers/acpi/apei/hest.c
@@ -0,0 +1,200 @@
+/*
+ * APEI Hardware Error Souce Table support
+ *
+ * HEST describes error sources in detail; communicates operational
+ * parameters (i.e. severity levels, masking bits, and threshold
+ * values) to Linux as necessary. It also allows the BIOS to report
+ * non-standard error sources to Linux (for example, chipset-specific
+ * error registers).
+ *
+ * For more information about HEST, please refer to ACPI Specification
+ * version 4.0, section 17.3.2.
+ *
+ * Copyright 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <xen/errno.h>
+#include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/mm.h>
+#include <xen/pfn.h>
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+#include "apei-internal.h"
+
+#define HEST_PFX "HEST: "
+
+static bool_t hest_disable;
+boolean_param("hest_disable", hest_disable);
+
+/* HEST table parsing */
+
+static struct acpi_table_hest *__read_mostly hest_tab;
+
+static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = {
+	[ACPI_HEST_TYPE_IA32_CHECK] = -1,	/* need further calculation */
+	[ACPI_HEST_TYPE_IA32_CORRECTED_CHECK] = -1,
+	[ACPI_HEST_TYPE_IA32_NMI] = sizeof(struct acpi_hest_ia_nmi),
+	[ACPI_HEST_TYPE_AER_ROOT_PORT] = sizeof(struct acpi_hest_aer_root),
+	[ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
+	[ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
+	[ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
+};
+
+static int hest_esrc_len(const struct acpi_hest_header *hest_hdr)
+{
+	u16 hest_type = hest_hdr->type;
+	int len;
+
+	if (hest_type >= ACPI_HEST_TYPE_RESERVED)
+		return 0;
+
+	len = hest_esrc_len_tab[hest_type];
+
+	if (hest_type == ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) {
+		const struct acpi_hest_ia_corrected *cmc =
+			container_of(hest_hdr,
+				     const struct acpi_hest_ia_corrected,
+				     header);
+
+		len = sizeof(*cmc) + cmc->num_hardware_banks *
+		      sizeof(struct acpi_hest_ia_error_bank);
+	} else if (hest_type == ACPI_HEST_TYPE_IA32_CHECK) {
+		const struct acpi_hest_ia_machine_check *mc =
+			container_of(hest_hdr,
+				     const struct acpi_hest_ia_machine_check,
+				     header);
+
+		len = sizeof(*mc) + mc->num_hardware_banks *
+		      sizeof(struct acpi_hest_ia_error_bank);
+	}
+	BUG_ON(len == -1);
+
+	return len;
+};
+
+int apei_hest_parse(apei_hest_func_t func, void *data)
+{
+	struct acpi_hest_header *hest_hdr;
+	int i, rc, len;
+
+	if (hest_disable || !hest_tab)
+		return -EINVAL;
+
+	hest_hdr = (struct acpi_hest_header *)(hest_tab + 1);
+	for (i = 0; i < hest_tab->error_source_count; i++) {
+		len = hest_esrc_len(hest_hdr);
+		if (!len) {
+			printk(XENLOG_WARNING HEST_PFX
+			       "Unknown or unused hardware error source "
+			       "type: %d for hardware error source: %d\n",
+			       hest_hdr->type, hest_hdr->source_id);
+			return -EINVAL;
+		}
+		if ((void *)hest_hdr + len >
+		    (void *)hest_tab + hest_tab->header.length) {
+			printk(XENLOG_WARNING HEST_PFX
+			       "Table contents overflow for hardware error source: %d\n",
+			       hest_hdr->source_id);
+			return -EINVAL;
+		}
+
+		rc = func(hest_hdr, data);
+		if (rc)
+			return rc;
+
+		hest_hdr = (void *)hest_hdr + len;
+	}
+
+	return 0;
+}
+
+/*
+ * Check if firmware advertises firmware first mode. We need FF bit to be set
+ * along with a set of MC banks which work in FF mode.
+ */
+static int __init hest_parse_cmc(const struct acpi_hest_header *hest_hdr,
+				 void *data)
+{
+#ifdef CONFIG_X86_MCE
+	unsigned int i;
+	const struct acpi_hest_ia_corrected *cmc;
+	const struct acpi_hest_ia_error_bank *mc_bank;
+
+	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+		return 0;
+
+	cmc = container_of(hest_hdr, const struct acpi_hest_ia_corrected, header);
+	if (!cmc->enabled)
+		return 0;
+
+	/*
+	 * We expect HEST to provide a list of MC banks that report errors
+	 * in firmware first mode. Otherwise, return non-zero value to
+	 * indicate that we are done parsing HEST.
+	 */
+	if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || !cmc->num_hardware_banks)
+		return 1;
+
+	printk(XENLOG_INFO HEST_PFX "Enabling Firmware First mode for corrected errors.\n");
+
+	mc_bank = (const struct acpi_hest_ia_error_bank *)(cmc + 1);
+	for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+		mce_disable_bank(mc_bank->bank_number);
+#else
+# define acpi_disable_cmcff 1
+#endif
+
+	return 1;
+}
+
+void __init acpi_hest_init(void)
+{
+	acpi_status status;
+	acpi_physical_address hest_addr;
+	acpi_native_uint hest_len;
+
+	if (acpi_disabled)
+		return;
+
+	if (hest_disable) {
+		printk(XENLOG_INFO HEST_PFX "Table parsing disabled.\n");
+		return;
+	}
+
+	status = acpi_get_table_phys(ACPI_SIG_HEST, 0, &hest_addr, &hest_len);
+	if (status == AE_NOT_FOUND)
+		goto err;
+	if (ACPI_FAILURE(status)) {
+		printk(XENLOG_ERR HEST_PFX "Failed to get table, %s\n",
+		       acpi_format_exception(status));
+		goto err;
+	}
+	map_pages_to_xen((unsigned long)__va(hest_addr), PFN_DOWN(hest_addr),
+			 PFN_UP(hest_addr + hest_len) - PFN_DOWN(hest_addr),
+			 PAGE_HYPERVISOR);
+	hest_tab = __va(hest_addr);
+
+	if (!acpi_disable_cmcff)
+		apei_hest_parse(hest_parse_cmc, NULL);
+
+	printk(XENLOG_INFO HEST_PFX "Table parsing has been initialized\n");
+	return;
+err:
+	hest_disable = 1;
+}
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -1066,6 +1066,105 @@ void __hwdom_init setup_hwdom_pci_device
     spin_unlock(&pcidevs_lock);
 }
 
+#ifdef CONFIG_ACPI
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+static int hest_match_pci(const struct acpi_hest_aer_common *p,
+                          const struct pci_dev *pdev)
+{
+    return ACPI_HEST_SEGMENT(p->bus) == pdev->seg &&
+           ACPI_HEST_BUS(p->bus)     == pdev->bus &&
+           p->device                 == PCI_SLOT(pdev->devfn) &&
+           p->function               == PCI_FUNC(pdev->devfn);
+}
+
+static bool_t hest_match_type(const struct acpi_hest_header *hest_hdr,
+                              const struct pci_dev *pdev)
+{
+    unsigned int pos = pci_find_cap_offset(pdev->seg, pdev->bus,
+                                           PCI_SLOT(pdev->devfn),
+                                           PCI_FUNC(pdev->devfn),
+                                           PCI_CAP_ID_EXP);
+    u8 pcie = (pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), pos + PCI_EXP_FLAGS) &
+               PCI_EXP_FLAGS_TYPE) /
+              (PCI_EXP_FLAGS_TYPE & -PCI_EXP_FLAGS_TYPE);
+
+    switch ( hest_hdr->type )
+    {
+    case ACPI_HEST_TYPE_AER_ROOT_PORT:
+        return pcie == PCI_EXP_TYPE_ROOT_PORT;
+    case ACPI_HEST_TYPE_AER_ENDPOINT:
+        return pcie == PCI_EXP_TYPE_ENDPOINT;
+    case ACPI_HEST_TYPE_AER_BRIDGE:
+        return pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), PCI_CLASS_DEVICE) ==
+               PCI_CLASS_BRIDGE_PCI;
+    }
+
+    return 0;
+}
+
+struct aer_hest_parse_info {
+    const struct pci_dev *pdev;
+    bool_t firmware_first;
+};
+
+static bool_t hest_source_is_pcie_aer(const struct acpi_hest_header *hest_hdr)
+{
+    if ( hest_hdr->type == ACPI_HEST_TYPE_AER_ROOT_PORT ||
+         hest_hdr->type == ACPI_HEST_TYPE_AER_ENDPOINT ||
+         hest_hdr->type == ACPI_HEST_TYPE_AER_BRIDGE )
+        return 1;
+    return 0;
+}
+
+static int aer_hest_parse(const struct acpi_hest_header *hest_hdr, void *data)
+{
+    struct aer_hest_parse_info *info = data;
+    const struct acpi_hest_aer_common *p;
+    bool_t ff;
+
+    if ( !hest_source_is_pcie_aer(hest_hdr) )
+        return 0;
+
+    p = (const struct acpi_hest_aer_common *)(hest_hdr + 1);
+    ff = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST);
+
+    /*
+     * If no specific device is supplied, determine whether
+     * FIRMWARE_FIRST is set for *any* PCIe device.
+     */
+    if ( !info->pdev )
+    {
+        info->firmware_first |= ff;
+        return 0;
+    }
+
+    /* Otherwise, check the specific device */
+    if ( p->flags & ACPI_HEST_GLOBAL ?
+         hest_match_type(hest_hdr, info->pdev) :
+         hest_match_pci(p, info->pdev) )
+    {
+        info->firmware_first = ff;
+        return 1;
+    }
+
+    return 0;
+}
+
+bool_t pcie_aer_get_firmware_first(const struct pci_dev *pdev)
+{
+    struct aer_hest_parse_info info = { .pdev = pdev };
+
+    return pci_find_cap_offset(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), PCI_CAP_ID_EXP) &&
+           apei_hest_parse(aer_hest_parse, &info) >= 0 &&
+           info.firmware_first;
+}
+#endif
+
 static int _dump_pci_devices(struct pci_seg *pseg, void *arg)
 {
     struct pci_dev *pdev;
--- a/xen/drivers/passthrough/vtd/quirks.c
+++ b/xen/drivers/passthrough/vtd/quirks.c
@@ -386,9 +386,10 @@ void pci_vtd_quirk(const struct pci_dev 
     int dev = PCI_SLOT(pdev->devfn);
     int func = PCI_FUNC(pdev->devfn);
     int pos;
-    u32 val;
+    u32 val, val2;
     u64 bar;
     paddr_t pa;
+    const char *action;
 
     if ( pci_conf_read16(seg, bus, dev, func, PCI_VENDOR_ID) !=
          PCI_VENDOR_ID_INTEL )
@@ -447,18 +448,26 @@ void pci_vtd_quirk(const struct pci_dev 
         }
 
         val = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK);
-        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
-                         val | PCI_ERR_UNC_UNSUP);
-        val = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK);
-        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
-                         val | PCI_ERR_COR_ADV_NFAT);
+        val2 = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK);
+        if ( (val & PCI_ERR_UNC_UNSUP) && (val2 & PCI_ERR_COR_ADV_NFAT) )
+            action = "Found masked";
+        else if ( !pcie_aer_get_firmware_first(pdev) )
+        {
+            pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
+                             val | PCI_ERR_UNC_UNSUP);
+            pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
+                             val2 | PCI_ERR_COR_ADV_NFAT);
+            action = "Masked";
+        }
+        else
+            action = "Cannot mask";
 
         /* XPUNCERRMSK Send Completion with Unsupported Request */
         val = pci_conf_read32(seg, bus, dev, func, 0x20c);
         pci_conf_write32(seg, bus, dev, func, 0x20c, val | (1 << 4));
 
-        printk(XENLOG_INFO "Masked UR signaling on %04x:%02x:%02x.%u\n",
-               seg, bus, dev, func);
+        printk(XENLOG_INFO "%s UR signaling on %04x:%02x:%02x.%u\n",
+               action, seg, bus, dev, func);
         break;
 
     case 0x100: case 0x104: case 0x108: /* Sandybridge */
--- a/xen/include/acpi/actbl1.h
+++ b/xen/include/acpi/actbl1.h
@@ -445,6 +445,14 @@ struct acpi_hest_aer_common {
 #define ACPI_HEST_FIRMWARE_FIRST        (1)
 #define ACPI_HEST_GLOBAL                (1<<1)
 
+/*
+ * Macros to access the bus/segment numbers in Bus field above:
+ *  Bus number is encoded in bits 7:0
+ *  Segment number is encoded in bits 23:8
+ */
+#define ACPI_HEST_BUS(bus)              ((bus) & 0xFF)
+#define ACPI_HEST_SEGMENT(bus)          (((bus) >> 8) & 0xFFFF)
+
 /* Hardware Error Notification */
 
 struct acpi_hest_notify {
--- a/xen/include/acpi/apei.h
+++ b/xen/include/acpi/apei.h
@@ -12,6 +12,9 @@
 
 #define FIX_APEI_RANGE_MAX 64
 
+typedef int (*apei_hest_func_t)(const struct acpi_hest_header *, void *);
+int apei_hest_parse(apei_hest_func_t, void *);
+
 int erst_write(const struct cper_record_header *record);
 ssize_t erst_get_record_count(void);
 int erst_get_next_record_id(u64 *record_id);
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -61,6 +61,7 @@ int acpi_boot_init (void);
 int acpi_boot_table_init (void);
 int acpi_numa_init (void);
 int erst_init(void);
+void acpi_hest_init(void);
 
 int acpi_table_init (void);
 int acpi_table_parse(char *id, acpi_table_handler handler);
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -144,6 +144,8 @@ int pci_find_next_ext_capability(int seg
 const char *parse_pci(const char *, unsigned int *seg, unsigned int *bus,
                       unsigned int *dev, unsigned int *func);
 
+bool_t pcie_aer_get_firmware_first(const struct pci_dev *);
+
 struct pirq;
 int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable);
 void msixtbl_pt_unregister(struct domain *, struct pirq *);



[-- Attachment #2: VT-d-mask-UR-honor-firmware-first.patch --]
[-- Type: text/plain, Size: 14813 bytes --]

VT-d: honor firmware-first mode in XSA-59 workaround code

When firmware-first mode is being indicated by firmware, we shouldn't
be modifying AER registers - these ar considered to be owned by
firmware in that case. Violating this is being reported to result in
SMI (or was it SCI?) storms. While circumventing the workaround means
re-exposing affected hosts to the XSA-59 issues, this in any event
seems better than not booting at all. Respective messages are being
issued to the log, so the situation can be diagnosed.

The basic building blocks were taken from Linux 3.15-rc.

Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reported-by: Malcolm Crossley <malcolm.crossley@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -785,6 +785,8 @@ int __init acpi_boot_init(void)
 
 	erst_init();
 
+	acpi_hest_init();
+
 	acpi_table_parse(ACPI_SIG_BGRT, acpi_invalidate_bgrt);
 
 	return 0;
--- a/xen/drivers/acpi/apei/Makefile
+++ b/xen/drivers/acpi/apei/Makefile
@@ -1,3 +1,4 @@
 obj-y += erst.o
+obj-y += hest.o
 obj-y += apei-base.o
 obj-y += apei-io.o
--- /dev/null
+++ b/xen/drivers/acpi/apei/hest.c
@@ -0,0 +1,200 @@
+/*
+ * APEI Hardware Error Souce Table support
+ *
+ * HEST describes error sources in detail; communicates operational
+ * parameters (i.e. severity levels, masking bits, and threshold
+ * values) to Linux as necessary. It also allows the BIOS to report
+ * non-standard error sources to Linux (for example, chipset-specific
+ * error registers).
+ *
+ * For more information about HEST, please refer to ACPI Specification
+ * version 4.0, section 17.3.2.
+ *
+ * Copyright 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <xen/errno.h>
+#include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/mm.h>
+#include <xen/pfn.h>
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+#include "apei-internal.h"
+
+#define HEST_PFX "HEST: "
+
+static bool_t hest_disable;
+boolean_param("hest_disable", hest_disable);
+
+/* HEST table parsing */
+
+static struct acpi_table_hest *__read_mostly hest_tab;
+
+static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = {
+	[ACPI_HEST_TYPE_IA32_CHECK] = -1,	/* need further calculation */
+	[ACPI_HEST_TYPE_IA32_CORRECTED_CHECK] = -1,
+	[ACPI_HEST_TYPE_IA32_NMI] = sizeof(struct acpi_hest_ia_nmi),
+	[ACPI_HEST_TYPE_AER_ROOT_PORT] = sizeof(struct acpi_hest_aer_root),
+	[ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
+	[ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
+	[ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
+};
+
+static int hest_esrc_len(const struct acpi_hest_header *hest_hdr)
+{
+	u16 hest_type = hest_hdr->type;
+	int len;
+
+	if (hest_type >= ACPI_HEST_TYPE_RESERVED)
+		return 0;
+
+	len = hest_esrc_len_tab[hest_type];
+
+	if (hest_type == ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) {
+		const struct acpi_hest_ia_corrected *cmc =
+			container_of(hest_hdr,
+				     const struct acpi_hest_ia_corrected,
+				     header);
+
+		len = sizeof(*cmc) + cmc->num_hardware_banks *
+		      sizeof(struct acpi_hest_ia_error_bank);
+	} else if (hest_type == ACPI_HEST_TYPE_IA32_CHECK) {
+		const struct acpi_hest_ia_machine_check *mc =
+			container_of(hest_hdr,
+				     const struct acpi_hest_ia_machine_check,
+				     header);
+
+		len = sizeof(*mc) + mc->num_hardware_banks *
+		      sizeof(struct acpi_hest_ia_error_bank);
+	}
+	BUG_ON(len == -1);
+
+	return len;
+};
+
+int apei_hest_parse(apei_hest_func_t func, void *data)
+{
+	struct acpi_hest_header *hest_hdr;
+	int i, rc, len;
+
+	if (hest_disable || !hest_tab)
+		return -EINVAL;
+
+	hest_hdr = (struct acpi_hest_header *)(hest_tab + 1);
+	for (i = 0; i < hest_tab->error_source_count; i++) {
+		len = hest_esrc_len(hest_hdr);
+		if (!len) {
+			printk(XENLOG_WARNING HEST_PFX
+			       "Unknown or unused hardware error source "
+			       "type: %d for hardware error source: %d\n",
+			       hest_hdr->type, hest_hdr->source_id);
+			return -EINVAL;
+		}
+		if ((void *)hest_hdr + len >
+		    (void *)hest_tab + hest_tab->header.length) {
+			printk(XENLOG_WARNING HEST_PFX
+			       "Table contents overflow for hardware error source: %d\n",
+			       hest_hdr->source_id);
+			return -EINVAL;
+		}
+
+		rc = func(hest_hdr, data);
+		if (rc)
+			return rc;
+
+		hest_hdr = (void *)hest_hdr + len;
+	}
+
+	return 0;
+}
+
+/*
+ * Check if firmware advertises firmware first mode. We need FF bit to be set
+ * along with a set of MC banks which work in FF mode.
+ */
+static int __init hest_parse_cmc(const struct acpi_hest_header *hest_hdr,
+				 void *data)
+{
+#ifdef CONFIG_X86_MCE
+	unsigned int i;
+	const struct acpi_hest_ia_corrected *cmc;
+	const struct acpi_hest_ia_error_bank *mc_bank;
+
+	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+		return 0;
+
+	cmc = container_of(hest_hdr, const struct acpi_hest_ia_corrected, header);
+	if (!cmc->enabled)
+		return 0;
+
+	/*
+	 * We expect HEST to provide a list of MC banks that report errors
+	 * in firmware first mode. Otherwise, return non-zero value to
+	 * indicate that we are done parsing HEST.
+	 */
+	if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || !cmc->num_hardware_banks)
+		return 1;
+
+	printk(XENLOG_INFO HEST_PFX "Enabling Firmware First mode for corrected errors.\n");
+
+	mc_bank = (const struct acpi_hest_ia_error_bank *)(cmc + 1);
+	for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+		mce_disable_bank(mc_bank->bank_number);
+#else
+# define acpi_disable_cmcff 1
+#endif
+
+	return 1;
+}
+
+void __init acpi_hest_init(void)
+{
+	acpi_status status;
+	acpi_physical_address hest_addr;
+	acpi_native_uint hest_len;
+
+	if (acpi_disabled)
+		return;
+
+	if (hest_disable) {
+		printk(XENLOG_INFO HEST_PFX "Table parsing disabled.\n");
+		return;
+	}
+
+	status = acpi_get_table_phys(ACPI_SIG_HEST, 0, &hest_addr, &hest_len);
+	if (status == AE_NOT_FOUND)
+		goto err;
+	if (ACPI_FAILURE(status)) {
+		printk(XENLOG_ERR HEST_PFX "Failed to get table, %s\n",
+		       acpi_format_exception(status));
+		goto err;
+	}
+	map_pages_to_xen((unsigned long)__va(hest_addr), PFN_DOWN(hest_addr),
+			 PFN_UP(hest_addr + hest_len) - PFN_DOWN(hest_addr),
+			 PAGE_HYPERVISOR);
+	hest_tab = __va(hest_addr);
+
+	if (!acpi_disable_cmcff)
+		apei_hest_parse(hest_parse_cmc, NULL);
+
+	printk(XENLOG_INFO HEST_PFX "Table parsing has been initialized\n");
+	return;
+err:
+	hest_disable = 1;
+}
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -1066,6 +1066,105 @@ void __hwdom_init setup_hwdom_pci_device
     spin_unlock(&pcidevs_lock);
 }
 
+#ifdef CONFIG_ACPI
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+static int hest_match_pci(const struct acpi_hest_aer_common *p,
+                          const struct pci_dev *pdev)
+{
+    return ACPI_HEST_SEGMENT(p->bus) == pdev->seg &&
+           ACPI_HEST_BUS(p->bus)     == pdev->bus &&
+           p->device                 == PCI_SLOT(pdev->devfn) &&
+           p->function               == PCI_FUNC(pdev->devfn);
+}
+
+static bool_t hest_match_type(const struct acpi_hest_header *hest_hdr,
+                              const struct pci_dev *pdev)
+{
+    unsigned int pos = pci_find_cap_offset(pdev->seg, pdev->bus,
+                                           PCI_SLOT(pdev->devfn),
+                                           PCI_FUNC(pdev->devfn),
+                                           PCI_CAP_ID_EXP);
+    u8 pcie = (pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), pos + PCI_EXP_FLAGS) &
+               PCI_EXP_FLAGS_TYPE) /
+              (PCI_EXP_FLAGS_TYPE & -PCI_EXP_FLAGS_TYPE);
+
+    switch ( hest_hdr->type )
+    {
+    case ACPI_HEST_TYPE_AER_ROOT_PORT:
+        return pcie == PCI_EXP_TYPE_ROOT_PORT;
+    case ACPI_HEST_TYPE_AER_ENDPOINT:
+        return pcie == PCI_EXP_TYPE_ENDPOINT;
+    case ACPI_HEST_TYPE_AER_BRIDGE:
+        return pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), PCI_CLASS_DEVICE) ==
+               PCI_CLASS_BRIDGE_PCI;
+    }
+
+    return 0;
+}
+
+struct aer_hest_parse_info {
+    const struct pci_dev *pdev;
+    bool_t firmware_first;
+};
+
+static bool_t hest_source_is_pcie_aer(const struct acpi_hest_header *hest_hdr)
+{
+    if ( hest_hdr->type == ACPI_HEST_TYPE_AER_ROOT_PORT ||
+         hest_hdr->type == ACPI_HEST_TYPE_AER_ENDPOINT ||
+         hest_hdr->type == ACPI_HEST_TYPE_AER_BRIDGE )
+        return 1;
+    return 0;
+}
+
+static int aer_hest_parse(const struct acpi_hest_header *hest_hdr, void *data)
+{
+    struct aer_hest_parse_info *info = data;
+    const struct acpi_hest_aer_common *p;
+    bool_t ff;
+
+    if ( !hest_source_is_pcie_aer(hest_hdr) )
+        return 0;
+
+    p = (const struct acpi_hest_aer_common *)(hest_hdr + 1);
+    ff = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST);
+
+    /*
+     * If no specific device is supplied, determine whether
+     * FIRMWARE_FIRST is set for *any* PCIe device.
+     */
+    if ( !info->pdev )
+    {
+        info->firmware_first |= ff;
+        return 0;
+    }
+
+    /* Otherwise, check the specific device */
+    if ( p->flags & ACPI_HEST_GLOBAL ?
+         hest_match_type(hest_hdr, info->pdev) :
+         hest_match_pci(p, info->pdev) )
+    {
+        info->firmware_first = ff;
+        return 1;
+    }
+
+    return 0;
+}
+
+bool_t pcie_aer_get_firmware_first(const struct pci_dev *pdev)
+{
+    struct aer_hest_parse_info info = { .pdev = pdev };
+
+    return pci_find_cap_offset(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), PCI_CAP_ID_EXP) &&
+           apei_hest_parse(aer_hest_parse, &info) >= 0 &&
+           info.firmware_first;
+}
+#endif
+
 static int _dump_pci_devices(struct pci_seg *pseg, void *arg)
 {
     struct pci_dev *pdev;
--- a/xen/drivers/passthrough/vtd/quirks.c
+++ b/xen/drivers/passthrough/vtd/quirks.c
@@ -386,9 +386,10 @@ void pci_vtd_quirk(const struct pci_dev 
     int dev = PCI_SLOT(pdev->devfn);
     int func = PCI_FUNC(pdev->devfn);
     int pos;
-    u32 val;
+    u32 val, val2;
     u64 bar;
     paddr_t pa;
+    const char *action;
 
     if ( pci_conf_read16(seg, bus, dev, func, PCI_VENDOR_ID) !=
          PCI_VENDOR_ID_INTEL )
@@ -447,18 +448,26 @@ void pci_vtd_quirk(const struct pci_dev 
         }
 
         val = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK);
-        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
-                         val | PCI_ERR_UNC_UNSUP);
-        val = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK);
-        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
-                         val | PCI_ERR_COR_ADV_NFAT);
+        val2 = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK);
+        if ( (val & PCI_ERR_UNC_UNSUP) && (val2 & PCI_ERR_COR_ADV_NFAT) )
+            action = "Found masked";
+        else if ( !pcie_aer_get_firmware_first(pdev) )
+        {
+            pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
+                             val | PCI_ERR_UNC_UNSUP);
+            pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
+                             val2 | PCI_ERR_COR_ADV_NFAT);
+            action = "Masked";
+        }
+        else
+            action = "Cannot mask";
 
         /* XPUNCERRMSK Send Completion with Unsupported Request */
         val = pci_conf_read32(seg, bus, dev, func, 0x20c);
         pci_conf_write32(seg, bus, dev, func, 0x20c, val | (1 << 4));
 
-        printk(XENLOG_INFO "Masked UR signaling on %04x:%02x:%02x.%u\n",
-               seg, bus, dev, func);
+        printk(XENLOG_INFO "%s UR signaling on %04x:%02x:%02x.%u\n",
+               action, seg, bus, dev, func);
         break;
 
     case 0x100: case 0x104: case 0x108: /* Sandybridge */
--- a/xen/include/acpi/actbl1.h
+++ b/xen/include/acpi/actbl1.h
@@ -445,6 +445,14 @@ struct acpi_hest_aer_common {
 #define ACPI_HEST_FIRMWARE_FIRST        (1)
 #define ACPI_HEST_GLOBAL                (1<<1)
 
+/*
+ * Macros to access the bus/segment numbers in Bus field above:
+ *  Bus number is encoded in bits 7:0
+ *  Segment number is encoded in bits 23:8
+ */
+#define ACPI_HEST_BUS(bus)              ((bus) & 0xFF)
+#define ACPI_HEST_SEGMENT(bus)          (((bus) >> 8) & 0xFFFF)
+
 /* Hardware Error Notification */
 
 struct acpi_hest_notify {
--- a/xen/include/acpi/apei.h
+++ b/xen/include/acpi/apei.h
@@ -12,6 +12,9 @@
 
 #define FIX_APEI_RANGE_MAX 64
 
+typedef int (*apei_hest_func_t)(const struct acpi_hest_header *, void *);
+int apei_hest_parse(apei_hest_func_t, void *);
+
 int erst_write(const struct cper_record_header *record);
 ssize_t erst_get_record_count(void);
 int erst_get_next_record_id(u64 *record_id);
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -61,6 +61,7 @@ int acpi_boot_init (void);
 int acpi_boot_table_init (void);
 int acpi_numa_init (void);
 int erst_init(void);
+void acpi_hest_init(void);
 
 int acpi_table_init (void);
 int acpi_table_parse(char *id, acpi_table_handler handler);
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -144,6 +144,8 @@ int pci_find_next_ext_capability(int seg
 const char *parse_pci(const char *, unsigned int *seg, unsigned int *bus,
                       unsigned int *dev, unsigned int *func);
 
+bool_t pcie_aer_get_firmware_first(const struct pci_dev *);
+
 struct pirq;
 int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable);
 void msixtbl_pt_unregister(struct domain *, struct pirq *);

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-21 16:09 [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code Jan Beulich
@ 2014-05-21 16:33 ` Andrew Cooper
  2014-05-22  7:13   ` Jan Beulich
  2014-05-23  2:32 ` Zhang, Yang Z
  2014-05-26 10:19 ` [PATCH v2 " Jan Beulich
  2 siblings, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2014-05-21 16:33 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Yang Z Zhang, xen-devel, Malcolm Crossley, Kevin Tian,
	Donald D Dugger

On 21/05/14 17:09, Jan Beulich wrote:
> When firmware-first mode is being indicated by firmware, we shouldn't
> be modifying AER registers - these ar considered to be owned by

"these are"

> firmware in that case. Violating this is being reported to result in
> SMI (or was it SCI?) storms.

SMM livelock.

The exact problem was a 2.6.32 kernel ignoring firmware-first and
disabling reporting on the root ports but not endpoint devices, causing
the SMM handler to fail to find the issue, and fail to clear the SMI.

However, as part of the investigation, we identified that Xen was also
violating firmware-first as part of XSA-59

~Andrew

>  While circumventing the workaround means
> re-exposing affected hosts to the XSA-59 issues, this in any event
> seems better than not booting at all. Respective messages are being
> issued to the log, so the situation can be diagnosed.
>
> The basic building blocks were taken from Linux 3.15-rc.
>
> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reported-by: Malcolm Crossley <malcolm.crossley@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-21 16:33 ` Andrew Cooper
@ 2014-05-22  7:13   ` Jan Beulich
  2014-05-22  9:34     ` Andrew Cooper
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2014-05-22  7:13 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Yang Z Zhang, xen-devel, Malcolm Crossley, Kevin Tian,
	Donald D Dugger

>>> On 21.05.14 at 18:33, <andrew.cooper3@citrix.com> wrote:
> On 21/05/14 17:09, Jan Beulich wrote:
>> When firmware-first mode is being indicated by firmware, we shouldn't
>> be modifying AER registers - these ar considered to be owned by
> 
> "these are"
> 
>> firmware in that case. Violating this is being reported to result in
>> SMI (or was it SCI?) storms.
> 
> SMM livelock.

IOW "SMI storm".

> The exact problem was a 2.6.32 kernel ignoring firmware-first and
> disabling reporting on the root ports but not endpoint devices, causing
> the SMM handler to fail to find the issue, and fail to clear the SMI.
> 
> However, as part of the investigation, we identified that Xen was also
> violating firmware-first as part of XSA-59

So has a problem solely with the XSA-59 workaround (i.e. without an
ignorant kernel) then been observed at all?

Jan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-22  7:13   ` Jan Beulich
@ 2014-05-22  9:34     ` Andrew Cooper
  2014-05-22 10:06       ` Jan Beulich
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2014-05-22  9:34 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Yang Z Zhang, xen-devel, Malcolm Crossley, Kevin Tian,
	Donald D Dugger

On 22/05/14 08:13, Jan Beulich wrote:
>>>> On 21.05.14 at 18:33, <andrew.cooper3@citrix.com> wrote:
>> On 21/05/14 17:09, Jan Beulich wrote:
>>> When firmware-first mode is being indicated by firmware, we shouldn't
>>> be modifying AER registers - these ar considered to be owned by
>> "these are"
>>
>>> firmware in that case. Violating this is being reported to result in
>>> SMI (or was it SCI?) storms.
>> SMM livelock.
> IOW "SMI storm".

It wasn't completely clear from the BIOS engineer's description whether
it was a single infinite loop in SMM mode, or an SMI storm, but the
overall effects are still the same.

>
>> The exact problem was a 2.6.32 kernel ignoring firmware-first and
>> disabling reporting on the root ports but not endpoint devices, causing
>> the SMM handler to fail to find the issue, and fail to clear the SMI.
>>
>> However, as part of the investigation, we identified that Xen was also
>> violating firmware-first as part of XSA-59
> So has a problem solely with the XSA-59 workaround (i.e. without an
> ignorant kernel) then been observed at all?
>
> Jan
>

Correct

~Andrew

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-22  9:34     ` Andrew Cooper
@ 2014-05-22 10:06       ` Jan Beulich
  2014-05-22 10:19         ` Andrew Cooper
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2014-05-22 10:06 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Yang Z Zhang, xen-devel, Malcolm Crossley, Kevin Tian,
	Donald D Dugger

>>> On 22.05.14 at 11:34, <andrew.cooper3@citrix.com> wrote:
> On 22/05/14 08:13, Jan Beulich wrote:
>>>>> On 21.05.14 at 18:33, <andrew.cooper3@citrix.com> wrote:
>>> The exact problem was a 2.6.32 kernel ignoring firmware-first and
>>> disabling reporting on the root ports but not endpoint devices, causing
>>> the SMM handler to fail to find the issue, and fail to clear the SMI.
>>>
>>> However, as part of the investigation, we identified that Xen was also
>>> violating firmware-first as part of XSA-59
>> So has a problem solely with the XSA-59 workaround (i.e. without an
>> ignorant kernel) then been observed at all?
> 
> Correct

"Correct" isn't really a meaningful answer to the question; I guess
you mean "yes".

Jan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-22 10:06       ` Jan Beulich
@ 2014-05-22 10:19         ` Andrew Cooper
  2014-05-22 10:33           ` Jan Beulich
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2014-05-22 10:19 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Yang Z Zhang, xen-devel, Malcolm Crossley, Kevin Tian,
	Donald D Dugger

On 22/05/14 11:06, Jan Beulich wrote:
>>>> On 22.05.14 at 11:34, <andrew.cooper3@citrix.com> wrote:
>> On 22/05/14 08:13, Jan Beulich wrote:
>>>>>> On 21.05.14 at 18:33, <andrew.cooper3@citrix.com> wrote:
>>>> The exact problem was a 2.6.32 kernel ignoring firmware-first and
>>>> disabling reporting on the root ports but not endpoint devices, causing
>>>> the SMM handler to fail to find the issue, and fail to clear the SMI.
>>>>
>>>> However, as part of the investigation, we identified that Xen was also
>>>> violating firmware-first as part of XSA-59
>>> So has a problem solely with the XSA-59 workaround (i.e. without an
>>> ignorant kernel) then been observed at all?
>> Correct
> "Correct" isn't really a meaningful answer to the question; I guess
> you mean "yes".
>
> Jan
>

Sorry - I misread the question.

No.  We have not observed an issue from XSA-59.

The version of XenServer we had the issue with didn't contain any of the
XSA-59 fixes at the point that the problem was observed.

~Andrew

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-22 10:19         ` Andrew Cooper
@ 2014-05-22 10:33           ` Jan Beulich
  2014-05-22 14:19             ` Andrew Cooper
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2014-05-22 10:33 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Yang Z Zhang, xen-devel, Malcolm Crossley, Kevin Tian,
	Donald D Dugger

>>> On 22.05.14 at 12:19, <andrew.cooper3@citrix.com> wrote:
> No.  We have not observed an issue from XSA-59.
> 
> The version of XenServer we had the issue with didn't contain any of the
> XSA-59 fixes at the point that the problem was observed.

Then what was yesterday's alert about then? I.e. do we have any
indication that the workaround as is may cause problems, and that
hence the (relatively involved) patch here is needed at all? And, how
are you intending to test this patch if you haven't even seen an
issue?

Jan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-22 10:33           ` Jan Beulich
@ 2014-05-22 14:19             ` Andrew Cooper
  2014-05-23  1:03               ` Zhang, Yang Z
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2014-05-22 14:19 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Yang Z Zhang, xen-devel, Malcolm Crossley, Kevin Tian,
	Donald D Dugger

On 22/05/14 11:33, Jan Beulich wrote:
>>>> On 22.05.14 at 12:19, <andrew.cooper3@citrix.com> wrote:
>> No.  We have not observed an issue from XSA-59.
>>
>> The version of XenServer we had the issue with didn't contain any of the
>> XSA-59 fixes at the point that the problem was observed.
> Then what was yesterday's alert about then? I.e. do we have any
> indication that the workaround as is may cause problems, and that
> hence the (relatively involved) patch here is needed at all? And, how
> are you intending to test this patch if you haven't even seen an
> issue?
>
> Jan
>

As part of finding the root cause of our issue, we identified that just
as Dom0 must not play with AER in firmware first mode, Xen must not play
either.

I believe that we have XSA-59 affected hardware with both firmware-first
and non-firmware-first HEST tables, so we should be able to
behaviourally test the patch.

~Andrew

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-22 14:19             ` Andrew Cooper
@ 2014-05-23  1:03               ` Zhang, Yang Z
  2014-05-23  6:13                 ` Jan Beulich
  0 siblings, 1 reply; 18+ messages in thread
From: Zhang, Yang Z @ 2014-05-23  1:03 UTC (permalink / raw)
  To: Andrew Cooper, Jan Beulich
  Cc: xen-devel, Malcolm Crossley, Tian, Kevin, Dugger, Donald D

Andrew Cooper wrote on 2014-05-22:
> On 22/05/14 11:33, Jan Beulich wrote:
>>>>> On 22.05.14 at 12:19, <andrew.cooper3@citrix.com> wrote:
>>> No.  We have not observed an issue from XSA-59.
>>> 
>>> The version of XenServer we had the issue with didn't contain any
>>> of the
>>> XSA-59 fixes at the point that the problem was observed.
>> Then what was yesterday's alert about then? I.e. do we have any
>> indication that the workaround as is may cause problems, and that
>> hence the (relatively involved) patch here is needed at all? And,
>> how are you intending to test this patch if you haven't even seen an
>> issue?
>> 
>> Jan
>> 
> 
> As part of finding the root cause of our issue, we identified that
> just as Dom0 must not play with AER in firmware first mode, Xen must not play either.

I saw upstream Linux has the patch to handle this case in 2009.

commit 0584396157ad2d008e2cc76b4ed6254151183a25
Author: Matt Domsch <Matt_Domsch@dell.com>
Date:   Mon Nov 2 11:51:24 2009 -0600

    PCI: PCIe AER: honor ACPI HEST FIRMWARE FIRST mode

    Feedback from Hidetoshi Seto and Kenji Kaneshige incorporated.  This
    correctly handles PCI-X bridges, PCIe root ports and endpoints, and
    prints debug messages when invalid/reserved types are found in the
    HEST.  PCI devices not in domain/segment 0 are not represented in
    HEST, thus will be ignored.

    Today, the PCIe Advanced Error Reporting (AER) driver attaches itself
    to every PCIe root port for which BIOS reports it should, via ACPI
    _OSC.

    However, _OSC alone is insufficient for newer BIOSes.  Part of ACPI
    4.0 is the new APEI (ACPI Platform Error Interfaces) which is a way
    for OS and BIOS to handshake over which errors for which components
    each will handle.  One table in ACPI 4.0 is the Hardware Error Source
    Table (HEST), where BIOS can define that errors for certain PCIe
    devices (or all devices), should be handled by BIOS ("Firmware First
    mode"), rather than be handled by the OS.

    Dell PowerEdge 11G server BIOS defines Firmware First mode in HEST, so
    that it may manage such errors, log them to the System Event Log, and
    possibly take other actions.  The aer driver should honor this, and
    not attach itself to devices noted as such.

    Furthermore, Kenji Kaneshige reminded us to disallow changing the AER
    registers when respecting Firmware First mode.  Platform firmware is
    expected to manage these, and if changes to them are allowed, it could
    break that firmware's behavior.

    The HEST parsing code may be replaced in the future by a more
    feature-rich implementation.  This patch provides the minimum needed
    to prevent breakage until that implementation is available.
> 
> I believe that we have XSA-59 affected hardware with both
> firmware-first and non-firmware-first HEST tables, so we should be

why non-firmware-first hardware also affected? It seems only firmware-first hardware is buggy.

> able to behaviourally test the patch.
> 
> ~Andrew


Best regards,
Yang

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-23  1:03               ` Zhang, Yang Z
@ 2014-05-23  6:13                 ` Jan Beulich
  2014-05-23  6:40                   ` Zhang, Yang Z
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2014-05-23  6:13 UTC (permalink / raw)
  To: Andrew Cooper, Yang Z Zhang
  Cc: xen-devel, Malcolm Crossley, Kevin Tian, Donald D Dugger

>>> On 23.05.14 at 03:03, <yang.z.zhang@intel.com> wrote:
> Andrew Cooper wrote on 2014-05-22:
>> On 22/05/14 11:33, Jan Beulich wrote:
>>>>>> On 22.05.14 at 12:19, <andrew.cooper3@citrix.com> wrote:
>>>> No.  We have not observed an issue from XSA-59.
>>>> 
>>>> The version of XenServer we had the issue with didn't contain any
>>>> of the
>>>> XSA-59 fixes at the point that the problem was observed.
>>> Then what was yesterday's alert about then? I.e. do we have any
>>> indication that the workaround as is may cause problems, and that
>>> hence the (relatively involved) patch here is needed at all? And,
>>> how are you intending to test this patch if you haven't even seen an
>>> issue?
>> 
>> As part of finding the root cause of our issue, we identified that
>> just as Dom0 must not play with AER in firmware first mode, Xen must not 
> play either.
> 
> I saw upstream Linux has the patch to handle this case in 2009.

Yes, yet all patches so far went through without anyone (me
included) pointing this out during review.

>> I believe that we have XSA-59 affected hardware with both
>> firmware-first and non-firmware-first HEST tables, so we should be
> 
> why non-firmware-first hardware also affected? It seems only firmware-first 
> hardware is buggy.

"Buggy" is perhaps the wrong term here; "problematic" would seem a
better fit. And I think you misread Andrew's reply - we certainly need
to test the patch on both HEST-with-FF and HEST-without-FF systems,
even if only the former would suffer from the presumed regression.

Jan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-23  6:13                 ` Jan Beulich
@ 2014-05-23  6:40                   ` Zhang, Yang Z
  0 siblings, 0 replies; 18+ messages in thread
From: Zhang, Yang Z @ 2014-05-23  6:40 UTC (permalink / raw)
  To: Jan Beulich, Andrew Cooper
  Cc: xen-devel, Malcolm Crossley, Tian, Kevin, Dugger, Donald D

Jan Beulich wrote on 2014-05-23:
>>>> On 23.05.14 at 03:03, <yang.z.zhang@intel.com> wrote:
>> Andrew Cooper wrote on 2014-05-22:
>>> On 22/05/14 11:33, Jan Beulich wrote:
>>>>>>> On 22.05.14 at 12:19, <andrew.cooper3@citrix.com> wrote:
>>>>> No.  We have not observed an issue from XSA-59.
>>>>> 
>>>>> The version of XenServer we had the issue with didn't contain any
>>>>> of the
>>>>> XSA-59 fixes at the point that the problem was observed.
>>>> Then what was yesterday's alert about then? I.e. do we have any
>>>> indication that the workaround as is may cause problems, and that
>>>> hence the (relatively involved) patch here is needed at all? And,
>>>> how are you intending to test this patch if you haven't even seen
>>>> an issue?
>>> 
>>> As part of finding the root cause of our issue, we identified that
>>> just as Dom0 must not play with AER in firmware first mode, Xen
>>> must not
>> play either.
>> 
>> I saw upstream Linux has the patch to handle this case in 2009.
> 
> Yes, yet all patches so far went through without anyone (me
> included) pointing this out during review.
> 
>>> I believe that we have XSA-59 affected hardware with both
>>> firmware-first and non-firmware-first HEST tables, so we should be
>> 
>> why non-firmware-first hardware also affected? It seems only
>> firmware-first hardware is buggy.
> 
> "Buggy" is perhaps the wrong term here; "problematic" would seem a
> better fit. And I think you misread Andrew's reply - we certainly need
> to test the patch on both HEST-with-FF and HEST-without-FF systems,
> even if only the former would suffer from the presumed regression.
> 

Yes, we definitely need to test it on both machines. From all your previous discussions, the cause is that XSA-59 touches AER register without consider whether it is owned by firmware. But if hardware doesn't support firmware-first, then it has no problem.

Best regards,
Yang

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-21 16:09 [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code Jan Beulich
  2014-05-21 16:33 ` Andrew Cooper
@ 2014-05-23  2:32 ` Zhang, Yang Z
  2014-05-23  6:22   ` Jan Beulich
  2014-05-26 10:19 ` [PATCH v2 " Jan Beulich
  2 siblings, 1 reply; 18+ messages in thread
From: Zhang, Yang Z @ 2014-05-23  2:32 UTC (permalink / raw)
  To: Jan Beulich, Andrew Cooper, Malcolm Crossley
  Cc: xen-devel, Tian, Kevin, Dugger, Donald D

Jan Beulich wrote on 2014-05-22:
> When firmware-first mode is being indicated by firmware, we shouldn't be
> modifying AER registers - these ar considered to be owned by firmware in that
> case. Violating this is being reported to result in SMI (or was it SCI?) storms.
> While circumventing the workaround means re-exposing affected hosts to the
> XSA-59 issues, this in any event seems better than not booting at all.
> Respective messages are being issued to the log, so the situation can be
> diagnosed.
> 
> The basic building blocks were taken from Linux 3.15-rc.
> 
> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reported-by: Malcolm Crossley <malcolm.crossley@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/acpi/boot.c
> +++ b/xen/arch/x86/acpi/boot.c
> @@ -785,6 +785,8 @@ int __init acpi_boot_init(void)
> 
>  	erst_init();
> 
> +	acpi_hest_init();
> +
>  	acpi_table_parse(ACPI_SIG_BGRT, acpi_invalidate_bgrt);
> 
>  	return 0;
> --- a/xen/drivers/acpi/apei/Makefile
> +++ b/xen/drivers/acpi/apei/Makefile
> @@ -1,3 +1,4 @@
>  obj-y += erst.o
> +obj-y += hest.o
>  obj-y += apei-base.o
>  obj-y += apei-io.o
> --- /dev/null
> +++ b/xen/drivers/acpi/apei/hest.c
> @@ -0,0 +1,200 @@
> +/*
> + * APEI Hardware Error Souce Table support
> + *
> + * HEST describes error sources in detail; communicates operational
> + * parameters (i.e. severity levels, masking bits, and threshold
> + * values) to Linux as necessary. It also allows the BIOS to report
> + * non-standard error sources to Linux (for example, chipset-specific
> + * error registers).
> + *
> + * For more information about HEST, please refer to ACPI Specification
> + * version 4.0, section 17.3.2.
> + *
> + * Copyright 2009 Intel Corp.
> + *   Author: Huang Ying <ying.huang@intel.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License version
> + * 2 as published by the Free Software Foundation;
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
> +USA  */
> +
> +#include <xen/errno.h>
> +#include <xen/init.h>
> +#include <xen/kernel.h>
> +#include <xen/mm.h>
> +#include <xen/pfn.h>
> +#include <acpi/acpi.h>
> +#include <acpi/apei.h>
> +
> +#include "apei-internal.h"
> +
> +#define HEST_PFX "HEST: "
> +
> +static bool_t hest_disable;
> +boolean_param("hest_disable", hest_disable);
> +
> +/* HEST table parsing */
> +
> +static struct acpi_table_hest *__read_mostly hest_tab;
> +
> +static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = {
> +	[ACPI_HEST_TYPE_IA32_CHECK] = -1,	/* need further calculation */
> +	[ACPI_HEST_TYPE_IA32_CORRECTED_CHECK] = -1,
> +	[ACPI_HEST_TYPE_IA32_NMI] = sizeof(struct acpi_hest_ia_nmi),
> +	[ACPI_HEST_TYPE_AER_ROOT_PORT] = sizeof(struct acpi_hest_aer_root),
> +	[ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
> +	[ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
> +	[ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct
> acpi_hest_generic), };
> +
> +static int hest_esrc_len(const struct acpi_hest_header *hest_hdr) {
> +	u16 hest_type = hest_hdr->type;
> +	int len;
> +
> +	if (hest_type >= ACPI_HEST_TYPE_RESERVED)
> +		return 0;
> +
> +	len = hest_esrc_len_tab[hest_type];
> +
> +	if (hest_type == ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) {
> +		const struct acpi_hest_ia_corrected *cmc =
> +			container_of(hest_hdr,
> +				     const struct acpi_hest_ia_corrected,
> +				     header);
> +
> +		len = sizeof(*cmc) + cmc->num_hardware_banks *
> +		      sizeof(struct acpi_hest_ia_error_bank);
> +	} else if (hest_type == ACPI_HEST_TYPE_IA32_CHECK) {
> +		const struct acpi_hest_ia_machine_check *mc =
> +			container_of(hest_hdr,
> +				     const struct acpi_hest_ia_machine_check,
> +				     header);
> +
> +		len = sizeof(*mc) + mc->num_hardware_banks *
> +		      sizeof(struct acpi_hest_ia_error_bank);
> +	}
> +	BUG_ON(len == -1);
> +
> +	return len;
> +};
> +
> +int apei_hest_parse(apei_hest_func_t func, void *data) {
> +	struct acpi_hest_header *hest_hdr;
> +	int i, rc, len;
> +
> +	if (hest_disable || !hest_tab)
> +		return -EINVAL;
> +
> +	hest_hdr = (struct acpi_hest_header *)(hest_tab + 1);
> +	for (i = 0; i < hest_tab->error_source_count; i++) {
> +		len = hest_esrc_len(hest_hdr);
> +		if (!len) {
> +			printk(XENLOG_WARNING HEST_PFX
> +			       "Unknown or unused hardware error source "
> +			       "type: %d for hardware error source: %d\n",
> +			       hest_hdr->type, hest_hdr->source_id);
> +			return -EINVAL;
> +		}
> +		if ((void *)hest_hdr + len >
> +		    (void *)hest_tab + hest_tab->header.length) {
> +			printk(XENLOG_WARNING HEST_PFX
> +			       "Table contents overflow for hardware error
> source: %d\n",
> +			       hest_hdr->source_id);
> +			return -EINVAL;
> +		}
> +
> +		rc = func(hest_hdr, data);
> +		if (rc)
> +			return rc;
> +
> +		hest_hdr = (void *)hest_hdr + len;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * Check if firmware advertises firmware first mode. We need FF bit to
> +be set
> + * along with a set of MC banks which work in FF mode.
> + */
> +static int __init hest_parse_cmc(const struct acpi_hest_header *hest_hdr,
> +				 void *data)
> +{
> +#ifdef CONFIG_X86_MCE

I didn't find where CONFIG_X86_MCE is defined. Do you have another patch to define it?

> +	unsigned int i;
> +	const struct acpi_hest_ia_corrected *cmc;
> +	const struct acpi_hest_ia_error_bank *mc_bank;
> +
> +	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
> +		return 0;
> +
> +	cmc = container_of(hest_hdr, const struct acpi_hest_ia_corrected,
> header);
> +	if (!cmc->enabled)
> +		return 0;
> +
> +	/*
> +	 * We expect HEST to provide a list of MC banks that report errors
> +	 * in firmware first mode. Otherwise, return non-zero value to
> +	 * indicate that we are done parsing HEST.
> +	 */
> +	if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST)
> || !cmc->num_hardware_banks)
> +		return 1;
> +
> +	printk(XENLOG_INFO HEST_PFX "Enabling Firmware First mode for
> +corrected errors.\n");
> +
> +	mc_bank = (const struct acpi_hest_ia_error_bank *)(cmc + 1);
> +	for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
> +		mce_disable_bank(mc_bank->bank_number);

Also no mce_diable_bak is founded.

> +#else
> +# define acpi_disable_cmcff 1

You didn't define acpi_disable_cmcff if defined CONFIG_X86_MCE. It will cause build error.

> +#endif
> +
> +	return 1;
> +}
> +
> +void __init acpi_hest_init(void)
> +{
> +	acpi_status status;
> +	acpi_physical_address hest_addr;
> +	acpi_native_uint hest_len;
> +
> +	if (acpi_disabled)
> +		return;
> +
> +	if (hest_disable) {
> +		printk(XENLOG_INFO HEST_PFX "Table parsing disabled.\n");
> +		return;
> +	}
> +
> +	status = acpi_get_table_phys(ACPI_SIG_HEST, 0, &hest_addr, &hest_len);
> +	if (status == AE_NOT_FOUND)
> +		goto err;
> +	if (ACPI_FAILURE(status)) {
> +		printk(XENLOG_ERR HEST_PFX "Failed to get table, %s\n",
> +		       acpi_format_exception(status));
> +		goto err;
> +	}
> +	map_pages_to_xen((unsigned long)__va(hest_addr),
> PFN_DOWN(hest_addr),
> +			 PFN_UP(hest_addr + hest_len) - PFN_DOWN(hest_addr),
> +			 PAGE_HYPERVISOR);
> +	hest_tab = __va(hest_addr);
> +
> +	if (!acpi_disable_cmcff)
> +		apei_hest_parse(hest_parse_cmc, NULL);
> +
> +	printk(XENLOG_INFO HEST_PFX "Table parsing has been initialized\n");
> +	return;
> +err:
> +	hest_disable = 1;
> +}
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -1066,6 +1066,105 @@ void __hwdom_init setup_hwdom_pci_device
>      spin_unlock(&pcidevs_lock);
>  }
> 
> +#ifdef CONFIG_ACPI
> +#include <acpi/acpi.h>
> +#include <acpi/apei.h>
> +
> +static int hest_match_pci(const struct acpi_hest_aer_common *p,
> +                          const struct pci_dev *pdev) {
> +    return ACPI_HEST_SEGMENT(p->bus) == pdev->seg &&
> +           ACPI_HEST_BUS(p->bus)     == pdev->bus &&
> +           p->device                 == PCI_SLOT(pdev->devfn) &&
> +           p->function               == PCI_FUNC(pdev->devfn);
> +}
> +
> +static bool_t hest_match_type(const struct acpi_hest_header *hest_hdr,
> +                              const struct pci_dev *pdev) {
> +    unsigned int pos = pci_find_cap_offset(pdev->seg, pdev->bus,
> +                                           PCI_SLOT(pdev->devfn),
> +                                           PCI_FUNC(pdev->devfn),
> +                                           PCI_CAP_ID_EXP);
> +    u8 pcie = (pci_conf_read16(pdev->seg, pdev->bus,
> PCI_SLOT(pdev->devfn),
> +                               PCI_FUNC(pdev->devfn), pos +
> PCI_EXP_FLAGS) &
> +               PCI_EXP_FLAGS_TYPE) /
> +              (PCI_EXP_FLAGS_TYPE & -PCI_EXP_FLAGS_TYPE);

I think right shift is much intuitively.
u8 pcie = (pci_conf_read16(pdev->seg, pdev->bus,PCI_SLOT(pdev->devfn),PCI_FUNC(pdev->devfn), pos + PCI_EXP_FLAGS) &  PCI_EXP_FLAGS_TYPE) >> 4;

> +
> +    switch ( hest_hdr->type )
> +    {
> +    case ACPI_HEST_TYPE_AER_ROOT_PORT:
> +        return pcie == PCI_EXP_TYPE_ROOT_PORT;
> +    case ACPI_HEST_TYPE_AER_ENDPOINT:
> +        return pcie == PCI_EXP_TYPE_ENDPOINT;
> +    case ACPI_HEST_TYPE_AER_BRIDGE:
> +        return pci_conf_read16(pdev->seg, pdev->bus,
> PCI_SLOT(pdev->devfn),
> +                               PCI_FUNC(pdev->devfn),
> PCI_CLASS_DEVICE) ==
> +               PCI_CLASS_BRIDGE_PCI;
> +    }
> +
> +    return 0;
> +}
> +
> +struct aer_hest_parse_info {
> +    const struct pci_dev *pdev;
> +    bool_t firmware_first;
> +};
> +
> +static bool_t hest_source_is_pcie_aer(const struct acpi_hest_header
> +*hest_hdr) {
> +    if ( hest_hdr->type == ACPI_HEST_TYPE_AER_ROOT_PORT ||
> +         hest_hdr->type == ACPI_HEST_TYPE_AER_ENDPOINT ||
> +         hest_hdr->type == ACPI_HEST_TYPE_AER_BRIDGE )
> +        return 1;
> +    return 0;
> +}
> +
> +static int aer_hest_parse(const struct acpi_hest_header *hest_hdr, void
> +*data) {
> +    struct aer_hest_parse_info *info = data;
> +    const struct acpi_hest_aer_common *p;
> +    bool_t ff;
> +
> +    if ( !hest_source_is_pcie_aer(hest_hdr) )
> +        return 0;
> +
> +    p = (const struct acpi_hest_aer_common *)(hest_hdr + 1);
> +    ff = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST);
> +
> +    /*
> +     * If no specific device is supplied, determine whether
> +     * FIRMWARE_FIRST is set for *any* PCIe device.
> +     */
> +    if ( !info->pdev )
> +    {
> +        info->firmware_first |= ff;
> +        return 0;
> +    }
> +
> +    /* Otherwise, check the specific device */
> +    if ( p->flags & ACPI_HEST_GLOBAL ?
> +         hest_match_type(hest_hdr, info->pdev) :
> +         hest_match_pci(p, info->pdev) )
> +    {
> +        info->firmware_first = ff;
> +        return 1;
> +    }
> +
> +    return 0;
> +}
> +
> +bool_t pcie_aer_get_firmware_first(const struct pci_dev *pdev) {
> +    struct aer_hest_parse_info info = { .pdev = pdev };
> +
> +    return pci_find_cap_offset(pdev->seg, pdev->bus,
> PCI_SLOT(pdev->devfn),
> +                               PCI_FUNC(pdev->devfn),
> PCI_CAP_ID_EXP) &&
> +           apei_hest_parse(aer_hest_parse, &info) >= 0 &&
> +           info.firmware_first;
> +}
> +#endif
> +
>  static int _dump_pci_devices(struct pci_seg *pseg, void *arg)  {
>      struct pci_dev *pdev;
> --- a/xen/drivers/passthrough/vtd/quirks.c
> +++ b/xen/drivers/passthrough/vtd/quirks.c
> @@ -386,9 +386,10 @@ void pci_vtd_quirk(const struct pci_dev
>      int dev = PCI_SLOT(pdev->devfn);
>      int func = PCI_FUNC(pdev->devfn);
>      int pos;
> -    u32 val;
> +    u32 val, val2;
>      u64 bar;
>      paddr_t pa;
> +    const char *action;
> 
>      if ( pci_conf_read16(seg, bus, dev, func, PCI_VENDOR_ID) !=
>           PCI_VENDOR_ID_INTEL )
> @@ -447,18 +448,26 @@ void pci_vtd_quirk(const struct pci_dev
>          }
> 
>          val = pci_conf_read32(seg, bus, dev, func, pos +
> PCI_ERR_UNCOR_MASK);
> -        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
> -                         val | PCI_ERR_UNC_UNSUP);
> -        val = pci_conf_read32(seg, bus, dev, func, pos +
> PCI_ERR_COR_MASK);
> -        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
> -                         val | PCI_ERR_COR_ADV_NFAT);
> +        val2 = pci_conf_read32(seg, bus, dev, func, pos +
> PCI_ERR_COR_MASK);
> +        if ( (val & PCI_ERR_UNC_UNSUP) && (val2 &
> PCI_ERR_COR_ADV_NFAT) )
> +            action = "Found masked";

What happened if dom0 unmasked it later?

> +        else if ( !pcie_aer_get_firmware_first(pdev) )
> +        {
> +            pci_conf_write32(seg, bus, dev, func, pos +
> PCI_ERR_UNCOR_MASK,
> +                             val | PCI_ERR_UNC_UNSUP);
> +            pci_conf_write32(seg, bus, dev, func, pos +
> PCI_ERR_COR_MASK,
> +                             val2 | PCI_ERR_COR_ADV_NFAT);
> +            action = "Masked";
> +        }
> +        else
> +            action = "Cannot mask";
> 
>          /* XPUNCERRMSK Send Completion with Unsupported Request */
>          val = pci_conf_read32(seg, bus, dev, func, 0x20c);
>          pci_conf_write32(seg, bus, dev, func, 0x20c, val | (1 << 4));
> 
> -        printk(XENLOG_INFO "Masked UR signaling
> on %04x:%02x:%02x.%u\n",
> -               seg, bus, dev, func);
> +        printk(XENLOG_INFO "%s UR signaling on %04x:%02x:%02x.%u\n",
> +               action, seg, bus, dev, func);
>          break;
> 
>      case 0x100: case 0x104: case 0x108: /* Sandybridge */
> --- a/xen/include/acpi/actbl1.h
> +++ b/xen/include/acpi/actbl1.h
> @@ -445,6 +445,14 @@ struct acpi_hest_aer_common {
>  #define ACPI_HEST_FIRMWARE_FIRST        (1)
>  #define ACPI_HEST_GLOBAL                (1<<1)
> 
> +/*
> + * Macros to access the bus/segment numbers in Bus field above:
> + *  Bus number is encoded in bits 7:0
> + *  Segment number is encoded in bits 23:8  */
> +#define ACPI_HEST_BUS(bus)              ((bus) & 0xFF)
> +#define ACPI_HEST_SEGMENT(bus)          (((bus) >> 8) & 0xFFFF)
> +
>  /* Hardware Error Notification */
> 
>  struct acpi_hest_notify {
> --- a/xen/include/acpi/apei.h
> +++ b/xen/include/acpi/apei.h
> @@ -12,6 +12,9 @@
> 
>  #define FIX_APEI_RANGE_MAX 64
> 
> +typedef int (*apei_hest_func_t)(const struct acpi_hest_header *, void
> +*); int apei_hest_parse(apei_hest_func_t, void *);
> +
>  int erst_write(const struct cper_record_header *record);  ssize_t
> erst_get_record_count(void);  int erst_get_next_record_id(u64 *record_id);
> --- a/xen/include/xen/acpi.h
> +++ b/xen/include/xen/acpi.h
> @@ -61,6 +61,7 @@ int acpi_boot_init (void);  int acpi_boot_table_init (void);
> int acpi_numa_init (void);  int erst_init(void);
> +void acpi_hest_init(void);
> 
>  int acpi_table_init (void);
>  int acpi_table_parse(char *id, acpi_table_handler handler);
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -144,6 +144,8 @@ int pci_find_next_ext_capability(int seg  const char
> *parse_pci(const char *, unsigned int *seg, unsigned int *bus,
>                        unsigned int *dev, unsigned int *func);
> 
> +bool_t pcie_aer_get_firmware_first(const struct pci_dev *);
> +
>  struct pirq;
>  int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable);  void
> msixtbl_pt_unregister(struct domain *, struct pirq *);
> 

Best regards,
Yang

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-23  2:32 ` Zhang, Yang Z
@ 2014-05-23  6:22   ` Jan Beulich
  2014-05-23  6:46     ` Zhang, Yang Z
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2014-05-23  6:22 UTC (permalink / raw)
  To: Yang Z Zhang
  Cc: Andrew Cooper, Malcolm Crossley, Kevin Tian, Donald D Dugger,
	xen-devel

>>> On 23.05.14 at 04:32, <yang.z.zhang@intel.com> wrote:
> Jan Beulich wrote on 2014-05-22:
>> +static int __init hest_parse_cmc(const struct acpi_hest_header *hest_hdr,
>> +				 void *data)
>> +{
>> +#ifdef CONFIG_X86_MCE
> 
> I didn't find where CONFIG_X86_MCE is defined. Do you have another patch to 
> define it?

No, and intentionally not (for the moment), because of ...

>> +	unsigned int i;
>> +	const struct acpi_hest_ia_corrected *cmc;
>> +	const struct acpi_hest_ia_error_bank *mc_bank;
>> +
>> +	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
>> +		return 0;
>> +
>> +	cmc = container_of(hest_hdr, const struct acpi_hest_ia_corrected, header);
>> +	if (!cmc->enabled)
>> +		return 0;
>> +
>> +	/*
>> +	 * We expect HEST to provide a list of MC banks that report errors
>> +	 * in firmware first mode. Otherwise, return non-zero value to
>> +	 * indicate that we are done parsing HEST.
>> +	 */
>> +	if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || !cmc->num_hardware_banks)
>> +		return 1;
>> +
>> +	printk(XENLOG_INFO HEST_PFX "Enabling Firmware First mode for
>> +corrected errors.\n");
>> +
>> +	mc_bank = (const struct acpi_hest_ia_error_bank *)(cmc + 1);
>> +	for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
>> +		mce_disable_bank(mc_bank->bank_number);
> 
> Also no mce_diable_bak is founded.

... this ...

>> +#else
>> +# define acpi_disable_cmcff 1
> 
> You didn't define acpi_disable_cmcff if defined CONFIG_X86_MCE. It will 
> cause build error.

... and this. I simply didn't want to discard stuff from the Linux file
that we ought to be making use of eventually (I'd be hoping for the
machine check maintainers to perhaps try to flesh this out
subsequently). With the way it is now, things build and work. I'm
adding a note to the commit message explaining this.

>> +static bool_t hest_match_type(const struct acpi_hest_header *hest_hdr,
>> +                              const struct pci_dev *pdev) {
>> +    unsigned int pos = pci_find_cap_offset(pdev->seg, pdev->bus,
>> +                                           PCI_SLOT(pdev->devfn),
>> +                                           PCI_FUNC(pdev->devfn),
>> +                                           PCI_CAP_ID_EXP);
>> +    u8 pcie = (pci_conf_read16(pdev->seg, pdev->bus,
>> PCI_SLOT(pdev->devfn),
>> +                               PCI_FUNC(pdev->devfn), pos +
>> PCI_EXP_FLAGS) &
>> +               PCI_EXP_FLAGS_TYPE) /
>> +              (PCI_EXP_FLAGS_TYPE & -PCI_EXP_FLAGS_TYPE);
> 
> I think right shift is much intuitively.
> u8 pcie = (pci_conf_read16(pdev->seg, 
> pdev->bus,PCI_SLOT(pdev->devfn),PCI_FUNC(pdev->devfn), pos + PCI_EXP_FLAGS) &  
> PCI_EXP_FLAGS_TYPE) >> 4;

But then who in the world is "4"? I sincerely dislike hardcoded
numbers, and _intentionally_ replaced the 4 Linux uses here.

>> @@ -447,18 +448,26 @@ void pci_vtd_quirk(const struct pci_dev
>>          }
>> 
>>          val = pci_conf_read32(seg, bus, dev, func, pos +
>> PCI_ERR_UNCOR_MASK);
>> -        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
>> -                         val | PCI_ERR_UNC_UNSUP);
>> -        val = pci_conf_read32(seg, bus, dev, func, pos +
>> PCI_ERR_COR_MASK);
>> -        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
>> -                         val | PCI_ERR_COR_ADV_NFAT);
>> +        val2 = pci_conf_read32(seg, bus, dev, func, pos +
>> PCI_ERR_COR_MASK);
>> +        if ( (val & PCI_ERR_UNC_UNSUP) && (val2 &
>> PCI_ERR_COR_ADV_NFAT) )
>> +            action = "Found masked";
> 
> What happened if dom0 unmasked it later?

That question you should have raised on the first of the XSA-59
related patch sets, but the answer is we expect Dom0 to be well
behaved. Of course, if you have a non-intrusive suggestion on
how to enforce this in Xen, I'm all ears.

Jan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-23  6:22   ` Jan Beulich
@ 2014-05-23  6:46     ` Zhang, Yang Z
  2014-05-23  7:15       ` Jan Beulich
  0 siblings, 1 reply; 18+ messages in thread
From: Zhang, Yang Z @ 2014-05-23  6:46 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, Malcolm Crossley, Tian, Kevin, Dugger, Donald D,
	xen-devel

Jan Beulich wrote on 2014-05-23:
>>>> On 23.05.14 at 04:32, <yang.z.zhang@intel.com> wrote:
>> Jan Beulich wrote on 2014-05-22:
>>> +static int __init hest_parse_cmc(const struct acpi_hest_header *hest_hdr,
>>> +				 void *data)
>>> +{
>>> +#ifdef CONFIG_X86_MCE
>> 
>> I didn't find where CONFIG_X86_MCE is defined. Do you have another
>> patch to define it?
> 
> No, and intentionally not (for the moment), because of ...
> 
>>> +	unsigned int i; +	const struct acpi_hest_ia_corrected *cmc; +	const
>>> struct acpi_hest_ia_error_bank *mc_bank; + +	if (hest_hdr->type !=
>>> ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) +		return 0; + +	cmc =
>>> container_of(hest_hdr, const struct acpi_hest_ia_corrected, header);
>>> +	if (!cmc->enabled) +		return 0; + +	/* +	 * We expect HEST to
>>> provide a list of MC banks that report errors +	 * in firmware first
>>> mode. Otherwise, return non-zero value to +	 * indicate that we are
>>> done parsing HEST. +	 */ +	if (!(cmc->flags &
>>> ACPI_HEST_FIRMWARE_FIRST) || !cmc->num_hardware_banks) +		return 1; +
>>> +	printk(XENLOG_INFO HEST_PFX "Enabling Firmware First mode for
>>> +corrected errors.\n"); + +	mc_bank = (const struct
>>> acpi_hest_ia_error_bank *)(cmc + 1); +	for (i = 0; i <
>>> cmc->num_hardware_banks; i++, mc_bank++)
>>> +		mce_disable_bank(mc_bank->bank_number);
>> 
>> Also no mce_diable_bak is founded.
> 
> ... this ...
> 
>>> +#else
>>> +# define acpi_disable_cmcff 1
>> 
>> You didn't define acpi_disable_cmcff if defined CONFIG_X86_MCE. It
>> will cause build error.
> 
> ... and this. I simply didn't want to discard stuff from the Linux
> file that we ought to be making use of eventually (I'd be hoping for
> the machine check maintainers to perhaps try to flesh this out
> subsequently). With the way it is now, things build and work. I'm
> adding a note to the commit message explaining this.
> 
>>> +static bool_t hest_match_type(const struct acpi_hest_header
>>> *hest_hdr, +                              const struct pci_dev *pdev)
>>> { +    unsigned int pos = pci_find_cap_offset(pdev->seg, pdev->bus, +
>>> PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), +                     
>>>                      PCI_CAP_ID_EXP); +    u8 pcie =
>>> (pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), +       
>>>                        PCI_FUNC(pdev->devfn), pos + PCI_EXP_FLAGS) & +
>>>               PCI_EXP_FLAGS_TYPE) / +              (PCI_EXP_FLAGS_TYPE
>>> & -PCI_EXP_FLAGS_TYPE);
>> 
>> I think right shift is much intuitively.
>> u8 pcie = (pci_conf_read16(pdev->seg,
>> pdev->bus,PCI_SLOT(pdev->devfn),PCI_FUNC(pdev->devfn), pos +
>> pdev->PCI_EXP_FLAGS) &
>> PCI_EXP_FLAGS_TYPE) >> 4;
> 
> But then who in the world is "4"? I sincerely dislike hardcoded
> numbers, and _intentionally_ replaced the 4 Linux uses here.
> 
>>> @@ -447,18 +448,26 @@ void pci_vtd_quirk(const struct pci_dev
>>>          }
>>>          
>>>          val = pci_conf_read32(seg, bus, dev, func, pos +
>>> PCI_ERR_UNCOR_MASK); -        pci_conf_write32(seg, bus, dev, func,
>>> pos + PCI_ERR_UNCOR_MASK, -                         val |
>>> PCI_ERR_UNC_UNSUP); -        val = pci_conf_read32(seg, bus, dev,
>>> func, pos + PCI_ERR_COR_MASK); -        pci_conf_write32(seg, bus,
>>> dev, func, pos + PCI_ERR_COR_MASK, -                         val |
>>> PCI_ERR_COR_ADV_NFAT); +        val2 = pci_conf_read32(seg, bus, dev,
>>> func, pos + PCI_ERR_COR_MASK); +        if ( (val & PCI_ERR_UNC_UNSUP)
>>> && (val2 & PCI_ERR_COR_ADV_NFAT) ) +            action = "Found
>>> masked";
>> 
>> What happened if dom0 unmasked it later?
> 
> That question you should have raised on the first of the XSA-59
> related patch sets, but the answer is we expect Dom0 to be well
> behaved. Of course, if you have a non-intrusive suggestion on how to enforce this in Xen, I'm all ears.

So you mean we need to stare at Linux upstream to make sure it doesn't do any un-friendly modification to Xen.

> 
> Jan


Best regards,
Yang

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-23  6:46     ` Zhang, Yang Z
@ 2014-05-23  7:15       ` Jan Beulich
  2014-05-26  0:39         ` Zhang, Yang Z
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2014-05-23  7:15 UTC (permalink / raw)
  To: Yang Z Zhang
  Cc: Andrew Cooper, Malcolm Crossley, Kevin Tian, Donald D Dugger,
	xen-devel

>>> On 23.05.14 at 08:46, <yang.z.zhang@intel.com> wrote:
> Jan Beulich wrote on 2014-05-23:
>>>>> On 23.05.14 at 04:32, <yang.z.zhang@intel.com> wrote:
>>> Jan Beulich wrote on 2014-05-22:
>>>> @@ -447,18 +448,26 @@ void pci_vtd_quirk(const struct pci_dev
>>>>          }
>>>>          
>>>>          val = pci_conf_read32(seg, bus, dev, func, pos +
>>>> PCI_ERR_UNCOR_MASK); -        pci_conf_write32(seg, bus, dev, func,
>>>> pos + PCI_ERR_UNCOR_MASK, -                         val |
>>>> PCI_ERR_UNC_UNSUP); -        val = pci_conf_read32(seg, bus, dev,
>>>> func, pos + PCI_ERR_COR_MASK); -        pci_conf_write32(seg, bus,
>>>> dev, func, pos + PCI_ERR_COR_MASK, -                         val |
>>>> PCI_ERR_COR_ADV_NFAT); +        val2 = pci_conf_read32(seg, bus, dev,
>>>> func, pos + PCI_ERR_COR_MASK); +        if ( (val & PCI_ERR_UNC_UNSUP)
>>>> && (val2 & PCI_ERR_COR_ADV_NFAT) ) +            action = "Found
>>>> masked";
>>> 
>>> What happened if dom0 unmasked it later?
>> 
>> That question you should have raised on the first of the XSA-59
>> related patch sets, but the answer is we expect Dom0 to be well
>> behaved. Of course, if you have a non-intrusive suggestion on how to enforce 
> this in Xen, I'm all ears.
> 
> So you mean we need to stare at Linux upstream to make sure it doesn't do 
> any un-friendly modification to Xen.

Not sure what you mean by "stare", but relying on Dom0 to not do
bad things is a fundamental model with Xen - if the Dom0 kernel
wanted, it'd have other ways of doing bad things.

Jan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-23  7:15       ` Jan Beulich
@ 2014-05-26  0:39         ` Zhang, Yang Z
  0 siblings, 0 replies; 18+ messages in thread
From: Zhang, Yang Z @ 2014-05-26  0:39 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Andrew Cooper, Malcolm Crossley, Tian, Kevin, Dugger, Donald D,
	xen-devel

Jan Beulich wrote on 2014-05-23:
>>>> On 23.05.14 at 08:46, <yang.z.zhang@intel.com> wrote:
>> Jan Beulich wrote on 2014-05-23:
>>>>>> On 23.05.14 at 04:32, <yang.z.zhang@intel.com> wrote:
>>>> Jan Beulich wrote on 2014-05-22:
>>>>> @@ -447,18 +448,26 @@ void pci_vtd_quirk(const struct pci_dev
>>>>>          }
>>>>>          
>>>>>          val = pci_conf_read32(seg, bus, dev, func, pos +
>>>>> PCI_ERR_UNCOR_MASK); -        pci_conf_write32(seg, bus, dev, func,
>>>>> pos + PCI_ERR_UNCOR_MASK, -                         val |
>>>>> PCI_ERR_UNC_UNSUP); -        val = pci_conf_read32(seg, bus, dev,
>>>>> func, pos + PCI_ERR_COR_MASK); -        pci_conf_write32(seg, bus,
>>>>> dev, func, pos + PCI_ERR_COR_MASK, -                         val |
>>>>> PCI_ERR_COR_ADV_NFAT); +        val2 = pci_conf_read32(seg, bus,
>>>>> dev, func, pos + PCI_ERR_COR_MASK); +        if ( (val &
>>>>> PCI_ERR_UNC_UNSUP) && (val2 & PCI_ERR_COR_ADV_NFAT) ) +           
>>>>> action = "Found masked";
>>>> 
>>>> What happened if dom0 unmasked it later?
>>> 
>>> That question you should have raised on the first of the XSA-59
>>> related patch sets, but the answer is we expect Dom0 to be well
>>> behaved. Of course, if you have a non-intrusive suggestion on how
>>> to enforce
>> this in Xen, I'm all ears.
>> 
>> So you mean we need to stare at Linux upstream to make sure it
>> doesn't do any un-friendly modification to Xen.
> 
> Not sure what you mean by "stare", but relying on Dom0 to not do bad
> things is a fundamental model with Xen - if the Dom0 kernel wanted,
> it'd have other ways of doing bad things.

>From dom0's point, hypervisor is similar to a firmware especially in this case. So if hypervisor want to disable the AER, it can act as a firmware to set the firmware first mode. Then it is safe to mask it and dom0 will never try to touch it. This is just an idea and may introduce side effect. Need more thinking.

> 
> Jan


Best regards,
Yang

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-21 16:09 [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code Jan Beulich
  2014-05-21 16:33 ` Andrew Cooper
  2014-05-23  2:32 ` Zhang, Yang Z
@ 2014-05-26 10:19 ` Jan Beulich
  2014-06-03 14:38   ` Malcolm Crossley
  2 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2014-05-26 10:19 UTC (permalink / raw)
  To: Andrew Cooper, Malcolm Crossley
  Cc: Yang Z Zhang, xen-devel, Kevin Tian, Donald D Dugger

[-- Attachment #1: Type: text/plain, Size: 15684 bytes --]

When firmware-first mode is being indicated by firmware, we shouldn't
be modifying AER registers - these are considered to be owned by
firmware in that case. Violating this is being reported to result in
SMI storms. While circumventing the workaround means re-exposing
affected hosts to the XSA-59 issues, this in any event seems better
than not booting at all. Respective messages are being issued to the
log, so the situation can be diagnosed.

The basic building blocks were taken from Linux 3.15-rc. Note that
this includes a block of code enclosed in #ifdef CONFIG_X86_MCE - we
don't define that symbol, and that code also wouldn't build without
suitable machine check side code added; that should happen eventually,
but isn't subject of this change.

Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reported-by: Malcolm Crossley <malcolm.crossley@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Only check firmware-first for actual AER (i.e. ignore it for host
    bridges in DMI mode). Use MASK_EXTR() instead of open coding it.
    Re-word message from "cannot" to "must not".

--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -754,6 +754,8 @@ int __init acpi_boot_init(void)
 
 	erst_init();
 
+	acpi_hest_init();
+
 	acpi_table_parse(ACPI_SIG_BGRT, acpi_invalidate_bgrt);
 
 	return 0;
--- a/xen/drivers/acpi/apei/Makefile
+++ b/xen/drivers/acpi/apei/Makefile
@@ -1,3 +1,4 @@
 obj-y += erst.o
+obj-y += hest.o
 obj-y += apei-base.o
 obj-y += apei-io.o
--- /dev/null
+++ b/xen/drivers/acpi/apei/hest.c
@@ -0,0 +1,200 @@
+/*
+ * APEI Hardware Error Souce Table support
+ *
+ * HEST describes error sources in detail; communicates operational
+ * parameters (i.e. severity levels, masking bits, and threshold
+ * values) to Linux as necessary. It also allows the BIOS to report
+ * non-standard error sources to Linux (for example, chipset-specific
+ * error registers).
+ *
+ * For more information about HEST, please refer to ACPI Specification
+ * version 4.0, section 17.3.2.
+ *
+ * Copyright 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <xen/errno.h>
+#include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/mm.h>
+#include <xen/pfn.h>
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+#include "apei-internal.h"
+
+#define HEST_PFX "HEST: "
+
+static bool_t hest_disable;
+boolean_param("hest_disable", hest_disable);
+
+/* HEST table parsing */
+
+static struct acpi_table_hest *__read_mostly hest_tab;
+
+static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = {
+	[ACPI_HEST_TYPE_IA32_CHECK] = -1,	/* need further calculation */
+	[ACPI_HEST_TYPE_IA32_CORRECTED_CHECK] = -1,
+	[ACPI_HEST_TYPE_IA32_NMI] = sizeof(struct acpi_hest_ia_nmi),
+	[ACPI_HEST_TYPE_AER_ROOT_PORT] = sizeof(struct acpi_hest_aer_root),
+	[ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
+	[ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
+	[ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
+};
+
+static int hest_esrc_len(const struct acpi_hest_header *hest_hdr)
+{
+	u16 hest_type = hest_hdr->type;
+	int len;
+
+	if (hest_type >= ACPI_HEST_TYPE_RESERVED)
+		return 0;
+
+	len = hest_esrc_len_tab[hest_type];
+
+	if (hest_type == ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) {
+		const struct acpi_hest_ia_corrected *cmc =
+			container_of(hest_hdr,
+				     const struct acpi_hest_ia_corrected,
+				     header);
+
+		len = sizeof(*cmc) + cmc->num_hardware_banks *
+		      sizeof(struct acpi_hest_ia_error_bank);
+	} else if (hest_type == ACPI_HEST_TYPE_IA32_CHECK) {
+		const struct acpi_hest_ia_machine_check *mc =
+			container_of(hest_hdr,
+				     const struct acpi_hest_ia_machine_check,
+				     header);
+
+		len = sizeof(*mc) + mc->num_hardware_banks *
+		      sizeof(struct acpi_hest_ia_error_bank);
+	}
+	BUG_ON(len == -1);
+
+	return len;
+};
+
+int apei_hest_parse(apei_hest_func_t func, void *data)
+{
+	struct acpi_hest_header *hest_hdr;
+	int i, rc, len;
+
+	if (hest_disable || !hest_tab)
+		return -EINVAL;
+
+	hest_hdr = (struct acpi_hest_header *)(hest_tab + 1);
+	for (i = 0; i < hest_tab->error_source_count; i++) {
+		len = hest_esrc_len(hest_hdr);
+		if (!len) {
+			printk(XENLOG_WARNING HEST_PFX
+			       "Unknown or unused hardware error source "
+			       "type: %d for hardware error source: %d\n",
+			       hest_hdr->type, hest_hdr->source_id);
+			return -EINVAL;
+		}
+		if ((void *)hest_hdr + len >
+		    (void *)hest_tab + hest_tab->header.length) {
+			printk(XENLOG_WARNING HEST_PFX
+			       "Table contents overflow for hardware error source: %d\n",
+			       hest_hdr->source_id);
+			return -EINVAL;
+		}
+
+		rc = func(hest_hdr, data);
+		if (rc)
+			return rc;
+
+		hest_hdr = (void *)hest_hdr + len;
+	}
+
+	return 0;
+}
+
+/*
+ * Check if firmware advertises firmware first mode. We need FF bit to be set
+ * along with a set of MC banks which work in FF mode.
+ */
+static int __init hest_parse_cmc(const struct acpi_hest_header *hest_hdr,
+				 void *data)
+{
+#ifdef CONFIG_X86_MCE
+	unsigned int i;
+	const struct acpi_hest_ia_corrected *cmc;
+	const struct acpi_hest_ia_error_bank *mc_bank;
+
+	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+		return 0;
+
+	cmc = container_of(hest_hdr, const struct acpi_hest_ia_corrected, header);
+	if (!cmc->enabled)
+		return 0;
+
+	/*
+	 * We expect HEST to provide a list of MC banks that report errors
+	 * in firmware first mode. Otherwise, return non-zero value to
+	 * indicate that we are done parsing HEST.
+	 */
+	if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || !cmc->num_hardware_banks)
+		return 1;
+
+	printk(XENLOG_INFO HEST_PFX "Enabling Firmware First mode for corrected errors.\n");
+
+	mc_bank = (const struct acpi_hest_ia_error_bank *)(cmc + 1);
+	for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+		mce_disable_bank(mc_bank->bank_number);
+#else
+# define acpi_disable_cmcff 1
+#endif
+
+	return 1;
+}
+
+void __init acpi_hest_init(void)
+{
+	acpi_status status;
+	acpi_physical_address hest_addr;
+	acpi_native_uint hest_len;
+
+	if (acpi_disabled)
+		return;
+
+	if (hest_disable) {
+		printk(XENLOG_INFO HEST_PFX "Table parsing disabled.\n");
+		return;
+	}
+
+	status = acpi_get_table_phys(ACPI_SIG_HEST, 0, &hest_addr, &hest_len);
+	if (status == AE_NOT_FOUND)
+		goto err;
+	if (ACPI_FAILURE(status)) {
+		printk(XENLOG_ERR HEST_PFX "Failed to get table, %s\n",
+		       acpi_format_exception(status));
+		goto err;
+	}
+	map_pages_to_xen((unsigned long)__va(hest_addr), PFN_DOWN(hest_addr),
+			 PFN_UP(hest_addr + hest_len) - PFN_DOWN(hest_addr),
+			 PAGE_HYPERVISOR);
+	hest_tab = __va(hest_addr);
+
+	if (!acpi_disable_cmcff)
+		apei_hest_parse(hest_parse_cmc, NULL);
+
+	printk(XENLOG_INFO HEST_PFX "Table parsing has been initialized\n");
+	return;
+err:
+	hest_disable = 1;
+}
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -1069,6 +1069,106 @@ void __hwdom_init setup_hwdom_pci_device
     spin_unlock(&pcidevs_lock);
 }
 
+#ifdef CONFIG_ACPI
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+static int hest_match_pci(const struct acpi_hest_aer_common *p,
+                          const struct pci_dev *pdev)
+{
+    return ACPI_HEST_SEGMENT(p->bus) == pdev->seg &&
+           ACPI_HEST_BUS(p->bus)     == pdev->bus &&
+           p->device                 == PCI_SLOT(pdev->devfn) &&
+           p->function               == PCI_FUNC(pdev->devfn);
+}
+
+static bool_t hest_match_type(const struct acpi_hest_header *hest_hdr,
+                              const struct pci_dev *pdev)
+{
+    unsigned int pos = pci_find_cap_offset(pdev->seg, pdev->bus,
+                                           PCI_SLOT(pdev->devfn),
+                                           PCI_FUNC(pdev->devfn),
+                                           PCI_CAP_ID_EXP);
+    u8 pcie = MASK_EXTR(pci_conf_read16(pdev->seg, pdev->bus,
+                                        PCI_SLOT(pdev->devfn),
+                                        PCI_FUNC(pdev->devfn),
+                                        pos + PCI_EXP_FLAGS),
+                        PCI_EXP_FLAGS_TYPE);
+
+    switch ( hest_hdr->type )
+    {
+    case ACPI_HEST_TYPE_AER_ROOT_PORT:
+        return pcie == PCI_EXP_TYPE_ROOT_PORT;
+    case ACPI_HEST_TYPE_AER_ENDPOINT:
+        return pcie == PCI_EXP_TYPE_ENDPOINT;
+    case ACPI_HEST_TYPE_AER_BRIDGE:
+        return pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), PCI_CLASS_DEVICE) ==
+               PCI_CLASS_BRIDGE_PCI;
+    }
+
+    return 0;
+}
+
+struct aer_hest_parse_info {
+    const struct pci_dev *pdev;
+    bool_t firmware_first;
+};
+
+static bool_t hest_source_is_pcie_aer(const struct acpi_hest_header *hest_hdr)
+{
+    if ( hest_hdr->type == ACPI_HEST_TYPE_AER_ROOT_PORT ||
+         hest_hdr->type == ACPI_HEST_TYPE_AER_ENDPOINT ||
+         hest_hdr->type == ACPI_HEST_TYPE_AER_BRIDGE )
+        return 1;
+    return 0;
+}
+
+static int aer_hest_parse(const struct acpi_hest_header *hest_hdr, void *data)
+{
+    struct aer_hest_parse_info *info = data;
+    const struct acpi_hest_aer_common *p;
+    bool_t ff;
+
+    if ( !hest_source_is_pcie_aer(hest_hdr) )
+        return 0;
+
+    p = (const struct acpi_hest_aer_common *)(hest_hdr + 1);
+    ff = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST);
+
+    /*
+     * If no specific device is supplied, determine whether
+     * FIRMWARE_FIRST is set for *any* PCIe device.
+     */
+    if ( !info->pdev )
+    {
+        info->firmware_first |= ff;
+        return 0;
+    }
+
+    /* Otherwise, check the specific device */
+    if ( p->flags & ACPI_HEST_GLOBAL ?
+         hest_match_type(hest_hdr, info->pdev) :
+         hest_match_pci(p, info->pdev) )
+    {
+        info->firmware_first = ff;
+        return 1;
+    }
+
+    return 0;
+}
+
+bool_t pcie_aer_get_firmware_first(const struct pci_dev *pdev)
+{
+    struct aer_hest_parse_info info = { .pdev = pdev };
+
+    return pci_find_cap_offset(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), PCI_CAP_ID_EXP) &&
+           apei_hest_parse(aer_hest_parse, &info) >= 0 &&
+           info.firmware_first;
+}
+#endif
+
 static int _dump_pci_devices(struct pci_seg *pseg, void *arg)
 {
     struct pci_dev *pdev;
--- a/xen/drivers/passthrough/vtd/quirks.c
+++ b/xen/drivers/passthrough/vtd/quirks.c
@@ -386,9 +386,11 @@ void pci_vtd_quirk(const struct pci_dev 
     int dev = PCI_SLOT(pdev->devfn);
     int func = PCI_FUNC(pdev->devfn);
     int pos;
-    u32 val;
+    bool_t ff;
+    u32 val, val2;
     u64 bar;
     paddr_t pa;
+    const char *action;
 
     if ( pci_conf_read16(seg, bus, dev, func, PCI_VENDOR_ID) !=
          PCI_VENDOR_ID_INTEL )
@@ -438,7 +440,10 @@ void pci_vtd_quirk(const struct pci_dev 
                 pos = pci_find_next_ext_capability(seg, bus, pdev->devfn, pos,
                                                    PCI_EXT_CAP_ID_VNDR);
             }
+            ff = 0;
         }
+        else
+            ff = pcie_aer_get_firmware_first(pdev);
         if ( !pos )
         {
             printk(XENLOG_WARNING "%04x:%02x:%02x.%u without AER capability?\n",
@@ -447,18 +452,26 @@ void pci_vtd_quirk(const struct pci_dev 
         }
 
         val = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK);
-        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
-                         val | PCI_ERR_UNC_UNSUP);
-        val = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK);
-        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
-                         val | PCI_ERR_COR_ADV_NFAT);
+        val2 = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK);
+        if ( (val & PCI_ERR_UNC_UNSUP) && (val2 & PCI_ERR_COR_ADV_NFAT) )
+            action = "Found masked";
+        else if ( !ff )
+        {
+            pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
+                             val | PCI_ERR_UNC_UNSUP);
+            pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
+                             val2 | PCI_ERR_COR_ADV_NFAT);
+            action = "Masked";
+        }
+        else
+            action = "Must not mask";
 
         /* XPUNCERRMSK Send Completion with Unsupported Request */
         val = pci_conf_read32(seg, bus, dev, func, 0x20c);
         pci_conf_write32(seg, bus, dev, func, 0x20c, val | (1 << 4));
 
-        printk(XENLOG_INFO "Masked UR signaling on %04x:%02x:%02x.%u\n",
-               seg, bus, dev, func);
+        printk(XENLOG_INFO "%s UR signaling on %04x:%02x:%02x.%u\n",
+               action, seg, bus, dev, func);
         break;
 
     case 0x100: case 0x104: case 0x108: /* Sandybridge */
--- a/xen/include/acpi/actbl1.h
+++ b/xen/include/acpi/actbl1.h
@@ -445,6 +445,14 @@ struct acpi_hest_aer_common {
 #define ACPI_HEST_FIRMWARE_FIRST        (1)
 #define ACPI_HEST_GLOBAL                (1<<1)
 
+/*
+ * Macros to access the bus/segment numbers in Bus field above:
+ *  Bus number is encoded in bits 7:0
+ *  Segment number is encoded in bits 23:8
+ */
+#define ACPI_HEST_BUS(bus)              ((bus) & 0xFF)
+#define ACPI_HEST_SEGMENT(bus)          (((bus) >> 8) & 0xFFFF)
+
 /* Hardware Error Notification */
 
 struct acpi_hest_notify {
--- a/xen/include/acpi/apei.h
+++ b/xen/include/acpi/apei.h
@@ -12,6 +12,9 @@
 
 #define FIX_APEI_RANGE_MAX 64
 
+typedef int (*apei_hest_func_t)(const struct acpi_hest_header *, void *);
+int apei_hest_parse(apei_hest_func_t, void *);
+
 int erst_write(const struct cper_record_header *record);
 ssize_t erst_get_record_count(void);
 int erst_get_next_record_id(u64 *record_id);
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -61,6 +61,7 @@ int acpi_boot_init (void);
 int acpi_boot_table_init (void);
 int acpi_numa_init (void);
 int erst_init(void);
+void acpi_hest_init(void);
 
 int acpi_table_init (void);
 int acpi_table_parse(char *id, acpi_table_handler handler);
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -144,6 +144,8 @@ int pci_find_next_ext_capability(int seg
 const char *parse_pci(const char *, unsigned int *seg, unsigned int *bus,
                       unsigned int *dev, unsigned int *func);
 
+bool_t pcie_aer_get_firmware_first(const struct pci_dev *);
+
 struct pirq;
 int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable);
 void msixtbl_pt_unregister(struct domain *, struct pirq *);



[-- Attachment #2: VT-d-mask-UR-honor-firmware-first.patch --]
[-- Type: text/plain, Size: 15741 bytes --]

VT-d: honor firmware-first mode in XSA-59 workaround code

When firmware-first mode is being indicated by firmware, we shouldn't
be modifying AER registers - these are considered to be owned by
firmware in that case. Violating this is being reported to result in
SMI storms. While circumventing the workaround means re-exposing
affected hosts to the XSA-59 issues, this in any event seems better
than not booting at all. Respective messages are being issued to the
log, so the situation can be diagnosed.

The basic building blocks were taken from Linux 3.15-rc. Note that
this includes a block of code enclosed in #ifdef CONFIG_X86_MCE - we
don't define that symbol, and that code also wouldn't build without
suitable machine check side code added; that should happen eventually,
but isn't subject of this change.

Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reported-by: Malcolm Crossley <malcolm.crossley@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Only check firmware-first for actual AER (i.e. ignore it for host
    bridges in DMI mode). Use MASK_EXTR() instead of open coding it.
    Re-word message from "cannot" to "must not".

--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -754,6 +754,8 @@ int __init acpi_boot_init(void)
 
 	erst_init();
 
+	acpi_hest_init();
+
 	acpi_table_parse(ACPI_SIG_BGRT, acpi_invalidate_bgrt);
 
 	return 0;
--- a/xen/drivers/acpi/apei/Makefile
+++ b/xen/drivers/acpi/apei/Makefile
@@ -1,3 +1,4 @@
 obj-y += erst.o
+obj-y += hest.o
 obj-y += apei-base.o
 obj-y += apei-io.o
--- /dev/null
+++ b/xen/drivers/acpi/apei/hest.c
@@ -0,0 +1,200 @@
+/*
+ * APEI Hardware Error Souce Table support
+ *
+ * HEST describes error sources in detail; communicates operational
+ * parameters (i.e. severity levels, masking bits, and threshold
+ * values) to Linux as necessary. It also allows the BIOS to report
+ * non-standard error sources to Linux (for example, chipset-specific
+ * error registers).
+ *
+ * For more information about HEST, please refer to ACPI Specification
+ * version 4.0, section 17.3.2.
+ *
+ * Copyright 2009 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <xen/errno.h>
+#include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/mm.h>
+#include <xen/pfn.h>
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+#include "apei-internal.h"
+
+#define HEST_PFX "HEST: "
+
+static bool_t hest_disable;
+boolean_param("hest_disable", hest_disable);
+
+/* HEST table parsing */
+
+static struct acpi_table_hest *__read_mostly hest_tab;
+
+static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = {
+	[ACPI_HEST_TYPE_IA32_CHECK] = -1,	/* need further calculation */
+	[ACPI_HEST_TYPE_IA32_CORRECTED_CHECK] = -1,
+	[ACPI_HEST_TYPE_IA32_NMI] = sizeof(struct acpi_hest_ia_nmi),
+	[ACPI_HEST_TYPE_AER_ROOT_PORT] = sizeof(struct acpi_hest_aer_root),
+	[ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
+	[ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
+	[ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
+};
+
+static int hest_esrc_len(const struct acpi_hest_header *hest_hdr)
+{
+	u16 hest_type = hest_hdr->type;
+	int len;
+
+	if (hest_type >= ACPI_HEST_TYPE_RESERVED)
+		return 0;
+
+	len = hest_esrc_len_tab[hest_type];
+
+	if (hest_type == ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) {
+		const struct acpi_hest_ia_corrected *cmc =
+			container_of(hest_hdr,
+				     const struct acpi_hest_ia_corrected,
+				     header);
+
+		len = sizeof(*cmc) + cmc->num_hardware_banks *
+		      sizeof(struct acpi_hest_ia_error_bank);
+	} else if (hest_type == ACPI_HEST_TYPE_IA32_CHECK) {
+		const struct acpi_hest_ia_machine_check *mc =
+			container_of(hest_hdr,
+				     const struct acpi_hest_ia_machine_check,
+				     header);
+
+		len = sizeof(*mc) + mc->num_hardware_banks *
+		      sizeof(struct acpi_hest_ia_error_bank);
+	}
+	BUG_ON(len == -1);
+
+	return len;
+};
+
+int apei_hest_parse(apei_hest_func_t func, void *data)
+{
+	struct acpi_hest_header *hest_hdr;
+	int i, rc, len;
+
+	if (hest_disable || !hest_tab)
+		return -EINVAL;
+
+	hest_hdr = (struct acpi_hest_header *)(hest_tab + 1);
+	for (i = 0; i < hest_tab->error_source_count; i++) {
+		len = hest_esrc_len(hest_hdr);
+		if (!len) {
+			printk(XENLOG_WARNING HEST_PFX
+			       "Unknown or unused hardware error source "
+			       "type: %d for hardware error source: %d\n",
+			       hest_hdr->type, hest_hdr->source_id);
+			return -EINVAL;
+		}
+		if ((void *)hest_hdr + len >
+		    (void *)hest_tab + hest_tab->header.length) {
+			printk(XENLOG_WARNING HEST_PFX
+			       "Table contents overflow for hardware error source: %d\n",
+			       hest_hdr->source_id);
+			return -EINVAL;
+		}
+
+		rc = func(hest_hdr, data);
+		if (rc)
+			return rc;
+
+		hest_hdr = (void *)hest_hdr + len;
+	}
+
+	return 0;
+}
+
+/*
+ * Check if firmware advertises firmware first mode. We need FF bit to be set
+ * along with a set of MC banks which work in FF mode.
+ */
+static int __init hest_parse_cmc(const struct acpi_hest_header *hest_hdr,
+				 void *data)
+{
+#ifdef CONFIG_X86_MCE
+	unsigned int i;
+	const struct acpi_hest_ia_corrected *cmc;
+	const struct acpi_hest_ia_error_bank *mc_bank;
+
+	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+		return 0;
+
+	cmc = container_of(hest_hdr, const struct acpi_hest_ia_corrected, header);
+	if (!cmc->enabled)
+		return 0;
+
+	/*
+	 * We expect HEST to provide a list of MC banks that report errors
+	 * in firmware first mode. Otherwise, return non-zero value to
+	 * indicate that we are done parsing HEST.
+	 */
+	if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || !cmc->num_hardware_banks)
+		return 1;
+
+	printk(XENLOG_INFO HEST_PFX "Enabling Firmware First mode for corrected errors.\n");
+
+	mc_bank = (const struct acpi_hest_ia_error_bank *)(cmc + 1);
+	for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+		mce_disable_bank(mc_bank->bank_number);
+#else
+# define acpi_disable_cmcff 1
+#endif
+
+	return 1;
+}
+
+void __init acpi_hest_init(void)
+{
+	acpi_status status;
+	acpi_physical_address hest_addr;
+	acpi_native_uint hest_len;
+
+	if (acpi_disabled)
+		return;
+
+	if (hest_disable) {
+		printk(XENLOG_INFO HEST_PFX "Table parsing disabled.\n");
+		return;
+	}
+
+	status = acpi_get_table_phys(ACPI_SIG_HEST, 0, &hest_addr, &hest_len);
+	if (status == AE_NOT_FOUND)
+		goto err;
+	if (ACPI_FAILURE(status)) {
+		printk(XENLOG_ERR HEST_PFX "Failed to get table, %s\n",
+		       acpi_format_exception(status));
+		goto err;
+	}
+	map_pages_to_xen((unsigned long)__va(hest_addr), PFN_DOWN(hest_addr),
+			 PFN_UP(hest_addr + hest_len) - PFN_DOWN(hest_addr),
+			 PAGE_HYPERVISOR);
+	hest_tab = __va(hest_addr);
+
+	if (!acpi_disable_cmcff)
+		apei_hest_parse(hest_parse_cmc, NULL);
+
+	printk(XENLOG_INFO HEST_PFX "Table parsing has been initialized\n");
+	return;
+err:
+	hest_disable = 1;
+}
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -1069,6 +1069,106 @@ void __hwdom_init setup_hwdom_pci_device
     spin_unlock(&pcidevs_lock);
 }
 
+#ifdef CONFIG_ACPI
+#include <acpi/acpi.h>
+#include <acpi/apei.h>
+
+static int hest_match_pci(const struct acpi_hest_aer_common *p,
+                          const struct pci_dev *pdev)
+{
+    return ACPI_HEST_SEGMENT(p->bus) == pdev->seg &&
+           ACPI_HEST_BUS(p->bus)     == pdev->bus &&
+           p->device                 == PCI_SLOT(pdev->devfn) &&
+           p->function               == PCI_FUNC(pdev->devfn);
+}
+
+static bool_t hest_match_type(const struct acpi_hest_header *hest_hdr,
+                              const struct pci_dev *pdev)
+{
+    unsigned int pos = pci_find_cap_offset(pdev->seg, pdev->bus,
+                                           PCI_SLOT(pdev->devfn),
+                                           PCI_FUNC(pdev->devfn),
+                                           PCI_CAP_ID_EXP);
+    u8 pcie = MASK_EXTR(pci_conf_read16(pdev->seg, pdev->bus,
+                                        PCI_SLOT(pdev->devfn),
+                                        PCI_FUNC(pdev->devfn),
+                                        pos + PCI_EXP_FLAGS),
+                        PCI_EXP_FLAGS_TYPE);
+
+    switch ( hest_hdr->type )
+    {
+    case ACPI_HEST_TYPE_AER_ROOT_PORT:
+        return pcie == PCI_EXP_TYPE_ROOT_PORT;
+    case ACPI_HEST_TYPE_AER_ENDPOINT:
+        return pcie == PCI_EXP_TYPE_ENDPOINT;
+    case ACPI_HEST_TYPE_AER_BRIDGE:
+        return pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), PCI_CLASS_DEVICE) ==
+               PCI_CLASS_BRIDGE_PCI;
+    }
+
+    return 0;
+}
+
+struct aer_hest_parse_info {
+    const struct pci_dev *pdev;
+    bool_t firmware_first;
+};
+
+static bool_t hest_source_is_pcie_aer(const struct acpi_hest_header *hest_hdr)
+{
+    if ( hest_hdr->type == ACPI_HEST_TYPE_AER_ROOT_PORT ||
+         hest_hdr->type == ACPI_HEST_TYPE_AER_ENDPOINT ||
+         hest_hdr->type == ACPI_HEST_TYPE_AER_BRIDGE )
+        return 1;
+    return 0;
+}
+
+static int aer_hest_parse(const struct acpi_hest_header *hest_hdr, void *data)
+{
+    struct aer_hest_parse_info *info = data;
+    const struct acpi_hest_aer_common *p;
+    bool_t ff;
+
+    if ( !hest_source_is_pcie_aer(hest_hdr) )
+        return 0;
+
+    p = (const struct acpi_hest_aer_common *)(hest_hdr + 1);
+    ff = !!(p->flags & ACPI_HEST_FIRMWARE_FIRST);
+
+    /*
+     * If no specific device is supplied, determine whether
+     * FIRMWARE_FIRST is set for *any* PCIe device.
+     */
+    if ( !info->pdev )
+    {
+        info->firmware_first |= ff;
+        return 0;
+    }
+
+    /* Otherwise, check the specific device */
+    if ( p->flags & ACPI_HEST_GLOBAL ?
+         hest_match_type(hest_hdr, info->pdev) :
+         hest_match_pci(p, info->pdev) )
+    {
+        info->firmware_first = ff;
+        return 1;
+    }
+
+    return 0;
+}
+
+bool_t pcie_aer_get_firmware_first(const struct pci_dev *pdev)
+{
+    struct aer_hest_parse_info info = { .pdev = pdev };
+
+    return pci_find_cap_offset(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                               PCI_FUNC(pdev->devfn), PCI_CAP_ID_EXP) &&
+           apei_hest_parse(aer_hest_parse, &info) >= 0 &&
+           info.firmware_first;
+}
+#endif
+
 static int _dump_pci_devices(struct pci_seg *pseg, void *arg)
 {
     struct pci_dev *pdev;
--- a/xen/drivers/passthrough/vtd/quirks.c
+++ b/xen/drivers/passthrough/vtd/quirks.c
@@ -386,9 +386,11 @@ void pci_vtd_quirk(const struct pci_dev 
     int dev = PCI_SLOT(pdev->devfn);
     int func = PCI_FUNC(pdev->devfn);
     int pos;
-    u32 val;
+    bool_t ff;
+    u32 val, val2;
     u64 bar;
     paddr_t pa;
+    const char *action;
 
     if ( pci_conf_read16(seg, bus, dev, func, PCI_VENDOR_ID) !=
          PCI_VENDOR_ID_INTEL )
@@ -438,7 +440,10 @@ void pci_vtd_quirk(const struct pci_dev 
                 pos = pci_find_next_ext_capability(seg, bus, pdev->devfn, pos,
                                                    PCI_EXT_CAP_ID_VNDR);
             }
+            ff = 0;
         }
+        else
+            ff = pcie_aer_get_firmware_first(pdev);
         if ( !pos )
         {
             printk(XENLOG_WARNING "%04x:%02x:%02x.%u without AER capability?\n",
@@ -447,18 +452,26 @@ void pci_vtd_quirk(const struct pci_dev 
         }
 
         val = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK);
-        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
-                         val | PCI_ERR_UNC_UNSUP);
-        val = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK);
-        pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
-                         val | PCI_ERR_COR_ADV_NFAT);
+        val2 = pci_conf_read32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK);
+        if ( (val & PCI_ERR_UNC_UNSUP) && (val2 & PCI_ERR_COR_ADV_NFAT) )
+            action = "Found masked";
+        else if ( !ff )
+        {
+            pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_UNCOR_MASK,
+                             val | PCI_ERR_UNC_UNSUP);
+            pci_conf_write32(seg, bus, dev, func, pos + PCI_ERR_COR_MASK,
+                             val2 | PCI_ERR_COR_ADV_NFAT);
+            action = "Masked";
+        }
+        else
+            action = "Must not mask";
 
         /* XPUNCERRMSK Send Completion with Unsupported Request */
         val = pci_conf_read32(seg, bus, dev, func, 0x20c);
         pci_conf_write32(seg, bus, dev, func, 0x20c, val | (1 << 4));
 
-        printk(XENLOG_INFO "Masked UR signaling on %04x:%02x:%02x.%u\n",
-               seg, bus, dev, func);
+        printk(XENLOG_INFO "%s UR signaling on %04x:%02x:%02x.%u\n",
+               action, seg, bus, dev, func);
         break;
 
     case 0x100: case 0x104: case 0x108: /* Sandybridge */
--- a/xen/include/acpi/actbl1.h
+++ b/xen/include/acpi/actbl1.h
@@ -445,6 +445,14 @@ struct acpi_hest_aer_common {
 #define ACPI_HEST_FIRMWARE_FIRST        (1)
 #define ACPI_HEST_GLOBAL                (1<<1)
 
+/*
+ * Macros to access the bus/segment numbers in Bus field above:
+ *  Bus number is encoded in bits 7:0
+ *  Segment number is encoded in bits 23:8
+ */
+#define ACPI_HEST_BUS(bus)              ((bus) & 0xFF)
+#define ACPI_HEST_SEGMENT(bus)          (((bus) >> 8) & 0xFFFF)
+
 /* Hardware Error Notification */
 
 struct acpi_hest_notify {
--- a/xen/include/acpi/apei.h
+++ b/xen/include/acpi/apei.h
@@ -12,6 +12,9 @@
 
 #define FIX_APEI_RANGE_MAX 64
 
+typedef int (*apei_hest_func_t)(const struct acpi_hest_header *, void *);
+int apei_hest_parse(apei_hest_func_t, void *);
+
 int erst_write(const struct cper_record_header *record);
 ssize_t erst_get_record_count(void);
 int erst_get_next_record_id(u64 *record_id);
--- a/xen/include/xen/acpi.h
+++ b/xen/include/xen/acpi.h
@@ -61,6 +61,7 @@ int acpi_boot_init (void);
 int acpi_boot_table_init (void);
 int acpi_numa_init (void);
 int erst_init(void);
+void acpi_hest_init(void);
 
 int acpi_table_init (void);
 int acpi_table_parse(char *id, acpi_table_handler handler);
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -144,6 +144,8 @@ int pci_find_next_ext_capability(int seg
 const char *parse_pci(const char *, unsigned int *seg, unsigned int *bus,
                       unsigned int *dev, unsigned int *func);
 
+bool_t pcie_aer_get_firmware_first(const struct pci_dev *);
+
 struct pirq;
 int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable);
 void msixtbl_pt_unregister(struct domain *, struct pirq *);

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 RFC] VT-d: honor firmware-first mode in XSA-59 workaround code
  2014-05-26 10:19 ` [PATCH v2 " Jan Beulich
@ 2014-06-03 14:38   ` Malcolm Crossley
  0 siblings, 0 replies; 18+ messages in thread
From: Malcolm Crossley @ 2014-06-03 14:38 UTC (permalink / raw)
  To: Jan Beulich, Andrew Cooper
  Cc: Yang Z Zhang, xen-devel, Kevin Tian, Donald D Dugger

On 26/05/14 11:19, Jan Beulich wrote:
> When firmware-first mode is being indicated by firmware, we shouldn't
> be modifying AER registers - these are considered to be owned by
> firmware in that case. Violating this is being reported to result in
> SMI storms. While circumventing the workaround means re-exposing
> affected hosts to the XSA-59 issues, this in any event seems better
> than not booting at all. Respective messages are being issued to the
> log, so the situation can be diagnosed.
> 
> The basic building blocks were taken from Linux 3.15-rc. Note that
> this includes a block of code enclosed in #ifdef CONFIG_X86_MCE - we
> don't define that symbol, and that code also wouldn't build without
> suitable machine check side code added; that should happen eventually,
> but isn't subject of this change.
> 
> Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reported-by: Malcolm Crossley <malcolm.crossley@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Tested-by: Malcolm Crossley <malcolm.crossley@citrix.com>

> ---
> v2: Only check firmware-first for actual AER (i.e. ignore it for host
>     bridges in DMI mode). Use MASK_EXTR() instead of open coding it.
>     Re-word message from "cannot" to "must not".

I tested this patch on an Dell R810 BIOS version 2.7.4, it successfully
detected the firmware first mode and did not mask device 0000:00:00.0.

The other root ports already had the relevant AER errors masked.

(XEN) Must not mask UR signaling on 0000:00:00.0
(XEN) PCI add device 0000:00:00.0
(XEN) Found masked UR signaling on 0000:00:01.0
(XEN) PCI add device 0000:00:01.0
(XEN) Found masked UR signaling on 0000:00:02.0
(XEN) PCI add device 0000:00:02.0
(XEN) Found masked UR signaling on 0000:00:03.0
(XEN) PCI add device 0000:00:03.0
(XEN) Found masked UR signaling on 0000:00:05.0
(XEN) PCI add device 0000:00:05.0
(XEN) Found masked UR signaling on 0000:00:07.0
(XEN) PCI add device 0000:00:07.0
(XEN) Found masked UR signaling on 0000:00:09.0
(XEN) PCI add device 0000:00:09.0
(XEN) Masked VT-d error signaling on 0000:00:14.0

lspci -n output for Dell R810:

00:00.0 0600: 8086:3407 (rev 22)
00:01.0 0604: 8086:3408 (rev 22)
00:02.0 0604: 8086:3409 (rev 22)
00:03.0 0604: 8086:340a (rev 22)
00:05.0 0604: 8086:340c (rev 22)
00:07.0 0604: 8086:340e (rev 22)
00:09.0 0604: 8086:3410 (rev 22)
00:14.0 0800: 8086:342e (rev 22)
00:14.1 0800: 8086:3422 (rev 22)
00:14.2 0800: 8086:3423 (rev 22)

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2014-06-03 14:39 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-21 16:09 [PATCH RFC] VT-d: honor firmware-first mode in XSA-59 workaround code Jan Beulich
2014-05-21 16:33 ` Andrew Cooper
2014-05-22  7:13   ` Jan Beulich
2014-05-22  9:34     ` Andrew Cooper
2014-05-22 10:06       ` Jan Beulich
2014-05-22 10:19         ` Andrew Cooper
2014-05-22 10:33           ` Jan Beulich
2014-05-22 14:19             ` Andrew Cooper
2014-05-23  1:03               ` Zhang, Yang Z
2014-05-23  6:13                 ` Jan Beulich
2014-05-23  6:40                   ` Zhang, Yang Z
2014-05-23  2:32 ` Zhang, Yang Z
2014-05-23  6:22   ` Jan Beulich
2014-05-23  6:46     ` Zhang, Yang Z
2014-05-23  7:15       ` Jan Beulich
2014-05-26  0:39         ` Zhang, Yang Z
2014-05-26 10:19 ` [PATCH v2 " Jan Beulich
2014-06-03 14:38   ` Malcolm Crossley

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).