LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 16/22] ppc/eeh: do reset based on PE
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

The patch implements reset based on PE instead of eeh device. Also,
The functions used to retrieve the reset type, either hot or fundamental
reset, have been reworked for a little bit. More specificly, it's
implemented based the the eeh device traverse function.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/ppc-pci.h   |    2 +-
 arch/powerpc/platforms/pseries/eeh.c |   91 +++++++++++++---------------------
 2 files changed, 35 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 5e34b10..2a80f08 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -53,7 +53,7 @@ void pci_addr_cache_remove_device(struct pci_dev *dev);
 struct pci_dev *pci_addr_cache_get_device(unsigned long addr);
 void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
-int eeh_reset_pe(struct eeh_dev *);
+int eeh_reset_pe(struct eeh_pe *);
 int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
 void eeh_pe_state_mark(struct eeh_pe *pe, int state);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 4572361..56a022b 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -455,17 +455,24 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
  */
 int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
 {
-	struct device_node *dn = pci_device_to_OF_node(dev);
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+	struct eeh_pe *pe = edev->pe;
+
+	if (!pe) {
+		pr_err("%s: No PE found on PCI device %s\n",
+			__func__, pci_name(dev));
+		return -EINVAL;
+	}
 
 	switch (state) {
 	case pcie_deassert_reset:
-		eeh_ops->reset(dn, EEH_RESET_DEACTIVATE);
+		eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
 		break;
 	case pcie_hot_reset:
-		eeh_ops->reset(dn, EEH_RESET_HOT);
+		eeh_ops->reset(pe, EEH_RESET_HOT);
 		break;
 	case pcie_warm_reset:
-		eeh_ops->reset(dn, EEH_RESET_FUNDAMENTAL);
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
 		break;
 	default:
 		return -EINVAL;
@@ -475,66 +482,37 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
 }
 
 /**
- * __eeh_set_pe_freset - Check the required reset for child devices
- * @parent: parent device
- * @freset: return value
- *
- * Each device might have its preferred reset type: fundamental or
- * hot reset. The routine is used to collect the information from
- * the child devices so that they could be reset accordingly.
- */
-void __eeh_set_pe_freset(struct device_node *parent, unsigned int *freset)
-{
-	struct device_node *dn;
-
-	for_each_child_of_node(parent, dn) {
-		if (of_node_to_eeh_dev(dn)) {
-			struct pci_dev *dev = of_node_to_eeh_dev(dn)->pdev;
-
-			if (dev && dev->driver)
-				*freset |= dev->needs_freset;
-
-			__eeh_set_pe_freset(dn, freset);
-		}
-	}
-}
-
-/**
- * eeh_set_pe_freset - Check the required reset for the indicated device and its children
- * @dn: parent device
- * @freset: return value
+ * eeh_set_pe_freset - Check the required reset for the indicated device
+ * @data: EEH device
+ * @flag: return value
  *
  * Each device might have its preferred reset type: fundamental or
  * hot reset. The routine is used to collected the information for
  * the indicated device and its children so that the bunch of the
  * devices could be reset properly.
  */
-void eeh_set_pe_freset(struct device_node *dn, unsigned int *freset)
+static void *eeh_set_dev_freset(void *data, void *flag)
 {
 	struct pci_dev *dev;
-	dn = eeh_find_device_pe(dn);
-
-	/* Back up one, since config addrs might be shared */
-	if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
-		dn = dn->parent;
+	unsigned int *freset = (unsigned int *)flag;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
 
-	dev = of_node_to_eeh_dev(dn)->pdev;
+	dev = eeh_dev_to_pci_dev(edev);
 	if (dev)
 		*freset |= dev->needs_freset;
 
-	__eeh_set_pe_freset(dn, freset);
+	return NULL;
 }
 
 /**
  * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second
- * @edev: pci device node to be reset.
+ * @pe: EEH PE
  *
  * Assert the PCI #RST line for 1/4 second.
  */
-static void eeh_reset_pe_once(struct eeh_dev *edev)
+static void eeh_reset_pe_once(struct eeh_pe *pe)
 {
 	unsigned int freset = 0;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
 
 	/* Determine type of EEH reset required for
 	 * Partitionable Endpoint, a hot-reset (1)
@@ -542,12 +520,12 @@ static void eeh_reset_pe_once(struct eeh_dev *edev)
 	 * A fundamental reset required by any device under
 	 * Partitionable Endpoint trumps hot-reset.
   	 */
-	eeh_set_pe_freset(dn, &freset);
+	eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
 
 	if (freset)
-		eeh_ops->reset(dn, EEH_RESET_FUNDAMENTAL);
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
 	else
-		eeh_ops->reset(dn, EEH_RESET_HOT);
+		eeh_ops->reset(pe, EEH_RESET_HOT);
 
 	/* The PCI bus requires that the reset be held high for at least
 	 * a 100 milliseconds. We wait a bit longer 'just in case'.
@@ -559,9 +537,9 @@ static void eeh_reset_pe_once(struct eeh_dev *edev)
 	 * pci slot reset line is dropped. Make sure we don't miss
 	 * these, and clear the flag now.
 	 */
-	eeh_clear_slot(dn, EEH_MODE_ISOLATED);
+	eeh_pe_state_clear(pe, EEH_MODE_ISOLATED);
 
-	eeh_ops->reset(dn, EEH_RESET_DEACTIVATE);
+	eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
 
 	/* After a PCI slot has been reset, the PCI Express spec requires
 	 * a 1.5 second idle time for the bus to stabilize, before starting
@@ -573,32 +551,31 @@ static void eeh_reset_pe_once(struct eeh_dev *edev)
 
 /**
  * eeh_reset_pe - Reset the indicated PE
- * @edev: PCI device associated EEH device
+ * @pe: EEH PE
  *
  * This routine should be called to reset indicated device, including
  * PE. A PE might include multiple PCI devices and sometimes PCI bridges
  * might be involved as well.
  */
-int eeh_reset_pe(struct eeh_dev *edev)
+int eeh_reset_pe(struct eeh_pe *pe)
 {
 	int i, rc;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
 
 	/* Take three shots at resetting the bus */
 	for (i=0; i<3; i++) {
-		eeh_reset_pe_once(edev);
+		eeh_reset_pe_once(pe);
 
-		rc = eeh_ops->wait_state(dn, PCI_BUS_RESET_WAIT_MSEC);
+		rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
 		if (rc == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE))
 			return 0;
 
 		if (rc < 0) {
-			printk(KERN_ERR "EEH: unrecoverable slot failure %s\n",
-			       dn->full_name);
+			pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x",
+				__func__, pe->phb->global_number, pe->addr);
 			return -1;
 		}
-		printk(KERN_ERR "EEH: bus reset %d failed on slot %s, rc=%d\n",
-		       i+1, dn->full_name, rc);
+		pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n",
+			i+1, pe->phb->global_number, pe->addr, rc);
 	}
 
 	return -1;
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 12/22] ppc/eeh: trace error based on PE from beginning
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

There're 2 conditions to trigger EEH error detection: invalid value
returned from reading I/O or config space. On each case, the function
eeh_dn_check_failure will be called to initialize EEH event and put
it into the poll for further processing.

The patch changes the function for a little bit so that the EEH error
will be traced based on PE instead of EEH device any more. Also, the
function eeh_find_device_pe() has been removed since the eeh device
is tracing the PE by struct eeh_dev::pe.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/ppc-pci.h   |    1 -
 arch/powerpc/platforms/pseries/eeh.c |   51 +++++++++++++--------------------
 arch/powerpc/platforms/pseries/msi.c |    6 +++-
 3 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index c7e5bd6..3e301b1 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -59,7 +59,6 @@ int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
 void eeh_pe_state_mark(struct eeh_pe *pe, int state);
 void eeh_pe_state_clear(struct eeh_pe *pe, int state);
-struct device_node *eeh_find_device_pe(struct device_node *dn);
 
 void eeh_sysfs_add_device(struct pci_dev *pdev);
 void eeh_sysfs_remove_device(struct pci_dev *pdev);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 9c623c2..f210160 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -264,21 +264,6 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 }
 
 /**
- * eeh_find_device_pe - Retrieve the PE for the given device
- * @dn: device node
- *
- * Return the PE under which this device lies
- */
-struct device_node *eeh_find_device_pe(struct device_node *dn)
-{
-	while (dn->parent && of_node_to_eeh_dev(dn->parent) &&
-	       (of_node_to_eeh_dev(dn->parent)->mode & EEH_MODE_SUPPORTED)) {
-		dn = dn->parent;
-	}
-	return dn;
-}
-
-/**
  * eeh_dn_check_failure - Check if all 1's data is due to EEH slot freeze
  * @dn: device node
  * @dev: pci device, if known
@@ -297,6 +282,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 {
 	int ret;
 	unsigned long flags;
+	struct eeh_pe *pe;
 	struct eeh_dev *edev;
 	int rc = 0;
 	const char *location;
@@ -306,23 +292,26 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	if (!eeh_subsystem_enabled)
 		return 0;
 
-	if (!dn) {
+	if (dn) {
+		edev = of_node_to_eeh_dev(dn);
+	} else if (dev) {
+		edev = pci_dev_to_eeh_dev(dev);
+		dn = pci_device_to_OF_node(dev);
+	} else {
 		eeh_stats.no_dn++;
 		return 0;
 	}
-	dn = eeh_find_device_pe(dn);
-	edev = of_node_to_eeh_dev(dn);
+	pe = edev->pe;
 
 	/* Access to IO BARs might get this far and still not want checking. */
-	if (!(edev->mode & EEH_MODE_SUPPORTED) ||
-	    edev->mode & EEH_MODE_NOCHECK) {
+	if (!pe) {
 		eeh_stats.ignored_check++;
-		pr_debug("EEH: Ignored check (%x) for %s %s\n",
-			edev->mode, eeh_pci_name(dev), dn->full_name);
+		pr_debug("EEH: Ignored check for %s %s\n",
+			eeh_pci_name(dev), dn->full_name);
 		return 0;
 	}
 
-	if (!edev->config_addr && !edev->pe_config_addr) {
+	if (!pe->addr && !pe->config_addr) {
 		eeh_stats.no_cfg_addr++;
 		return 0;
 	}
@@ -335,13 +324,13 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	 */
 	raw_spin_lock_irqsave(&confirm_error_lock, flags);
 	rc = 1;
-	if (edev->mode & EEH_MODE_ISOLATED) {
-		edev->check_count++;
-		if (edev->check_count % EEH_MAX_FAILS == 0) {
+	if (pe->state & EEH_PE_ISOLATED) {
+		pe->check_count++;
+		if (pe->check_count % EEH_MAX_FAILS == 0) {
 			location = of_get_property(dn, "ibm,loc-code", NULL);
 			printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
 				"location=%s driver=%s pci addr=%s\n",
-				edev->check_count, location,
+				pe->check_count, location,
 				eeh_driver_name(dev), eeh_pci_name(dev));
 			printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
 				eeh_driver_name(dev));
@@ -357,7 +346,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	 * function zero of a multi-function device.
 	 * In any case they must share a common PHB.
 	 */
-	ret = eeh_ops->get_state(dn, NULL);
+	ret = eeh_ops->get_state(pe, NULL);
 
 	/* Note that config-io to empty slots may fail;
 	 * they are empty when they don't have children.
@@ -370,7 +359,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
 	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
 		eeh_stats.false_positives++;
-		edev->false_positives ++;
+		pe->false_positives++;
 		rc = 0;
 		goto dn_unlock;
 	}
@@ -381,10 +370,10 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	 * with other functions on this device, and functions under
 	 * bridges.
 	 */
-	eeh_mark_slot(dn, EEH_MODE_ISOLATED);
+	eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
 	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
 
-	eeh_send_failure_event(edev);
+	eeh_send_failure_event(pe);
 
 	/* Most EEH events are due to device driver bugs.  Having
 	 * a stack trace will help the device-driver authors figure
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 109fdb7..c8534fa 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -210,6 +210,7 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
 static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 {
 	struct device_node *dn;
+	struct eeh_dev *edev;
 
 	/* Found our PE and assume 8 at that point. */
 
@@ -217,7 +218,10 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 	if (!dn)
 		return NULL;
 
-	dn = eeh_find_device_pe(dn);
+	/* Get the top level device in the PE */
+	edev = of_node_to_eeh_dev(dn);
+	edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list);
+	dn = eeh_dev_to_of_node(edev);
 	if (!dn)
 		return NULL;
 
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 13/22] ppc/eeh: eeh options based on PE
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

Originally, all the EEH options were implemented based on OF node.
Actually, it explicitly breaks the rules that the operation target
is PE instead of device. Therefore, the patch makes all the operations
based on PE instead of device.

Unfortunately, the backend for config space has to be kept as original
because it doesn't depend on PE actually.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h               |   14 ++--
 arch/powerpc/platforms/pseries/eeh.c         |   13 ++-
 arch/powerpc/platforms/pseries/eeh_pseries.c |  133 +++++++++++---------------
 3 files changed, 74 insertions(+), 86 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index f86a85f..5e45a1c 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -136,13 +136,13 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
 struct eeh_ops {
 	char *name;
 	int (*init)(void);
-	int (*set_option)(struct device_node *dn, int option);
-	int (*get_pe_addr)(struct device_node *dn);
-	int (*get_state)(struct device_node *dn, int *state);
-	int (*reset)(struct device_node *dn, int option);
-	int (*wait_state)(struct device_node *dn, int max_wait);
-	int (*get_log)(struct device_node *dn, int severity, char *drv_log, unsigned long len);
-	int (*configure_bridge)(struct device_node *dn);
+	int (*set_option)(struct eeh_pe *pe, int option);
+	int (*get_pe_addr)(struct eeh_pe *pe);
+	int (*get_state)(struct eeh_pe *pe, int *state);
+	int (*reset)(struct eeh_pe *pe, int option);
+	int (*wait_state)(struct eeh_pe *pe, int max_wait);
+	int (*get_log)(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len);
+	int (*configure_bridge)(struct eeh_pe *pe);
 	int (*read_config)(struct device_node *dn, int where, int size, u32 *val);
 	int (*write_config)(struct device_node *dn, int where, int size, u32 val);
 };
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index f210160..3c8658e 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -729,6 +729,7 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 	const u32 *regs;
 	int enable;
 	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+	struct eeh_pe pe;
 
 	edev->class_code = 0;
 	edev->mode = 0;
@@ -755,9 +756,14 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 	 */
 	regs = of_get_property(dn, "reg", NULL);
 	if (regs) {
+		/* Initialize the fake PE */
+		memset(&pe, 0, sizeof(struct eeh_pe));
+		pe.phb = edev->phb;
+		pe.config_addr = regs[0];
+
 		/* First register entry is addr (00BBSS00)  */
 		/* Try to enable eeh */
-		ret = eeh_ops->set_option(dn, EEH_OPT_ENABLE);
+		ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
 
 		enable = 0;
 		if (ret == 0) {
@@ -766,14 +772,15 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 			/* If the newer, better, ibm,get-config-addr-info is supported, 
 			 * then use that instead.
 			 */
-			edev->pe_config_addr = eeh_ops->get_pe_addr(dn);
+			edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
+			pe.addr = edev->pe_config_addr;
 
 			/* Some older systems (Power4) allow the
 			 * ibm,set-eeh-option call to succeed even on nodes
 			 * where EEH is not supported. Verify support
 			 * explicitly.
 			 */
-			ret = eeh_ops->get_state(dn, NULL);
+			ret = eeh_ops->get_state(&pe, NULL);
 			if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
 				enable = 1;
 		}
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index cf6d6cc..fdeef77 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -134,22 +134,18 @@ static int pseries_eeh_init(void)
 
 /**
  * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable
- * @dn: device node
+ * @pe: EEH PE
  * @option: operation to be issued
  *
  * The function is used to control the EEH functionality globally.
  * Currently, following options are support according to PAPR:
  * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
  */
-static int pseries_eeh_set_option(struct device_node *dn, int option)
+static int pseries_eeh_set_option(struct eeh_pe *pe, int option)
 {
 	int ret = 0;
-	struct eeh_dev *edev;
-	const u32 *reg;
 	int config_addr;
 
-	edev = of_node_to_eeh_dev(dn);
-
 	/*
 	 * When we're enabling or disabling EEH functioality on
 	 * the particular PE, the PE config address is possibly
@@ -159,15 +155,11 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
 	switch (option) {
 	case EEH_OPT_DISABLE:
 	case EEH_OPT_ENABLE:
-		reg = of_get_property(dn, "reg", NULL);
-		config_addr = reg[0];
-		break;
-
 	case EEH_OPT_THAW_MMIO:
 	case EEH_OPT_THAW_DMA:
-		config_addr = edev->config_addr;
-		if (edev->pe_config_addr)
-			config_addr = edev->pe_config_addr;
+		config_addr = pe->config_addr;
+		if (pe->addr)
+			config_addr = pe->addr;
 		break;
 
 	default:
@@ -177,15 +169,15 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
 	}
 
 	ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
-			config_addr, BUID_HI(edev->phb->buid),
-			BUID_LO(edev->phb->buid), option);
+			config_addr, BUID_HI(pe->phb->buid),
+			BUID_LO(pe->phb->buid), option);
 
 	return ret;
 }
 
 /**
  * pseries_eeh_get_pe_addr - Retrieve PE address
- * @dn: device node
+ * @pe: EEH PE
  *
  * Retrieve the assocated PE address. Actually, there're 2 RTAS
  * function calls dedicated for the purpose. We need implement
@@ -196,14 +188,11 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
  * It's notable that zero'ed return value means invalid PE config
  * address.
  */
-static int pseries_eeh_get_pe_addr(struct device_node *dn)
+static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
 {
-	struct eeh_dev *edev;
 	int ret = 0;
 	int rets[3];
 
-	edev = of_node_to_eeh_dev(dn);
-
 	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
 		/*
 		 * First of all, we need to make sure there has one PE
@@ -211,18 +200,18 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 		 * meaningless.
 		 */
 		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-				edev->config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), 1);
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 1);
 		if (ret || (rets[0] == 0))
 			return 0;
 
 		/* Retrieve the associated PE config address */
 		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-				edev->config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), 0);
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 0);
 		if (ret) {
-			pr_warning("%s: Failed to get PE address for %s\n",
-				__func__, dn->full_name);
+			pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n",
+				__func__, pe->phb->global_number, pe->config_addr);
 			return 0;
 		}
 
@@ -231,11 +220,11 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 
 	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
-				edev->config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), 0);
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 0);
 		if (ret) {
-			pr_warning("%s: Failed to get PE address for %s\n",
-				__func__, dn->full_name);
+			pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n",
+				__func__, pe->phb->global_number, pe->config_addr);
 			return 0;
 		}
 
@@ -247,7 +236,7 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 
 /**
  * pseries_eeh_get_state - Retrieve PE state
- * @dn: PE associated device node
+ * @pe: EEH PE
  * @state: return value
  *
  * Retrieve the state of the specified PE. On RTAS compliant
@@ -258,30 +247,28 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
  * RTAS calls for the purpose, we need to try the new one and back
  * to the old one if the new one couldn't work properly.
  */
-static int pseries_eeh_get_state(struct device_node *dn, int *state)
+static int pseries_eeh_get_state(struct eeh_pe *pe, int *state)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 	int rets[4];
 	int result;
 
 	/* Figure out PE config address if possible */
-	edev = of_node_to_eeh_dev(dn);
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) {
 		/* Fake PE unavailable info */
 		rets[2] = 0;
 		ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else {
 		return EEH_STATE_NOT_SUPPORT;
 	}
@@ -333,34 +320,32 @@ static int pseries_eeh_get_state(struct device_node *dn, int *state)
 
 /**
  * pseries_eeh_reset - Reset the specified PE
- * @dn: PE associated device node
+ * @pe: EEH PE
  * @option: reset option
  *
  * Reset the specified PE
  */
-static int pseries_eeh_reset(struct device_node *dn, int option)
+static int pseries_eeh_reset(struct eeh_pe *pe, int option)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 
 	/* Figure out PE address */
-	edev = of_node_to_eeh_dev(dn);
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	/* Reset PE through RTAS call */
 	ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
-			config_addr, BUID_HI(edev->phb->buid),
-			BUID_LO(edev->phb->buid), option);
+			config_addr, BUID_HI(pe->phb->buid),
+			BUID_LO(pe->phb->buid), option);
 
 	/* If fundamental-reset not supported, try hot-reset */
 	if (option == EEH_RESET_FUNDAMENTAL &&
 	    ret == -8) {
 		ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), EEH_RESET_HOT);
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), EEH_RESET_HOT);
 	}
 
 	return ret;
@@ -368,13 +353,13 @@ static int pseries_eeh_reset(struct device_node *dn, int option)
 
 /**
  * pseries_eeh_wait_state - Wait for PE state
- * @dn: PE associated device node
+ * @pe: EEH PE
  * @max_wait: maximal period in microsecond
  *
  * Wait for the state of associated PE. It might take some time
  * to retrieve the PE's state.
  */
-static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
+static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait)
 {
 	int ret;
 	int mwait;
@@ -391,7 +376,7 @@ static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
 #define EEH_STATE_MAX_WAIT_TIME	(300 * 1000)
 
 	while (1) {
-		ret = pseries_eeh_get_state(dn, &mwait);
+		ret = pseries_eeh_get_state(pe, &mwait);
 
 		/*
 		 * If the PE's state is temporarily unavailable,
@@ -426,7 +411,7 @@ static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
 
 /**
  * pseries_eeh_get_log - Retrieve error log
- * @dn: device node
+ * @pe: EEH PE
  * @severity: temporary or permanent error log
  * @drv_log: driver log to be combined with retrieved error log
  * @len: length of driver log
@@ -435,24 +420,22 @@ static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
  * Actually, the error will be retrieved through the dedicated
  * RTAS call.
  */
-static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_log, unsigned long len)
+static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	unsigned long flags;
 	int ret;
 
-	edev = of_node_to_eeh_dev(dn);
 	spin_lock_irqsave(&slot_errbuf_lock, flags);
 	memset(slot_errbuf, 0, eeh_error_buf_size);
 
 	/* Figure out the PE address */
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr,
-			BUID_HI(edev->phb->buid), BUID_LO(edev->phb->buid),
+			BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid),
 			virt_to_phys(drv_log), len,
 			virt_to_phys(slot_errbuf), eeh_error_buf_size,
 			severity);
@@ -465,40 +448,38 @@ static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_l
 
 /**
  * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE
- * @dn: PE associated device node
+ * @pe: EEH PE
  *
  * The function will be called to reconfigure the bridges included
  * in the specified PE so that the mulfunctional PE would be recovered
  * again.
  */
-static int pseries_eeh_configure_bridge(struct device_node *dn)
+static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 
 	/* Figure out the PE address */
-	edev = of_node_to_eeh_dev(dn);
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	/* Use new configure-pe function, if supported */
 	if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_configure_bridge, 3, 1, NULL,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else {
 		return -EFAULT;
 	}
 
 	if (ret)
-		pr_warning("%s: Unable to configure bridge %d for %s\n",
-			__func__, ret, dn->full_name);
+		pr_warning("%s: Unable to configure bridge PHB#%d-PE#%x (%d)\n",
+			__func__, pe->phb->global_number, pe->addr, ret);
 
 	return ret;
 }
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 11/22] ppc/eeh: trace EEH state based on PE
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

Since we've introduced dedicated struct to trace individual PEs,
it's reasonable to trace its state through the dedicated struct
instead of using "eeh_dev" any more.

The patches implements the state tracing based on PE. It's notable
that the PE state will be applied to the specified PE as well as
its child PEs. That complies with the rule that problematic parent
PE will prevent those child PEs from working properly.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h          |    3 +
 arch/powerpc/include/asm/ppc-pci.h      |    4 +-
 arch/powerpc/platforms/pseries/eeh.c    |  102 -------------------------------
 arch/powerpc/platforms/pseries/eeh_pe.c |   79 ++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 104 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 250ae27..f86a85f 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -67,6 +67,9 @@ struct eeh_pe {
 	struct list_head child;		/* Child PEs			*/
 };
 
+#define eeh_pe_for_each_dev(pe, edev) \
+		list_for_each_entry(edev, &pe->edevs, list)
+
 /*
  * The struct is used to trace EEH state for the associated
  * PCI device node or PCI device. In future, it might
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 80fa704..c7e5bd6 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -57,8 +57,8 @@ int eeh_reset_pe(struct eeh_dev *);
 void eeh_restore_bars(struct eeh_dev *);
 int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
-void eeh_mark_slot(struct device_node *dn, int mode_flag);
-void eeh_clear_slot(struct device_node *dn, int mode_flag);
+void eeh_pe_state_mark(struct eeh_pe *pe, int state);
+void eeh_pe_state_clear(struct eeh_pe *pe, int state);
 struct device_node *eeh_find_device_pe(struct device_node *dn);
 
 void eeh_sysfs_add_device(struct pci_dev *pdev);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index b60863b..9c623c2 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -279,108 +279,6 @@ struct device_node *eeh_find_device_pe(struct device_node *dn)
 }
 
 /**
- * __eeh_mark_slot - Mark all child devices as failed
- * @parent: parent device
- * @mode_flag: failure flag
- *
- * Mark all devices that are children of this device as failed.
- * Mark the device driver too, so that it can see the failure
- * immediately; this is critical, since some drivers poll
- * status registers in interrupts ... If a driver is polling,
- * and the slot is frozen, then the driver can deadlock in
- * an interrupt context, which is bad.
- */
-static void __eeh_mark_slot(struct device_node *parent, int mode_flag)
-{
-	struct device_node *dn;
-
-	for_each_child_of_node(parent, dn) {
-		if (of_node_to_eeh_dev(dn)) {
-			/* Mark the pci device driver too */
-			struct pci_dev *dev = of_node_to_eeh_dev(dn)->pdev;
-
-			of_node_to_eeh_dev(dn)->mode |= mode_flag;
-
-			if (dev && dev->driver)
-				dev->error_state = pci_channel_io_frozen;
-
-			__eeh_mark_slot(dn, mode_flag);
-		}
-	}
-}
-
-/**
- * eeh_mark_slot - Mark the indicated device and its children as failed
- * @dn: parent device
- * @mode_flag: failure flag
- *
- * Mark the indicated device and its child devices as failed.
- * The device drivers are marked as failed as well.
- */
-void eeh_mark_slot(struct device_node *dn, int mode_flag)
-{
-	struct pci_dev *dev;
-	dn = eeh_find_device_pe(dn);
-
-	/* Back up one, since config addrs might be shared */
-	if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
-		dn = dn->parent;
-
-	of_node_to_eeh_dev(dn)->mode |= mode_flag;
-
-	/* Mark the pci device too */
-	dev = of_node_to_eeh_dev(dn)->pdev;
-	if (dev)
-		dev->error_state = pci_channel_io_frozen;
-
-	__eeh_mark_slot(dn, mode_flag);
-}
-
-/**
- * __eeh_clear_slot - Clear failure flag for the child devices
- * @parent: parent device
- * @mode_flag: flag to be cleared
- *
- * Clear failure flag for the child devices.
- */
-static void __eeh_clear_slot(struct device_node *parent, int mode_flag)
-{
-	struct device_node *dn;
-
-	for_each_child_of_node(parent, dn) {
-		if (of_node_to_eeh_dev(dn)) {
-			of_node_to_eeh_dev(dn)->mode &= ~mode_flag;
-			of_node_to_eeh_dev(dn)->check_count = 0;
-			__eeh_clear_slot(dn, mode_flag);
-		}
-	}
-}
-
-/**
- * eeh_clear_slot - Clear failure flag for the indicated device and its children
- * @dn: parent device
- * @mode_flag: flag to be cleared
- *
- * Clear failure flag for the indicated device and its children.
- */
-void eeh_clear_slot(struct device_node *dn, int mode_flag)
-{
-	unsigned long flags;
-	raw_spin_lock_irqsave(&confirm_error_lock, flags);
-	
-	dn = eeh_find_device_pe(dn);
-	
-	/* Back up one, since config addrs might be shared */
-	if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
-		dn = dn->parent;
-
-	of_node_to_eeh_dev(dn)->mode &= ~mode_flag;
-	of_node_to_eeh_dev(dn)->check_count = 0;
-	__eeh_clear_slot(dn, mode_flag);
-	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
-}
-
-/**
  * eeh_dn_check_failure - Check if all 1's data is due to EEH slot freeze
  * @dn: device node
  * @dev: pci device, if known
diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
index 0629ae5..6e33f03 100644
--- a/arch/powerpc/platforms/pseries/eeh_pe.c
+++ b/arch/powerpc/platforms/pseries/eeh_pe.c
@@ -388,3 +388,82 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 
 	return 0;
 }
+
+/**
+ * __eeh_pe_state_mark - Mark the state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to mark the indicated state for the given
+ * PE. Also, the associated PCI devices will be put into IO frozen
+ * state as well.
+ */
+static void *__eeh_pe_state_mark(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	int state = *((int *)flag);
+	struct eeh_dev *tmp;
+	struct pci_dev *pdev;
+
+	/*
+	 * Mark the PE with the indicated state. Also,
+	 * the associated PCI device will be put into
+	 * I/O frozen state to avoid I/O accesses from
+	 * the PCI device driver.
+	 */
+	pe->state |= state;
+	eeh_pe_for_each_dev(pe, tmp) {
+		pdev = eeh_dev_to_pci_dev(tmp);
+		if (pdev)
+			pdev->error_state = pci_channel_io_frozen;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_state_mark - Mark specified state for PE and its associated device
+ * @pe: EEH PE
+ *
+ * EEH error affects the current PE and its child PEs. The function
+ * is used to mark appropriate state for the affected PEs and the
+ * associated devices.
+ */
+void eeh_pe_state_mark(struct eeh_pe *pe, int state)
+{
+	eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
+}
+
+/**
+ * __eeh_pe_state_clear - Clear state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to clear the indicated state from the
+ * given PE. Besides, we also clear the check count of the PE
+ * as well.
+ */
+static void *__eeh_pe_state_clear(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	int state = *((int *)flag);
+
+	pe->state &= ~state;
+	pe->check_count = 0;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_state_clear - Clear state for the PE and its children
+ * @pe: PE
+ * @state: state to be cleared
+ *
+ * When the PE and its children has been recovered from error,
+ * we need clear the error state for that. The function is used
+ * for the purpose.
+ */
+void eeh_pe_state_clear(struct eeh_pe *pe, int state)
+{
+	eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
+}
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 09/22] ppc/eeh: remove PE at appropriate time
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

During PCI hotplug and EEH recovery, the PE hierarchy tree might be
changed due to the PCI topology changes. At later point when the
PCI device is added, the PE will be created dynamically again.

The patch introduces new function to remove EEH devices from the
associated PE. That also can cause that the parent PE is removed
from the PE tree if the parent PE doesn't include valid EEH devices
and child PEs.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h          |    1 +
 arch/powerpc/platforms/pseries/eeh.c    |    1 +
 arch/powerpc/platforms/pseries/eeh_pe.c |   47 +++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 6b13790..250ae27 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -167,6 +167,7 @@ static inline void eeh_unlock(void)
 typedef void *(*eeh_traverse_func)(void *data, void *flag);
 int __devinit eeh_phb_pe_create(struct pci_controller *phb);
 int eeh_add_to_parent_pe(struct eeh_dev *edev);
+int eeh_rmv_from_parent_pe(struct eeh_dev *edev);
 
 void * __devinit eeh_dev_init(struct device_node *dn, void *data);
 void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 8f214906..b60863b 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -1156,6 +1156,7 @@ static void eeh_remove_device(struct pci_dev *dev)
 	dev->dev.archdata.edev = NULL;
 	pci_dev_put(dev);
 
+	eeh_rmv_from_parent_pe(edev);
 	pci_addr_cache_remove_device(dev);
 	eeh_sysfs_remove_device(dev);
 }
diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
index 08b62de..0629ae5 100644
--- a/arch/powerpc/platforms/pseries/eeh_pe.c
+++ b/arch/powerpc/platforms/pseries/eeh_pe.c
@@ -341,3 +341,50 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 
 	return 0;
 }
+
+/**
+ * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE
+ * @edev: EEH device
+ *
+ * The PE hierarchy tree might be changed when doing PCI hotplug.
+ * Also, the PCI devices or buses could be removed from the system
+ * during EEH recovery. So we have to call the function remove the
+ * corresponding PE accordingly if necessary.
+ */
+int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
+{
+	struct eeh_pe *pe, *parent;
+
+	if (!edev->pe) {
+		pr_warning("%s: No PE found for EEH device %s\n",
+			__func__, edev->dn->full_name);
+		return -EEXIST;
+	}
+
+	/* Remove the EEH device */
+	pe = edev->pe;
+	edev->pe = NULL;
+	list_del(&edev->list);
+
+	/*
+	 * Check if the parent PE includes any EEH devices.
+	 * If not, we should delete that. Also, we should
+	 * delete the parent PE if it doesn't have associated
+	 * child PEs and EEH devices.
+	 */
+	while (1) {
+		parent = pe->parent;
+		if (pe->type == EEH_PE_PHB)
+			break;
+
+		if (list_empty(&pe->edevs) &&
+		    list_empty(&pe->child_list)) {
+			list_del(&pe->child);
+			kfree(pe);
+		}
+
+		pe = parent;
+	}
+
+	return 0;
+}
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 07/22] ppc/eeh: Search PE based on requirement
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

The patch implements searching PE based on the following
requirements:

 * Search PE according to PE address, which is traditional
   PE address that is composed of PCI bus/device/function
   number, or unified PE address assigned by firmware or
   platform.
 * Search parent PE according to the given EEH device. It's
   useful when creating new PE and put it into right position.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h          |    1 +
 arch/powerpc/platforms/pseries/eeh_pe.c |  143 +++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 7b9c7d6..1cc1388 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -164,6 +164,7 @@ static inline void eeh_unlock(void)
  */
 #define EEH_MAX_ALLOWED_FREEZES 5
 
+typedef void *(*eeh_traverse_func)(void *data, void *flag);
 int __devinit eeh_phb_pe_create(struct pci_controller *phb);
 
 void * __devinit eeh_dev_init(struct device_node *dn, void *data);
diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
index 535788e..6b53fb8 100644
--- a/arch/powerpc/platforms/pseries/eeh_pe.c
+++ b/arch/powerpc/platforms/pseries/eeh_pe.c
@@ -118,3 +118,146 @@ static struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
 
 	return NULL;
 }
+
+/**
+ * eeh_pe_next - Retrieve the next PE in the tree
+ * @pe: current PE
+ * @root: root PE
+ *
+ * The function is used to retrieve the next PE in the
+ * hierarchy PE tree.
+ */
+static struct eeh_pe *eeh_pe_next(struct eeh_pe *pe,
+				  struct eeh_pe *root)
+{
+	struct list_head *next = pe->child_list.next;
+
+	if (next == &pe->child_list) {
+		while (1) {
+			if (pe == root)
+				return NULL;
+			next = pe->child.next;
+			if (next != &pe->parent->child_list)
+				break;
+			pe = pe->parent;
+		}
+	}
+
+	return list_entry(next, struct eeh_pe, child);
+}
+
+/**
+ * eeh_pe_traverse - Traverse PEs in the specified PHB
+ * @root: root PE
+ * @fn: callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the specified PE and its
+ * child PEs. The traversing is to be terminated once the
+ * callback returns something other than NULL, or no more PEs
+ * to be traversed.
+ */
+static void *eeh_pe_traverse(struct eeh_pe *root,
+			eeh_traverse_func fn, void *flag)
+{
+	struct eeh_pe *pe;
+	void *ret;
+
+	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+		ret = fn(pe, flag);
+		if (ret) return ret;
+	}
+
+	return NULL;
+}
+
+/**
+ * __eeh_pe_get - Check the PE address
+ * @data: EEH PE
+ * @flag: EEH device
+ *
+ * For one particular PE, it can be identified by PE address
+ * or tranditional BDF address. BDF address is composed of
+ * Bus/Device/Function number. The extra data referred by flag
+ * indicates which type of address should be used.
+ */
+static void *__eeh_pe_get(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	struct eeh_dev *edev = (struct eeh_dev *)flag;
+
+	/* Unexpected PHB PE */
+	if (pe->type == EEH_PE_PHB)
+		return NULL;
+
+	/* We prefer PE address */
+	if (edev->pe_config_addr &&
+	   (edev->pe_config_addr == pe->addr))
+		return pe;
+
+	/* Try BDF address */
+	if (edev->pe_config_addr &&
+	   (edev->config_addr == pe->config_addr))
+		return pe;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_get - Search PE based on the given address
+ * @edev: EEH device
+ *
+ * Search the corresponding PE based on the specified address which
+ * is included in the eeh device. The function is used to check if
+ * the associated PE has been created against the PE address. It's
+ * notable that the PE address has 2 format: traditional PE address
+ * which is composed of PCI bus/device/function number, or unified
+ * PE address.
+ */
+static struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
+{
+	struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
+	struct eeh_pe *pe;
+
+	eeh_lock();
+	pe = eeh_pe_traverse(root, __eeh_pe_get, edev);
+	eeh_unlock();
+
+	return pe;
+}
+
+/**
+ * eeh_pe_get_parent - Retrieve the parent PE
+ * @edev: EEH device
+ *
+ * The whole PEs existing in the system are organized as hierarchy
+ * tree. The function is used to retrieve the parent PE according
+ * to the parent EEH device.
+ */
+static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
+{
+	struct device_node *dn;
+	struct eeh_dev *parent;
+
+	/*
+	 * It might have the case for the indirect parent
+	 * EEH device already having associated PE, but
+	 * the direct parent EEH device doesn't have yet.
+	 */
+	dn = edev->dn->parent;
+	while (dn) {
+		/* We're poking out of PCI territory */
+		if (!PCI_DN(dn)) return NULL;
+
+		parent = of_node_to_eeh_dev(dn);
+		/* We're poking out of PCI territory */
+		if (!parent) return NULL;
+
+		if (parent->pe)
+			return parent->pe;
+
+		dn = dn->parent;
+	}
+
+	return NULL;
+}
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 08/22] ppc/eeh: create PEs duing EEH initialization
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

The patch creates PEs and associated the newly created PEs with
it parent/silbing as well as EEH devices. It would become more
straight to trace EEH errors and recover them accordingly.

Once the EEH functionality on one PCI IOA has been enabled, we
tries to create PE against it. If there's existing PE, to which
the current PCI IOA should be attached, the existing PE will be
converted from "device" type to "bus" type accordingly.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h          |    1 +
 arch/powerpc/platforms/pseries/eeh.c    |    6 ++
 arch/powerpc/platforms/pseries/eeh_pe.c |   80 +++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 1cc1388..6b13790 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -166,6 +166,7 @@ static inline void eeh_unlock(void)
 
 typedef void *(*eeh_traverse_func)(void *data, void *flag);
 int __devinit eeh_phb_pe_create(struct pci_controller *phb);
+int eeh_add_to_parent_pe(struct eeh_dev *edev);
 
 void * __devinit eeh_dev_init(struct device_node *dn, void *data);
 void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 0ba7e3b..8f214906 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -895,6 +895,8 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 			eeh_subsystem_enabled = 1;
 			edev->mode |= EEH_MODE_SUPPORTED;
 
+			eeh_add_to_parent_pe(edev);
+
 			pr_debug("EEH: %s: eeh enabled, config=%x pe_config=%x\n",
 				 dn->full_name, edev->config_addr,
 				 edev->pe_config_addr);
@@ -908,6 +910,10 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 				/* Parent supports EEH. */
 				edev->mode |= EEH_MODE_SUPPORTED;
 				edev->config_addr = of_node_to_eeh_dev(dn->parent)->config_addr;
+				edev->pe_config_addr = of_node_to_eeh_dev(dn->parent)->pe_config_addr;
+
+				eeh_add_to_parent_pe(edev);
+
 				return NULL;
 			}
 		}
diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
index 6b53fb8..08b62de 100644
--- a/arch/powerpc/platforms/pseries/eeh_pe.c
+++ b/arch/powerpc/platforms/pseries/eeh_pe.c
@@ -261,3 +261,83 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
 
 	return NULL;
 }
+
+/**
+ * eeh_add_to_parent_pe - Add EEH device to parent PE
+ * @edev: EEH device
+ *
+ * Add EEH device to the parent PE. If the parent PE already
+ * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
+ * we have to create new PE to hold the EEH device and the new
+ * PE will be linked to its parent PE as well.
+ */
+int eeh_add_to_parent_pe(struct eeh_dev *edev)
+{
+	struct eeh_pe *pe, *parent;
+
+	/*
+	 * Search the PE has been existing or not according
+	 * to the PE address. If that has been existing, the
+	 * PE should be composed of PCI bus and its subordinate
+	 * components.
+	 */
+	pe = eeh_pe_get(edev);
+	if (pe) {
+		if (!edev->pe_config_addr) {
+			pr_err("%s: PE with addr 0x%x already exists\n",
+				__func__, edev->config_addr);
+			return -EEXIST;
+		}
+
+		/* Mark the PE as type of PCI bus */
+		pe->type = EEH_PE_BUS;
+		edev->pe = pe;
+
+		/* Put the edev to PE */
+		list_add_tail(&edev->list, &pe->edevs);
+		pr_debug("EEH: Add %s to Bus PE#%x\n",
+			edev->dn->full_name, pe->addr);
+
+		return 0;
+	}
+
+	/* Create a new EEH PE */
+	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
+	if (!pe) {
+		pr_err("%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+	pe->addr	= edev->pe_config_addr;
+	pe->config_addr	= edev->config_addr;
+
+	/*
+	 * Put the new EEH PE into hierarchy tree. If the parent
+	 * can't be found, the newly created PE will be attached
+	 * to PHB directly. Otherwise, we have to associate the
+	 * PE with its parent.
+	 */
+	parent = eeh_pe_get_parent(edev);
+	if (!parent) {
+		parent = eeh_phb_pe_get(edev->phb);
+		if (!parent) {
+			pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
+				__func__, edev->phb->global_number);
+			edev->pe = NULL;
+			kfree(pe);
+			return -EEXIST;
+		}
+	}
+	pe->parent = parent;
+
+	/*
+	 * Put the newly created PE into the child list and
+	 * link the EEH device accordingly.
+	 */
+	list_add_tail(&pe->child, &parent->child_list);
+	list_add_tail(&edev->list, &pe->edevs);
+	edev->pe = pe;
+	pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+		edev->dn->full_name, pe->addr, pe->parent->addr);
+
+	return 0;
+}
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 06/22] ppc/eeh: Create PEs for PHBs
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

For one particular PE, it's only meaningful in the ancestor PHB
domain. Therefore, each PHB should have its own PE hierarchy tree
to trace those PEs created against the PHB.

The patch creates PEs for the PHBs and put those PEs into the
global link list traced by "eeh_phb_pe". The link list of PEs
would be first level of overall PE hierarchy tree across the
system.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h           |    2 +
 arch/powerpc/platforms/pseries/Makefile  |    5 +-
 arch/powerpc/platforms/pseries/eeh_dev.c |    4 +
 arch/powerpc/platforms/pseries/eeh_pe.c  |  120 ++++++++++++++++++++++++++++++
 4 files changed, 129 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/platforms/pseries/eeh_pe.c

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 248b3d9..7b9c7d6 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -164,6 +164,8 @@ static inline void eeh_unlock(void)
  */
 #define EEH_MAX_ALLOWED_FREEZES 5
 
+int __devinit eeh_phb_pe_create(struct pci_controller *phb);
+
 void * __devinit eeh_dev_init(struct device_node *dn, void *data);
 void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb);
 int __init eeh_ops_register(struct eeh_ops *ops);
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index c222189..890622b 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -6,8 +6,9 @@ obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
 			   firmware.o power.o dlpar.o mobility.o
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_SCANLOG)	+= scanlog.o
-obj-$(CONFIG_EEH)	+= eeh.o eeh_dev.o eeh_cache.o eeh_driver.o \
-			   eeh_event.o eeh_sysfs.o eeh_pseries.o
+obj-$(CONFIG_EEH)	+= eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
+			   eeh_driver.o eeh_event.o eeh_sysfs.o \
+			   eeh_pseries.o
 obj-$(CONFIG_KEXEC)	+= kexec.o
 obj-$(CONFIG_PCI)	+= pci.o pci_dlpar.o
 obj-$(CONFIG_PSERIES_MSI)	+= msi.o
diff --git a/arch/powerpc/platforms/pseries/eeh_dev.c b/arch/powerpc/platforms/pseries/eeh_dev.c
index a0cee3a..6644234 100644
--- a/arch/powerpc/platforms/pseries/eeh_dev.c
+++ b/arch/powerpc/platforms/pseries/eeh_dev.c
@@ -65,6 +65,7 @@ void * __devinit eeh_dev_init(struct device_node *dn, void *data)
 	PCI_DN(dn)->edev = edev;
 	edev->dn  = dn;
 	edev->phb = phb;
+	INIT_LIST_HEAD(&edev->list);
 
 	return NULL;
 }
@@ -80,6 +81,9 @@ void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb)
 {
 	struct device_node *dn = phb->dn;
 
+	/* EEH PE for PHB */
+	eeh_phb_pe_create(phb);
+
 	/* EEH device for PHB */
 	eeh_dev_init(dn, phb);
 
diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
new file mode 100644
index 0000000..535788e
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_pe.c
@@ -0,0 +1,120 @@
+/*
+ * The file intends to implement PE based on the information from
+ * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device.
+ * All the PEs should be organized as hierarchy tree. The first level
+ * of the tree will be associated to existing PHBs since the particular
+ * PE is only meaningful in one PHB domain.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+static LIST_HEAD(eeh_phb_pe);
+
+/**
+ * eeh_pe_alloc - Allocate PE
+ * @phb: PCI controller
+ * @type: PE type
+ *
+ * Allocate PE instance dynamically.
+ */
+static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type)
+{
+	struct eeh_pe *pe;
+
+	/* Allocate PHB PE */
+	pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL);
+	if (!pe) return NULL; 
+
+	/* Initialize PHB PE */
+	pe->type = type;
+	pe->phb = phb;
+	INIT_LIST_HEAD(&pe->child_list);
+	INIT_LIST_HEAD(&pe->child);
+	INIT_LIST_HEAD(&pe->edevs);
+
+	return pe;
+}
+
+/**
+ * eeh_phb_pe_create - Create PHB PE 
+ * @phb: PCI controller
+ *
+ * The function should be called while the PHB is detected during
+ * system boot or PCI hotplug in order to create PHB PE.
+ */
+int __devinit eeh_phb_pe_create(struct pci_controller *phb)
+{
+	struct eeh_pe *pe;
+
+	/* Allocate PHB PE */
+	pe = eeh_pe_alloc(phb, EEH_PE_PHB);
+	if (!pe) {
+		pr_err("%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+ 
+	/* Put it into the list */
+	eeh_lock();
+	list_add_tail(&pe->child, &eeh_phb_pe);
+	eeh_unlock();
+
+	pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
+
+	return 0;
+}
+
+/**
+ * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB
+ * @phb: PCI controller
+ *
+ * The overall PEs form hierarchy tree. The first layer of the
+ * hierarchy tree is composed of PHB PEs. The function is used
+ * to retrieve the corresponding PHB PE according to the given PHB.
+ */
+static struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
+{
+	struct eeh_pe *pe;
+
+	eeh_lock();
+
+	list_for_each_entry(pe, &eeh_phb_pe, child) {
+		/*
+		 * Actually, we needn't check the type since
+		 * the PE for PHB has been determined when that
+		 * was created.
+		 */
+		if (pe->type == EEH_PE_PHB &&
+		    pe->phb == phb) {
+			eeh_unlock();
+			return pe;
+		}
+	}
+
+	eeh_unlock();
+
+	return NULL;
+}
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 05/22] ppc/eeh: introduce global mutex
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

The patch introduces global mutex for EEH so that the core data
structures can be protected by that. Also, 2 inline functions
are exported for that: eeh_lock() and eeh_unlock().

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h       |   15 +++++++++++++++
 arch/powerpc/platforms/pseries/eeh.c |    3 +++
 2 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index f77b6d7..248b3d9 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -146,6 +146,17 @@ struct eeh_ops {
 
 extern struct eeh_ops *eeh_ops;
 extern int eeh_subsystem_enabled;
+extern struct mutex eeh_mutex;
+
+static inline void eeh_lock(void)
+{
+	mutex_lock(&eeh_mutex);
+}
+
+static inline void eeh_unlock(void)
+{
+	mutex_unlock(&eeh_mutex);
+}
 
 /*
  * Max number of EEH freezes allowed before we consider the device
@@ -206,6 +217,10 @@ static inline void eeh_add_device_tree_early(struct device_node *dn) { }
 static inline void eeh_add_device_tree_late(struct pci_bus *bus) { }
 
 static inline void eeh_remove_bus_device(struct pci_dev *dev) { }
+
+static inline void eeh_lock(void) { }
+static inline void eeh_unlock(void) { }
+
 #define EEH_POSSIBLE_ERROR(val, type) (0)
 #define EEH_IO_ERROR_VALUE(size) (-1UL)
 #endif /* CONFIG_EEH */
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index e819448..0ba7e3b 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -92,6 +92,9 @@ struct eeh_ops *eeh_ops = NULL;
 int eeh_subsystem_enabled;
 EXPORT_SYMBOL(eeh_subsystem_enabled);
 
+/* Global EEH mutex */
+DEFINE_MUTEX(eeh_mutex);
+
 /* Lock to avoid races due to multiple reports of an error */
 static DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 04/22] ppc/eeh: Introduce eeh_pe struct
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

As defined in PAPR 2.4, Partitionable Endpoint (PE) is an I/O subtree
that can be treated as a unit for the purposes of partitioning and error
recovery. Therefore, eeh core should be aware of PE. With eeh_pe struct,
we can support PE explicitly. Further more, it makes all the stuff much
more data centralized. Another important reason is for eeh core to support
multiple platforms. Some of them like pSeries figures out PEs through
OF nodes while others like powernv have to do that through PCI bus/device
tree. With explicit PE support, eeh core will be implemented based on
the centrialized data and platform dependent implementations figure it
out by their feasible ways.

When the struct is designed, following factors are taken in account:
  * Reflecting the relationships of PEs. PE might have parent
    as well children.
  * Reflecting the association of PE and (eeh) devices.
  * PEs have PHB boundary.
  * PE should have unique address assigned in the corresponding
    PHB domain.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h |   38 ++++++++++++++++++++++++++++++++++++++
 1 files changed, 38 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 06dedff..f77b6d7 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -32,6 +32,42 @@ struct device_node;
 #ifdef CONFIG_EEH
 
 /*
+ * The struct is used to trace PE related EEH functionality.
+ * In theory, there will have one instance of the struct to
+ * be created against particular PE. In nature, PEs corelate
+ * to each other. the struct has to reflect that hierarchy in
+ * order to easily pick up those affected PEs when one particular
+ * PE has EEH errors.
+ *
+ * Also, one particular PE might be composed of PCI device, PCI
+ * bus and its subordinate components. The struct also need ship
+ * the information. Further more, one particular PE is only meaingful
+ * in the corresponding PHB. Therefore, the root PEs should be created
+ * against existing PHBs in on-to-one fashion.
+ */
+#define EEH_PE_PHB	1	/* PHB PE    */
+#define EEH_PE_DEVICE 	2	/* Device PE */
+#define EEH_PE_BUS	3	/* Bus PE    */
+
+#define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
+#define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
+
+struct eeh_pe {
+	int type;			/* PE type: PHB/Bus/Device	*/
+	int state;			/* PE EEH dependent mode	*/
+	int config_addr;		/* Traditional PCI address	*/
+	int addr;			/* PE configuration address	*/
+	struct pci_controller *phb;	/* Associated PHB		*/
+	int check_count;		/* Times of ignored error	*/
+	int freeze_count;		/* Times of froze up		*/
+	int false_positives;		/* Times of reported #ff's	*/
+	struct eeh_pe *parent;		/* Parent PE			*/
+	struct list_head child_list;	/* Link PE to the child list	*/
+	struct list_head edevs;		/* Link list of EEH devices	*/
+	struct list_head child;		/* Child PEs			*/
+};
+
+/*
  * The struct is used to trace EEH state for the associated
  * PCI device node or PCI device. In future, it might
  * represent PE as well so that the EEH device to form
@@ -53,6 +89,8 @@ struct eeh_dev {
 	int freeze_count;		/* Times of froze up		*/
 	int false_positives;		/* Times of reported #ff's	*/
 	u32 config_space[16];		/* Saved PCI config space	*/
+	struct eeh_pe *pe;		/* Associated PE		*/
+	struct list_head list;		/* Form link list in the PE	*/
 	struct pci_controller *phb;	/* Associated PHB		*/
 	struct device_node *dn;		/* Associated device node	*/
 	struct pci_dev *pdev;		/* Associated PCI device	*/
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 03/22] ppc/eeh: more logs for EEH initialization
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

The patch adds more logs to EEH initialization functions for
debugging purpose. Also, the machine type (pSeries) is checked
in the platform initialization to assure it's the correct platform
to invoke it.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/eeh_dev.c     |    2 ++
 arch/powerpc/platforms/pseries/eeh_pseries.c |   14 +++++++++++++-
 2 files changed, 15 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_dev.c b/arch/powerpc/platforms/pseries/eeh_dev.c
index 8e3443b..a0cee3a 100644
--- a/arch/powerpc/platforms/pseries/eeh_dev.c
+++ b/arch/powerpc/platforms/pseries/eeh_dev.c
@@ -100,6 +100,8 @@ static int __init eeh_dev_phb_init(void)
 	list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
 		eeh_dev_phb_init_dynamic(phb);
 
+	pr_info("EEH: devices created\n");
+
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 5e2805a..cf6d6cc 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -561,7 +561,19 @@ static struct eeh_ops pseries_eeh_ops = {
  */
 static int __init eeh_pseries_init(void)
 {
-	return eeh_ops_register(&pseries_eeh_ops);
+	int ret = -EINVAL;
+
+	if (!machine_is(pseries))
+		return ret;
+
+	ret = eeh_ops_register(&pseries_eeh_ops);
+	if (!ret)
+		pr_info("EEH: pSeries platform initialized\n");
+	else
+		pr_info("EEH: pSeries platform initialization failure (%d)\n",
+			ret);
+
+	return ret;
 }
 
 early_initcall(eeh_pseries_init);
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 02/22] ppc/eeh: use slab to allocate eeh devices
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan
In-Reply-To: <1347093863-6319-1-git-send-email-shangw@linux.vnet.ibm.com>

The EEH initialization functions have been postponed until slab/slub
are ready. So we use slab/slub to allocate the memory chunks for newly
creatd EEH devices. That would save lots of memory.

The patch also does cleanup to replace "kmalloc" with "kzalloc" so
that we needn't clear the allocated memory chunk explicitly.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/eeh_cache.c |    2 +-
 arch/powerpc/platforms/pseries/eeh_dev.c   |    2 +-
 arch/powerpc/platforms/pseries/eeh_event.c |    2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
index e5ae1c6..f50b717 100644
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ b/arch/powerpc/platforms/pseries/eeh_cache.c
@@ -151,7 +151,7 @@ pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
 			return piar;
 		}
 	}
-	piar = kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+	piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
 	if (!piar)
 		return NULL;
 
diff --git a/arch/powerpc/platforms/pseries/eeh_dev.c b/arch/powerpc/platforms/pseries/eeh_dev.c
index ab68c59..8e3443b 100644
--- a/arch/powerpc/platforms/pseries/eeh_dev.c
+++ b/arch/powerpc/platforms/pseries/eeh_dev.c
@@ -55,7 +55,7 @@ void * __devinit eeh_dev_init(struct device_node *dn, void *data)
 	struct eeh_dev *edev;
 
 	/* Allocate EEH device */
-	edev = zalloc_maybe_bootmem(sizeof(*edev), GFP_KERNEL);
+	edev = kzalloc(sizeof(*edev), GFP_KERNEL);
 	if (!edev) {
 		pr_warning("%s: out of memory\n", __func__);
 		return NULL;
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index fb50631..6132772 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -139,7 +139,7 @@ int eeh_send_failure_event(struct eeh_dev *edev)
 		printk(KERN_ERR "EEH: PCI location = %s\n", location);
 		return 1;
 	}
-	event = kmalloc(sizeof(*event), GFP_ATOMIC);
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
 	if (event == NULL) {
 		printk(KERN_ERR "EEH: out of memory, event not handled\n");
 		return 1;
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 00/22 V4] powerpc/eeh: PE support
From: Gavin Shan @ 2012-09-08  8:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan

The series of patches address explicit PE support as well as probe type
support. For explicit PE support, struct eeh_pe has been introduced.
While designing the struct, following factors have been taken into
account.

   * For one particular PE, it might be composed of single PCI device,
     or multiple PCI devices and its educed children PCI devices (e.g.
     by PCIe bridges). The PE struct has included a linked list to refer
     the included PCI devices. Also, the linked list of devices has relected
     top-to-bottom fasion of the PCI subtree. That's to say, the first device
     in the linked list should be the toppest element in the PCI subtree which
     is being managed by the PE.
   * PEs correlate to each other. So the existing PEs have to form hierarchy
     levels. There're some fields in PE struct (e.g. parent/child/silbing)
     have been introduced for the purpose.
   * For one PE, it's only meaningful in the PHB domain.

In addition, the mechniasm used to do memory bars restore, error report have
been reworked based on PE. The eeh cache has been reworked for a little bit
based on Ben's suggestion to trace eeh device. 

In order for explicit probe support, either OF node or pci device, global
variable and some inline functions are introduced. For pSeries platform, it's
going to support OF node probe and figure out PEs from the corresponding OF
nodes. In contrast, powernv platform has to use pci device probe type since
the PEs are being constructed at PHB fixup time.

The series of patches have been verified on Firebird-L machine using "errinjct"
utility. Here's the command used for that.

errinjct eeh -v -f 0 -p U78AE.001.WZS00M9-P1-C18-L1-T2 -a 0x0 -m 0x0

V3 -> V4:
	* The V4 patches were built on 3.6.RC4 as V3 did.
	* Changelog changes according to Ben's comments. More specificly,
	  change FDT with device tree or similiar terminology.
	* Print return value when failing to register platform dependent EEH
	  operations in eeh_pseries_init().
	* Introduce function eeh_pe_alloc() to allocate instance of EEH PE
	  and initialize its link lists. The PE type and corresponding PHB
	  are also assigned during PE creation time.
	* Change eeh_phb_pe_create() to use eeh_pe_alloc().
	* Introduce function eeh_add_to_parent_pe() to replace the original
	  function eeh_pe_create().
	* Change pr_info() to pr_debug() while associating EEH device with its
	  parent PE to reduce output from system console.
	* Rename eeh_pe_remove() to eeh_rmv_from_parent_pe().
	* Change pr_err() to pr_warning() when we can't find the parent PE for
	  the given EEH device in eeh_rmv_from_parent_pe().
	* Fix the experssion to check if the given PE is PHB sensitive PE in
	  function eeh_rmv_from_parent_pe().
	* Rename EEH_PROBE_MODE_FDT to EEH_PROBE_MODE_DEVTREE.
	* Rename function eeh_probe_mode_fdt() to eeh_probe_mode_devtree().
	* Cleanup on function names for EEH cache so that they have prefix "eeh"
	  and more short. Besides, the printk() has been replaced with pr_warning()
	  or pr_debug().
V2 -> V3:
	* Rebase to 3.6.RC4.
V1 -> V2:
	* Rebase to 3.5.RC4.
	* Use the link list to trace the relationships of PEs, PE and eeh
	  devices according to Ram's suggestion.
	* Simplify the PE tranverse function according to Ram's example.
	* Move EEH initialization around according to Ben's suggestion so
	  that we can do memory allocation through slab.
	* Use kzmalloc() to allocate memory chunks for PE and eeh devices.
	* More booting messages for EEH initialization functions.
	* Introduce global EEH mutex to protect the PEs and eeh devices.
	* Added functions to support PE removal.
	* Comments cleanup
	* Change on the comparison of PE or BDF (Bus/Device/Function)
	  address so that code looks more readable.

-----

arch/powerpc/include/asm/eeh.h               |  136 +++++--
arch/powerpc/include/asm/eeh_event.h         |    6 +-
arch/powerpc/include/asm/pci-bridge.h        |    2 +
arch/powerpc/include/asm/ppc-pci.h           |   20 +-
arch/powerpc/kernel/rtas_pci.c               |    5 +-
arch/powerpc/platforms/pseries/Makefile      |    5 +-
arch/powerpc/platforms/pseries/eeh.c         |  531 +++++------------------
arch/powerpc/platforms/pseries/eeh_cache.c   |   57 ++--
arch/powerpc/platforms/pseries/eeh_dev.c     |   14 +-
arch/powerpc/platforms/pseries/eeh_driver.c  |  235 +++++------
arch/powerpc/platforms/pseries/eeh_event.c   |   54 +--
arch/powerpc/platforms/pseries/eeh_pe.c      |  591 ++++++++++++++++++++++++++
arch/powerpc/platforms/pseries/eeh_pseries.c |  247 ++++++++----
arch/powerpc/platforms/pseries/eeh_sysfs.c   |    9 -
arch/powerpc/platforms/pseries/msi.c         |    6 +-
arch/powerpc/platforms/pseries/pci.c         |    2 +-
arch/powerpc/platforms/pseries/setup.c       |    2 -
17 files changed, 1154 insertions(+), 768 deletions(-)
create mode 100644 arch/powerpc/platforms/pseries/eeh_pe.c

Thanks,
Gavin

^ permalink raw reply

* [PATCH] powerpc: Initialise paca.data_offset with poison
From: Michael Ellerman @ 2012-09-08  1:31 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Anton Blanchard

It's possible for the cpu_possible_mask to change between the time we
initialise the pacas and the time we setup per_cpu areas.

Obviously impossible cpus shouldn't ever be running, but stranger things
have happened. So be paranoid and initialise data_offset with a poison
value in case we don't set it up later.

Based on a patch from Anton Blanchard.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
---
 arch/powerpc/kernel/paca.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index fbe1a12..cd6da85 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -142,6 +142,7 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu)
 	new_paca->hw_cpu_id = 0xffff;
 	new_paca->kexec_state = KEXEC_STATE_NONE;
 	new_paca->__current = &init_task;
+	new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL;
 #ifdef CONFIG_PPC_STD_MMU_64
 	new_paca->slb_shadow_ptr = &slb_shadow[cpu];
 #endif /* CONFIG_PPC_STD_MMU_64 */
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH 2/2] powerpc/e6500: TLB miss handler with hardware tablewalk support
From: Nishanth Aravamudan @ 2012-09-08  0:55 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Scott Wood, linuxppc-dev
In-Reply-To: <1347061804.2385.58.camel@pasglop>

On 08.09.2012 [09:50:04 +1000], Benjamin Herrenschmidt wrote:
<snip>
> BTW. On another note, can you pickup Ananth series for larger address

I think you mean Aneesh here? Just to help Scott find the thread.

Thanks,
Nish

> space (minus the one patch that breaks the BookE build, it shouldn't
> matter) and see if there's any runtime issue on BookE 64 ? (And whether
> the larger address space actually works for you too, using something
> like high up mmap tests)
> 
> Cheers,
> Ben.
> 
> 
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
> 

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/e6500: TLB miss handler with hardware tablewalk support
From: Benjamin Herrenschmidt @ 2012-09-07 23:50 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev
In-Reply-To: <504A7CBC.80803@freescale.com>


> That would be one more cache line that misses need... and the threads
> share cache, so there's no ping-pong.

Ok, keep it that way then.

> >> After all that, do you have some bullets left for the HW designers ?
> 
> They seem to not care much about making our lives easier, only how bad
> the benchmarks will be without it -- and they seem to think TLB miss
> performance is no longer important since we won't take them as often
> with hardware tablewalk.  I suspect they'll be regretting that when they
> see workloads that thrash TLB1's ability to hold 2MiB indirect pages.
> Then it'll probably be "why can't you use larger page tables?" :-P

Didn't you simulate ?

> >>> +tlb_miss_common_e6500:
> >>> +	/*
> >>> +	 * Search if we already have an indirect entry for that virtual
> >>> +	 * address, and if we do, bail out.
> >>> +	 *
> >>> +	 * MAS6:IND should be already set based on MAS4
> >>> +	 */
> >>> +	addi	r10,r11,PERCORE_TLB_LOCK
> >>> +1:	lbarx	r15,0,r10
> >>> +	cmpdi	r15,0
> >>> +	bne	2f
> >>> +	li	r15,1
> >>> +	stbcx.	r15,0,r10
> >>
> >> No need for barriers here ?
> 
> I don't think so.  We're not guarding memory accesses, just the
> tlbsx+tlbwe.  At least on FSL cores those instructions have enough
> internal sync that isync shouldn't be needed (according to the core
> manual tlbsx, tlbwe, and stbcx. all have presync and postsync, so
> nothing else should be able to run at the same time).  And this is
> FSL-specific code. :-)

Sadly...

> >>>  #endif /* CONFIG_PPC64 */
> >>> @@ -377,7 +382,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
> >>>  {
> >>>  	int tsize = mmu_psize_defs[mmu_pte_psize].enc;
> >>>  
> >>> -	if (book3e_htw_enabled) {
> >>> +	if (book3e_htw_mode) {
> >>
> >> Make it if (boot3e_htw_enabled != PPC_HTW_NONE)
> 
> Seems a little verbose, but OK.
> 
> Same with things like this, I guess:
> 	book3e_htw_mode ? "enabled" : "not supported"

Well, it's no longer a boolean so ...

BTW. On another note, can you pickup Ananth series for larger address
space (minus the one patch that breaks the BookE build, it shouldn't
matter) and see if there's any runtime issue on BookE 64 ? (And whether
the larger address space actually works for you too, using something
like high up mmap tests)

Cheers,
Ben.

^ permalink raw reply

* Re: [PATCH] powerpc/booke-64: fix tlbsrx. path in bolted tlb handler
From: Scott Wood @ 2012-09-07 23:03 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <504A4C43.3040206@freescale.com>

On 09/07/2012 02:34 PM, Scott Wood wrote:
> On 09/06/2012 11:23 PM, Benjamin Herrenschmidt wrote:
>> On Tue, 2012-06-12 at 17:02 -0500, Scott Wood wrote:
>>> It was branching to the cleanup part of the non-bolted handler,
>>> which would have been bad if there were any chips with tlbsrx.
>>> that use the bolted handler.
>>
>> Still relevant ? It doesn't apply anymore :-)

It's still relevant -- I'll respin.

-Scott

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/e6500: TLB miss handler with hardware tablewalk support
From: Scott Wood @ 2012-09-07 23:01 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev
In-Reply-To: <504A4C3D.3040709@freescale.com>

On 09/07/2012 02:34 PM, Scott Wood wrote:
> On 09/06/2012 11:41 PM, Benjamin Herrenschmidt wrote:
>>> diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
>>> index eeabcdb..3072aa0 100644
>>> --- a/arch/powerpc/include/asm/mmu-book3e.h
>>> +++ b/arch/powerpc/include/asm/mmu-book3e.h
>>> @@ -264,8 +264,21 @@ extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
>>>  extern int mmu_linear_psize;
>>>  extern int mmu_vmemmap_psize;
>>>  
>>> +struct book3e_tlb_per_core {
>>> +	/* For software way selection, as on Freescale TLB1 */
>>> +	u8 esel_next, esel_max, esel_first;
>>> +
>>> +	/* Per-core spinlock for e6500 TLB handlers (no tlbsrx.) */
>>> +	u8 lock;
>>> +};
>>
>> I'm no fan of the name ... tlb_core_data ?

tlb_core_data is fine with me.

>> Probably don't even need the book3e prefix really.

Right, it's already in a book3e file.

>>>  #if defined(CONFIG_PPC_STD_MMU_64)
>>>  /* 64-bit classic hash table MMU */
>>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>>> index daf813f..4e18bb5 100644
>>> --- a/arch/powerpc/include/asm/paca.h
>>> +++ b/arch/powerpc/include/asm/paca.h
>>> @@ -108,6 +108,12 @@ struct paca_struct {
>>>  	/* Keep pgd in the same cacheline as the start of extlb */
>>>  	pgd_t *pgd __attribute__((aligned(0x80))); /* Current PGD */
>>>  	pgd_t *kernel_pgd;		/* Kernel PGD */
>>> +
>>> +	struct book3e_tlb_per_core tlb_per_core;
>>> +
>>> +	/* Points to the tlb_per_core of the first thread on this core. */
>>> +	struct book3e_tlb_per_core *tlb_per_core_ptr;
>>> +
>>
>> That's gross. Can't you allocate them elsewhere and then populate the
>> PACA pointers ?

That would be one more cache line that misses need... and the threads
share cache, so there's no ping-pong.

>>> @@ -142,6 +173,8 @@ static void check_smt_enabled(void)
>>>  			of_node_put(dn);
>>>  		}
>>>  	}
>>> +
>>> +	setup_tlb_per_core();
>>>  }
>>
>> I'd rather you move that to the caller

OK.

>>> +/*
>>> + * TLB miss handling for e6500 and derivatives, using hardware tablewalk.
>>> + *
>>> + * Linear mapping is bolted: no virtual page table or nested TLB misses
>>> + * Indirect entries in TLB1, hardware loads resulting direct entries
>>> + *    into TLB0
>>> + * No HES or NV hint on TLB1, so we need to do software round-robin
>>> + * No tlbsrx. so we need a spinlock, and we have to deal
>>> + *    with MAS-damage caused by tlbsx
>>
>> Ouch ... so for every indirect entry you have to take a lock, backup the
>> MAS, do a tlbsx, restore the MAS, insert the entry and drop the lock ?

Pretty much (only a couple of the MASes need to be restored).

>> After all that, do you have some bullets left for the HW designers ?

They seem to not care much about making our lives easier, only how bad
the benchmarks will be without it -- and they seem to think TLB miss
performance is no longer important since we won't take them as often
with hardware tablewalk.  I suspect they'll be regretting that when they
see workloads that thrash TLB1's ability to hold 2MiB indirect pages.
Then it'll probably be "why can't you use larger page tables?" :-P

>>> +tlb_miss_common_e6500:
>>> +	/*
>>> +	 * Search if we already have an indirect entry for that virtual
>>> +	 * address, and if we do, bail out.
>>> +	 *
>>> +	 * MAS6:IND should be already set based on MAS4
>>> +	 */
>>> +	addi	r10,r11,PERCORE_TLB_LOCK
>>> +1:	lbarx	r15,0,r10
>>> +	cmpdi	r15,0
>>> +	bne	2f
>>> +	li	r15,1
>>> +	stbcx.	r15,0,r10
>>
>> No need for barriers here ?

I don't think so.  We're not guarding memory accesses, just the
tlbsx+tlbwe.  At least on FSL cores those instructions have enough
internal sync that isync shouldn't be needed (according to the core
manual tlbsx, tlbwe, and stbcx. all have presync and postsync, so
nothing else should be able to run at the same time).  And this is
FSL-specific code. :-)

>>>  #endif /* CONFIG_PPC64 */
>>> @@ -377,7 +382,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
>>>  {
>>>  	int tsize = mmu_psize_defs[mmu_pte_psize].enc;
>>>  
>>> -	if (book3e_htw_enabled) {
>>> +	if (book3e_htw_mode) {
>>
>> Make it if (boot3e_htw_enabled != PPC_HTW_NONE)

Seems a little verbose, but OK.

Same with things like this, I guess:
	book3e_htw_mode ? "enabled" : "not supported"

-Scott

^ permalink raw reply

* Re: [PATCH v2] PCI: use dev->irq instead of dev->pin to enable non MSI/INTx interrupt
From: Bjorn Helgaas @ 2012-09-07 22:08 UTC (permalink / raw)
  To: Zang Roy-R61911
  Cc: Liu Shengzhou-B36685, Wood Scott-B07421,
	linux-pci@vger.kernel.org, akpm@linux-foundation.org,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <3E027F8168735B46AC006B1D0C7BB002079D6153@039-SN2MPN1-012.039d.mgd.msft.net>

On Mon, Aug 6, 2012 at 8:45 PM, Zang Roy-R61911 <r61911@freescale.com> wrote:
>
>
>> -----Original Message-----
>> From: Linuxppc-dev [mailto:linuxppc-dev-bounces+tie-
>> fei.zang=freescale.com@lists.ozlabs.org] On Behalf Of Liu Shengzhou-B36685
>> Sent: Thursday, July 26, 2012 11:45 AM
>> To: bhelgaas@google.com; linux-pci@vger.kernel.org; akpm@linux-
>> foundation.org
>> Cc: Wood Scott-B07421; linuxppc-dev@lists.ozlabs.org; Liu Shengzhou-B36685
>> Subject: RE: [PATCH v2] PCI: use dev->irq instead of dev->pin to enable non
>> MSI/INTx interrupt
>>
>> Hello,
>>
>> A gentle reminder!
>> Any comments are appreciated.
>
> Who can help to review and pick up this patch?

I merged this to a staging branch and will merge it to my "next"
branch as soon as it gets a build/smoke test by Fengguang.  Thanks!

Bjorn

^ permalink raw reply

* Re: [PATCH V3 1/3] drivers/char/tpm: Add new device driver to support IBM vTPM
From: Kent Yoder @ 2012-09-07 17:38 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linux-kernel, Ashley Lai, linux-security-module, tpmdd-devel,
	adlai, James Morris, rcj, linuxppc-dev
In-Reply-To: <1346879505.19098.3.camel@pasglop>

> >   James did accept my pull request, so these are already in
> > security-next...
> 
> For the driver itself, it's not a big issue (though I did found issue
> while reviewing it so it will need another round of updates). For the
> code that changes arch/powerpc, especially prom_init.c, that stuff must
> at the very least be acked by me (or the acting powerpc person if I'm
> away) if it's going to go via a different tree.

  Sorry about that.  Hopefully there won't be any changes there and we
can amend with your ack.

  As for the driver updates, I'd hate to see everyone else's code in the
pull request get delayed yet again.  James, will it be ok to apply the
update on top of security-next?

Thanks,
Kent

> Cheers,
> Ben.
> 

^ permalink raw reply

* Re: [PATCH] scsi/ibmvscsi: /sys/class/scsi_host/hostX/config doesn't show any information
From: Robert Jennings @ 2012-09-07 16:33 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: olaf, Linda Xie, Brian J King, linux-scsi, James E.J. Bottomley,
	linuxppc-dev
In-Reply-To: <1343611985.21647.25.camel@pasglop>

On Sun, Jul 29, 2012 at 8:33 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> scsi/ibmvscsi: Fix host config length field overflow
>
> The length field in the host config packet is only 16-bit long, so
> passing it 0x10000 (64K which is our standard PAGE_SIZE) doesn't
> work and result in an empty config from the server.
>
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> CC: <stable@vger.kernel.org>

James, can this be added to your for-next branch so that we can
also get this to the stable trees?  Thanks.

Acked-by: Robert Jennings <rcj@linux.vnet.ibm.com>

> ---
>
> diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
> index 3a6c474..337e8b3 100644
> --- a/drivers/scsi/ibmvscsi/ibmvscsi.c
> +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
> @@ -1541,6 +1541,9 @@ static int ibmvscsi_do_host_config(struct ibmvscsi_host_data *hostdata,
>
>         host_config = &evt_struct->iu.mad.host_config;
>
> +       /* The transport length field is only 16-bit */
> +       length = min(0xffff, length);
> +
>         /* Set up a lun reset SRP command */
>         memset(host_config, 0x00, sizeof(*host_config));
>         host_config->common.type = VIOSRP_HOST_CONFIG_TYPE;
>
>

^ permalink raw reply

* Re: [PATCH] scsi/ibmvscsi: add module alias for ibmvscsic
From: Robert Jennings @ 2012-09-07 16:16 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linuxppc-dev, olaf, Brian J King, James E.J. Bottomley,
	linux-scsi
In-Reply-To: <1343611946.21647.23.camel@pasglop>

On Sun, Jul 29, 2012 at 8:32 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Wed, 2012-07-18 at 18:49 +0200, olaf@aepfle.de wrote:
>> From: Olaf Hering <olaf@aepfle.de>
>>
>> The driver is named ibmvscsic, at runtime it its name is advertised as
>> ibmvscsi. For this reason mkinitrd wont pickup the driver properly.
>> Reported by IBM during SLES11 beta testing:
>>
>> https://bugzilla.novell.com/show_bug.cgi?id=459933
>> LTC50724
>
> So while this would work, I do wonder however whether we could instead
> fix it by simplifying the whole thing as follow since iSeries is now
> gone and so we don't need split backends anymore:
>
> scsi/ibmvscsi: Remove backend abstraction
>
> Now that the iSeries code is gone the backend abstraction
> in this driver is no longer necessary, which allows us to
> consolidate the driver in one file.
>
> The side effect is that the module name is now ibmvscsi.ko
> which matches the driver hotplug name and fixes auto-load
> issues.
>
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.

I've give this a quick test and I prefer this cleanup, it solves the
initial problem nicely.

James, please consider pulling this patch.  Thanks.

Acked-by: Robert Jennings <rcj@linux.vnet.ibm.com>

^ permalink raw reply

* Re: [PATCH][v3] sata_fsl: add workaround for data length mismatch on freescale V2 controller
From: Kumar Gala @ 2012-09-07 12:37 UTC (permalink / raw)
  To: Shaohui Xie; +Cc: linux-ide, jgarzik, linuxppc-dev, Anju Bhartiya, linux-kernel
In-Reply-To: <1347012095-4404-1-git-send-email-Shaohui.Xie@freescale.com>


On Sep 7, 2012, at 5:01 AM, Shaohui Xie wrote:

> The freescale V2 SATA controller checks if the received data length =
matches
> the programmed length 'ttl', if not, it assumes that this is an error.
> In ATAPI, the 'ttl' is based on max allocation length and not the =
actual
> data transfer length, controller will raise 'DLM' (Data length =
Mismatch)
> error bit in Hstatus register. Along with 'DLM', DE (Device error) and
> FE (fatal Error) bits are also set in Hstatus register, 'E' (Internal =
Error)
> bit is set in Serror register and CE (Command Error) and DE (Device =
error)
> registers have the corresponding bit set. In this condition, we need =
to
> clear errors in following way: in the service routine, based on 'DLM' =
flag,
> HCONTROL[27] operation clears Hstatus, CE and DE registers, clear =
Serror
> register.
>=20
> Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
> Signed-off-by: Anju Bhartiya <Anju.Bhartiya@freescale.com>
> ---
> changes for v3:
> 1. not using uppercase for variable names;
> 2. remove unnecessary parens;
>=20
> changes for v2:
> 1. remove the using of quirk;
> 2. wrap errata codes in condition;
>=20
> drivers/ata/sata_fsl.c |   39 +++++++++++++++++++++++++++++++++++----
> 1 files changed, 35 insertions(+), 4 deletions(-)
>=20
> diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
> index d6577b9..9fbab68 100644
> --- a/drivers/ata/sata_fsl.c
> +++ b/drivers/ata/sata_fsl.c
> @@ -143,6 +143,7 @@ enum {
> 	    FATAL_ERR_CRC_ERR_RX |
> 	    FATAL_ERR_FIFO_OVRFL_TX | FATAL_ERR_FIFO_OVRFL_RX,
>=20
> +	INT_ON_DATA_LENGTH_MISMATCH =3D (1 << 12),
> 	INT_ON_FATAL_ERR =3D (1 << 5),
> 	INT_ON_PHYRDY_CHG =3D (1 << 4),
>=20
> @@ -1181,25 +1182,55 @@ static void sata_fsl_host_intr(struct ata_port =
*ap)
> 	u32 hstatus, done_mask =3D 0;
> 	struct ata_queued_cmd *qc;
> 	u32 SError;
> +	u32 tag;
> +	u32 status_mask =3D INT_ON_ERROR;
>=20
> 	hstatus =3D ioread32(hcr_base + HSTATUS);
>=20
> 	sata_fsl_scr_read(&ap->link, SCR_ERROR, &SError);
>=20
> +	/* Read command completed register */
> +	done_mask =3D ioread32(hcr_base + CC);
> +
> +	/* Workaround for data length mismatch errata */
> +	if (unlikely(hstatus & INT_ON_DATA_LENGTH_MISMATCH)) {
> +		for (tag =3D 0; tag < ATA_MAX_QUEUE; tag++) {
> +			qc =3D ata_qc_from_tag(ap, tag);
> +			if (qc && ata_is_atapi(qc->tf.protocol)) {
> +				u32 hcontrol;
> +#define HCONTROL_CLEAR_ERROR	(1 << 27)

shouldn't we have this #define be part of the enum that the other =
HCONTROL_ bits/flags are part of?

> +				/* Set HControl[27] to clear error =
registers */
> +				hcontrol =3D ioread32(hcr_base + =
HCONTROL);
> +				iowrite32(hcontrol | =
HCONTROL_CLEAR_ERROR,
> +						hcr_base + HCONTROL);
> +
> +				/* Clear HControl[27] */
> +				iowrite32(hcontrol & =
~HCONTROL_CLEAR_ERROR,
> +						hcr_base + HCONTROL);
> +
> +				/* Clear SError[E] bit */
> +				sata_fsl_scr_write(&ap->link, SCR_ERROR,
> +						SError);
> +
> +				/* Ignore fatal error and device error =
*/
> +				status_mask &=3D =
~(INT_ON_SINGL_DEVICE_ERR
> +						| INT_ON_FATAL_ERR);
> +				break;
> +			}
> +		}
> +	}
> +
> 	if (unlikely(SError & 0xFFFF0000)) {
> 		DPRINTK("serror @host_intr : 0x%x\n", SError);
> 		sata_fsl_error_intr(ap);
> 	}
>=20
> -	if (unlikely(hstatus & INT_ON_ERROR)) {
> +	if (unlikely(hstatus & status_mask)) {
> 		DPRINTK("error interrupt!!\n");
> 		sata_fsl_error_intr(ap);
> 		return;
> 	}
>=20
> -	/* Read command completed register */
> -	done_mask =3D ioread32(hcr_base + CC);
> -
> 	VPRINTK("Status of all queues :\n");
> 	VPRINTK("done_mask/CC =3D 0x%x, CA =3D 0x%x, =
CE=3D0x%x,CQ=3D0x%x,apqa=3D0x%x\n",
> 		done_mask,
> --=20
> 1.6.4
>=20
>=20
> --
> To unsubscribe from this list: send the line "unsubscribe =
linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* Re: [PATCH -V8 0/11] arch/powerpc: Add 64TB support to ppc64
From: Aneesh Kumar K.V @ 2012-09-07 11:54 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, paulus
In-Reply-To: <1346982235.2385.33.camel@pasglop>

Benjamin Herrenschmidt <benh@kernel.crashing.org> writes:

> On Thu, 2012-09-06 at 20:59 +0530, Aneesh Kumar K.V wrote:
>> Hi,
>> 
>> This patchset include patches for supporting 64TB with ppc64. I haven't booted
>> this on hardware with 64TB memory yet. But they boot fine on real hardware with
>> less memory. Changes extend VSID bits to 38 bits for a 256MB segment
>> and 26 bits for 1TB segments.
>
> Your series breaks the embedded 64-bit build. You seem to be hard wiring
> dependencies on slice stuff all over 64-bit stuff regardless of the MMU
> type or the value of CONFIG_MM_SLICES.
>
> Also all these:
>
>> +/* 4 bits per slice and we have one slice per 1TB */
>> +#if 0 /* We can't directly include pgtable.h hence this hack */
>> +#define SLICE_ARRAY_SIZE  (PGTABLE_RANGE >> 41)
>> +#else
>> +/* Right now we only support 64TB */
>> +#define SLICE_ARRAY_SIZE  32
>> +#endif
>
> Things are just too horrible. Find a different way of doing it, if
> necessary create a new range define somewhere, whatever but don't leave
> that crap as-is, it's too wrong.
>
> Dropping the series for now.
>

You can drop the patch [PATCH -V8 07/11] arch/powerpc: Make some of the PGTABLE_RANGE dependency explicit
from the series. The above two problems are introduced by that patch and
as such can be looked up as a cleanup. I can rework the patch later. You
should be able to apply series without any conflicts even if you drop
that patch.

-aneesh

^ permalink raw reply

* [PATCH][v3] sata_fsl: add workaround for data length mismatch on freescale V2 controller
From: Shaohui Xie @ 2012-09-07 10:01 UTC (permalink / raw)
  To: jgarzik, linux-ide; +Cc: linuxppc-dev, linux-kernel, Anju Bhartiya, Shaohui Xie

The freescale V2 SATA controller checks if the received data length matches
the programmed length 'ttl', if not, it assumes that this is an error.
In ATAPI, the 'ttl' is based on max allocation length and not the actual
data transfer length, controller will raise 'DLM' (Data length Mismatch)
error bit in Hstatus register. Along with 'DLM', DE (Device error) and
FE (fatal Error) bits are also set in Hstatus register, 'E' (Internal Error)
bit is set in Serror register and CE (Command Error) and DE (Device error)
registers have the corresponding bit set. In this condition, we need to
clear errors in following way: in the service routine, based on 'DLM' flag,
HCONTROL[27] operation clears Hstatus, CE and DE registers, clear Serror
register.

Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
Signed-off-by: Anju Bhartiya <Anju.Bhartiya@freescale.com>
---
changes for v3:
1. not using uppercase for variable names;
2. remove unnecessary parens;

changes for v2:
1. remove the using of quirk;
2. wrap errata codes in condition;

 drivers/ata/sata_fsl.c |   39 +++++++++++++++++++++++++++++++++++----
 1 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index d6577b9..9fbab68 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -143,6 +143,7 @@ enum {
 	    FATAL_ERR_CRC_ERR_RX |
 	    FATAL_ERR_FIFO_OVRFL_TX | FATAL_ERR_FIFO_OVRFL_RX,
 
+	INT_ON_DATA_LENGTH_MISMATCH = (1 << 12),
 	INT_ON_FATAL_ERR = (1 << 5),
 	INT_ON_PHYRDY_CHG = (1 << 4),
 
@@ -1181,25 +1182,55 @@ static void sata_fsl_host_intr(struct ata_port *ap)
 	u32 hstatus, done_mask = 0;
 	struct ata_queued_cmd *qc;
 	u32 SError;
+	u32 tag;
+	u32 status_mask = INT_ON_ERROR;
 
 	hstatus = ioread32(hcr_base + HSTATUS);
 
 	sata_fsl_scr_read(&ap->link, SCR_ERROR, &SError);
 
+	/* Read command completed register */
+	done_mask = ioread32(hcr_base + CC);
+
+	/* Workaround for data length mismatch errata */
+	if (unlikely(hstatus & INT_ON_DATA_LENGTH_MISMATCH)) {
+		for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
+			qc = ata_qc_from_tag(ap, tag);
+			if (qc && ata_is_atapi(qc->tf.protocol)) {
+				u32 hcontrol;
+#define HCONTROL_CLEAR_ERROR	(1 << 27)
+				/* Set HControl[27] to clear error registers */
+				hcontrol = ioread32(hcr_base + HCONTROL);
+				iowrite32(hcontrol | HCONTROL_CLEAR_ERROR,
+						hcr_base + HCONTROL);
+
+				/* Clear HControl[27] */
+				iowrite32(hcontrol & ~HCONTROL_CLEAR_ERROR,
+						hcr_base + HCONTROL);
+
+				/* Clear SError[E] bit */
+				sata_fsl_scr_write(&ap->link, SCR_ERROR,
+						SError);
+
+				/* Ignore fatal error and device error */
+				status_mask &= ~(INT_ON_SINGL_DEVICE_ERR
+						| INT_ON_FATAL_ERR);
+				break;
+			}
+		}
+	}
+
 	if (unlikely(SError & 0xFFFF0000)) {
 		DPRINTK("serror @host_intr : 0x%x\n", SError);
 		sata_fsl_error_intr(ap);
 	}
 
-	if (unlikely(hstatus & INT_ON_ERROR)) {
+	if (unlikely(hstatus & status_mask)) {
 		DPRINTK("error interrupt!!\n");
 		sata_fsl_error_intr(ap);
 		return;
 	}
 
-	/* Read command completed register */
-	done_mask = ioread32(hcr_base + CC);
-
 	VPRINTK("Status of all queues :\n");
 	VPRINTK("done_mask/CC = 0x%x, CA = 0x%x, CE=0x%x,CQ=0x%x,apqa=0x%x\n",
 		done_mask,
-- 
1.6.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox