[PATCH 1/5] powerpc/eeh: Remove EEH_PE_PHB

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/5] powerpc/eeh: Remove EEH_PE_PHB_DEAD
@ 2014-02-21 11:53 Gavin Shan
  2014-02-21 11:53 ` [PATCH 2/5] powerpc/powernv: Remove PNV_EEH_STATE_REMOVED Gavin Shan
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Gavin Shan @ 2014-02-21 11:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan

The PE state (for eeh_pe instance) EEH_PE_PHB_DEAD is duplicate to
EEH_PE_ISOLATED. Originally, those PHB (PHB PE) with EEH_PE_PHB_DEAD
would be removed from the system. However, it's safe to replace
that with EEH_PE_ISOLATED.

The patch also clear EEH_PE_RECOVERING after fenced PHB has been handled,
either failure or success. It makes the PHB PE state consistent with:

	PHB functions normally		  NONE
	PHB has been removed		  EEH_PE_ISOLATED
	PHB fenced, recovery in progress  EEH_PE_ISOLATED | RECOVERING

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h   |    1 -
 arch/powerpc/kernel/eeh.c        |   10 ++--------
 arch/powerpc/kernel/eeh_driver.c |   10 +++++-----
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index d4dd41f..a61b06f 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -53,7 +53,6 @@ struct device_node;
 
 #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
 #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
-#define EEH_PE_PHB_DEAD		(1 << 2)	/* Dead PHB		*/
 
 #define EEH_PE_KEEP		(1 << 8)	/* Keep PE on hotplug	*/
 
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index e7b76a6..f167676 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -232,7 +232,6 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
 {
 	size_t loglen = 0;
 	struct eeh_dev *edev, *tmp;
-	bool valid_cfg_log = true;
 
 	/*
 	 * When the PHB is fenced or dead, it's pointless to collect
@@ -240,12 +239,7 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
 	 * 0xFF's. For ER, we still retrieve the data from the PCI
 	 * config space.
 	 */
-	if (eeh_probe_mode_dev() &&
-	    (pe->type & EEH_PE_PHB) &&
-	    (pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)))
-		valid_cfg_log = false;
-
-	if (valid_cfg_log) {
+	if (!(pe->type & EEH_PE_PHB)) {
 		eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
 		eeh_ops->configure_bridge(pe);
 		eeh_pe_restore_bars(pe);
@@ -309,7 +303,7 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
 
 	/* If the PHB has been in problematic state */
 	eeh_serialize_lock(&flags);
-	if (phb_pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)) {
+	if (phb_pe->state & EEH_PE_ISOLATED) {
 		ret = 0;
 		goto out;
 	}
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7bb30dc..5f50f9c 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -661,8 +661,7 @@ static void eeh_handle_special_event(void)
 				phb_pe = eeh_phb_pe_get(hose);
 				if (!phb_pe) continue;
 
-				eeh_pe_state_mark(phb_pe,
-					EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+				eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
 			}
 
 			eeh_serialize_unlock(flags);
@@ -678,8 +677,7 @@ static void eeh_handle_special_event(void)
 			eeh_remove_event(pe);
 
 			if (rc == EEH_NEXT_ERR_DEAD_PHB)
-				eeh_pe_state_mark(pe,
-					EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+				eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
 			else
 				eeh_pe_state_mark(pe,
 					EEH_PE_ISOLATED | EEH_PE_RECOVERING);
@@ -703,12 +701,14 @@ static void eeh_handle_special_event(void)
 		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
 		    rc == EEH_NEXT_ERR_FENCED_PHB) {
 			eeh_handle_normal_event(pe);
+			eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
 		} else {
 			pci_lock_rescan_remove();
 			list_for_each_entry(hose, &hose_list, list_node) {
 				phb_pe = eeh_phb_pe_get(hose);
 				if (!phb_pe ||
-				    !(phb_pe->state & EEH_PE_PHB_DEAD))
+				    !(phb_pe->state & EEH_PE_ISOLATED) ||
+				    (phb_pe->state & EEH_PE_RECOVERING))
 					continue;
 
 				/* Notify all devices to be down */
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/5] powerpc/powernv: Remove PNV_EEH_STATE_REMOVED
  2014-02-21 11:53 [PATCH 1/5] powerpc/eeh: Remove EEH_PE_PHB_DEAD Gavin Shan
@ 2014-02-21 11:53 ` Gavin Shan
  2014-02-21 11:53 ` [PATCH 3/5] powerpc/powernv: Cleanup on PNV_EEH_STATE_ENABLED Gavin Shan
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 12+ messages in thread
From: Gavin Shan @ 2014-02-21 11:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan

The PHB state PNV_EEH_STATE_REMOVED maintained in pnv_phb isn't
so useful any more and it's duplicated to EEH_PE_ISOLATED. The
patch replaces PNV_EEH_STATE_REMOVED with EEH_PE_ISOLATED.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-ioda.c |   56 ++++++++---------------------
 arch/powerpc/platforms/powernv/pci.h      |    1 -
 2 files changed, 15 insertions(+), 42 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index f514743..0d1d424 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -662,22 +662,6 @@ static void ioda_eeh_phb_diag(struct pci_controller *hose)
 	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
 }
 
-static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
-			       struct eeh_pe **pe)
-{
-	struct eeh_pe *phb_pe;
-
-	phb_pe = eeh_phb_pe_get(hose);
-	if (!phb_pe) {
-		pr_warning("%s Can't find PE for PHB#%d\n",
-			   __func__, hose->global_number);
-		return -EEXIST;
-	}
-
-	*pe = phb_pe;
-	return 0;
-}
-
 static int ioda_eeh_get_pe(struct pci_controller *hose,
 			   u16 pe_no, struct eeh_pe **pe)
 {
@@ -685,7 +669,8 @@ static int ioda_eeh_get_pe(struct pci_controller *hose,
 	struct eeh_dev dev;
 
 	/* Find the PHB PE */
-	if (ioda_eeh_get_phb_pe(hose, &phb_pe))
+	phb_pe = eeh_phb_pe_get(hose);
+	if (!phb_pe)
 		return -EEXIST;
 
 	/* Find the PE according to PE# */
@@ -713,6 +698,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 {
 	struct pci_controller *hose;
 	struct pnv_phb *phb;
+	struct eeh_pe *phb_pe;
 	u64 frozen_pe_no;
 	u16 err_type, severity;
 	long rc;
@@ -729,10 +715,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 	list_for_each_entry(hose, &hose_list, list_node) {
 		/*
 		 * If the subordinate PCI buses of the PHB has been
-		 * removed, we needn't take care of it any more.
+		 * removed or is exactly under error recovery, we
+		 * needn't take care of it any more.
 		 */
 		phb = hose->private_data;
-		if (phb->eeh_state & PNV_EEH_STATE_REMOVED)
+		phb_pe = eeh_phb_pe_get(hose);
+		if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
 			continue;
 
 		rc = opal_pci_next_error(phb->opal_id,
@@ -765,12 +753,6 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		switch (err_type) {
 		case OPAL_EEH_IOC_ERROR:
 			if (severity == OPAL_EEH_SEV_IOC_DEAD) {
-				list_for_each_entry(hose, &hose_list,
-						    list_node) {
-					phb = hose->private_data;
-					phb->eeh_state |= PNV_EEH_STATE_REMOVED;
-				}
-
 				pr_err("EEH: dead IOC detected\n");
 				ret = EEH_NEXT_ERR_DEAD_IOC;
 			} else if (severity == OPAL_EEH_SEV_INF) {
@@ -783,17 +765,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 			break;
 		case OPAL_EEH_PHB_ERROR:
 			if (severity == OPAL_EEH_SEV_PHB_DEAD) {
-				if (ioda_eeh_get_phb_pe(hose, pe))
-					break;
-
+				*pe = phb_pe;
 				pr_err("EEH: dead PHB#%x detected\n",
 					hose->global_number);
-				phb->eeh_state |= PNV_EEH_STATE_REMOVED;
 				ret = EEH_NEXT_ERR_DEAD_PHB;
 			} else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
-				if (ioda_eeh_get_phb_pe(hose, pe))
-					break;
-
+				*pe = phb_pe;
 				pr_err("EEH: fenced PHB#%x detected\n",
 					hose->global_number);
 				ret = EEH_NEXT_ERR_FENCED_PHB;
@@ -813,15 +790,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 			 * fenced PHB so that it can be recovered.
 			 */
 			if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) {
-				if (!ioda_eeh_get_phb_pe(hose, pe)) {
-					pr_err("EEH: Escalated fenced PHB#%x "
-					       "detected for PE#%llx\n",
-						hose->global_number,
-						frozen_pe_no);
-					ret = EEH_NEXT_ERR_FENCED_PHB;
-				} else {
-					ret = EEH_NEXT_ERR_NONE;
-				}
+				*pe = phb_pe;
+				pr_err("EEH: Escalated fenced PHB#%x "
+				       "detected for PE#%llx\n",
+					hose->global_number,
+					frozen_pe_no);
+				ret = EEH_NEXT_ERR_FENCED_PHB;
 			} else {
 				pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
 					(*pe)->addr, (*pe)->phb->global_number);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 13f1942..dbeba3d 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -81,7 +81,6 @@ struct pnv_eeh_ops {
 };
 
 #define PNV_EEH_STATE_ENABLED	(1 << 0)	/* EEH enabled	*/
-#define PNV_EEH_STATE_REMOVED	(1 << 1)	/* PHB removed	*/
 
 #endif /* CONFIG_EEH */
 
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/5] powerpc/powernv: Cleanup on PNV_EEH_STATE_ENABLED
  2014-02-21 11:53 [PATCH 1/5] powerpc/eeh: Remove EEH_PE_PHB_DEAD Gavin Shan
  2014-02-21 11:53 ` [PATCH 2/5] powerpc/powernv: Remove PNV_EEH_STATE_REMOVED Gavin Shan
@ 2014-02-21 11:53 ` Gavin Shan
  2014-02-21 19:56   ` Benjamin Herrenschmidt
  2014-02-21 11:53 ` [PATCH 4/5] powerpc/powernv: Cache PHB diag-data Gavin Shan
  2014-02-21 11:53 ` [PATCH 5/5] powerpc/powernv: Make PHB diag-data output short Gavin Shan
  3 siblings, 1 reply; 12+ messages in thread
From: Gavin Shan @ 2014-02-21 11:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan

The flag PNV_EEH_STATE_ENABLED is put into pnv_phb::eeh_state, which
is protected by CONFIG_EEH. We needn't that. Instead, we can have
pnv_phb::flags and maintain all flags there, which is the purpose
of the patch.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-ioda.c |    2 +-
 arch/powerpc/platforms/powernv/pci.c      |    8 ++------
 arch/powerpc/platforms/powernv/pci.h      |    7 +++----
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 0d1d424..04b4710 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -153,7 +153,7 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
 	}
 #endif
 
-	phb->eeh_state |= PNV_EEH_STATE_ENABLED;
+	phb->flags |= PNV_PHB_FLAG_EEH;
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index b555ebc..437c37d 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -396,7 +396,7 @@ int pnv_pci_cfg_read(struct device_node *dn,
 	if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED))
 		return PCIBIOS_SUCCESSFUL;
 
-	if (phb->eeh_state & PNV_EEH_STATE_ENABLED) {
+	if (phb->flags & PNV_PHB_FLAG_EEH) {
 		if (*val == EEH_IO_ERROR_VALUE(size) &&
 		    eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
 			return PCIBIOS_DEVICE_NOT_FOUND;
@@ -434,12 +434,8 @@ int pnv_pci_cfg_write(struct device_node *dn,
 	}
 
 	/* Check if the PHB got frozen due to an error (no response) */
-#ifdef CONFIG_EEH
-	if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED))
+	if (!(phb->flags & PNV_PHB_FLAG_EEH))
 		pnv_pci_config_check_eeh(phb, dn);
-#else
-	pnv_pci_config_check_eeh(phb, dn);
-#endif
 
 	return PCIBIOS_SUCCESSFUL;
 }
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index dbeba3d..adeb3c4 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -79,24 +79,23 @@ struct pnv_eeh_ops {
 	int (*configure_bridge)(struct eeh_pe *pe);
 	int (*next_error)(struct eeh_pe **pe);
 };
-
-#define PNV_EEH_STATE_ENABLED	(1 << 0)	/* EEH enabled	*/
-
 #endif /* CONFIG_EEH */
 
+#define PNV_PHB_FLAG_EEH	(1 << 0)
+
 struct pnv_phb {
 	struct pci_controller	*hose;
 	enum pnv_phb_type	type;
 	enum pnv_phb_model	model;
 	u64			hub_id;
 	u64			opal_id;
+	int			flags;
 	void __iomem		*regs;
 	int			initialized;
 	spinlock_t		lock;
 
 #ifdef CONFIG_EEH
 	struct pnv_eeh_ops	*eeh_ops;
-	int			eeh_state;
 #endif
 
 #ifdef CONFIG_DEBUG_FS
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/5] powerpc/powernv: Cleanup on PNV_EEH_STATE_ENABLED
  2014-02-21 11:53 ` [PATCH 3/5] powerpc/powernv: Cleanup on PNV_EEH_STATE_ENABLED Gavin Shan
@ 2014-02-21 19:56   ` Benjamin Herrenschmidt
  2014-02-23  2:12     ` Gavin Shan
  0 siblings, 1 reply; 12+ messages in thread
From: Benjamin Herrenschmidt @ 2014-02-21 19:56 UTC (permalink / raw)
  To: Gavin Shan; +Cc: linuxppc-dev

On Fri, 2014-02-21 at 19:53 +0800, Gavin Shan wrote:
> The flag PNV_EEH_STATE_ENABLED is put into pnv_phb::eeh_state, which
> is protected by CONFIG_EEH. We needn't that. Instead, we can have
> pnv_phb::flags and maintain all flags there, which is the purpose
> of the patch.

Can you explain a bit more why we want to create a new flag set
that didn't exist before ? This adds confusion so we need a very
good reason... Do we need to know about the enable state of EEH
even when CNFIG_EEH is not set ?

Cheers,
Ben.

> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---
>  arch/powerpc/platforms/powernv/eeh-ioda.c |    2 +-
>  arch/powerpc/platforms/powernv/pci.c      |    8 ++------
>  arch/powerpc/platforms/powernv/pci.h      |    7 +++----
>  3 files changed, 6 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
> index 0d1d424..04b4710 100644
> --- a/arch/powerpc/platforms/powernv/eeh-ioda.c
> +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
> @@ -153,7 +153,7 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
>  	}
>  #endif
>  
> -	phb->eeh_state |= PNV_EEH_STATE_ENABLED;
> +	phb->flags |= PNV_PHB_FLAG_EEH;
>  
>  	return 0;
>  }
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index b555ebc..437c37d 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -396,7 +396,7 @@ int pnv_pci_cfg_read(struct device_node *dn,
>  	if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED))
>  		return PCIBIOS_SUCCESSFUL;
>  
> -	if (phb->eeh_state & PNV_EEH_STATE_ENABLED) {
> +	if (phb->flags & PNV_PHB_FLAG_EEH) {
>  		if (*val == EEH_IO_ERROR_VALUE(size) &&
>  		    eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
>  			return PCIBIOS_DEVICE_NOT_FOUND;
> @@ -434,12 +434,8 @@ int pnv_pci_cfg_write(struct device_node *dn,
>  	}
>  
>  	/* Check if the PHB got frozen due to an error (no response) */
> -#ifdef CONFIG_EEH
> -	if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED))
> +	if (!(phb->flags & PNV_PHB_FLAG_EEH))
>  		pnv_pci_config_check_eeh(phb, dn);
> -#else
> -	pnv_pci_config_check_eeh(phb, dn);
> -#endif
>  
>  	return PCIBIOS_SUCCESSFUL;
>  }
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index dbeba3d..adeb3c4 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -79,24 +79,23 @@ struct pnv_eeh_ops {
>  	int (*configure_bridge)(struct eeh_pe *pe);
>  	int (*next_error)(struct eeh_pe **pe);
>  };
> -
> -#define PNV_EEH_STATE_ENABLED	(1 << 0)	/* EEH enabled	*/
> -
>  #endif /* CONFIG_EEH */
>  
> +#define PNV_PHB_FLAG_EEH	(1 << 0)
> +
>  struct pnv_phb {
>  	struct pci_controller	*hose;
>  	enum pnv_phb_type	type;
>  	enum pnv_phb_model	model;
>  	u64			hub_id;
>  	u64			opal_id;
> +	int			flags;
>  	void __iomem		*regs;
>  	int			initialized;
>  	spinlock_t		lock;
>  
>  #ifdef CONFIG_EEH
>  	struct pnv_eeh_ops	*eeh_ops;
> -	int			eeh_state;
>  #endif
>  
>  #ifdef CONFIG_DEBUG_FS

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/5] powerpc/powernv: Cleanup on PNV_EEH_STATE_ENABLED
  2014-02-21 19:56   ` Benjamin Herrenschmidt
@ 2014-02-23  2:12     ` Gavin Shan
  0 siblings, 0 replies; 12+ messages in thread
From: Gavin Shan @ 2014-02-23  2:12 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Gavin Shan

On Sat, Feb 22, 2014 at 06:56:49AM +1100, Benjamin Herrenschmidt wrote:
>On Fri, 2014-02-21 at 19:53 +0800, Gavin Shan wrote:
>> The flag PNV_EEH_STATE_ENABLED is put into pnv_phb::eeh_state, which
>> is protected by CONFIG_EEH. We needn't that. Instead, we can have
>> pnv_phb::flags and maintain all flags there, which is the purpose
>> of the patch.
>
>Can you explain a bit more why we want to create a new flag set
>that didn't exist before ? This adds confusion so we need a very
>good reason... Do we need to know about the enable state of EEH
>even when CNFIG_EEH is not set ?
>

The commit log was a bit confusing. We didn't create a new flag
here and I just renamed PNV_EEH_STATE_ENABLED to PNV_PHB_FLAG_EEH.
I'll say more in the commit log about it in next revision.

The flag is needed even we have CONFIG_EEH set because we need
switch to EEH, instead of detecting frozen PE and clearing it
in PCI config accessors after EEH is initialized and loaded :-)

Thanks,
Gavin

>> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/platforms/powernv/eeh-ioda.c |    2 +-
>>  arch/powerpc/platforms/powernv/pci.c      |    8 ++------
>>  arch/powerpc/platforms/powernv/pci.h      |    7 +++----
>>  3 files changed, 6 insertions(+), 11 deletions(-)
>> 
>> diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
>> index 0d1d424..04b4710 100644
>> --- a/arch/powerpc/platforms/powernv/eeh-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
>> @@ -153,7 +153,7 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
>>  	}
>>  #endif
>>  
>> -	phb->eeh_state |= PNV_EEH_STATE_ENABLED;
>> +	phb->flags |= PNV_PHB_FLAG_EEH;
>>  
>>  	return 0;
>>  }
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index b555ebc..437c37d 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -396,7 +396,7 @@ int pnv_pci_cfg_read(struct device_node *dn,
>>  	if (phb_pe && (phb_pe->state & EEH_PE_ISOLATED))
>>  		return PCIBIOS_SUCCESSFUL;
>>  
>> -	if (phb->eeh_state & PNV_EEH_STATE_ENABLED) {
>> +	if (phb->flags & PNV_PHB_FLAG_EEH) {
>>  		if (*val == EEH_IO_ERROR_VALUE(size) &&
>>  		    eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
>>  			return PCIBIOS_DEVICE_NOT_FOUND;
>> @@ -434,12 +434,8 @@ int pnv_pci_cfg_write(struct device_node *dn,
>>  	}
>>  
>>  	/* Check if the PHB got frozen due to an error (no response) */
>> -#ifdef CONFIG_EEH
>> -	if (!(phb->eeh_state & PNV_EEH_STATE_ENABLED))
>> +	if (!(phb->flags & PNV_PHB_FLAG_EEH))
>>  		pnv_pci_config_check_eeh(phb, dn);
>> -#else
>> -	pnv_pci_config_check_eeh(phb, dn);
>> -#endif
>>  
>>  	return PCIBIOS_SUCCESSFUL;
>>  }
>> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>> index dbeba3d..adeb3c4 100644
>> --- a/arch/powerpc/platforms/powernv/pci.h
>> +++ b/arch/powerpc/platforms/powernv/pci.h
>> @@ -79,24 +79,23 @@ struct pnv_eeh_ops {
>>  	int (*configure_bridge)(struct eeh_pe *pe);
>>  	int (*next_error)(struct eeh_pe **pe);
>>  };
>> -
>> -#define PNV_EEH_STATE_ENABLED	(1 << 0)	/* EEH enabled	*/
>> -
>>  #endif /* CONFIG_EEH */
>>  
>> +#define PNV_PHB_FLAG_EEH	(1 << 0)
>> +
>>  struct pnv_phb {
>>  	struct pci_controller	*hose;
>>  	enum pnv_phb_type	type;
>>  	enum pnv_phb_model	model;
>>  	u64			hub_id;
>>  	u64			opal_id;
>> +	int			flags;
>>  	void __iomem		*regs;
>>  	int			initialized;
>>  	spinlock_t		lock;
>>  
>>  #ifdef CONFIG_EEH
>>  	struct pnv_eeh_ops	*eeh_ops;
>> -	int			eeh_state;
>>  #endif
>>  
>>  #ifdef CONFIG_DEBUG_FS
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 4/5] powerpc/powernv: Cache PHB diag-data
  2014-02-21 11:53 [PATCH 1/5] powerpc/eeh: Remove EEH_PE_PHB_DEAD Gavin Shan
  2014-02-21 11:53 ` [PATCH 2/5] powerpc/powernv: Remove PNV_EEH_STATE_REMOVED Gavin Shan
  2014-02-21 11:53 ` [PATCH 3/5] powerpc/powernv: Cleanup on PNV_EEH_STATE_ENABLED Gavin Shan
@ 2014-02-21 11:53 ` Gavin Shan
  2014-02-21 20:01   ` Benjamin Herrenschmidt
  2014-02-21 11:53 ` [PATCH 5/5] powerpc/powernv: Make PHB diag-data output short Gavin Shan
  3 siblings, 1 reply; 12+ messages in thread
From: Gavin Shan @ 2014-02-21 11:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan

EEH core tries to recover from fenced PHB or frozen PE. Unfortunately,
the log isn't consistent with the site because enabling IO path for
the frozen PE before collecting log would ruin the site. The patch
solves the problem to cache the PHB diag-data in advance with the
help of additional flag PNV_PHB_FLAG_DIAG to pnv_phb::flags.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-ioda.c |   65 ++++++++++++++++++-----------
 arch/powerpc/platforms/powernv/pci.c      |   21 ++++++----
 arch/powerpc/platforms/powernv/pci.h      |    1 +
 3 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 04b4710..3ed8d22 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -114,6 +114,27 @@ DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, ioda_eeh_inbB_dbgfs_get,
 			ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
 #endif /* CONFIG_DEBUG_FS */
 
+static void ioda_eeh_phb_diag(struct pci_controller *hose)
+{
+	struct pnv_phb *phb = hose->private_data;
+	unsigned long flags;
+	long rc;
+
+	spin_lock_irqsave(&phb->lock, flags);
+
+	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
+					 PNV_PCI_DIAG_BUF_SIZE);
+	if (rc == OPAL_SUCCESS) {
+		phb->flags |= PNV_PHB_FLAG_DIAG;
+	} else {
+		pr_warn("%s: Can't get diag-data for PHB#%x (%ld)\n",
+			__func__, hose->global_number, rc);
+		phb->flags &= ~PNV_PHB_FLAG_DIAG;
+	}
+
+	spin_unlock_irqrestore(&phb->lock, flags);
+}
+
 /**
  * ioda_eeh_post_init - Chip dependent post initialization
  * @hose: PCI controller
@@ -272,6 +293,8 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
 			result |= EEH_STATE_DMA_ACTIVE;
 			result |= EEH_STATE_MMIO_ENABLED;
 			result |= EEH_STATE_DMA_ENABLED;
+		} else {
+			ioda_eeh_phb_diag(hose);
 		}
 
 		return result;
@@ -541,24 +564,13 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
 static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
 			    char *drv_log, unsigned long len)
 {
-	s64 ret;
+	struct pnv_phb *phb = pe->phb->private_data;
 	unsigned long flags;
-	struct pci_controller *hose = pe->phb;
-	struct pnv_phb *phb = hose->private_data;
 
 	spin_lock_irqsave(&phb->lock, flags);
 
-	ret = opal_pci_get_phb_diag_data2(phb->opal_id,
-			phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
-	if (ret) {
-		spin_unlock_irqrestore(&phb->lock, flags);
-		pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n",
-			   __func__, hose->global_number, pe->addr, ret);
-		return -EIO;
-	}
-
-	/* The PHB diag-data is always indicative */
-	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
+	pnv_pci_dump_phb_diag_data(pe->phb, phb->diag.blob);
+	phb->flags &= ~PNV_PHB_FLAG_DIAG;
 
 	spin_unlock_irqrestore(&phb->lock, flags);
 
@@ -646,19 +658,11 @@ static void ioda_eeh_hub_diag(struct pci_controller *hose)
 	}
 }
 
-static void ioda_eeh_phb_diag(struct pci_controller *hose)
+static void ioda_eeh_phb_diag_dump(struct pci_controller *hose)
 {
 	struct pnv_phb *phb = hose->private_data;
-	long rc;
-
-	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
-					 PNV_PCI_DIAG_BUF_SIZE);
-	if (rc != OPAL_SUCCESS) {
-		pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
-			    __func__, hose->global_number, rc);
-		return;
-	}
 
+	ioda_eeh_phb_diag(hose);
 	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
 }
 
@@ -778,7 +782,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 				pr_info("EEH: PHB#%x informative error "
 					"detected\n",
 					hose->global_number);
-				ioda_eeh_phb_diag(hose);
+				ioda_eeh_phb_diag_dump(hose);
 				ret = EEH_NEXT_ERR_NONE;
 			}
 
@@ -809,6 +813,17 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		}
 
 		/*
+		 * EEH core will try recover from fenced PHB or
+		 * frozen PE. In the time for frozen PE, EEH core
+		 * enable IO path for that before collecting logs,
+		 * but it ruins the site. So we have to cache the
+		 * log in advance here.
+		 */
+		if (ret == EEH_NEXT_ERR_FROZEN_PE ||
+		    ret == EEH_NEXT_ERR_FENCED_PHB)
+			ioda_eeh_phb_diag(hose);
+
+		/*
 		 * If we have no errors on the specific PHB or only
 		 * informative error there, we continue poking it.
 		 * Otherwise, we need actions to be taken by upper
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 437c37d..67b2254 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -259,11 +259,15 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
 				unsigned char *log_buff)
 {
+	struct pnv_phb *phb = hose->private_data;
 	struct OpalIoPhbErrorCommon *common;
 
 	if (!hose || !log_buff)
 		return;
 
+	if (!(phb->flags & PNV_PHB_FLAG_DIAG))
+		return;
+
 	common = (struct OpalIoPhbErrorCommon *)log_buff;
 	switch (common->ioType) {
 	case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
@@ -281,13 +285,19 @@ void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
 static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
 {
 	unsigned long flags, rc;
-	int has_diag;
+	bool has_diag = false;
 
 	spin_lock_irqsave(&phb->lock, flags);
 
-	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
-					 PNV_PCI_DIAG_BUF_SIZE);
-	has_diag = (rc == OPAL_SUCCESS);
+	if (!(phb->flags & PNV_PHB_FLAG_DIAG)) {
+		rc = opal_pci_get_phb_diag_data2(phb->opal_id,
+						 phb->diag.blob,
+						 PNV_PCI_DIAG_BUF_SIZE);
+		if (rc == OPAL_SUCCESS) {
+			phb->flags |= PNV_PHB_FLAG_DIAG;
+			has_diag = true;
+		}
+	}
 
 	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
 				       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
@@ -303,9 +313,6 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
 		 */
 		if (has_diag)
 			pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob);
-		else
-			pr_warning("PCI %d: No diag data available\n",
-				   phb->hose->global_number);
 	}
 
 	spin_unlock_irqrestore(&phb->lock, flags);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index adeb3c4..153af9a 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -82,6 +82,7 @@ struct pnv_eeh_ops {
 #endif /* CONFIG_EEH */
 
 #define PNV_PHB_FLAG_EEH	(1 << 0)
+#define PNV_PHB_FLAG_DIAG	(1 << 1)
 
 struct pnv_phb {
 	struct pci_controller	*hose;
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/5] powerpc/powernv: Cache PHB diag-data
  2014-02-21 11:53 ` [PATCH 4/5] powerpc/powernv: Cache PHB diag-data Gavin Shan
@ 2014-02-21 20:01   ` Benjamin Herrenschmidt
  2014-02-23  4:52     ` Gavin Shan
  0 siblings, 1 reply; 12+ messages in thread
From: Benjamin Herrenschmidt @ 2014-02-21 20:01 UTC (permalink / raw)
  To: Gavin Shan; +Cc: linuxppc-dev

On Fri, 2014-02-21 at 19:53 +0800, Gavin Shan wrote:
> EEH core tries to recover from fenced PHB or frozen PE. Unfortunately,
> the log isn't consistent with the site because enabling IO path for
> the frozen PE before collecting log would ruin the site. The patch
> solves the problem to cache the PHB diag-data in advance with the
> help of additional flag PNV_PHB_FLAG_DIAG to pnv_phb::flags.

Ok, so correct me if I'm wrong, but you are

  - Collecting the diag data in get_state, as a sort of side effect
(what happens if get_state is called multiple times ?)

  - Dumping it later on

Any reason why we can't instead dump it immediately ? Also do we have a
clean trigger for when we detect an error condition ? It can either be
the result of an interrupt or a driver called get_state following an
ffffffff. Are both path eventually going into the same function to
handle a "new" error condition ? That's where I would both collect and
dump the EEH state..

Cheers,
Ben.

> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---
>  arch/powerpc/platforms/powernv/eeh-ioda.c |   65 ++++++++++++++++++-----------
>  arch/powerpc/platforms/powernv/pci.c      |   21 ++++++----
>  arch/powerpc/platforms/powernv/pci.h      |    1 +
>  3 files changed, 55 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
> index 04b4710..3ed8d22 100644
> --- a/arch/powerpc/platforms/powernv/eeh-ioda.c
> +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
> @@ -114,6 +114,27 @@ DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, ioda_eeh_inbB_dbgfs_get,
>  			ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
>  #endif /* CONFIG_DEBUG_FS */
>  
> +static void ioda_eeh_phb_diag(struct pci_controller *hose)
> +{
> +	struct pnv_phb *phb = hose->private_data;
> +	unsigned long flags;
> +	long rc;
> +
> +	spin_lock_irqsave(&phb->lock, flags);
> +
> +	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
> +					 PNV_PCI_DIAG_BUF_SIZE);
> +	if (rc == OPAL_SUCCESS) {
> +		phb->flags |= PNV_PHB_FLAG_DIAG;
> +	} else {
> +		pr_warn("%s: Can't get diag-data for PHB#%x (%ld)\n",
> +			__func__, hose->global_number, rc);
> +		phb->flags &= ~PNV_PHB_FLAG_DIAG;
> +	}
> +
> +	spin_unlock_irqrestore(&phb->lock, flags);
> +}
> +
>  /**
>   * ioda_eeh_post_init - Chip dependent post initialization
>   * @hose: PCI controller
> @@ -272,6 +293,8 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
>  			result |= EEH_STATE_DMA_ACTIVE;
>  			result |= EEH_STATE_MMIO_ENABLED;
>  			result |= EEH_STATE_DMA_ENABLED;
> +		} else {
> +			ioda_eeh_phb_diag(hose);
>  		}
>  
>  		return result;
> @@ -541,24 +564,13 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
>  static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
>  			    char *drv_log, unsigned long len)
>  {
> -	s64 ret;
> +	struct pnv_phb *phb = pe->phb->private_data;
>  	unsigned long flags;
> -	struct pci_controller *hose = pe->phb;
> -	struct pnv_phb *phb = hose->private_data;
>  
>  	spin_lock_irqsave(&phb->lock, flags);
>  
> -	ret = opal_pci_get_phb_diag_data2(phb->opal_id,
> -			phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
> -	if (ret) {
> -		spin_unlock_irqrestore(&phb->lock, flags);
> -		pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n",
> -			   __func__, hose->global_number, pe->addr, ret);
> -		return -EIO;
> -	}
> -
> -	/* The PHB diag-data is always indicative */
> -	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
> +	pnv_pci_dump_phb_diag_data(pe->phb, phb->diag.blob);
> +	phb->flags &= ~PNV_PHB_FLAG_DIAG;
>  
>  	spin_unlock_irqrestore(&phb->lock, flags);
>  
> @@ -646,19 +658,11 @@ static void ioda_eeh_hub_diag(struct pci_controller *hose)
>  	}
>  }
>  
> -static void ioda_eeh_phb_diag(struct pci_controller *hose)
> +static void ioda_eeh_phb_diag_dump(struct pci_controller *hose)
>  {
>  	struct pnv_phb *phb = hose->private_data;
> -	long rc;
> -
> -	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
> -					 PNV_PCI_DIAG_BUF_SIZE);
> -	if (rc != OPAL_SUCCESS) {
> -		pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
> -			    __func__, hose->global_number, rc);
> -		return;
> -	}
>  
> +	ioda_eeh_phb_diag(hose);
>  	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
>  }
>  
> @@ -778,7 +782,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
>  				pr_info("EEH: PHB#%x informative error "
>  					"detected\n",
>  					hose->global_number);
> -				ioda_eeh_phb_diag(hose);
> +				ioda_eeh_phb_diag_dump(hose);
>  				ret = EEH_NEXT_ERR_NONE;
>  			}
>  
> @@ -809,6 +813,17 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
>  		}
>  
>  		/*
> +		 * EEH core will try recover from fenced PHB or
> +		 * frozen PE. In the time for frozen PE, EEH core
> +		 * enable IO path for that before collecting logs,
> +		 * but it ruins the site. So we have to cache the
> +		 * log in advance here.
> +		 */
> +		if (ret == EEH_NEXT_ERR_FROZEN_PE ||
> +		    ret == EEH_NEXT_ERR_FENCED_PHB)
> +			ioda_eeh_phb_diag(hose);
> +
> +		/*
>  		 * If we have no errors on the specific PHB or only
>  		 * informative error there, we continue poking it.
>  		 * Otherwise, we need actions to be taken by upper
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 437c37d..67b2254 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -259,11 +259,15 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
>  void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
>  				unsigned char *log_buff)
>  {
> +	struct pnv_phb *phb = hose->private_data;
>  	struct OpalIoPhbErrorCommon *common;
>  
>  	if (!hose || !log_buff)
>  		return;
>  
> +	if (!(phb->flags & PNV_PHB_FLAG_DIAG))
> +		return;
> +
>  	common = (struct OpalIoPhbErrorCommon *)log_buff;
>  	switch (common->ioType) {
>  	case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
> @@ -281,13 +285,19 @@ void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
>  static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
>  {
>  	unsigned long flags, rc;
> -	int has_diag;
> +	bool has_diag = false;
>  
>  	spin_lock_irqsave(&phb->lock, flags);
>  
> -	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
> -					 PNV_PCI_DIAG_BUF_SIZE);
> -	has_diag = (rc == OPAL_SUCCESS);
> +	if (!(phb->flags & PNV_PHB_FLAG_DIAG)) {
> +		rc = opal_pci_get_phb_diag_data2(phb->opal_id,
> +						 phb->diag.blob,
> +						 PNV_PCI_DIAG_BUF_SIZE);
> +		if (rc == OPAL_SUCCESS) {
> +			phb->flags |= PNV_PHB_FLAG_DIAG;
> +			has_diag = true;
> +		}
> +	}
>  
>  	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
>  				       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
> @@ -303,9 +313,6 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
>  		 */
>  		if (has_diag)
>  			pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob);
> -		else
> -			pr_warning("PCI %d: No diag data available\n",
> -				   phb->hose->global_number);
>  	}
>  
>  	spin_unlock_irqrestore(&phb->lock, flags);
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index adeb3c4..153af9a 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -82,6 +82,7 @@ struct pnv_eeh_ops {
>  #endif /* CONFIG_EEH */
>  
>  #define PNV_PHB_FLAG_EEH	(1 << 0)
> +#define PNV_PHB_FLAG_DIAG	(1 << 1)
>  
>  struct pnv_phb {
>  	struct pci_controller	*hose;

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/5] powerpc/powernv: Cache PHB diag-data
  2014-02-21 20:01   ` Benjamin Herrenschmidt
@ 2014-02-23  4:52     ` Gavin Shan
  0 siblings, 0 replies; 12+ messages in thread
From: Gavin Shan @ 2014-02-23  4:52 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Gavin Shan

On Sat, Feb 22, 2014 at 07:01:27AM +1100, Benjamin Herrenschmidt wrote:
>On Fri, 2014-02-21 at 19:53 +0800, Gavin Shan wrote:
>> EEH core tries to recover from fenced PHB or frozen PE. Unfortunately,
>> the log isn't consistent with the site because enabling IO path for
>> the frozen PE before collecting log would ruin the site. The patch
>> solves the problem to cache the PHB diag-data in advance with the
>> help of additional flag PNV_PHB_FLAG_DIAG to pnv_phb::flags.
>
>Ok, so correct me if I'm wrong, but you are
>
>  - Collecting the diag data in get_state, as a sort of side effect
>(what happens if get_state is called multiple times ?)
>
>  - Dumping it later on
>

Yeah, the patch would have some problems when get_state gets called
for multiple times: the log could be much more than what we expected
in case that frozen PE#1 detected and in progress to handle it (we
don't dump it yet), frozen PE#2 detected. The log would include all
information for frozen PE#1 and PE#2 and it's not expected.

Another case is get_state called for multiple times on frozen PE#1
and we can check EEH_PE_ISOLATED to avoid diag-data over-writting.

I'm thinking of a new mechanism (please refer to the reply below).

>Any reason why we can't instead dump it immediately ? Also do we have a
>clean trigger for when we detect an error condition ? It can either be
>the result of an interrupt or a driver called get_state following an
>ffffffff. Are both path eventually going into the same function to
>handle a "new" error condition ? That's where I would both collect and
>dump the EEH state..
>

The reason I don't want dump it immediately is that I would keep
struct pnv_eeh_ops::get_log() to dump diag-data to guest in the
future.

The problem is that we have only one PHB diag-data instance in
struct pnv_phb::diag.blob[]. I'm thinking of to have each PE
to have diag-data for itself and the things would look like
followings. Ben, please comment :-)

- Extend "struct eeh_pe" to have a platform pointer (void *data).
  And we still collect diag-data in get_state() or next_error(),
  which will be dumped in pnv_eeh_ops->get_log(). The disadvantage
  could be lots of memory (extra 8KB usually) consumed by each PE
  instance.
- For PCI config accessors and informative (also dead PHB, dead
  P7IOC), we still use pnv_phb::diag.blob[].

Thanks,
Gavin

>> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/platforms/powernv/eeh-ioda.c |   65 ++++++++++++++++++-----------
>>  arch/powerpc/platforms/powernv/pci.c      |   21 ++++++----
>>  arch/powerpc/platforms/powernv/pci.h      |    1 +
>>  3 files changed, 55 insertions(+), 32 deletions(-)
>> 
>> diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
>> index 04b4710..3ed8d22 100644
>> --- a/arch/powerpc/platforms/powernv/eeh-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
>> @@ -114,6 +114,27 @@ DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, ioda_eeh_inbB_dbgfs_get,
>>  			ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
>>  #endif /* CONFIG_DEBUG_FS */
>>  
>> +static void ioda_eeh_phb_diag(struct pci_controller *hose)
>> +{
>> +	struct pnv_phb *phb = hose->private_data;
>> +	unsigned long flags;
>> +	long rc;
>> +
>> +	spin_lock_irqsave(&phb->lock, flags);
>> +
>> +	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
>> +					 PNV_PCI_DIAG_BUF_SIZE);
>> +	if (rc == OPAL_SUCCESS) {
>> +		phb->flags |= PNV_PHB_FLAG_DIAG;
>> +	} else {
>> +		pr_warn("%s: Can't get diag-data for PHB#%x (%ld)\n",
>> +			__func__, hose->global_number, rc);
>> +		phb->flags &= ~PNV_PHB_FLAG_DIAG;
>> +	}
>> +
>> +	spin_unlock_irqrestore(&phb->lock, flags);
>> +}
>> +
>>  /**
>>   * ioda_eeh_post_init - Chip dependent post initialization
>>   * @hose: PCI controller
>> @@ -272,6 +293,8 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
>>  			result |= EEH_STATE_DMA_ACTIVE;
>>  			result |= EEH_STATE_MMIO_ENABLED;
>>  			result |= EEH_STATE_DMA_ENABLED;
>> +		} else {
>> +			ioda_eeh_phb_diag(hose);
>>  		}
>>  
>>  		return result;
>> @@ -541,24 +564,13 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
>>  static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
>>  			    char *drv_log, unsigned long len)
>>  {
>> -	s64 ret;
>> +	struct pnv_phb *phb = pe->phb->private_data;
>>  	unsigned long flags;
>> -	struct pci_controller *hose = pe->phb;
>> -	struct pnv_phb *phb = hose->private_data;
>>  
>>  	spin_lock_irqsave(&phb->lock, flags);
>>  
>> -	ret = opal_pci_get_phb_diag_data2(phb->opal_id,
>> -			phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
>> -	if (ret) {
>> -		spin_unlock_irqrestore(&phb->lock, flags);
>> -		pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n",
>> -			   __func__, hose->global_number, pe->addr, ret);
>> -		return -EIO;
>> -	}
>> -
>> -	/* The PHB diag-data is always indicative */
>> -	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
>> +	pnv_pci_dump_phb_diag_data(pe->phb, phb->diag.blob);
>> +	phb->flags &= ~PNV_PHB_FLAG_DIAG;
>>  
>>  	spin_unlock_irqrestore(&phb->lock, flags);
>>  
>> @@ -646,19 +658,11 @@ static void ioda_eeh_hub_diag(struct pci_controller *hose)
>>  	}
>>  }
>>  
>> -static void ioda_eeh_phb_diag(struct pci_controller *hose)
>> +static void ioda_eeh_phb_diag_dump(struct pci_controller *hose)
>>  {
>>  	struct pnv_phb *phb = hose->private_data;
>> -	long rc;
>> -
>> -	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
>> -					 PNV_PCI_DIAG_BUF_SIZE);
>> -	if (rc != OPAL_SUCCESS) {
>> -		pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
>> -			    __func__, hose->global_number, rc);
>> -		return;
>> -	}
>>  
>> +	ioda_eeh_phb_diag(hose);
>>  	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
>>  }
>>  
>> @@ -778,7 +782,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
>>  				pr_info("EEH: PHB#%x informative error "
>>  					"detected\n",
>>  					hose->global_number);
>> -				ioda_eeh_phb_diag(hose);
>> +				ioda_eeh_phb_diag_dump(hose);
>>  				ret = EEH_NEXT_ERR_NONE;
>>  			}
>>  
>> @@ -809,6 +813,17 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
>>  		}
>>  
>>  		/*
>> +		 * EEH core will try recover from fenced PHB or
>> +		 * frozen PE. In the time for frozen PE, EEH core
>> +		 * enable IO path for that before collecting logs,
>> +		 * but it ruins the site. So we have to cache the
>> +		 * log in advance here.
>> +		 */
>> +		if (ret == EEH_NEXT_ERR_FROZEN_PE ||
>> +		    ret == EEH_NEXT_ERR_FENCED_PHB)
>> +			ioda_eeh_phb_diag(hose);
>> +
>> +		/*
>>  		 * If we have no errors on the specific PHB or only
>>  		 * informative error there, we continue poking it.
>>  		 * Otherwise, we need actions to be taken by upper
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index 437c37d..67b2254 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -259,11 +259,15 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
>>  void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
>>  				unsigned char *log_buff)
>>  {
>> +	struct pnv_phb *phb = hose->private_data;
>>  	struct OpalIoPhbErrorCommon *common;
>>  
>>  	if (!hose || !log_buff)
>>  		return;
>>  
>> +	if (!(phb->flags & PNV_PHB_FLAG_DIAG))
>> +		return;
>> +
>>  	common = (struct OpalIoPhbErrorCommon *)log_buff;
>>  	switch (common->ioType) {
>>  	case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
>> @@ -281,13 +285,19 @@ void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
>>  static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
>>  {
>>  	unsigned long flags, rc;
>> -	int has_diag;
>> +	bool has_diag = false;
>>  
>>  	spin_lock_irqsave(&phb->lock, flags);
>>  
>> -	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
>> -					 PNV_PCI_DIAG_BUF_SIZE);
>> -	has_diag = (rc == OPAL_SUCCESS);
>> +	if (!(phb->flags & PNV_PHB_FLAG_DIAG)) {
>> +		rc = opal_pci_get_phb_diag_data2(phb->opal_id,
>> +						 phb->diag.blob,
>> +						 PNV_PCI_DIAG_BUF_SIZE);
>> +		if (rc == OPAL_SUCCESS) {
>> +			phb->flags |= PNV_PHB_FLAG_DIAG;
>> +			has_diag = true;
>> +		}
>> +	}
>>  
>>  	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
>>  				       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>> @@ -303,9 +313,6 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
>>  		 */
>>  		if (has_diag)
>>  			pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob);
>> -		else
>> -			pr_warning("PCI %d: No diag data available\n",
>> -				   phb->hose->global_number);
>>  	}
>>  
>>  	spin_unlock_irqrestore(&phb->lock, flags);
>> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
>> index adeb3c4..153af9a 100644
>> --- a/arch/powerpc/platforms/powernv/pci.h
>> +++ b/arch/powerpc/platforms/powernv/pci.h
>> @@ -82,6 +82,7 @@ struct pnv_eeh_ops {
>>  #endif /* CONFIG_EEH */
>>  
>>  #define PNV_PHB_FLAG_EEH	(1 << 0)
>> +#define PNV_PHB_FLAG_DIAG	(1 << 1)
>>  
>>  struct pnv_phb {
>>  	struct pci_controller	*hose;
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 5/5] powerpc/powernv: Make PHB diag-data output short
  2014-02-21 11:53 [PATCH 1/5] powerpc/eeh: Remove EEH_PE_PHB_DEAD Gavin Shan
                   ` (2 preceding siblings ...)
  2014-02-21 11:53 ` [PATCH 4/5] powerpc/powernv: Cache PHB diag-data Gavin Shan
@ 2014-02-21 11:53 ` Gavin Shan
  2014-02-21 20:05   ` Benjamin Herrenschmidt
  3 siblings, 1 reply; 12+ messages in thread
From: Gavin Shan @ 2014-02-21 11:53 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan

According to Ben's suggestion, the patch makes the PHB diag-data
dump looks a bit short by printing multiple values in one line
and outputing "-" for zero fields.

After the patch applied, the PHB diag-data dump looks like:

PHB3 PHB#3 Diag-data (Version: 1)

  brdgCtl:     00000002
  UtlSts:      - - -
  RootSts:     0000000f 00400000 b0830008 00100147 00002000
  RootErrSts:  - - -
  RootErrLog:  - - - -
  RootErrLog1: - - -
  nFir:        - 0030006e00000000 -
  PhbSts:      0000001c00000000 -
  Lem:         0000000000100000 42498e327f502eae -
  PhbErr:      - - - -
  OutErr:      - - - -
  InAErr:      8000000000000000 8000000000000000 0402030000000000 -
  InBErr:      - - - -
  PE[  8] A/B: 8480002b00000000 8000000000000000

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/pci.c |  238 ++++++++++++++++++++--------------
 1 file changed, 143 insertions(+), 95 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 67b2254..a5f236a 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -124,67 +124,103 @@ static void pnv_teardown_msi_irqs(struct pci_dev *pdev)
 }
 #endif /* CONFIG_PCI_MSI */
 
+static char *pnv_pci_diag_field(char *buf, int fmt, u64 val64)
+{
+	u32 val32 = (u32)val64;
+
+	memset(buf, 0, 24);
+	switch (fmt) {
+	case 8:
+		if (val32)
+			sprintf(buf, "%08x", val32);
+		else
+			sprintf(buf, "%s", "-");
+		break;
+	case 16:
+		if (val64)
+			sprintf(buf, "%016llx", val64);
+		else
+			sprintf(buf, "%s", "-");
+		break;
+	default:
+		sprintf(buf, "%s", "-");
+	}
+
+	return buf;
+}
+
 static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
 					 struct OpalIoPhbErrorCommon *common)
 {
 	struct OpalIoP7IOCPhbErrorData *data;
+	char buf[120];
 	int i;
 
 	data = (struct OpalIoP7IOCPhbErrorData *)common;
 	pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n\n",
 		hose->global_number, common->version);
 
-	pr_info("  brdgCtl:              %08x\n", data->brdgCtl);
-
-	pr_info("  portStatusReg:        %08x\n", data->portStatusReg);
-	pr_info("  rootCmplxStatus:      %08x\n", data->rootCmplxStatus);
-	pr_info("  busAgentStatus:       %08x\n", data->busAgentStatus);
-
-	pr_info("  deviceStatus:         %08x\n", data->deviceStatus);
-	pr_info("  slotStatus:           %08x\n", data->slotStatus);
-	pr_info("  linkStatus:           %08x\n", data->linkStatus);
-	pr_info("  devCmdStatus:         %08x\n", data->devCmdStatus);
-	pr_info("  devSecStatus:         %08x\n", data->devSecStatus);
-
-	pr_info("  rootErrorStatus:      %08x\n", data->rootErrorStatus);
-	pr_info("  uncorrErrorStatus:    %08x\n", data->uncorrErrorStatus);
-	pr_info("  corrErrorStatus:      %08x\n", data->corrErrorStatus);
-	pr_info("  tlpHdr1:              %08x\n", data->tlpHdr1);
-	pr_info("  tlpHdr2:              %08x\n", data->tlpHdr2);
-	pr_info("  tlpHdr3:              %08x\n", data->tlpHdr3);
-	pr_info("  tlpHdr4:              %08x\n", data->tlpHdr4);
-	pr_info("  sourceId:             %08x\n", data->sourceId);
-	pr_info("  errorClass:           %016llx\n", data->errorClass);
-	pr_info("  correlator:           %016llx\n", data->correlator);
-	pr_info("  p7iocPlssr:           %016llx\n", data->p7iocPlssr);
-	pr_info("  p7iocCsr:             %016llx\n", data->p7iocCsr);
-	pr_info("  lemFir:               %016llx\n", data->lemFir);
-	pr_info("  lemErrorMask:         %016llx\n", data->lemErrorMask);
-	pr_info("  lemWOF:               %016llx\n", data->lemWOF);
-	pr_info("  phbErrorStatus:       %016llx\n", data->phbErrorStatus);
-	pr_info("  phbFirstErrorStatus:  %016llx\n", data->phbFirstErrorStatus);
-	pr_info("  phbErrorLog0:         %016llx\n", data->phbErrorLog0);
-	pr_info("  phbErrorLog1:         %016llx\n", data->phbErrorLog1);
-	pr_info("  mmioErrorStatus:      %016llx\n", data->mmioErrorStatus);
-	pr_info("  mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus);
-	pr_info("  mmioErrorLog0:        %016llx\n", data->mmioErrorLog0);
-	pr_info("  mmioErrorLog1:        %016llx\n", data->mmioErrorLog1);
-	pr_info("  dma0ErrorStatus:      %016llx\n", data->dma0ErrorStatus);
-	pr_info("  dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus);
-	pr_info("  dma0ErrorLog0:        %016llx\n", data->dma0ErrorLog0);
-	pr_info("  dma0ErrorLog1:        %016llx\n", data->dma0ErrorLog1);
-	pr_info("  dma1ErrorStatus:      %016llx\n", data->dma1ErrorStatus);
-	pr_info("  dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus);
-	pr_info("  dma1ErrorLog0:        %016llx\n", data->dma1ErrorLog0);
-	pr_info("  dma1ErrorLog1:        %016llx\n", data->dma1ErrorLog1);
+	pr_info("  brdgCtl:     %s\n",
+		pnv_pci_diag_field(&buf[0], 8, data->brdgCtl));
+	pr_info("  UtlSts:      %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      8, data->portStatusReg),
+		pnv_pci_diag_field(&buf[1 * 24], 8, data->rootCmplxStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 8, data->busAgentStatus));
+	pr_info("  RootSts:     %s %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      8, data->deviceStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 8, data->slotStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 8, data->linkStatus),
+		pnv_pci_diag_field(&buf[3 * 24], 8, data->devCmdStatus),
+		pnv_pci_diag_field(&buf[4 * 24], 8, data->devSecStatus));
+	pr_info("  RootErrSts:  %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      8, data->rootErrorStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 8, data->uncorrErrorStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 8, data->corrErrorStatus));
+	pr_info("  RootErrLog:  %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      8, data->tlpHdr1),
+		pnv_pci_diag_field(&buf[1 * 24], 8, data->tlpHdr2),
+		pnv_pci_diag_field(&buf[2 * 24], 8, data->tlpHdr3),
+		pnv_pci_diag_field(&buf[3 * 24], 8, data->tlpHdr4));
+	pr_info("  RootErrLog1: %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],       8, data->sourceId),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->errorClass),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->correlator));
+	pr_info("  PhbSts:      %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->p7iocPlssr),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->p7iocCsr));
+	pr_info("  Lem:         %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->lemFir),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->lemErrorMask),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->lemWOF));
+	pr_info("  PhbErr:      %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->phbErrorStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->phbFirstErrorStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->phbErrorLog0),
+		pnv_pci_diag_field(&buf[3 * 24], 16, data->phbErrorLog1));
+	pr_info("  OutErr:      %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->mmioErrorStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->mmioFirstErrorStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->mmioErrorLog0),
+		pnv_pci_diag_field(&buf[3 * 24], 16, data->mmioErrorLog1));
+	pr_info("  InAErr:      %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->dma0ErrorStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma0FirstErrorStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma0ErrorLog0),
+		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma0ErrorLog1));
+	pr_info("  InBErr:      %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->dma1ErrorStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma1FirstErrorStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma1ErrorLog0),
+		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma1ErrorLog1));
 
 	for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
 		if ((data->pestA[i] >> 63) == 0 &&
 		    (data->pestB[i] >> 63) == 0)
 			continue;
 
-		pr_info("  PE[%3d] PESTA:        %016llx\n", i, data->pestA[i]);
-		pr_info("          PESTB:        %016llx\n", data->pestB[i]);
+		pr_info("  PE[%3d] A/B: %s %s\n",
+			i, pnv_pci_diag_field(&buf[0], 16, data->pestA[i]),
+			pnv_pci_diag_field(&buf[1 * 24], 16, data->pestB[i]));
 	}
 }
 
@@ -192,67 +228,79 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
 					struct OpalIoPhbErrorCommon *common)
 {
 	struct OpalIoPhb3ErrorData *data;
-	int i;
+	char buf[120];
+	int i = 0;
 
+	memset(buf, 0, 120);
 	data = (struct OpalIoPhb3ErrorData*)common;
 	pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n\n",
 		hose->global_number, common->version);
 
-	pr_info("  brdgCtl:              %08x\n", data->brdgCtl);
-
-	pr_info("  portStatusReg:        %08x\n", data->portStatusReg);
-	pr_info("  rootCmplxStatus:      %08x\n", data->rootCmplxStatus);
-	pr_info("  busAgentStatus:       %08x\n", data->busAgentStatus);
-
-	pr_info("  deviceStatus:         %08x\n", data->deviceStatus);
-	pr_info("  slotStatus:           %08x\n", data->slotStatus);
-	pr_info("  linkStatus:           %08x\n", data->linkStatus);
-	pr_info("  devCmdStatus:         %08x\n", data->devCmdStatus);
-	pr_info("  devSecStatus:         %08x\n", data->devSecStatus);
-
-	pr_info("  rootErrorStatus:      %08x\n", data->rootErrorStatus);
-	pr_info("  uncorrErrorStatus:    %08x\n", data->uncorrErrorStatus);
-	pr_info("  corrErrorStatus:      %08x\n", data->corrErrorStatus);
-	pr_info("  tlpHdr1:              %08x\n", data->tlpHdr1);
-	pr_info("  tlpHdr2:              %08x\n", data->tlpHdr2);
-	pr_info("  tlpHdr3:              %08x\n", data->tlpHdr3);
-	pr_info("  tlpHdr4:              %08x\n", data->tlpHdr4);
-	pr_info("  sourceId:             %08x\n", data->sourceId);
-	pr_info("  errorClass:           %016llx\n", data->errorClass);
-	pr_info("  correlator:           %016llx\n", data->correlator);
-
-	pr_info("  nFir:                 %016llx\n", data->nFir);
-	pr_info("  nFirMask:             %016llx\n", data->nFirMask);
-	pr_info("  nFirWOF:              %016llx\n", data->nFirWOF);
-	pr_info("  PhbPlssr:             %016llx\n", data->phbPlssr);
-	pr_info("  PhbCsr:               %016llx\n", data->phbCsr);
-	pr_info("  lemFir:               %016llx\n", data->lemFir);
-	pr_info("  lemErrorMask:         %016llx\n", data->lemErrorMask);
-	pr_info("  lemWOF:               %016llx\n", data->lemWOF);
-	pr_info("  phbErrorStatus:       %016llx\n", data->phbErrorStatus);
-	pr_info("  phbFirstErrorStatus:  %016llx\n", data->phbFirstErrorStatus);
-	pr_info("  phbErrorLog0:         %016llx\n", data->phbErrorLog0);
-	pr_info("  phbErrorLog1:         %016llx\n", data->phbErrorLog1);
-	pr_info("  mmioErrorStatus:      %016llx\n", data->mmioErrorStatus);
-	pr_info("  mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus);
-	pr_info("  mmioErrorLog0:        %016llx\n", data->mmioErrorLog0);
-	pr_info("  mmioErrorLog1:        %016llx\n", data->mmioErrorLog1);
-	pr_info("  dma0ErrorStatus:      %016llx\n", data->dma0ErrorStatus);
-	pr_info("  dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus);
-	pr_info("  dma0ErrorLog0:        %016llx\n", data->dma0ErrorLog0);
-	pr_info("  dma0ErrorLog1:        %016llx\n", data->dma0ErrorLog1);
-	pr_info("  dma1ErrorStatus:      %016llx\n", data->dma1ErrorStatus);
-	pr_info("  dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus);
-	pr_info("  dma1ErrorLog0:        %016llx\n", data->dma1ErrorLog0);
-	pr_info("  dma1ErrorLog1:        %016llx\n", data->dma1ErrorLog1);
+	pr_info("  brdgCtl:     %s\n",
+		pnv_pci_diag_field(&buf[0], 8, data->brdgCtl));
+	pr_info("  UtlSts:      %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      8, data->portStatusReg),
+		pnv_pci_diag_field(&buf[1 * 24], 8, data->rootCmplxStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 8, data->busAgentStatus));
+	pr_info("  RootSts:     %s %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      8, data->deviceStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 8, data->slotStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 8, data->linkStatus),
+		pnv_pci_diag_field(&buf[3 * 24], 8, data->devCmdStatus),
+		pnv_pci_diag_field(&buf[4 * 24], 8, data->devSecStatus));
+	pr_info("  RootErrSts:  %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      8, data->rootErrorStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 8, data->uncorrErrorStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 8, data->corrErrorStatus));
+	pr_info("  RootErrLog:  %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      8, data->tlpHdr1),
+		pnv_pci_diag_field(&buf[1 * 24], 8, data->tlpHdr2),
+		pnv_pci_diag_field(&buf[2 * 24], 8, data->tlpHdr3),
+		pnv_pci_diag_field(&buf[3 * 24], 8, data->tlpHdr4));
+	pr_info("  RootErrLog1: %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],       8, data->sourceId),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->errorClass),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->correlator));
+	pr_info("  nFir:        %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->nFir),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->nFirMask),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->nFirWOF));
+	pr_info("  PhbSts:      %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->phbPlssr),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->phbCsr));
+	pr_info("  Lem:         %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->lemFir),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->lemErrorMask),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->lemWOF));
+	pr_info("  PhbErr:      %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->phbErrorStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->phbFirstErrorStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->phbErrorLog0),
+		pnv_pci_diag_field(&buf[3 * 24], 16, data->phbErrorLog1));
+	pr_info("  OutErr:      %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->mmioErrorStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->mmioFirstErrorStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->mmioErrorLog0),
+		pnv_pci_diag_field(&buf[3 * 24], 16, data->mmioErrorLog1));
+	pr_info("  InAErr:      %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->dma0ErrorStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma0FirstErrorStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma0ErrorLog0),
+		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma0ErrorLog1));
+	pr_info("  InBErr:      %s %s %s %s\n",
+		pnv_pci_diag_field(&buf[0],      16, data->dma1ErrorStatus),
+		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma1FirstErrorStatus),
+		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma1ErrorLog0),
+		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma1ErrorLog1));
 
 	for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
 		if ((data->pestA[i] >> 63) == 0 &&
 		    (data->pestB[i] >> 63) == 0)
 			continue;
 
-		pr_info("  PE[%3d] PESTA:        %016llx\n", i, data->pestA[i]);
-		pr_info("          PESTB:        %016llx\n", data->pestB[i]);
+		pr_info("  PE[%3d] A/B: %s %s\n",
+			i, pnv_pci_diag_field(&buf[0], 16, data->pestA[i]),
+			pnv_pci_diag_field(&buf[1 * 24], 16, data->pestB[i]));
 	}
 }
 
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] powerpc/powernv: Make PHB diag-data output short
  2014-02-21 11:53 ` [PATCH 5/5] powerpc/powernv: Make PHB diag-data output short Gavin Shan
@ 2014-02-21 20:05   ` Benjamin Herrenschmidt
  2014-02-23  4:55     ` Gavin Shan
  0 siblings, 1 reply; 12+ messages in thread
From: Benjamin Herrenschmidt @ 2014-02-21 20:05 UTC (permalink / raw)
  To: Gavin Shan; +Cc: linuxppc-dev

On Fri, 2014-02-21 at 19:53 +0800, Gavin Shan wrote:
> According to Ben's suggestion, the patch makes the PHB diag-data
> dump looks a bit short by printing multiple values in one line
> and outputing "-" for zero fields.
> 
> After the patch applied, the PHB diag-data dump looks like:

Actually, I wouldn't do that "-" thing, I would leave zeros as
zeros but I would remove lines that have all zeros.

Additionally, we might want to consider what if we can get rid
of more fields for INF, or maybe even not dump them by default
and just count them (should we have counters in sysfs ?)

One thing I'm tempted to do is turn the full logs into actual
error logs (sent to FSP) and only display a "analyzed" version
in the kernel, something that decodes the PEST for example
and indicates if it's an DMA or MMIO error, the address, etc...

Cheers,
Ben.

> PHB3 PHB#3 Diag-data (Version: 1)
> 
>   brdgCtl:     00000002
>   UtlSts:      - - -
>   RootSts:     0000000f 00400000 b0830008 00100147 00002000
>   RootErrSts:  - - -
>   RootErrLog:  - - - -
>   RootErrLog1: - - -
>   nFir:        - 0030006e00000000 -
>   PhbSts:      0000001c00000000 -
>   Lem:         0000000000100000 42498e327f502eae -
>   PhbErr:      - - - -
>   OutErr:      - - - -
>   InAErr:      8000000000000000 8000000000000000 0402030000000000 -
>   InBErr:      - - - -
>   PE[  8] A/B: 8480002b00000000 8000000000000000
> 
> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---
>  arch/powerpc/platforms/powernv/pci.c |  238 ++++++++++++++++++++--------------
>  1 file changed, 143 insertions(+), 95 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 67b2254..a5f236a 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -124,67 +124,103 @@ static void pnv_teardown_msi_irqs(struct pci_dev *pdev)
>  }
>  #endif /* CONFIG_PCI_MSI */
>  
> +static char *pnv_pci_diag_field(char *buf, int fmt, u64 val64)
> +{
> +	u32 val32 = (u32)val64;
> +
> +	memset(buf, 0, 24);
> +	switch (fmt) {
> +	case 8:
> +		if (val32)
> +			sprintf(buf, "%08x", val32);
> +		else
> +			sprintf(buf, "%s", "-");
> +		break;
> +	case 16:
> +		if (val64)
> +			sprintf(buf, "%016llx", val64);
> +		else
> +			sprintf(buf, "%s", "-");
> +		break;
> +	default:
> +		sprintf(buf, "%s", "-");
> +	}
> +
> +	return buf;
> +}
> +
>  static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
>  					 struct OpalIoPhbErrorCommon *common)
>  {
>  	struct OpalIoP7IOCPhbErrorData *data;
> +	char buf[120];
>  	int i;
>  
>  	data = (struct OpalIoP7IOCPhbErrorData *)common;
>  	pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n\n",
>  		hose->global_number, common->version);
>  
> -	pr_info("  brdgCtl:              %08x\n", data->brdgCtl);
> -
> -	pr_info("  portStatusReg:        %08x\n", data->portStatusReg);
> -	pr_info("  rootCmplxStatus:      %08x\n", data->rootCmplxStatus);
> -	pr_info("  busAgentStatus:       %08x\n", data->busAgentStatus);
> -
> -	pr_info("  deviceStatus:         %08x\n", data->deviceStatus);
> -	pr_info("  slotStatus:           %08x\n", data->slotStatus);
> -	pr_info("  linkStatus:           %08x\n", data->linkStatus);
> -	pr_info("  devCmdStatus:         %08x\n", data->devCmdStatus);
> -	pr_info("  devSecStatus:         %08x\n", data->devSecStatus);
> -
> -	pr_info("  rootErrorStatus:      %08x\n", data->rootErrorStatus);
> -	pr_info("  uncorrErrorStatus:    %08x\n", data->uncorrErrorStatus);
> -	pr_info("  corrErrorStatus:      %08x\n", data->corrErrorStatus);
> -	pr_info("  tlpHdr1:              %08x\n", data->tlpHdr1);
> -	pr_info("  tlpHdr2:              %08x\n", data->tlpHdr2);
> -	pr_info("  tlpHdr3:              %08x\n", data->tlpHdr3);
> -	pr_info("  tlpHdr4:              %08x\n", data->tlpHdr4);
> -	pr_info("  sourceId:             %08x\n", data->sourceId);
> -	pr_info("  errorClass:           %016llx\n", data->errorClass);
> -	pr_info("  correlator:           %016llx\n", data->correlator);
> -	pr_info("  p7iocPlssr:           %016llx\n", data->p7iocPlssr);
> -	pr_info("  p7iocCsr:             %016llx\n", data->p7iocCsr);
> -	pr_info("  lemFir:               %016llx\n", data->lemFir);
> -	pr_info("  lemErrorMask:         %016llx\n", data->lemErrorMask);
> -	pr_info("  lemWOF:               %016llx\n", data->lemWOF);
> -	pr_info("  phbErrorStatus:       %016llx\n", data->phbErrorStatus);
> -	pr_info("  phbFirstErrorStatus:  %016llx\n", data->phbFirstErrorStatus);
> -	pr_info("  phbErrorLog0:         %016llx\n", data->phbErrorLog0);
> -	pr_info("  phbErrorLog1:         %016llx\n", data->phbErrorLog1);
> -	pr_info("  mmioErrorStatus:      %016llx\n", data->mmioErrorStatus);
> -	pr_info("  mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus);
> -	pr_info("  mmioErrorLog0:        %016llx\n", data->mmioErrorLog0);
> -	pr_info("  mmioErrorLog1:        %016llx\n", data->mmioErrorLog1);
> -	pr_info("  dma0ErrorStatus:      %016llx\n", data->dma0ErrorStatus);
> -	pr_info("  dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus);
> -	pr_info("  dma0ErrorLog0:        %016llx\n", data->dma0ErrorLog0);
> -	pr_info("  dma0ErrorLog1:        %016llx\n", data->dma0ErrorLog1);
> -	pr_info("  dma1ErrorStatus:      %016llx\n", data->dma1ErrorStatus);
> -	pr_info("  dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus);
> -	pr_info("  dma1ErrorLog0:        %016llx\n", data->dma1ErrorLog0);
> -	pr_info("  dma1ErrorLog1:        %016llx\n", data->dma1ErrorLog1);
> +	pr_info("  brdgCtl:     %s\n",
> +		pnv_pci_diag_field(&buf[0], 8, data->brdgCtl));
> +	pr_info("  UtlSts:      %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      8, data->portStatusReg),
> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->rootCmplxStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->busAgentStatus));
> +	pr_info("  RootSts:     %s %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      8, data->deviceStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->slotStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->linkStatus),
> +		pnv_pci_diag_field(&buf[3 * 24], 8, data->devCmdStatus),
> +		pnv_pci_diag_field(&buf[4 * 24], 8, data->devSecStatus));
> +	pr_info("  RootErrSts:  %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      8, data->rootErrorStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->uncorrErrorStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->corrErrorStatus));
> +	pr_info("  RootErrLog:  %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      8, data->tlpHdr1),
> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->tlpHdr2),
> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->tlpHdr3),
> +		pnv_pci_diag_field(&buf[3 * 24], 8, data->tlpHdr4));
> +	pr_info("  RootErrLog1: %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],       8, data->sourceId),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->errorClass),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->correlator));
> +	pr_info("  PhbSts:      %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->p7iocPlssr),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->p7iocCsr));
> +	pr_info("  Lem:         %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->lemFir),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->lemErrorMask),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->lemWOF));
> +	pr_info("  PhbErr:      %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->phbErrorStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->phbFirstErrorStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->phbErrorLog0),
> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->phbErrorLog1));
> +	pr_info("  OutErr:      %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->mmioErrorStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->mmioFirstErrorStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->mmioErrorLog0),
> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->mmioErrorLog1));
> +	pr_info("  InAErr:      %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->dma0ErrorStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma0FirstErrorStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma0ErrorLog0),
> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma0ErrorLog1));
> +	pr_info("  InBErr:      %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->dma1ErrorStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma1FirstErrorStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma1ErrorLog0),
> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma1ErrorLog1));
>  
>  	for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
>  		if ((data->pestA[i] >> 63) == 0 &&
>  		    (data->pestB[i] >> 63) == 0)
>  			continue;
>  
> -		pr_info("  PE[%3d] PESTA:        %016llx\n", i, data->pestA[i]);
> -		pr_info("          PESTB:        %016llx\n", data->pestB[i]);
> +		pr_info("  PE[%3d] A/B: %s %s\n",
> +			i, pnv_pci_diag_field(&buf[0], 16, data->pestA[i]),
> +			pnv_pci_diag_field(&buf[1 * 24], 16, data->pestB[i]));
>  	}
>  }
>  
> @@ -192,67 +228,79 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
>  					struct OpalIoPhbErrorCommon *common)
>  {
>  	struct OpalIoPhb3ErrorData *data;
> -	int i;
> +	char buf[120];
> +	int i = 0;
>  
> +	memset(buf, 0, 120);
>  	data = (struct OpalIoPhb3ErrorData*)common;
>  	pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n\n",
>  		hose->global_number, common->version);
>  
> -	pr_info("  brdgCtl:              %08x\n", data->brdgCtl);
> -
> -	pr_info("  portStatusReg:        %08x\n", data->portStatusReg);
> -	pr_info("  rootCmplxStatus:      %08x\n", data->rootCmplxStatus);
> -	pr_info("  busAgentStatus:       %08x\n", data->busAgentStatus);
> -
> -	pr_info("  deviceStatus:         %08x\n", data->deviceStatus);
> -	pr_info("  slotStatus:           %08x\n", data->slotStatus);
> -	pr_info("  linkStatus:           %08x\n", data->linkStatus);
> -	pr_info("  devCmdStatus:         %08x\n", data->devCmdStatus);
> -	pr_info("  devSecStatus:         %08x\n", data->devSecStatus);
> -
> -	pr_info("  rootErrorStatus:      %08x\n", data->rootErrorStatus);
> -	pr_info("  uncorrErrorStatus:    %08x\n", data->uncorrErrorStatus);
> -	pr_info("  corrErrorStatus:      %08x\n", data->corrErrorStatus);
> -	pr_info("  tlpHdr1:              %08x\n", data->tlpHdr1);
> -	pr_info("  tlpHdr2:              %08x\n", data->tlpHdr2);
> -	pr_info("  tlpHdr3:              %08x\n", data->tlpHdr3);
> -	pr_info("  tlpHdr4:              %08x\n", data->tlpHdr4);
> -	pr_info("  sourceId:             %08x\n", data->sourceId);
> -	pr_info("  errorClass:           %016llx\n", data->errorClass);
> -	pr_info("  correlator:           %016llx\n", data->correlator);
> -
> -	pr_info("  nFir:                 %016llx\n", data->nFir);
> -	pr_info("  nFirMask:             %016llx\n", data->nFirMask);
> -	pr_info("  nFirWOF:              %016llx\n", data->nFirWOF);
> -	pr_info("  PhbPlssr:             %016llx\n", data->phbPlssr);
> -	pr_info("  PhbCsr:               %016llx\n", data->phbCsr);
> -	pr_info("  lemFir:               %016llx\n", data->lemFir);
> -	pr_info("  lemErrorMask:         %016llx\n", data->lemErrorMask);
> -	pr_info("  lemWOF:               %016llx\n", data->lemWOF);
> -	pr_info("  phbErrorStatus:       %016llx\n", data->phbErrorStatus);
> -	pr_info("  phbFirstErrorStatus:  %016llx\n", data->phbFirstErrorStatus);
> -	pr_info("  phbErrorLog0:         %016llx\n", data->phbErrorLog0);
> -	pr_info("  phbErrorLog1:         %016llx\n", data->phbErrorLog1);
> -	pr_info("  mmioErrorStatus:      %016llx\n", data->mmioErrorStatus);
> -	pr_info("  mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus);
> -	pr_info("  mmioErrorLog0:        %016llx\n", data->mmioErrorLog0);
> -	pr_info("  mmioErrorLog1:        %016llx\n", data->mmioErrorLog1);
> -	pr_info("  dma0ErrorStatus:      %016llx\n", data->dma0ErrorStatus);
> -	pr_info("  dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus);
> -	pr_info("  dma0ErrorLog0:        %016llx\n", data->dma0ErrorLog0);
> -	pr_info("  dma0ErrorLog1:        %016llx\n", data->dma0ErrorLog1);
> -	pr_info("  dma1ErrorStatus:      %016llx\n", data->dma1ErrorStatus);
> -	pr_info("  dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus);
> -	pr_info("  dma1ErrorLog0:        %016llx\n", data->dma1ErrorLog0);
> -	pr_info("  dma1ErrorLog1:        %016llx\n", data->dma1ErrorLog1);
> +	pr_info("  brdgCtl:     %s\n",
> +		pnv_pci_diag_field(&buf[0], 8, data->brdgCtl));
> +	pr_info("  UtlSts:      %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      8, data->portStatusReg),
> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->rootCmplxStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->busAgentStatus));
> +	pr_info("  RootSts:     %s %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      8, data->deviceStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->slotStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->linkStatus),
> +		pnv_pci_diag_field(&buf[3 * 24], 8, data->devCmdStatus),
> +		pnv_pci_diag_field(&buf[4 * 24], 8, data->devSecStatus));
> +	pr_info("  RootErrSts:  %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      8, data->rootErrorStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->uncorrErrorStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->corrErrorStatus));
> +	pr_info("  RootErrLog:  %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      8, data->tlpHdr1),
> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->tlpHdr2),
> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->tlpHdr3),
> +		pnv_pci_diag_field(&buf[3 * 24], 8, data->tlpHdr4));
> +	pr_info("  RootErrLog1: %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],       8, data->sourceId),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->errorClass),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->correlator));
> +	pr_info("  nFir:        %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->nFir),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->nFirMask),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->nFirWOF));
> +	pr_info("  PhbSts:      %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->phbPlssr),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->phbCsr));
> +	pr_info("  Lem:         %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->lemFir),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->lemErrorMask),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->lemWOF));
> +	pr_info("  PhbErr:      %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->phbErrorStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->phbFirstErrorStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->phbErrorLog0),
> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->phbErrorLog1));
> +	pr_info("  OutErr:      %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->mmioErrorStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->mmioFirstErrorStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->mmioErrorLog0),
> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->mmioErrorLog1));
> +	pr_info("  InAErr:      %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->dma0ErrorStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma0FirstErrorStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma0ErrorLog0),
> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma0ErrorLog1));
> +	pr_info("  InBErr:      %s %s %s %s\n",
> +		pnv_pci_diag_field(&buf[0],      16, data->dma1ErrorStatus),
> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma1FirstErrorStatus),
> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma1ErrorLog0),
> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma1ErrorLog1));
>  
>  	for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
>  		if ((data->pestA[i] >> 63) == 0 &&
>  		    (data->pestB[i] >> 63) == 0)
>  			continue;
>  
> -		pr_info("  PE[%3d] PESTA:        %016llx\n", i, data->pestA[i]);
> -		pr_info("          PESTB:        %016llx\n", data->pestB[i]);
> +		pr_info("  PE[%3d] A/B: %s %s\n",
> +			i, pnv_pci_diag_field(&buf[0], 16, data->pestA[i]),
> +			pnv_pci_diag_field(&buf[1 * 24], 16, data->pestB[i]));
>  	}
>  }
>  

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] powerpc/powernv: Make PHB diag-data output short
  2014-02-21 20:05   ` Benjamin Herrenschmidt
@ 2014-02-23  4:55     ` Gavin Shan
  0 siblings, 0 replies; 12+ messages in thread
From: Gavin Shan @ 2014-02-23  4:55 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Gavin Shan

On Sat, Feb 22, 2014 at 07:05:15AM +1100, Benjamin Herrenschmidt wrote:
>On Fri, 2014-02-21 at 19:53 +0800, Gavin Shan wrote:
>> According to Ben's suggestion, the patch makes the PHB diag-data
>> dump looks a bit short by printing multiple values in one line
>> and outputing "-" for zero fields.
>> 
>> After the patch applied, the PHB diag-data dump looks like:
>
>Actually, I wouldn't do that "-" thing, I would leave zeros as
>zeros but I would remove lines that have all zeros.
>

Ok. I'll change it in next revision :-)

>Additionally, we might want to consider what if we can get rid
>of more fields for INF, or maybe even not dump them by default
>and just count them (should we have counters in sysfs ?)
>

Yes, I'll remove dumping for INF and have a sysfs entry for the
INF counter, which would be separate patch in next revision.

>One thing I'm tempted to do is turn the full logs into actual
>error logs (sent to FSP) and only display a "analyzed" version
>in the kernel, something that decodes the PEST for example
>and indicates if it's an DMA or MMIO error, the address, etc...
>

Ok. I'll try to do it in next revision :-)

Thanks,
Gavin

>> PHB3 PHB#3 Diag-data (Version: 1)
>> 
>>   brdgCtl:     00000002
>>   UtlSts:      - - -
>>   RootSts:     0000000f 00400000 b0830008 00100147 00002000
>>   RootErrSts:  - - -
>>   RootErrLog:  - - - -
>>   RootErrLog1: - - -
>>   nFir:        - 0030006e00000000 -
>>   PhbSts:      0000001c00000000 -
>>   Lem:         0000000000100000 42498e327f502eae -
>>   PhbErr:      - - - -
>>   OutErr:      - - - -
>>   InAErr:      8000000000000000 8000000000000000 0402030000000000 -
>>   InBErr:      - - - -
>>   PE[  8] A/B: 8480002b00000000 8000000000000000
>> 
>> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/platforms/powernv/pci.c |  238 ++++++++++++++++++++--------------
>>  1 file changed, 143 insertions(+), 95 deletions(-)
>> 
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index 67b2254..a5f236a 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -124,67 +124,103 @@ static void pnv_teardown_msi_irqs(struct pci_dev *pdev)
>>  }
>>  #endif /* CONFIG_PCI_MSI */
>>  
>> +static char *pnv_pci_diag_field(char *buf, int fmt, u64 val64)
>> +{
>> +	u32 val32 = (u32)val64;
>> +
>> +	memset(buf, 0, 24);
>> +	switch (fmt) {
>> +	case 8:
>> +		if (val32)
>> +			sprintf(buf, "%08x", val32);
>> +		else
>> +			sprintf(buf, "%s", "-");
>> +		break;
>> +	case 16:
>> +		if (val64)
>> +			sprintf(buf, "%016llx", val64);
>> +		else
>> +			sprintf(buf, "%s", "-");
>> +		break;
>> +	default:
>> +		sprintf(buf, "%s", "-");
>> +	}
>> +
>> +	return buf;
>> +}
>> +
>>  static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
>>  					 struct OpalIoPhbErrorCommon *common)
>>  {
>>  	struct OpalIoP7IOCPhbErrorData *data;
>> +	char buf[120];
>>  	int i;
>>  
>>  	data = (struct OpalIoP7IOCPhbErrorData *)common;
>>  	pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n\n",
>>  		hose->global_number, common->version);
>>  
>> -	pr_info("  brdgCtl:              %08x\n", data->brdgCtl);
>> -
>> -	pr_info("  portStatusReg:        %08x\n", data->portStatusReg);
>> -	pr_info("  rootCmplxStatus:      %08x\n", data->rootCmplxStatus);
>> -	pr_info("  busAgentStatus:       %08x\n", data->busAgentStatus);
>> -
>> -	pr_info("  deviceStatus:         %08x\n", data->deviceStatus);
>> -	pr_info("  slotStatus:           %08x\n", data->slotStatus);
>> -	pr_info("  linkStatus:           %08x\n", data->linkStatus);
>> -	pr_info("  devCmdStatus:         %08x\n", data->devCmdStatus);
>> -	pr_info("  devSecStatus:         %08x\n", data->devSecStatus);
>> -
>> -	pr_info("  rootErrorStatus:      %08x\n", data->rootErrorStatus);
>> -	pr_info("  uncorrErrorStatus:    %08x\n", data->uncorrErrorStatus);
>> -	pr_info("  corrErrorStatus:      %08x\n", data->corrErrorStatus);
>> -	pr_info("  tlpHdr1:              %08x\n", data->tlpHdr1);
>> -	pr_info("  tlpHdr2:              %08x\n", data->tlpHdr2);
>> -	pr_info("  tlpHdr3:              %08x\n", data->tlpHdr3);
>> -	pr_info("  tlpHdr4:              %08x\n", data->tlpHdr4);
>> -	pr_info("  sourceId:             %08x\n", data->sourceId);
>> -	pr_info("  errorClass:           %016llx\n", data->errorClass);
>> -	pr_info("  correlator:           %016llx\n", data->correlator);
>> -	pr_info("  p7iocPlssr:           %016llx\n", data->p7iocPlssr);
>> -	pr_info("  p7iocCsr:             %016llx\n", data->p7iocCsr);
>> -	pr_info("  lemFir:               %016llx\n", data->lemFir);
>> -	pr_info("  lemErrorMask:         %016llx\n", data->lemErrorMask);
>> -	pr_info("  lemWOF:               %016llx\n", data->lemWOF);
>> -	pr_info("  phbErrorStatus:       %016llx\n", data->phbErrorStatus);
>> -	pr_info("  phbFirstErrorStatus:  %016llx\n", data->phbFirstErrorStatus);
>> -	pr_info("  phbErrorLog0:         %016llx\n", data->phbErrorLog0);
>> -	pr_info("  phbErrorLog1:         %016llx\n", data->phbErrorLog1);
>> -	pr_info("  mmioErrorStatus:      %016llx\n", data->mmioErrorStatus);
>> -	pr_info("  mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus);
>> -	pr_info("  mmioErrorLog0:        %016llx\n", data->mmioErrorLog0);
>> -	pr_info("  mmioErrorLog1:        %016llx\n", data->mmioErrorLog1);
>> -	pr_info("  dma0ErrorStatus:      %016llx\n", data->dma0ErrorStatus);
>> -	pr_info("  dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus);
>> -	pr_info("  dma0ErrorLog0:        %016llx\n", data->dma0ErrorLog0);
>> -	pr_info("  dma0ErrorLog1:        %016llx\n", data->dma0ErrorLog1);
>> -	pr_info("  dma1ErrorStatus:      %016llx\n", data->dma1ErrorStatus);
>> -	pr_info("  dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus);
>> -	pr_info("  dma1ErrorLog0:        %016llx\n", data->dma1ErrorLog0);
>> -	pr_info("  dma1ErrorLog1:        %016llx\n", data->dma1ErrorLog1);
>> +	pr_info("  brdgCtl:     %s\n",
>> +		pnv_pci_diag_field(&buf[0], 8, data->brdgCtl));
>> +	pr_info("  UtlSts:      %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      8, data->portStatusReg),
>> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->rootCmplxStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->busAgentStatus));
>> +	pr_info("  RootSts:     %s %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      8, data->deviceStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->slotStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->linkStatus),
>> +		pnv_pci_diag_field(&buf[3 * 24], 8, data->devCmdStatus),
>> +		pnv_pci_diag_field(&buf[4 * 24], 8, data->devSecStatus));
>> +	pr_info("  RootErrSts:  %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      8, data->rootErrorStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->uncorrErrorStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->corrErrorStatus));
>> +	pr_info("  RootErrLog:  %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      8, data->tlpHdr1),
>> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->tlpHdr2),
>> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->tlpHdr3),
>> +		pnv_pci_diag_field(&buf[3 * 24], 8, data->tlpHdr4));
>> +	pr_info("  RootErrLog1: %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],       8, data->sourceId),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->errorClass),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->correlator));
>> +	pr_info("  PhbSts:      %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->p7iocPlssr),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->p7iocCsr));
>> +	pr_info("  Lem:         %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->lemFir),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->lemErrorMask),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->lemWOF));
>> +	pr_info("  PhbErr:      %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->phbErrorStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->phbFirstErrorStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->phbErrorLog0),
>> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->phbErrorLog1));
>> +	pr_info("  OutErr:      %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->mmioErrorStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->mmioFirstErrorStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->mmioErrorLog0),
>> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->mmioErrorLog1));
>> +	pr_info("  InAErr:      %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->dma0ErrorStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma0FirstErrorStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma0ErrorLog0),
>> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma0ErrorLog1));
>> +	pr_info("  InBErr:      %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->dma1ErrorStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma1FirstErrorStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma1ErrorLog0),
>> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma1ErrorLog1));
>>  
>>  	for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
>>  		if ((data->pestA[i] >> 63) == 0 &&
>>  		    (data->pestB[i] >> 63) == 0)
>>  			continue;
>>  
>> -		pr_info("  PE[%3d] PESTA:        %016llx\n", i, data->pestA[i]);
>> -		pr_info("          PESTB:        %016llx\n", data->pestB[i]);
>> +		pr_info("  PE[%3d] A/B: %s %s\n",
>> +			i, pnv_pci_diag_field(&buf[0], 16, data->pestA[i]),
>> +			pnv_pci_diag_field(&buf[1 * 24], 16, data->pestB[i]));
>>  	}
>>  }
>>  
>> @@ -192,67 +228,79 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
>>  					struct OpalIoPhbErrorCommon *common)
>>  {
>>  	struct OpalIoPhb3ErrorData *data;
>> -	int i;
>> +	char buf[120];
>> +	int i = 0;
>>  
>> +	memset(buf, 0, 120);
>>  	data = (struct OpalIoPhb3ErrorData*)common;
>>  	pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n\n",
>>  		hose->global_number, common->version);
>>  
>> -	pr_info("  brdgCtl:              %08x\n", data->brdgCtl);
>> -
>> -	pr_info("  portStatusReg:        %08x\n", data->portStatusReg);
>> -	pr_info("  rootCmplxStatus:      %08x\n", data->rootCmplxStatus);
>> -	pr_info("  busAgentStatus:       %08x\n", data->busAgentStatus);
>> -
>> -	pr_info("  deviceStatus:         %08x\n", data->deviceStatus);
>> -	pr_info("  slotStatus:           %08x\n", data->slotStatus);
>> -	pr_info("  linkStatus:           %08x\n", data->linkStatus);
>> -	pr_info("  devCmdStatus:         %08x\n", data->devCmdStatus);
>> -	pr_info("  devSecStatus:         %08x\n", data->devSecStatus);
>> -
>> -	pr_info("  rootErrorStatus:      %08x\n", data->rootErrorStatus);
>> -	pr_info("  uncorrErrorStatus:    %08x\n", data->uncorrErrorStatus);
>> -	pr_info("  corrErrorStatus:      %08x\n", data->corrErrorStatus);
>> -	pr_info("  tlpHdr1:              %08x\n", data->tlpHdr1);
>> -	pr_info("  tlpHdr2:              %08x\n", data->tlpHdr2);
>> -	pr_info("  tlpHdr3:              %08x\n", data->tlpHdr3);
>> -	pr_info("  tlpHdr4:              %08x\n", data->tlpHdr4);
>> -	pr_info("  sourceId:             %08x\n", data->sourceId);
>> -	pr_info("  errorClass:           %016llx\n", data->errorClass);
>> -	pr_info("  correlator:           %016llx\n", data->correlator);
>> -
>> -	pr_info("  nFir:                 %016llx\n", data->nFir);
>> -	pr_info("  nFirMask:             %016llx\n", data->nFirMask);
>> -	pr_info("  nFirWOF:              %016llx\n", data->nFirWOF);
>> -	pr_info("  PhbPlssr:             %016llx\n", data->phbPlssr);
>> -	pr_info("  PhbCsr:               %016llx\n", data->phbCsr);
>> -	pr_info("  lemFir:               %016llx\n", data->lemFir);
>> -	pr_info("  lemErrorMask:         %016llx\n", data->lemErrorMask);
>> -	pr_info("  lemWOF:               %016llx\n", data->lemWOF);
>> -	pr_info("  phbErrorStatus:       %016llx\n", data->phbErrorStatus);
>> -	pr_info("  phbFirstErrorStatus:  %016llx\n", data->phbFirstErrorStatus);
>> -	pr_info("  phbErrorLog0:         %016llx\n", data->phbErrorLog0);
>> -	pr_info("  phbErrorLog1:         %016llx\n", data->phbErrorLog1);
>> -	pr_info("  mmioErrorStatus:      %016llx\n", data->mmioErrorStatus);
>> -	pr_info("  mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus);
>> -	pr_info("  mmioErrorLog0:        %016llx\n", data->mmioErrorLog0);
>> -	pr_info("  mmioErrorLog1:        %016llx\n", data->mmioErrorLog1);
>> -	pr_info("  dma0ErrorStatus:      %016llx\n", data->dma0ErrorStatus);
>> -	pr_info("  dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus);
>> -	pr_info("  dma0ErrorLog0:        %016llx\n", data->dma0ErrorLog0);
>> -	pr_info("  dma0ErrorLog1:        %016llx\n", data->dma0ErrorLog1);
>> -	pr_info("  dma1ErrorStatus:      %016llx\n", data->dma1ErrorStatus);
>> -	pr_info("  dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus);
>> -	pr_info("  dma1ErrorLog0:        %016llx\n", data->dma1ErrorLog0);
>> -	pr_info("  dma1ErrorLog1:        %016llx\n", data->dma1ErrorLog1);
>> +	pr_info("  brdgCtl:     %s\n",
>> +		pnv_pci_diag_field(&buf[0], 8, data->brdgCtl));
>> +	pr_info("  UtlSts:      %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      8, data->portStatusReg),
>> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->rootCmplxStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->busAgentStatus));
>> +	pr_info("  RootSts:     %s %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      8, data->deviceStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->slotStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->linkStatus),
>> +		pnv_pci_diag_field(&buf[3 * 24], 8, data->devCmdStatus),
>> +		pnv_pci_diag_field(&buf[4 * 24], 8, data->devSecStatus));
>> +	pr_info("  RootErrSts:  %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      8, data->rootErrorStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->uncorrErrorStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->corrErrorStatus));
>> +	pr_info("  RootErrLog:  %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      8, data->tlpHdr1),
>> +		pnv_pci_diag_field(&buf[1 * 24], 8, data->tlpHdr2),
>> +		pnv_pci_diag_field(&buf[2 * 24], 8, data->tlpHdr3),
>> +		pnv_pci_diag_field(&buf[3 * 24], 8, data->tlpHdr4));
>> +	pr_info("  RootErrLog1: %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],       8, data->sourceId),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->errorClass),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->correlator));
>> +	pr_info("  nFir:        %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->nFir),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->nFirMask),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->nFirWOF));
>> +	pr_info("  PhbSts:      %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->phbPlssr),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->phbCsr));
>> +	pr_info("  Lem:         %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->lemFir),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->lemErrorMask),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->lemWOF));
>> +	pr_info("  PhbErr:      %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->phbErrorStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->phbFirstErrorStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->phbErrorLog0),
>> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->phbErrorLog1));
>> +	pr_info("  OutErr:      %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->mmioErrorStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->mmioFirstErrorStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->mmioErrorLog0),
>> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->mmioErrorLog1));
>> +	pr_info("  InAErr:      %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->dma0ErrorStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma0FirstErrorStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma0ErrorLog0),
>> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma0ErrorLog1));
>> +	pr_info("  InBErr:      %s %s %s %s\n",
>> +		pnv_pci_diag_field(&buf[0],      16, data->dma1ErrorStatus),
>> +		pnv_pci_diag_field(&buf[1 * 24], 16, data->dma1FirstErrorStatus),
>> +		pnv_pci_diag_field(&buf[2 * 24], 16, data->dma1ErrorLog0),
>> +		pnv_pci_diag_field(&buf[3 * 24], 16, data->dma1ErrorLog1));
>>  
>>  	for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
>>  		if ((data->pestA[i] >> 63) == 0 &&
>>  		    (data->pestB[i] >> 63) == 0)
>>  			continue;
>>  
>> -		pr_info("  PE[%3d] PESTA:        %016llx\n", i, data->pestA[i]);
>> -		pr_info("          PESTB:        %016llx\n", data->pestB[i]);
>> +		pr_info("  PE[%3d] A/B: %s %s\n",
>> +			i, pnv_pci_diag_field(&buf[0], 16, data->pestA[i]),
>> +			pnv_pci_diag_field(&buf[1 * 24], 16, data->pestB[i]));
>>  	}
>>  }
>>  
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v3 0/5] EEH improvement
@ 2014-02-25  7:28 Gavin Shan
  2014-02-25  7:28 ` [PATCH 2/5] powerpc/powernv: Remove PNV_EEH_STATE_REMOVED Gavin Shan
  0 siblings, 1 reply; 12+ messages in thread
From: Gavin Shan @ 2014-02-25  7:28 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan

The series of patches intends to improve reliability of EEH on PowerNV
platform. First all, we have had multiple duplicate states (flags) for
PHB and PE, so we remove those duplicate states to simplify the code.
Besides, we had corrupted PHB diag-data for case of frozen PE. In order
to solve the problem, we introduce eeh_ops->event() and notifications
are sent from EEH core to (PowerNV) platform on creating or destroying
PE instance so that we can allocate or free PHB diag-data backend. Then
we cache the PHB diag-data on the first call to eeh_ops->get_state()
and dump it afterwards, which helps to get correct PHB diag-data.

With the patchset applied, we never dump PHB diag-data for INF errors.
Instead, we just maintain statistics in /proc/powerpc/eeh_inf_err. Also,
we changed the PHB diag-data dump format for a bit to have multiple
fields per line and omits the line with all zero'd fields as Ben suggested.

v2 -> v3:
	* We don't cache the PHB diag-data, instead we just grab and
	  dump PHB diag-data on the first catch-up to avoid broken
	  PHB diag-data.
v1 -> v2:
	* Amending commit logs
	* Support eeh_ops->event() and maintain PHB diag-data on basis
	  of PE instance
	* When dumping PHB diag-data, to replace "-" with "00000000" and
	  omit the line if the fields of it are all zeros.

---

arch/powerpc/include/asm/eeh.h            |    1 -
arch/powerpc/kernel/eeh.c                 |   10 +---
arch/powerpc/kernel/eeh_driver.c          |   10 ++--
arch/powerpc/platforms/powernv/eeh-ioda.c |  137 ++++++++++++++++++++--------------------------
arch/powerpc/platforms/powernv/pci.c      |  228 ++++++++++++++++++++++++++++++++++++++++++---------------------------
arch/powerpc/platforms/powernv/pci.h      |    8 +--
6 files changed, 195 insertions(+), 199 deletions(-)

Thanks,
Gavin

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/5] powerpc/powernv: Remove PNV_EEH_STATE_REMOVED
  2014-02-25  7:28 [PATCH v3 0/5] EEH improvement Gavin Shan
@ 2014-02-25  7:28 ` Gavin Shan
  0 siblings, 0 replies; 12+ messages in thread
From: Gavin Shan @ 2014-02-25  7:28 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gavin Shan

The PHB state PNV_EEH_STATE_REMOVED maintained in pnv_phb isn't
so useful any more and it's duplicated to EEH_PE_ISOLATED. The
patch replaces PNV_EEH_STATE_REMOVED with EEH_PE_ISOLATED.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-ioda.c |   56 ++++++++---------------------
 arch/powerpc/platforms/powernv/pci.h      |    1 -
 2 files changed, 15 insertions(+), 42 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index f514743..0d1d424 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -662,22 +662,6 @@ static void ioda_eeh_phb_diag(struct pci_controller *hose)
 	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
 }
 
-static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
-			       struct eeh_pe **pe)
-{
-	struct eeh_pe *phb_pe;
-
-	phb_pe = eeh_phb_pe_get(hose);
-	if (!phb_pe) {
-		pr_warning("%s Can't find PE for PHB#%d\n",
-			   __func__, hose->global_number);
-		return -EEXIST;
-	}
-
-	*pe = phb_pe;
-	return 0;
-}
-
 static int ioda_eeh_get_pe(struct pci_controller *hose,
 			   u16 pe_no, struct eeh_pe **pe)
 {
@@ -685,7 +669,8 @@ static int ioda_eeh_get_pe(struct pci_controller *hose,
 	struct eeh_dev dev;
 
 	/* Find the PHB PE */
-	if (ioda_eeh_get_phb_pe(hose, &phb_pe))
+	phb_pe = eeh_phb_pe_get(hose);
+	if (!phb_pe)
 		return -EEXIST;
 
 	/* Find the PE according to PE# */
@@ -713,6 +698,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 {
 	struct pci_controller *hose;
 	struct pnv_phb *phb;
+	struct eeh_pe *phb_pe;
 	u64 frozen_pe_no;
 	u16 err_type, severity;
 	long rc;
@@ -729,10 +715,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 	list_for_each_entry(hose, &hose_list, list_node) {
 		/*
 		 * If the subordinate PCI buses of the PHB has been
-		 * removed, we needn't take care of it any more.
+		 * removed or is exactly under error recovery, we
+		 * needn't take care of it any more.
 		 */
 		phb = hose->private_data;
-		if (phb->eeh_state & PNV_EEH_STATE_REMOVED)
+		phb_pe = eeh_phb_pe_get(hose);
+		if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
 			continue;
 
 		rc = opal_pci_next_error(phb->opal_id,
@@ -765,12 +753,6 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		switch (err_type) {
 		case OPAL_EEH_IOC_ERROR:
 			if (severity == OPAL_EEH_SEV_IOC_DEAD) {
-				list_for_each_entry(hose, &hose_list,
-						    list_node) {
-					phb = hose->private_data;
-					phb->eeh_state |= PNV_EEH_STATE_REMOVED;
-				}
-
 				pr_err("EEH: dead IOC detected\n");
 				ret = EEH_NEXT_ERR_DEAD_IOC;
 			} else if (severity == OPAL_EEH_SEV_INF) {
@@ -783,17 +765,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 			break;
 		case OPAL_EEH_PHB_ERROR:
 			if (severity == OPAL_EEH_SEV_PHB_DEAD) {
-				if (ioda_eeh_get_phb_pe(hose, pe))
-					break;
-
+				*pe = phb_pe;
 				pr_err("EEH: dead PHB#%x detected\n",
 					hose->global_number);
-				phb->eeh_state |= PNV_EEH_STATE_REMOVED;
 				ret = EEH_NEXT_ERR_DEAD_PHB;
 			} else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
-				if (ioda_eeh_get_phb_pe(hose, pe))
-					break;
-
+				*pe = phb_pe;
 				pr_err("EEH: fenced PHB#%x detected\n",
 					hose->global_number);
 				ret = EEH_NEXT_ERR_FENCED_PHB;
@@ -813,15 +790,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 			 * fenced PHB so that it can be recovered.
 			 */
 			if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) {
-				if (!ioda_eeh_get_phb_pe(hose, pe)) {
-					pr_err("EEH: Escalated fenced PHB#%x "
-					       "detected for PE#%llx\n",
-						hose->global_number,
-						frozen_pe_no);
-					ret = EEH_NEXT_ERR_FENCED_PHB;
-				} else {
-					ret = EEH_NEXT_ERR_NONE;
-				}
+				*pe = phb_pe;
+				pr_err("EEH: Escalated fenced PHB#%x "
+				       "detected for PE#%llx\n",
+					hose->global_number,
+					frozen_pe_no);
+				ret = EEH_NEXT_ERR_FENCED_PHB;
 			} else {
 				pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
 					(*pe)->addr, (*pe)->phb->global_number);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index cde1694..6870f60 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -83,7 +83,6 @@ struct pnv_eeh_ops {
 };
 
 #define PNV_EEH_STATE_ENABLED	(1 << 0)	/* EEH enabled	*/
-#define PNV_EEH_STATE_REMOVED	(1 << 1)	/* PHB removed	*/
 
 #endif /* CONFIG_EEH */
 
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-02-25  7:28 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-02-21 11:53 [PATCH 1/5] powerpc/eeh: Remove EEH_PE_PHB_DEAD Gavin Shan
2014-02-21 11:53 ` [PATCH 2/5] powerpc/powernv: Remove PNV_EEH_STATE_REMOVED Gavin Shan
2014-02-21 11:53 ` [PATCH 3/5] powerpc/powernv: Cleanup on PNV_EEH_STATE_ENABLED Gavin Shan
2014-02-21 19:56   ` Benjamin Herrenschmidt
2014-02-23  2:12     ` Gavin Shan
2014-02-21 11:53 ` [PATCH 4/5] powerpc/powernv: Cache PHB diag-data Gavin Shan
2014-02-21 20:01   ` Benjamin Herrenschmidt
2014-02-23  4:52     ` Gavin Shan
2014-02-21 11:53 ` [PATCH 5/5] powerpc/powernv: Make PHB diag-data output short Gavin Shan
2014-02-21 20:05   ` Benjamin Herrenschmidt
2014-02-23  4:55     ` Gavin Shan
  -- strict thread matches above, loose matches on Subject: below --
2014-02-25  7:28 [PATCH v3 0/5] EEH improvement Gavin Shan
2014-02-25  7:28 ` [PATCH 2/5] powerpc/powernv: Remove PNV_EEH_STATE_REMOVED Gavin Shan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).