linux-ide.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFT] sata_promise: decode and report error reasons
@ 2007-03-01  1:58 Mikael Pettersson
  2007-03-05  4:35 ` Tejun Heo
  0 siblings, 1 reply; 6+ messages in thread
From: Mikael Pettersson @ 2007-03-01  1:58 UTC (permalink / raw)
  To: linux-ide

This is a preliminary patch to add error reason decoding
and reporting to sata_promise. It's fairly simplistic but
should log all error info the controller is able to provide.

Works for me, but my test rig hardly ever throws any errors.
Testing by those that experience frequent errors would be
most welcome.

libata experts may want to check if my mapping to libata
error codes and eh actions is OK.

/Mikael

--- linux-2.6.21-rc2/drivers/ata/sata_promise.c.~1~	2007-02-28 13:32:46.000000000 +0100
+++ linux-2.6.21-rc2/drivers/ata/sata_promise.c	2007-03-01 01:52:35.000000000 +0100
@@ -70,8 +70,37 @@ enum {
 	PDC_TBG_MODE		= 0x41C, /* TBG mode (not SATAII) */
 	PDC_SLEW_CTL		= 0x470, /* slew rate control reg (not SATAII) */
 
-	PDC_ERR_MASK		= (1<<19) | (1<<20) | (1<<21) | (1<<22) |
-				  (1<<8) | (1<<9) | (1<<10),
+	/* PDC_GLOBAL_CTL bit definitions */
+	PDC_PH_ERR		= (1 << 8),
+	PDC_SH_ERR		= (1 << 9),
+	PDC_DH_ERR		= (1 << 10),
+	PDC2_HTO_ERR		= (1 << 12),
+	PDC2_ATA_HBA_ERR	= (1 << 13),
+	PDC2_ATA_DMA_CNT_ERR	= (1 << 14),
+	PDC_OVERRUN_ERR		= (1 << 19),
+	PDC_UNDERRUN_ERR	= (1 << 20),
+	PDC_DRIVE_ERR		= (1 << 21),
+	PDC_PCI_SYS_ERR		= (1 << 22),
+	PDC1_PCI_PARITY_ERR	= (1 << 23),	/* 1st gen only */
+	PDC1_ERR_MASK		= PDC1_PCI_PARITY_ERR,
+	PDC2_ERR_MASK		= PDC2_HTO_ERR | PDC2_ATA_HBA_ERR | PDC2_ATA_DMA_CNT_ERR,
+	PDC_ERR_MASK		= (PDC_PH_ERR | PDC_SH_ERR | PDC_DH_ERR | PDC_OVERRUN_ERR
+				   | PDC_UNDERRUN_ERR | PDC_DRIVE_ERR | PDC_PCI_SYS_ERR
+				   | PDC2_ERR_MASK | PDC1_ERR_MASK),
+
+	/* Promise-specific SError bit definitions */
+	PDC_SERR_COMWAKE	= (1 << 18),
+	PDC_SERR_10BTO8B	= (1 << 19),
+	PDC_SERR_DISPARITY	= (1 << 20),
+	PDC_SERR_CRC		= (1 << 21),
+	PDC_SERR_HANDSHAKE	= (1 << 22),
+	PDC_SERR_LINK_SEQ	= (1 << 23),
+	PDC2_SERR_STATE_TRANS	= (1 << 24),
+	PDC_SERR_FIS_TYPE	= (1 << 25),
+	PDC2_SERR_MASK		= PDC2_SERR_STATE_TRANS,
+	PDC_SERR_MASK		= (PDC_SERR_COMWAKE | PDC_SERR_10BTO8B | PDC_SERR_DISPARITY
+				   | PDC_SERR_CRC | PDC_SERR_HANDSHAKE | PDC_SERR_LINK_SEQ
+				   | PDC_SERR_FIS_TYPE | PDC2_SERR_MASK),
 
 	board_2037x		= 0,	/* FastTrak S150 TX2plus */
 	board_20319		= 1,	/* FastTrak S150 TX4 */
@@ -590,17 +619,67 @@ static void pdc_post_internal_cmd(struct
 		pdc_reset_port(ap);
 }
 
+static void pdc_error_intr(struct ata_port *ap, struct ata_queued_cmd *qc, u32 port_status)
+{
+	struct pdc_host_priv *hp = ap->host->private_data;
+	struct ata_eh_info *ehi = &ap->eh_info;
+	unsigned int err_mask = 0, action = 0;
+	u32 serror;
+
+	ata_ehi_clear_desc(ehi);
+
+	serror = 0;
+	if (sata_scr_valid(ap)) {
+		serror = pdc_sata_scr_read(ap, SCR_ERROR);
+		if (!(hp->flags & PDC_FLAG_GEN_II))
+			serror &= ~PDC2_SERR_MASK;
+	}
+
+	printk("%s: port_status 0x%08x serror 0x%08x\n", __FUNCTION__, port_status, serror);
+
+	ata_ehi_push_desc(ehi, "port_status 0x%08x", port_status);
+
+	if (serror & PDC_SERR_MASK) {
+		err_mask |= AC_ERR_ATA_BUS;
+		ata_ehi_push_desc(ehi, ", serror 0x%08x", serror);
+	}
+	if (port_status & PDC_DRIVE_ERR)
+		err_mask |= AC_ERR_DEV;
+	if (port_status & PDC2_HTO_ERR)
+		err_mask |= AC_ERR_TIMEOUT;
+	if (port_status & (PDC_UNDERRUN_ERR | PDC_OVERRUN_ERR | PDC2_ATA_DMA_CNT_ERR
+			   | PDC2_ATA_HBA_ERR))
+		err_mask |= AC_ERR_ATA_BUS;
+	if (port_status & (PDC_PH_ERR | PDC_SH_ERR | PDC_DH_ERR | PDC_PCI_SYS_ERR
+			   | PDC1_PCI_PARITY_ERR))
+		err_mask |= AC_ERR_HOST_BUS;
+
+	action |= ATA_EH_SOFTRESET;
+
+	ehi->serror |= serror;
+	ehi->action |= action;
+
+	qc->err_mask |= err_mask;
+
+	ata_port_freeze(ap);
+}
+
 static inline unsigned int pdc_host_intr( struct ata_port *ap,
                                           struct ata_queued_cmd *qc)
 {
 	unsigned int handled = 0;
-	u32 tmp;
-	void __iomem *mmio = ap->ioaddr.cmd_addr + PDC_GLOBAL_CTL;
+	void __iomem *port_mmio = ap->ioaddr.cmd_addr;
+	struct pdc_host_priv *hp = ap->host->private_data;
+	u32 port_status;
 
-	tmp = readl(mmio);
-	if (tmp & PDC_ERR_MASK) {
-		qc->err_mask |= AC_ERR_DEV;
-		pdc_reset_port(ap);
+	port_status = readl(port_mmio + PDC_GLOBAL_CTL);
+	if (hp->flags & PDC_FLAG_GEN_II)
+		port_status &= ~PDC1_ERR_MASK;
+	else
+		port_status &= ~PDC2_ERR_MASK;
+	if (unlikely(port_status & PDC_ERR_MASK)) {
+		pdc_error_intr(ap, qc, port_status);
+		return 1;
 	}
 
 	switch (qc->tf.protocol) {

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFT] sata_promise: decode and report error reasons
  2007-03-01  1:58 [RFT] sata_promise: decode and report error reasons Mikael Pettersson
@ 2007-03-05  4:35 ` Tejun Heo
  2007-03-07 19:53   ` Mikael Pettersson
  0 siblings, 1 reply; 6+ messages in thread
From: Tejun Heo @ 2007-03-05  4:35 UTC (permalink / raw)
  To: Mikael Pettersson; +Cc: linux-ide

Hello, Mikael.

Mikael Pettersson wrote:
> +static void pdc_error_intr(struct ata_port *ap, struct ata_queued_cmd *qc, u32 port_status)
> +{
> +	struct pdc_host_priv *hp = ap->host->private_data;
> +	struct ata_eh_info *ehi = &ap->eh_info;
> +	unsigned int err_mask = 0, action = 0;
> +	u32 serror;
> +
> +	ata_ehi_clear_desc(ehi);
> +
> +	serror = 0;
> +	if (sata_scr_valid(ap)) {
> +		serror = pdc_sata_scr_read(ap, SCR_ERROR);
> +		if (!(hp->flags & PDC_FLAG_GEN_II))
> +			serror &= ~PDC2_SERR_MASK;
> +	}
> +
> +	printk("%s: port_status 0x%08x serror 0x%08x\n", __FUNCTION__, port_status, serror);
> +
> +	ata_ehi_push_desc(ehi, "port_status 0x%08x", port_status);
> +
> +	if (serror & PDC_SERR_MASK) {
> +		err_mask |= AC_ERR_ATA_BUS;

1. It might be that decoding PDC specific bits is unnecessary if it sets
the standard bits correctly.  Also, SError bits above bit16 are
diagnostic bits and don't necessarily indicate error condition.

2. PDC_SERR_FIS_TYPE is more close to AC_ERR_HSM.

> +		ata_ehi_push_desc(ehi, ", serror 0x%08x", serror);
> +	}
> +	if (port_status & PDC_DRIVE_ERR)
> +		err_mask |= AC_ERR_DEV;
> +	if (port_status & PDC2_HTO_ERR)
> +		err_mask |= AC_ERR_TIMEOUT;

What does HTO mean?  Host time out?  Until now, AC_ERR_TIMEOUT has been
used to indicate that driver side timeout has expired and I'd like to
keep it that way.

> +	if (port_status & (PDC_UNDERRUN_ERR | PDC_OVERRUN_ERR | PDC2_ATA_DMA_CNT_ERR
> +			   | PDC2_ATA_HBA_ERR))
> +		err_mask |= AC_ERR_ATA_BUS;

AC_ERR_ATA_BUS indicates transmission errors on the ATA bus.  AC_ERR_HSM
(host state machine/protocol violation), AC_ERR_HOST_BUS (host/PCI BUS
error) or AC_ERR_SYSTEM (other system errors) seems more appropriate for
the above error conditions.

> +	if (port_status & (PDC_PH_ERR | PDC_SH_ERR | PDC_DH_ERR | PDC_PCI_SYS_ERR
> +			   | PDC1_PCI_PARITY_ERR))
> +		err_mask |= AC_ERR_HOST_BUS;
> +
> +	action |= ATA_EH_SOFTRESET;
> +
> +	ehi->serror |= serror;
> +	ehi->action |= action;
> +
> +	qc->err_mask |= err_mask;
> +
> +	ata_port_freeze(ap);
> +}
> +

Thanks for working on sata_promise.  :-)

-- 
tejun

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFT] sata_promise: decode and report error reasons
  2007-03-05  4:35 ` Tejun Heo
@ 2007-03-07 19:53   ` Mikael Pettersson
  2007-03-07 22:45     ` Jeff Garzik
  2007-03-08  2:44     ` Tejun Heo
  0 siblings, 2 replies; 6+ messages in thread
From: Mikael Pettersson @ 2007-03-07 19:53 UTC (permalink / raw)
  To: Tejun Heo; +Cc: linux-ide

Tejun,

Tejun Heo writes:
 > Hello, Mikael.
 > 
 > Mikael Pettersson wrote:
 > > +static void pdc_error_intr(struct ata_port *ap, struct ata_queued_cmd *qc, u32 port_status)
 > > +{
 > > +	struct pdc_host_priv *hp = ap->host->private_data;
 > > +	struct ata_eh_info *ehi = &ap->eh_info;
 > > +	unsigned int err_mask = 0, action = 0;
 > > +	u32 serror;
 > > +
 > > +	ata_ehi_clear_desc(ehi);
 > > +
 > > +	serror = 0;
 > > +	if (sata_scr_valid(ap)) {
 > > +		serror = pdc_sata_scr_read(ap, SCR_ERROR);
 > > +		if (!(hp->flags & PDC_FLAG_GEN_II))
 > > +			serror &= ~PDC2_SERR_MASK;
 > > +	}
 > > +
 > > +	printk("%s: port_status 0x%08x serror 0x%08x\n", __FUNCTION__, port_status, serror);
 > > +
 > > +	ata_ehi_push_desc(ehi, "port_status 0x%08x", port_status);
 > > +
 > > +	if (serror & PDC_SERR_MASK) {
 > > +		err_mask |= AC_ERR_ATA_BUS;
 > 
 > 1. It might be that decoding PDC specific bits is unnecessary if it sets
 > the standard bits correctly.  Also, SError bits above bit16 are
 > diagnostic bits and don't necessarily indicate error condition.

Which SErrror bits are standard?
It is true that some of the SError bits are diagnostic rather than
actual error indicators, as Promise's driver only checks a subset
of them. I'll fix that.

 > 2. PDC_SERR_FIS_TYPE is more close to AC_ERR_HSM.

FIS_TYPE is described as reception of a FIS with a good CRC but
unrecognised type field. I can make it AC_ERR_HSM if that's more
appropriate.

 > > +		ata_ehi_push_desc(ehi, ", serror 0x%08x", serror);
 > > +	}
 > > +	if (port_status & PDC_DRIVE_ERR)
 > > +		err_mask |= AC_ERR_DEV;
 > > +	if (port_status & PDC2_HTO_ERR)
 > > +		err_mask |= AC_ERR_TIMEOUT;
 > 
 > What does HTO mean?  Host time out?  Until now, AC_ERR_TIMEOUT has been
 > used to indicate that driver side timeout has expired and I'd like to
 > keep it that way.

Yes, HTO is "host bus timeout" which is described as the host bus being
busy more than 256 clock (I guess PCI) cycles for an ATA I/O transfer.

If not AC_ERR_TIMEOUT, then what? AC_ERR_HOST_BUS?

 > > +	if (port_status & (PDC_UNDERRUN_ERR | PDC_OVERRUN_ERR | PDC2_ATA_DMA_CNT_ERR
 > > +			   | PDC2_ATA_HBA_ERR))
 > > +		err_mask |= AC_ERR_ATA_BUS;
 > 
 > AC_ERR_ATA_BUS indicates transmission errors on the ATA bus.  AC_ERR_HSM
 > (host state machine/protocol violation), AC_ERR_HOST_BUS (host/PCI BUS
 > error) or AC_ERR_SYSTEM (other system errors) seems more appropriate for
 > the above error conditions.

UNDERRUN and OVERRUN occur when DMA S/G byte count differs from what the
device accepts or delivers as checked when the device asserts INTRQ.
I can make them AC_ERR_HSM instead. (HOST_BUS or SYSTEM seem inappropriate.)

ATA_HBA_ERR is any FIS transmission error on SATA interface. AC_ERR_ATA_BUS
seems appropriate for that one.

ATA_DMA_CNT_ERR is when a DMA FIS data size differs from total DMA S/G size.
I think AC_ERR_ATA_BUS is the correct choice for this one too.

I will add more explanatory text to the error bit definitions, and
perhaps also a table-driven error logger (a bit like sata_sil24).

Thanks for the review.

/Mikael

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFT] sata_promise: decode and report error reasons
  2007-03-07 19:53   ` Mikael Pettersson
@ 2007-03-07 22:45     ` Jeff Garzik
  2007-03-08 10:09       ` Mikael Pettersson
  2007-03-08  2:44     ` Tejun Heo
  1 sibling, 1 reply; 6+ messages in thread
From: Jeff Garzik @ 2007-03-07 22:45 UTC (permalink / raw)
  To: Mikael Pettersson; +Cc: Tejun Heo, linux-ide

Mikael Pettersson wrote:
>  > 2. PDC_SERR_FIS_TYPE is more close to AC_ERR_HSM.
> 
> FIS_TYPE is described as reception of a FIS with a good CRC but
> unrecognised type field. I can make it AC_ERR_HSM if that's more
> appropriate.

AC_ERR_HSM is how other drivers currently handle "unknown FIS received" 
condition.

AC_ERR_HSM is generally used for any sort of protocol violation. 
Successful reception of an unexpected FIS would be a reasonable 
candidate for that, though perhaps for SATA it's easiest to just note 
and ignore unknown FIS's.


>  > > +		ata_ehi_push_desc(ehi, ", serror 0x%08x", serror);
>  > > +	}
>  > > +	if (port_status & PDC_DRIVE_ERR)
>  > > +		err_mask |= AC_ERR_DEV;
>  > > +	if (port_status & PDC2_HTO_ERR)
>  > > +		err_mask |= AC_ERR_TIMEOUT;
>  > 
>  > What does HTO mean?  Host time out?  Until now, AC_ERR_TIMEOUT has been
>  > used to indicate that driver side timeout has expired and I'd like to
>  > keep it that way.
> 
> Yes, HTO is "host bus timeout" which is described as the host bus being
> busy more than 256 clock (I guess PCI) cycles for an ATA I/O transfer.
> 
> If not AC_ERR_TIMEOUT, then what? AC_ERR_HOST_BUS?

AC_ERR_HOST_BUS sounds applicable, yes.


>  > > +	if (port_status & (PDC_UNDERRUN_ERR | PDC_OVERRUN_ERR | PDC2_ATA_DMA_CNT_ERR
>  > > +			   | PDC2_ATA_HBA_ERR))
>  > > +		err_mask |= AC_ERR_ATA_BUS;
>  > 
>  > AC_ERR_ATA_BUS indicates transmission errors on the ATA bus.  AC_ERR_HSM
>  > (host state machine/protocol violation), AC_ERR_HOST_BUS (host/PCI BUS
>  > error) or AC_ERR_SYSTEM (other system errors) seems more appropriate for
>  > the above error conditions.
> 
> UNDERRUN and OVERRUN occur when DMA S/G byte count differs from what the
> device accepts or delivers as checked when the device asserts INTRQ.
> I can make them AC_ERR_HSM instead. (HOST_BUS or SYSTEM seem inappropriate.)

Overrun/underrun are typically programmer errors, something you should 
never see in the field is the driver is working properly.  AC_ERR_HSM is 
probably the closest mapping to such a condition, though perhaps 
AC_ERR_DRIVER_BUG would be more clear :)


> ATA_HBA_ERR is any FIS transmission error on SATA interface. AC_ERR_ATA_BUS
> seems appropriate for that one.

Yep.


> ATA_DMA_CNT_ERR is when a DMA FIS data size differs from total DMA S/G size.
> I think AC_ERR_ATA_BUS is the correct choice for this one too.

Where is this in the Promise docs, so that I can take a closer look?

This condition sounds like overrun/underrun, something that would not 
occur outside of a driver bug?


> I will add more explanatory text to the error bit definitions, and
> perhaps also a table-driven error logger (a bit like sata_sil24).

SiI 3124 makes it a bit easier, by actually returning error codes 
(rather than bits scattered about, like all other hardware).  But yes, 
that's a reasonable approach if it makes the code more clean.

	Jeff



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFT] sata_promise: decode and report error reasons
  2007-03-07 19:53   ` Mikael Pettersson
  2007-03-07 22:45     ` Jeff Garzik
@ 2007-03-08  2:44     ` Tejun Heo
  1 sibling, 0 replies; 6+ messages in thread
From: Tejun Heo @ 2007-03-08  2:44 UTC (permalink / raw)
  To: Mikael Pettersson; +Cc: linux-ide

Hello,

Jeff clarified most things.  Just few more things.

Mikael Pettersson wrote:
> Which SErrror bits are standard?
> It is true that some of the SError bits are diagnostic rather than
> actual error indicators, as Promise's driver only checks a subset
> of them. I'll fix that.

Pretty much most of them.  Just take a look at SError section in
serialATA spec.  libata EH decodes most of error and some of diagnostic
bits in ata_eh_analyze_serror().

> ATA_DMA_CNT_ERR is when a DMA FIS data size differs from total DMA S/G size.
> I think AC_ERR_ATA_BUS is the correct choice for this one too.

AC_ERR_ATA_BUS is a bit special in that if it happens repeatedly it will
trigger transfer speed adjustment pretty quickly.  The error should be
set iff the error is a transmission error on the wire which is likely to
be fixed by slowing down the transfer rate.  Maybe it should have been
named AC_ERR_ATA_TRANSMISSION or something.

As Jeff pointed out, AC_ERR_HSM is usually used when we can receive the
FIS correctly but it doesn't make sense.  This is usually caused by
driver error or seriously brain damaged firmware.  :-)

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFT] sata_promise: decode and report error reasons
  2007-03-07 22:45     ` Jeff Garzik
@ 2007-03-08 10:09       ` Mikael Pettersson
  0 siblings, 0 replies; 6+ messages in thread
From: Mikael Pettersson @ 2007-03-08 10:09 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Tejun Heo, linux-ide

Jeff Garzik writes:
 > > ATA_DMA_CNT_ERR is when a DMA FIS data size differs from total DMA S/G size.
 > > I think AC_ERR_ATA_BUS is the correct choice for this one too.
 > 
 > Where is this in the Promise docs, so that I can take a closer look?

Pages 38-39 in the PDC205xx development guide (my primary source),
page 32 in the PDC2037x guide, and pages 35-36 in the PDC20319 guide.

 > This condition sounds like overrun/underrun, something that would not 
 > occur outside of a driver bug?

I got overruns (or was it underruns?) when running a SATA300 TX2plus
with a 3Gbps disk in an UltraSPARC box, which were "cured" by hacking
SControl to limit speeds to 1.5Gbps. These issues did not occur when
the same card/cable/disk combo sat in an x86 PC, which makes me think
that the chip's x86 BIOS did some initialisation that never occurred
on the UltraSPARC box. I intend to investigate that this evening.

/Mikael

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2007-03-08 10:09 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-01  1:58 [RFT] sata_promise: decode and report error reasons Mikael Pettersson
2007-03-05  4:35 ` Tejun Heo
2007-03-07 19:53   ` Mikael Pettersson
2007-03-07 22:45     ` Jeff Garzik
2007-03-08 10:09       ` Mikael Pettersson
2007-03-08  2:44     ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).