[U-Boot] [PATCH] Davinci: SPI performance enhancements

public inbox for u-boot@lists.denx.de
 help / color / mirror / Atom feed

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
@ 2010-06-21  9:27 Nick Thompson
  2010-06-21 14:41 ` Paulraj, Sandeep
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Nick Thompson @ 2010-06-21  9:27 UTC (permalink / raw)
  To: u-boot

The following restructuring and optimisations increase the SPI
read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):

Remove continual revaluation of driver state from the core of the
copy loop. State can not change during the copy loop, so it is
possible to move these evaluations to before the copy loop.

Cost is more code space as loop variants are required for each set
of possible configurations. The loops are simpler however, so the
extra is only 128bytes on da830 with CONFIG_SPI_HALF_DUPLEX
defined.

Unrolling the first copy loop iteration allows the TX buffer to be
pre-loaded reducing SPI clock starvation.

Unrolling the last copy loop iteration removes testing for the
final loop iteration every time round the loop.

Using the RX buffer empty flag as a transfer throttle allows the
assumption that it is always safe to write to the TX buffer, so
polling of TX buffer full flag can be removed.

Signed-off-by: Nick Thompson <nick.thompson@ge.com>
---
da850 and da830 are similar devices. The SPI module is common to
both, but da850 uses DDR and da830 uses SDRAM. The EVM's might
not actually be comparable, but they appear to be at least similar.

The speed was tested with a 8MiB transfer from SPI FLASH using:

sf read 0xc0008000 0 0x800000

 drivers/spi/davinci_spi.c |  195 +++++++++++++++++++++++++++++---------------
 1 files changed, 128 insertions(+), 67 deletions(-)

diff --git a/drivers/spi/davinci_spi.c b/drivers/spi/davinci_spi.c
index 08f837b..4518ecb 100644
--- a/drivers/spi/davinci_spi.c
+++ b/drivers/spi/davinci_spi.c
@@ -66,7 +66,7 @@ void spi_free_slave(struct spi_slave *slave)
 int spi_claim_bus(struct spi_slave *slave)
 {
 	struct davinci_spi_slave *ds = to_davinci_spi(slave);
-	unsigned int scalar, data1_reg_val = 0;
+	unsigned int scalar;
 
 	/* Enable the SPI hardware */
 	writel(SPIGCR0_SPIRST_MASK, &ds->regs->gcr0);
@@ -93,11 +93,6 @@ int spi_claim_bus(struct spi_slave *slave)
 	writel(8 | (scalar << SPIFMT_PRESCALE_SHIFT) |
 		(1 << SPIFMT_PHASE_SHIFT), &ds->regs->fmt0);
 
-	/* hold cs active at end of transfer until explicitly de-asserted */
-	data1_reg_val = (1 << SPIDAT1_CSHOLD_SHIFT) |
-			(slave->cs << SPIDAT1_CSNR_SHIFT);
-	writel(data1_reg_val, &ds->regs->dat1);
-
 	/*
 	 * Including a minor delay. No science here. Should be good even with
 	 * no delay
@@ -113,8 +108,7 @@ int spi_claim_bus(struct spi_slave *slave)
 	writel(0, &ds->regs->lvl);
 
 	/* enable SPI */
-	writel((readl(&ds->regs->gcr1) |
-		SPIGCR1_SPIENA_MASK), &ds->regs->gcr1);
+	writel((readl(&ds->regs->gcr1) | SPIGCR1_SPIENA_MASK), &ds->regs->gcr1);
 
 	return 0;
 }
@@ -127,14 +121,125 @@ void spi_release_bus(struct spi_slave *slave)
 	writel(SPIGCR0_SPIRST_MASK, &ds->regs->gcr0);
 }
 
+/*
+ * This functions needs to act like a macro to avoid pipeline reloads in the
+ * loops below. Use always_inline. This gains us about 160KiB/s and the bloat
+ * appears to be zero bytes (da830).
+ */
+__attribute__((always_inline))
+static inline u32 davinci_spi_xfer_data(struct davinci_spi_slave *ds, u32 data)
+{
+	u32	buf_reg_val;
+
+	/* send out data */
+	writel(data, &ds->regs->dat1);
+
+	/* wait for the data to clock in/out */
+	while ((buf_reg_val = readl(&ds->regs->buf)) & SPIBUF_RXEMPTY_MASK)
+		;
+
+	return buf_reg_val;
+}
+
+static int davinci_spi_read(struct spi_slave *slave, unsigned int len,
+			    u8 *rxp, unsigned long flags)
+{
+	struct davinci_spi_slave *ds = to_davinci_spi(slave);
+	unsigned int data1_reg_val;
+
+	/* enable CS hold, CS[n] and clear the data bits */
+	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
+			 (slave->cs << SPIDAT1_CSNR_SHIFT));
+
+	/* wait till TXFULL is deasserted */
+	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
+		;
+
+	/* preload the TX buffer to avoid clock starvation */
+	writel(data1_reg_val, &ds->regs->dat1);
+
+	/* keep reading 1 byte until only 1 byte left */
+	while ((len--) > 1)
+		*rxp++ = davinci_spi_xfer_data(ds, data1_reg_val);
+
+	/* clear CS hold when we reach the end */
+	if (flags & SPI_XFER_END)
+		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
+
+	/* read the last byte */
+	*rxp = davinci_spi_xfer_data(ds, data1_reg_val);
+
+	return 0;
+}
+
+static int davinci_spi_write(struct spi_slave *slave, unsigned int len,
+		const u8 *txp, unsigned long flags)
+{
+	struct davinci_spi_slave *ds = to_davinci_spi(slave);
+	unsigned int data1_reg_val;
+
+	/* enable CS hold and clear the data bits */
+	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
+			 (slave->cs << SPIDAT1_CSNR_SHIFT));
+
+	/* wait till TXFULL is deasserted */
+	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
+		;
+
+	/* preload the TX buffer to avoid clock starvation */
+	if (len > 2) {
+		writel(data1_reg_val | *txp++, &ds->regs->dat1);
+		len--;
+	}
+
+	/* keep writing 1 byte until only 1 byte left */
+	while ((len--) > 1)
+		davinci_spi_xfer_data(ds, data1_reg_val | *txp++);
+
+	/* clear CS hold when we reach the end */
+	if (flags & SPI_XFER_END)
+		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
+
+	/* write the last byte */
+	davinci_spi_xfer_data(ds, data1_reg_val | *txp);
+
+	return 0;
+}
+
+#ifndef CONFIG_SPI_HALF_DUPLEX
+static int davinci_spi_read_write(struct spi_slave *slave, unsigned int len,
+				  u8 *rxp, const u8 *txp, unsigned long flags)
+{
+	struct davinci_spi_slave *ds = to_davinci_spi(slave);
+	unsigned int data1_reg_val;
+
+	/* enable CS hold and clear the data bits */
+	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
+			 (slave->cs << SPIDAT1_CSNR_SHIFT));
+
+	/* wait till TXFULL is deasserted */
+	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
+		;
+
+	/* keep reading and writing 1 byte until only 1 byte left */
+	while ((len--) > 1)
+		*rxp++ = davinci_spi_xfer_data(ds, data1_reg_val | *txp++);
+
+	/* clear CS hold when we reach the end */
+	if (flags & SPI_XFER_END)
+		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
+
+	/* read and write the last byte */
+	*rxp = davinci_spi_xfer_data(ds, data1_reg_val | *txp);
+
+	return 0;
+}
+#endif
+
 int spi_xfer(struct spi_slave *slave, unsigned int bitlen,
 		const void *dout, void *din, unsigned long flags)
 {
-	struct davinci_spi_slave *ds = to_davinci_spi(slave);
-	unsigned int	len, data1_reg_val = readl(&ds->regs->dat1);
-	unsigned int	i_cnt = 0, o_cnt = 0, buf_reg_val;
-	const u8	*txp = dout; /* dout can be NULL for read operation */
-	u8		*rxp = din;  /* din can be NULL for write operation */
+	unsigned int len;
 
 	if (bitlen == 0)
 		/* Finish any previously submitted transfers */
@@ -154,63 +259,19 @@ int spi_xfer(struct spi_slave *slave, unsigned int bitlen,
 
 	len = bitlen / 8;
 
-	/* do an empty read to clear the current contents */
-	readl(&ds->regs->buf);
-
-	/* keep writing and reading 1 byte until done */
-	while ((i_cnt < len) || (o_cnt < len)) {
-		/* read RX buffer and flags */
-		buf_reg_val = readl(&ds->regs->buf);
-
-		/* if data is available */
-		if ((i_cnt < len) &&
-			(buf_reg_val & SPIBUF_RXEMPTY_MASK) == 0) {
-			/*
-			 * If there is no read buffer simply
-			 * ignore the read character
-			 */
-			if (rxp)
-				*rxp++ = buf_reg_val & 0xFF;
-			/* increment read words count */
-			i_cnt++;
-		}
-
-		/*
-		 * if the tx buffer is empty and there
-		 * is still data to transmit
-		 */
-		if ((o_cnt < len) &&
-			((buf_reg_val & SPIBUF_TXFULL_MASK) == 0)) {
-			/* write the data */
-			data1_reg_val &= ~0xFFFF;
-			if (txp)
-				data1_reg_val |= *txp++;
-			/*
-			 * Write to DAT1 is required to keep
-			 * the serial transfer going.
-			 * We just terminate when we reach the end.
-			 */
-			if ((o_cnt == (len - 1)) && (flags & SPI_XFER_END)) {
-				/* clear CS hold */
-				writel(data1_reg_val &
-						~(1 << SPIDAT1_CSHOLD_SHIFT),
-						&ds->regs->dat1);
-			} else {
-				/* enable CS hold and write TX register */
-				data1_reg_val |= ((1 << SPIDAT1_CSHOLD_SHIFT) |
-					(slave->cs << SPIDAT1_CSNR_SHIFT));
-				writel(data1_reg_val, &ds->regs->dat1);
-			}
-			/* increment written words count */
-			o_cnt++;
-		}
-	}
-	return 0;
+	if (!dout)
+		return davinci_spi_read(slave, len, din, flags);
+	else if (!din)
+		return davinci_spi_write(slave, len, dout, flags);
+#ifndef CONFIG_SPI_HALF_DUPLEX
+	else
+		return davinci_spi_read_write(slave, len, din, dout, flags);
+#endif
 
 out:
 	if (flags & SPI_XFER_END) {
-		writel(data1_reg_val &
-			~(1 << SPIDAT1_CSHOLD_SHIFT), &ds->regs->dat1);
+		u8 dummy = 0;
+		davinci_spi_write(slave, 1, &dummy, flags);
 	}
 	return 0;
 }
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
  2010-06-21  9:27 [U-Boot] [PATCH] Davinci: SPI performance enhancements Nick Thompson
@ 2010-06-21 14:41 ` Paulraj, Sandeep
  2010-06-21 14:57   ` Nick Thompson
  2010-06-21 15:21   ` Nick Thompson
  2010-06-21 18:38 ` Delio Brignoli
  2010-06-22 15:27 ` Paulraj, Sandeep
  2 siblings, 2 replies; 11+ messages in thread
From: Paulraj, Sandeep @ 2010-06-21 14:41 UTC (permalink / raw)
  To: u-boot



> 
> The following restructuring and optimisations increase the SPI
> read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):
> 
> Remove continual revaluation of driver state from the core of the
> copy loop. State can not change during the copy loop, so it is
> possible to move these evaluations to before the copy loop.
> 
> Cost is more code space as loop variants are required for each set
> of possible configurations. The loops are simpler however, so the
> extra is only 128bytes on da830 with CONFIG_SPI_HALF_DUPLEX
> defined.
> 
> Unrolling the first copy loop iteration allows the TX buffer to be
> pre-loaded reducing SPI clock starvation.
> 
> Unrolling the last copy loop iteration removes testing for the
> final loop iteration every time round the loop.
> 
> Using the RX buffer empty flag as a transfer throttle allows the
> assumption that it is always safe to write to the TX buffer, so
> polling of TX buffer full flag can be removed.
> 
> Signed-off-by: Nick Thompson <nick.thompson@ge.com>
> ---
> da850 and da830 are similar devices. The SPI module is common to
> both, but da850 uses DDR and da830 uses SDRAM. The EVM's might
> not actually be comparable, but they appear to be at least similar.
> 
> The speed was tested with a 8MiB transfer from SPI FLASH using:
> 
> sf read 0xc0008000 0 0x800000
> 
>  drivers/spi/davinci_spi.c |  195 +++++++++++++++++++++++++++++-----------

This patch does not apply against Wolfgang's next.
The patch should be against u-boot/next.


> ----
>  1 files changed, 128 insertions(+), 67 deletions(-)
> 
> diff --git a/drivers/spi/davinci_spi.c b/drivers/spi/davinci_spi.c
> index 08f837b..4518ecb 100644
> --- a/drivers/spi/davinci_spi.c
> +++ b/drivers/spi/davinci_spi.c
> @@ -66,7 +66,7 @@ void spi_free_slave(struct spi_slave *slave)
>  int spi_claim_bus(struct spi_slave *slave)
>  {
>  	struct davinci_spi_slave *ds = to_davinci_spi(slave);
> -	unsigned int scalar, data1_reg_val = 0;
> +	unsigned int scalar;
> 
>  	/* Enable the SPI hardware */
>  	writel(SPIGCR0_SPIRST_MASK, &ds->regs->gcr0);
> @@ -93,11 +93,6 @@ int spi_claim_bus(struct spi_slave *slave)
>  	writel(8 | (scalar << SPIFMT_PRESCALE_SHIFT) |
>  		(1 << SPIFMT_PHASE_SHIFT), &ds->regs->fmt0);
> 
> -	/* hold cs active at end of transfer until explicitly de-asserted */
> -	data1_reg_val = (1 << SPIDAT1_CSHOLD_SHIFT) |
> -			(slave->cs << SPIDAT1_CSNR_SHIFT);
> -	writel(data1_reg_val, &ds->regs->dat1);
> -
>  	/*
>  	 * Including a minor delay. No science here. Should be good even
> with
>  	 * no delay
> @@ -113,8 +108,7 @@ int spi_claim_bus(struct spi_slave *slave)
>  	writel(0, &ds->regs->lvl);
> 
>  	/* enable SPI */
> -	writel((readl(&ds->regs->gcr1) |
> -		SPIGCR1_SPIENA_MASK), &ds->regs->gcr1);
> +	writel((readl(&ds->regs->gcr1) | SPIGCR1_SPIENA_MASK), &ds->regs-
> >gcr1);
> 
>  	return 0;
>  }
> @@ -127,14 +121,125 @@ void spi_release_bus(struct spi_slave *slave)
>  	writel(SPIGCR0_SPIRST_MASK, &ds->regs->gcr0);
>  }
> 
> +/*
> + * This functions needs to act like a macro to avoid pipeline reloads in
> the
> + * loops below. Use always_inline. This gains us about 160KiB/s and the
> bloat
> + * appears to be zero bytes (da830).
> + */
> +__attribute__((always_inline))
> +static inline u32 davinci_spi_xfer_data(struct davinci_spi_slave *ds, u32
> data)
> +{
> +	u32	buf_reg_val;
> +
> +	/* send out data */
> +	writel(data, &ds->regs->dat1);
> +
> +	/* wait for the data to clock in/out */
> +	while ((buf_reg_val = readl(&ds->regs->buf)) & SPIBUF_RXEMPTY_MASK)
> +		;
> +
> +	return buf_reg_val;
> +}
> +
> +static int davinci_spi_read(struct spi_slave *slave, unsigned int len,
> +			    u8 *rxp, unsigned long flags)
> +{
> +	struct davinci_spi_slave *ds = to_davinci_spi(slave);
> +	unsigned int data1_reg_val;
> +
> +	/* enable CS hold, CS[n] and clear the data bits */
> +	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
> +			 (slave->cs << SPIDAT1_CSNR_SHIFT));
> +
> +	/* wait till TXFULL is deasserted */
> +	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
> +		;
> +
> +	/* preload the TX buffer to avoid clock starvation */
> +	writel(data1_reg_val, &ds->regs->dat1);
> +
> +	/* keep reading 1 byte until only 1 byte left */
> +	while ((len--) > 1)
> +		*rxp++ = davinci_spi_xfer_data(ds, data1_reg_val);
> +
> +	/* clear CS hold when we reach the end */
> +	if (flags & SPI_XFER_END)
> +		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
> +
> +	/* read the last byte */
> +	*rxp = davinci_spi_xfer_data(ds, data1_reg_val);
> +
> +	return 0;
> +}
> +
> +static int davinci_spi_write(struct spi_slave *slave, unsigned int len,
> +		const u8 *txp, unsigned long flags)
> +{
> +	struct davinci_spi_slave *ds = to_davinci_spi(slave);
> +	unsigned int data1_reg_val;
> +
> +	/* enable CS hold and clear the data bits */
> +	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
> +			 (slave->cs << SPIDAT1_CSNR_SHIFT));
> +
> +	/* wait till TXFULL is deasserted */
> +	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
> +		;
> +
> +	/* preload the TX buffer to avoid clock starvation */
> +	if (len > 2) {
> +		writel(data1_reg_val | *txp++, &ds->regs->dat1);
> +		len--;
> +	}
> +
> +	/* keep writing 1 byte until only 1 byte left */
> +	while ((len--) > 1)
> +		davinci_spi_xfer_data(ds, data1_reg_val | *txp++);
> +
> +	/* clear CS hold when we reach the end */
> +	if (flags & SPI_XFER_END)
> +		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
> +
> +	/* write the last byte */
> +	davinci_spi_xfer_data(ds, data1_reg_val | *txp);
> +
> +	return 0;
> +}
> +
> +#ifndef CONFIG_SPI_HALF_DUPLEX
> +static int davinci_spi_read_write(struct spi_slave *slave, unsigned int
> len,
> +				  u8 *rxp, const u8 *txp, unsigned long flags)
> +{
> +	struct davinci_spi_slave *ds = to_davinci_spi(slave);
> +	unsigned int data1_reg_val;
> +
> +	/* enable CS hold and clear the data bits */
> +	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
> +			 (slave->cs << SPIDAT1_CSNR_SHIFT));
> +
> +	/* wait till TXFULL is deasserted */
> +	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
> +		;
> +
> +	/* keep reading and writing 1 byte until only 1 byte left */
> +	while ((len--) > 1)
> +		*rxp++ = davinci_spi_xfer_data(ds, data1_reg_val | *txp++);
> +
> +	/* clear CS hold when we reach the end */
> +	if (flags & SPI_XFER_END)
> +		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
> +
> +	/* read and write the last byte */
> +	*rxp = davinci_spi_xfer_data(ds, data1_reg_val | *txp);
> +
> +	return 0;
> +}
> +#endif
> +
>  int spi_xfer(struct spi_slave *slave, unsigned int bitlen,
>  		const void *dout, void *din, unsigned long flags)
>  {
> -	struct davinci_spi_slave *ds = to_davinci_spi(slave);
> -	unsigned int	len, data1_reg_val = readl(&ds->regs->dat1);
> -	unsigned int	i_cnt = 0, o_cnt = 0, buf_reg_val;
> -	const u8	*txp = dout; /* dout can be NULL for read operation */
> -	u8		*rxp = din;  /* din can be NULL for write operation */
> +	unsigned int len;
> 
>  	if (bitlen == 0)
>  		/* Finish any previously submitted transfers */
> @@ -154,63 +259,19 @@ int spi_xfer(struct spi_slave *slave, unsigned int
> bitlen,
> 
>  	len = bitlen / 8;
> 
> -	/* do an empty read to clear the current contents */
> -	readl(&ds->regs->buf);
> -
> -	/* keep writing and reading 1 byte until done */
> -	while ((i_cnt < len) || (o_cnt < len)) {
> -		/* read RX buffer and flags */
> -		buf_reg_val = readl(&ds->regs->buf);
> -
> -		/* if data is available */
> -		if ((i_cnt < len) &&
> -			(buf_reg_val & SPIBUF_RXEMPTY_MASK) == 0) {
> -			/*
> -			 * If there is no read buffer simply
> -			 * ignore the read character
> -			 */
> -			if (rxp)
> -				*rxp++ = buf_reg_val & 0xFF;
> -			/* increment read words count */
> -			i_cnt++;
> -		}
> -
> -		/*
> -		 * if the tx buffer is empty and there
> -		 * is still data to transmit
> -		 */
> -		if ((o_cnt < len) &&
> -			((buf_reg_val & SPIBUF_TXFULL_MASK) == 0)) {
> -			/* write the data */
> -			data1_reg_val &= ~0xFFFF;
> -			if (txp)
> -				data1_reg_val |= *txp++;
> -			/*
> -			 * Write to DAT1 is required to keep
> -			 * the serial transfer going.
> -			 * We just terminate when we reach the end.
> -			 */
> -			if ((o_cnt == (len - 1)) && (flags & SPI_XFER_END)) {
> -				/* clear CS hold */
> -				writel(data1_reg_val &
> -						~(1 << SPIDAT1_CSHOLD_SHIFT),
> -						&ds->regs->dat1);
> -			} else {
> -				/* enable CS hold and write TX register */
> -				data1_reg_val |= ((1 << SPIDAT1_CSHOLD_SHIFT) |
> -					(slave->cs << SPIDAT1_CSNR_SHIFT));
> -				writel(data1_reg_val, &ds->regs->dat1);
> -			}
> -			/* increment written words count */
> -			o_cnt++;
> -		}
> -	}
> -	return 0;
> +	if (!dout)
> +		return davinci_spi_read(slave, len, din, flags);
> +	else if (!din)
> +		return davinci_spi_write(slave, len, dout, flags);
> +#ifndef CONFIG_SPI_HALF_DUPLEX
> +	else
> +		return davinci_spi_read_write(slave, len, din, dout, flags);
> +#endif
> 
>  out:
>  	if (flags & SPI_XFER_END) {
> -		writel(data1_reg_val &
> -			~(1 << SPIDAT1_CSHOLD_SHIFT), &ds->regs->dat1);
> +		u8 dummy = 0;
> +		davinci_spi_write(slave, 1, &dummy, flags);
>  	}
>  	return 0;
>  }
> --
> 1.7.0.4

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
  2010-06-21 14:41 ` Paulraj, Sandeep
@ 2010-06-21 14:57   ` Nick Thompson
  2010-06-21 15:21   ` Nick Thompson
  1 sibling, 0 replies; 11+ messages in thread
From: Nick Thompson @ 2010-06-21 14:57 UTC (permalink / raw)
  To: u-boot

On 21/06/10 15:41, Paulraj, Sandeep wrote:
> 
> 
>>
>> The following restructuring and optimisations increase the SPI
>> read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):
>>
>> Remove continual revaluation of driver state from the core of the
>> copy loop. State can not change during the copy loop, so it is
>> possible to move these evaluations to before the copy loop.
>>
>> Cost is more code space as loop variants are required for each set
>> of possible configurations. The loops are simpler however, so the
>> extra is only 128bytes on da830 with CONFIG_SPI_HALF_DUPLEX
>> defined.
>>
>> Unrolling the first copy loop iteration allows the TX buffer to be
>> pre-loaded reducing SPI clock starvation.
>>
>> Unrolling the last copy loop iteration removes testing for the
>> final loop iteration every time round the loop.
>>
>> Using the RX buffer empty flag as a transfer throttle allows the
>> assumption that it is always safe to write to the TX buffer, so
>> polling of TX buffer full flag can be removed.
>>
>> Signed-off-by: Nick Thompson <nick.thompson@ge.com>
>> ---
>> da850 and da830 are similar devices. The SPI module is common to
>> both, but da850 uses DDR and da830 uses SDRAM. The EVM's might
>> not actually be comparable, but they appear to be at least similar.
>>
>> The speed was tested with a 8MiB transfer from SPI FLASH using:
>>
>> sf read 0xc0008000 0 0x800000
>>
>>  drivers/spi/davinci_spi.c |  195 +++++++++++++++++++++++++++++-----------
> 
> This patch does not apply against Wolfgang's next.
> The patch should be against u-boot/next.

The patch is based on u-boot as updated this morning. It applies
directly after Delio's patch, which is in u-boot/next, and the last
change to this file. u-boot/next is at the tip and is three days old.

I don't understand why it doesn't apply. What failure do you see?

$ git log drivers/spi/davinci_spi.c
commit 15c4e0802caed209ca021de25ec607658ae35720
Author: Nick Thompson <nick.thompson@ge.com>
Date:   Mon Jun 21 09:48:22 2010 +0100

    Davinci: SPI performance enhancements
    
    The following restructuring and optimisations increase the SPI

[...skip to next log...]

commit 9268236529161312c877e638a14c011fd3c883e1
Author: Delio Brignoli <dbrignoli@audioscience.com>
Date:   Mon Jun 7 17:16:13 2010 -0400

    DaVinci: Improve DaVinci SPI speed.
    
    I have updated this patch based on the comments [1] by Wolfgang Denk and
...

Nick.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
  2010-06-21 14:41 ` Paulraj, Sandeep
  2010-06-21 14:57   ` Nick Thompson
@ 2010-06-21 15:21   ` Nick Thompson
  1 sibling, 0 replies; 11+ messages in thread
From: Nick Thompson @ 2010-06-21 15:21 UTC (permalink / raw)
  To: u-boot

On 21/06/10 15:41, Paulraj, Sandeep wrote:
> 
> 
>>
>> The following restructuring and optimisations increase the SPI
>> read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):
>>
>> Remove continual revaluation of driver state from the core of the
>> copy loop. State can not change during the copy loop, so it is
>> possible to move these evaluations to before the copy loop.
>>
>> Cost is more code space as loop variants are required for each set
>> of possible configurations. The loops are simpler however, so the
>> extra is only 128bytes on da830 with CONFIG_SPI_HALF_DUPLEX
>> defined.
>>
>> Unrolling the first copy loop iteration allows the TX buffer to be
>> pre-loaded reducing SPI clock starvation.
>>
>> Unrolling the last copy loop iteration removes testing for the
>> final loop iteration every time round the loop.
>>
>> Using the RX buffer empty flag as a transfer throttle allows the
>> assumption that it is always safe to write to the TX buffer, so
>> polling of TX buffer full flag can be removed.
>>
>> Signed-off-by: Nick Thompson <nick.thompson@ge.com>
>> ---
>> da850 and da830 are similar devices. The SPI module is common to
>> both, but da850 uses DDR and da830 uses SDRAM. The EVM's might
>> not actually be comparable, but they appear to be at least similar.
>>
>> The speed was tested with a 8MiB transfer from SPI FLASH using:
>>
>> sf read 0xc0008000 0 0x800000
>>
>>  drivers/spi/davinci_spi.c |  195 +++++++++++++++++++++++++++++-----------
> 
> This patch does not apply against Wolfgang's next.
> The patch should be against u-boot/next.

It looks like Delio's patch ended up being pulled into main via Tom's
tree and yours. On u-boot I see 9268236529161312c877e638a14c011fd3c883e1,
but the same commit has id 23911740486c59851df57521c49bfd81ce1865ec on
on u-boot-ti.

I guess this is the source of the confusion? My patch was against
u-boot, not u-boot-ti.

Nick.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
  2010-06-21  9:27 [U-Boot] [PATCH] Davinci: SPI performance enhancements Nick Thompson
  2010-06-21 14:41 ` Paulraj, Sandeep
@ 2010-06-21 18:38 ` Delio Brignoli
  2010-06-22  8:29   ` Nick Thompson
  2010-06-22 15:27 ` Paulraj, Sandeep
  2 siblings, 1 reply; 11+ messages in thread
From: Delio Brignoli @ 2010-06-21 18:38 UTC (permalink / raw)
  To: u-boot

Hello Nick,

On 21/06/2010, at 11:27, Nick Thompson wrote:
> The following restructuring and optimisations increase the SPI
> read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):

Using this patch I get 2.21MiB/s on my L138 EVM (da850), quite 
an improvement! I would like to see how much my original patch can
be improved using some of your changes without splitting the code
to handle the three cases. I will try later this week.

[...]
> +	if (!dout)
> +		return davinci_spi_read(slave, len, din, flags);
> +	else if (!din)
> +		return davinci_spi_write(slave, len, dout, flags);
> +#ifndef CONFIG_SPI_HALF_DUPLEX
> +	else
> +		return davinci_spi_read_write(slave, len, din, dout, flags);
> +#endif

I think there should always be an else branch at the end even if 
CONFIG_SPI_HALF_DUPLEX is not defined. Something like:

#else
	flags |= SPI_XFER_END;
#endif

to terminate the transfer instead of failing silently.
In fact it should signal the error condition somehow, but 
I do not know enough about u-boot to provide an advice on this.

Thanks
--
Delio

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
  2010-06-21 18:38 ` Delio Brignoli
@ 2010-06-22  8:29   ` Nick Thompson
  2010-06-22  9:57     ` Delio Brignoli
  0 siblings, 1 reply; 11+ messages in thread
From: Nick Thompson @ 2010-06-22  8:29 UTC (permalink / raw)
  To: u-boot

On 21/06/10 19:38, Delio Brignoli wrote:
> Hello Nick,
> 
> On 21/06/2010, at 11:27, Nick Thompson wrote:
>> The following restructuring and optimisations increase the SPI
>> read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):
> 
> Using this patch I get 2.21MiB/s on my L138 EVM (da850), quite 
> an improvement! I would like to see how much my original patch can
> be improved using some of your changes without splitting the code
> to handle the three cases. I will try later this week.

Not testing the txp and rxp pointers in the loop was a significant
gain for me and pipe-lining the TX and RX operations is going to be
a little trickier, but give it a go by all means.

> 
> [...]
>> +	if (!dout)
>> +		return davinci_spi_read(slave, len, din, flags);
>> +	else if (!din)
>> +		return davinci_spi_write(slave, len, dout, flags);
>> +#ifndef CONFIG_SPI_HALF_DUPLEX
>> +	else
>> +		return davinci_spi_read_write(slave, len, din, dout, flags);
>> +#endif
> 
> I think there should always be an else branch at the end even if 
> CONFIG_SPI_HALF_DUPLEX is not defined. Something like:
> 
> #else
> 	flags |= SPI_XFER_END;
> #endif

Hmmm, yes, you are correct. That must be added, else in the unexpected
case, the transaction will be opened and left open. I'll fix that.

> 
> to terminate the transfer instead of failing silently.
> In fact it should signal the error condition somehow, but 
> I do not know enough about u-boot to provide an advice on this.

I think maybe a printk(KERN_ERR ...) will do it. I'll add that too.

> 
> Thanks
> --
> Delio

Thanks for the review.

Nick.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
  2010-06-22  8:29   ` Nick Thompson
@ 2010-06-22  9:57     ` Delio Brignoli
  2010-06-22 14:27       ` Paulraj, Sandeep
  0 siblings, 1 reply; 11+ messages in thread
From: Delio Brignoli @ 2010-06-22  9:57 UTC (permalink / raw)
  To: u-boot

On 22/06/2010, at 10:29, Nick Thompson wrote:
> On 21/06/10 19:38, Delio Brignoli wrote:
>> Hello Nick,
>> 
>> On 21/06/2010, at 11:27, Nick Thompson wrote:
>>> The following restructuring and optimisations increase the SPI
>>> read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):
>> 
>> Using this patch I get 2.21MiB/s on my L138 EVM (da850), quite 
>> an improvement! I would like to see how much my original patch can
>> be improved using some of your changes without splitting the code
>> to handle the three cases. I will try later this week.
> 
> Not testing the txp and rxp pointers in the loop was a significant
> gain for me and pipe-lining the TX and RX operations is going to be
> a little trickier, but give it a go by all means.

Yes, it's unlikely I will be able to get my version to work as fast as yours 
because of all the extra uncached memory access, which makes me
wonder if we shouldn't just bite the bullet and enable dcache for
da830/da850, so we can keep the code simpler.

> Thanks for the review.

Thank you for the patch.
--
Delio

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
  2010-06-22  9:57     ` Delio Brignoli
@ 2010-06-22 14:27       ` Paulraj, Sandeep
  2010-06-22 14:40         ` Delio Brignoli
  0 siblings, 1 reply; 11+ messages in thread
From: Paulraj, Sandeep @ 2010-06-22 14:27 UTC (permalink / raw)
  To: u-boot



> >
> > Not testing the txp and rxp pointers in the loop was a significant
> > gain for me and pipe-lining the TX and RX operations is going to be
> > a little trickier, but give it a go by all means.
> 
> Yes, it's unlikely I will be able to get my version to work as fast as
> yours
> because of all the extra uncached memory access, which makes me
> wonder if we shouldn't just bite the bullet and enable dcache for
> da830/da850, so we can keep the code simpler.
> 
> > Thanks for the review.
> 
> Thank you for the patch.


Delio, Nick,

Can we consider the discussion closed? If so I am ready to apply the patch.

Sandeep

> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
  2010-06-22 14:27       ` Paulraj, Sandeep
@ 2010-06-22 14:40         ` Delio Brignoli
  0 siblings, 0 replies; 11+ messages in thread
From: Delio Brignoli @ 2010-06-22 14:40 UTC (permalink / raw)
  To: u-boot

Hello Sandeep, 

Yes, please apply the patch.

Regards
--
Delio

On 22/06/2010, at 16:27, Paulraj, Sandeep wrote:

> 
> 
>>> 
>>> Not testing the txp and rxp pointers in the loop was a significant
>>> gain for me and pipe-lining the TX and RX operations is going to be
>>> a little trickier, but give it a go by all means.
>> 
>> Yes, it's unlikely I will be able to get my version to work as fast as
>> yours
>> because of all the extra uncached memory access, which makes me
>> wonder if we shouldn't just bite the bullet and enable dcache for
>> da830/da850, so we can keep the code simpler.
>> 
>>> Thanks for the review.
>> 
>> Thank you for the patch.
> 
> 
> Delio, Nick,
> 
> Can we consider the discussion closed? If so I am ready to apply the patch.
> 
> Sandeep
> 
>> 
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
  2010-06-21  9:27 [U-Boot] [PATCH] Davinci: SPI performance enhancements Nick Thompson
  2010-06-21 14:41 ` Paulraj, Sandeep
  2010-06-21 18:38 ` Delio Brignoli
@ 2010-06-22 15:27 ` Paulraj, Sandeep
  2010-06-23  8:14   ` Nick Thompson
  2 siblings, 1 reply; 11+ messages in thread
From: Paulraj, Sandeep @ 2010-06-22 15:27 UTC (permalink / raw)
  To: u-boot


> The following restructuring and optimisations increase the SPI
> read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):
> 
> Remove continual revaluation of driver state from the core of the
> copy loop. State can not change during the copy loop, so it is
> possible to move these evaluations to before the copy loop.
> 
> Cost is more code space as loop variants are required for each set
> of possible configurations. The loops are simpler however, so the
> extra is only 128bytes on da830 with CONFIG_SPI_HALF_DUPLEX
> defined.
> 
> Unrolling the first copy loop iteration allows the TX buffer to be
> pre-loaded reducing SPI clock starvation.
> 
> Unrolling the last copy loop iteration removes testing for the
> final loop iteration every time round the loop.
> 
> Using the RX buffer empty flag as a transfer throttle allows the
> assumption that it is always safe to write to the TX buffer, so
> polling of TX buffer full flag can be removed.
> 
> Signed-off-by: Nick Thompson <nick.thompson@ge.com>
> ---
> da850 and da830 are similar devices. The SPI module is common to
> both, but da850 uses DDR and da830 uses SDRAM. The EVM's might
> not actually be comparable, but they appear to be at least similar.
> 
> The speed was tested with a 8MiB transfer from SPI FLASH using:
> 
> sf read 0xc0008000 0 0x800000
> 
>  drivers/spi/davinci_spi.c |  195 +++++++++++++++++++++++++++++-----------

Applied to u-boot-ti/next

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [U-Boot] [PATCH] Davinci: SPI performance enhancements
  2010-06-22 15:27 ` Paulraj, Sandeep
@ 2010-06-23  8:14   ` Nick Thompson
  0 siblings, 0 replies; 11+ messages in thread
From: Nick Thompson @ 2010-06-23  8:14 UTC (permalink / raw)
  To: u-boot

On 22/06/10 16:27, Paulraj, Sandeep wrote:
> 
>> The following restructuring and optimisations increase the SPI
>> read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):
>>
>> Remove continual revaluation of driver state from the core of the
>> copy loop. State can not change during the copy loop, so it is
>> possible to move these evaluations to before the copy loop.
>>
>> Cost is more code space as loop variants are required for each set
>> of possible configurations. The loops are simpler however, so the
>> extra is only 128bytes on da830 with CONFIG_SPI_HALF_DUPLEX
>> defined.
>>
>> Unrolling the first copy loop iteration allows the TX buffer to be
>> pre-loaded reducing SPI clock starvation.
>>
>> Unrolling the last copy loop iteration removes testing for the
>> final loop iteration every time round the loop.
>>
>> Using the RX buffer empty flag as a transfer throttle allows the
>> assumption that it is always safe to write to the TX buffer, so
>> polling of TX buffer full flag can be removed.
>>
>> Signed-off-by: Nick Thompson <nick.thompson@ge.com>
>> ---
>> da850 and da830 are similar devices. The SPI module is common to
>> both, but da850 uses DDR and da830 uses SDRAM. The EVM's might
>> not actually be comparable, but they appear to be at least similar.
>>
>> The speed was tested with a 8MiB transfer from SPI FLASH using:
>>
>> sf read 0xc0008000 0 0x800000
>>
>>  drivers/spi/davinci_spi.c |  195 +++++++++++++++++++++++++++++-----------
> 
> Applied to u-boot-ti/next

Sandeep,

You're too quick for me :-) I was going to do a v2 patch to cover a
couple of Delio's points. They are not critical issues though, so I
think we can let the pull request go through and I will submit a new
patch once it hits u-boot/next:

On 21/06/10 19:38, Delio Brignoli wrote:
>> +	if (!dout)
>> +		return davinci_spi_read(slave, len, din, flags);
>> +	else if (!din)
>> +		return davinci_spi_write(slave, len, dout, flags);
>> +#ifndef CONFIG_SPI_HALF_DUPLEX
>> +	else
>> +		return davinci_spi_read_write(slave, len, din, dout, flags);
>> +#endif
> 
> I think there should always be an else branch at the end even if 
> CONFIG_SPI_HALF_DUPLEX is not defined. Something like:
> 
> #else
> 	flags |= SPI_XFER_END;
> #endif
> 
> to terminate the transfer instead of failing silently.
> In fact it should signal the error condition somehow, but 
> I do not know enough about u-boot to provide an advice on this.

Its a non issue if the driver is called correctly in the first place,
but we can't be sure of that.

Nick.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2010-06-23  8:14 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-06-21  9:27 [U-Boot] [PATCH] Davinci: SPI performance enhancements Nick Thompson
2010-06-21 14:41 ` Paulraj, Sandeep
2010-06-21 14:57   ` Nick Thompson
2010-06-21 15:21   ` Nick Thompson
2010-06-21 18:38 ` Delio Brignoli
2010-06-22  8:29   ` Nick Thompson
2010-06-22  9:57     ` Delio Brignoli
2010-06-22 14:27       ` Paulraj, Sandeep
2010-06-22 14:40         ` Delio Brignoli
2010-06-22 15:27 ` Paulraj, Sandeep
2010-06-23  8:14   ` Nick Thompson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox