[PATCH] arm64: optimize memcpy_{from,to}io() and memset

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
@ 2014-07-30  6:28 Joonwoo Park
  2014-08-01  6:30 ` Joonwoo Park
  0 siblings, 1 reply; 12+ messages in thread
From: Joonwoo Park @ 2014-07-30  6:28 UTC (permalink / raw)
  To: linux-arm-kernel

Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
as much as possible with minimized barrier usage.  This simplest optimization
brings faster throughput compare to current byte-by-byte read and write with
barrier in the loop.  Code's skeleton is taken from the powerpc.

Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Acked-by: Trilok Soni <tsoni@codeaurora.org>
---
 arch/arm64/kernel/io.c | 72 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 62 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index 7d37ead..c0e3ab1 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -20,18 +20,34 @@
 #include <linux/types.h>
 #include <linux/io.h>
 
+#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
+
 /*
  * Copy data from IO memory space to "real" memory space.
  */
 void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
 {
-	unsigned char *t = to;
-	while (count) {
+	while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) {
+		*(u8 *)to = readb_relaxed(from);
+		from++;
+		to++;
 		count--;
-		*t = readb(from);
-		t++;
+	}
+
+	while (count >= 8) {
+		*(u64 *)to = readq_relaxed(from);
+		from += 8;
+		to += 8;
+		count -= 8;
+	}
+
+	while (count) {
+		*(u8 *)to = readb_relaxed(from);
 		from++;
+		to++;
+		count--;
 	}
+	__iormb();
 }
 EXPORT_SYMBOL(__memcpy_fromio);
 
@@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio);
  */
 void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
 {
-	const unsigned char *f = from;
+	void *p = (void __force *)from;
+
+	__iowmb();
+	while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) {
+		writeb_relaxed(*(volatile u8 *)from, p);
+		from++;
+		p++;
+		count--;
+	}
+
+	while (count >= 8) {
+		writeq_relaxed(*(volatile u64 *)from, p);
+		from += 8;
+		p += 8;
+		count -= 8;
+	}
+
 	while (count) {
+		writeb_relaxed(*(volatile u8 *)from, p);
+		from++;
+		p++;
 		count--;
-		writeb(*f, to);
-		f++;
-		to++;
 	}
 }
 EXPORT_SYMBOL(__memcpy_toio);
@@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio);
  */
 void __memset_io(volatile void __iomem *dst, int c, size_t count)
 {
+	void *p = (void __force *)dst;
+	u64 qc = c;
+
+	qc |= qc << 8;
+	qc |= qc << 16;
+	qc |= qc << 32;
+
+	__iowmb();
+	while (count && !IO_CHECK_ALIGN(p, 8)) {
+		writeb_relaxed(c, p);
+		p++;
+		count--;
+	}
+
+	while (count >= 8) {
+		writeq_relaxed(c, p);
+		p += 8;
+		count -= 8;
+	}
+
 	while (count) {
+		writeb_relaxed(c, p);
+		p++;
 		count--;
-		writeb(c, dst);
-		dst++;
 	}
 }
 EXPORT_SYMBOL(__memset_io);
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
  2014-07-30  6:28 [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() Joonwoo Park
@ 2014-08-01  6:30 ` Joonwoo Park
  2014-08-01  8:32   ` Will Deacon
  2014-10-03 16:31   ` Catalin Marinas
  0 siblings, 2 replies; 12+ messages in thread
From: Joonwoo Park @ 2014-08-01  6:30 UTC (permalink / raw)
  To: linux-arm-kernel

+ Catalin, Will

Thanks,
Joonwoo
On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> as much as possible with minimized barrier usage.  This simplest optimization
> brings faster throughput compare to current byte-by-byte read and write with
> barrier in the loop.  Code's skeleton is taken from the powerpc.
> 
> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
> Acked-by: Trilok Soni <tsoni@codeaurora.org>
> ---
>  arch/arm64/kernel/io.c | 72 +++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 62 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
> index 7d37ead..c0e3ab1 100644
> --- a/arch/arm64/kernel/io.c
> +++ b/arch/arm64/kernel/io.c
> @@ -20,18 +20,34 @@
>  #include <linux/types.h>
>  #include <linux/io.h>
>  
> +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
> +
>  /*
>   * Copy data from IO memory space to "real" memory space.
>   */
>  void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
>  {
> -	unsigned char *t = to;
> -	while (count) {
> +	while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) {
> +		*(u8 *)to = readb_relaxed(from);
> +		from++;
> +		to++;
>  		count--;
> -		*t = readb(from);
> -		t++;
> +	}
> +
> +	while (count >= 8) {
> +		*(u64 *)to = readq_relaxed(from);
> +		from += 8;
> +		to += 8;
> +		count -= 8;
> +	}
> +
> +	while (count) {
> +		*(u8 *)to = readb_relaxed(from);
>  		from++;
> +		to++;
> +		count--;
>  	}
> +	__iormb();
>  }
>  EXPORT_SYMBOL(__memcpy_fromio);
>  
> @@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio);
>   */
>  void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
>  {
> -	const unsigned char *f = from;
> +	void *p = (void __force *)from;
> +
> +	__iowmb();
> +	while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) {
> +		writeb_relaxed(*(volatile u8 *)from, p);
> +		from++;
> +		p++;
> +		count--;
> +	}
> +
> +	while (count >= 8) {
> +		writeq_relaxed(*(volatile u64 *)from, p);
> +		from += 8;
> +		p += 8;
> +		count -= 8;
> +	}
> +
>  	while (count) {
> +		writeb_relaxed(*(volatile u8 *)from, p);
> +		from++;
> +		p++;
>  		count--;
> -		writeb(*f, to);
> -		f++;
> -		to++;
>  	}
>  }
>  EXPORT_SYMBOL(__memcpy_toio);
> @@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio);
>   */
>  void __memset_io(volatile void __iomem *dst, int c, size_t count)
>  {
> +	void *p = (void __force *)dst;
> +	u64 qc = c;
> +
> +	qc |= qc << 8;
> +	qc |= qc << 16;
> +	qc |= qc << 32;
> +
> +	__iowmb();
> +	while (count && !IO_CHECK_ALIGN(p, 8)) {
> +		writeb_relaxed(c, p);
> +		p++;
> +		count--;
> +	}
> +
> +	while (count >= 8) {
> +		writeq_relaxed(c, p);
> +		p += 8;
> +		count -= 8;
> +	}
> +
>  	while (count) {
> +		writeb_relaxed(c, p);
> +		p++;
>  		count--;
> -		writeb(c, dst);
> -		dst++;
>  	}
>  }
>  EXPORT_SYMBOL(__memset_io);
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> hosted by The Linux Foundation
> 

-- 
 Sent by an employee of the Qualcomm Innovation Center, Inc.
 The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
  2014-08-01  6:30 ` Joonwoo Park
@ 2014-08-01  8:32   ` Will Deacon
  2014-08-02  1:38     ` Joonwoo Park
  2014-10-03 16:31   ` Catalin Marinas
  1 sibling, 1 reply; 12+ messages in thread
From: Will Deacon @ 2014-08-01  8:32 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Aug 01, 2014 at 07:30:09AM +0100, Joonwoo Park wrote:
> + Catalin, Will
> 
> Thanks,
> Joonwoo
> On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> > as much as possible with minimized barrier usage.  This simplest optimization
> > brings faster throughput compare to current byte-by-byte read and write with
> > barrier in the loop.  Code's skeleton is taken from the powerpc.

Hmm, I've never really understood the use-case for memcpy_{to,from}io on
ARM, so getting to the bottom of that would help in reviewing this patch.

Can you point me at the drivers which are using this for ARM please? Doing a
blind byte-by-byte copy could easily cause problems with some peripherals,
so there must be an underlying assumption somewhere about how this is
supposed to work.

Will

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
  2014-08-01  8:32   ` Will Deacon
@ 2014-08-02  1:38     ` Joonwoo Park
  2014-08-04  9:57       ` Will Deacon
  0 siblings, 1 reply; 12+ messages in thread
From: Joonwoo Park @ 2014-08-02  1:38 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Aug 01, 2014 at 09:32:46AM +0100, Will Deacon wrote:
> On Fri, Aug 01, 2014 at 07:30:09AM +0100, Joonwoo Park wrote:
> > + Catalin, Will
> > 
> > Thanks,
> > Joonwoo
> > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> > > as much as possible with minimized barrier usage.  This simplest optimization
> > > brings faster throughput compare to current byte-by-byte read and write with
> > > barrier in the loop.  Code's skeleton is taken from the powerpc.
> 
> Hmm, I've never really understood the use-case for memcpy_{to,from}io on
> ARM, so getting to the bottom of that would help in reviewing this patch.
> 
> Can you point me at the drivers which are using this for ARM please? Doing a
Sure.  This peripheral-loader.c driver now moved under drivers/soc/ so it can be used for ARM and ARM64.
https://android.googlesource.com/kernel/msm.git/+/db34f44bcba24345d26b8a4b8137cf94d70afa73/arch/arm/mach-msm/peripheral-loader.c
static int load_segment(const struct elf32_phdr *phdr, unsigned num, struct pil_device *pil)
{
<snip>
	while (count > 0) {
		int size;
		u8 __iomem *buf;
		size = min_t(size_t, IOMAP_SIZE, count);
		buf = ioremap(paddr, size);
	}
	<snip>
	memset(buf, 0, size);
<snip>
}
As you can see the function load_segment() does ioremap() followed by memset() and memcpy() which can cause unaligned multi-byte (maybe ARM64 traps 8byte unaligned access?) write to device memory.
Because of this I was fixing the driver to use memset_io() and memcpy_io() but the existing implementations were too slow compare to the one I'm proposing.

> blind byte-by-byte copy could easily cause problems with some peripherals,
> so there must be an underlying assumption somewhere about how this is
> supposed to work.
Would you mind to explain more about the problem with byte-by-byte copying you're worried about?
I thought byte-by-byte copy always safe with regard to aligned access and that's the reason existing implementation does byte-by-byte copy.
I can imagine there are some peripherals don't allow per-byte access.  But if that is the case, should they not use memset_io() and memcpy_{from,to}io() anyway?

Also I found this.  http://lists.infradead.org/pipermail/linux-arm-kernel/2014-February/234729.html
Looks like Catalin also had a similar idea with mine.

Thanks,
Joonwoo

> 
> Will

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
  2014-08-02  1:38     ` Joonwoo Park
@ 2014-08-04  9:57       ` Will Deacon
  0 siblings, 0 replies; 12+ messages in thread
From: Will Deacon @ 2014-08-04  9:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Sat, Aug 02, 2014 at 02:38:34AM +0100, Joonwoo Park wrote:
> On Fri, Aug 01, 2014 at 09:32:46AM +0100, Will Deacon wrote:
> > On Fri, Aug 01, 2014 at 07:30:09AM +0100, Joonwoo Park wrote:
> > > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > > > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> > > > as much as possible with minimized barrier usage.  This simplest optimization
> > > > brings faster throughput compare to current byte-by-byte read and write with
> > > > barrier in the loop.  Code's skeleton is taken from the powerpc.
> > 
> > Hmm, I've never really understood the use-case for memcpy_{to,from}io on
> > ARM, so getting to the bottom of that would help in reviewing this patch.
> > 
> > Can you point me at the drivers which are using this for ARM please? Doing a
> Sure.  This peripheral-loader.c driver now moved under drivers/soc/ so it
> can be used for ARM and ARM64.
> https://android.googlesource.com/kernel/msm.git/+/db34f44bcba24345d26b8a4b8137cf94d70afa73/arch/arm/mach-msm/peripheral-loader.c
> static int load_segment(const struct elf32_phdr *phdr, unsigned num, struct pil_device *pil)
> {
> <snip>
> 	while (count > 0) {
> 		int size;
> 		u8 __iomem *buf;
> 		size = min_t(size_t, IOMAP_SIZE, count);
> 		buf = ioremap(paddr, size);
> 	}
> 	<snip>
> 	memset(buf, 0, size);
> <snip>

Right, but that code doesn't exist in mainline afaict.

> As you can see the function load_segment() does ioremap() followed by
> memset() and memcpy() which can cause unaligned multi-byte (maybe ARM64
> traps 8byte unaligned access?) write to device memory.
> Because of this I was fixing the driver to use memset_io() and memcpy_io()
> but the existing implementations were too slow compare to the one I'm
> proposing.
> 
> > blind byte-by-byte copy could easily cause problems with some peripherals,
> > so there must be an underlying assumption somewhere about how this is
> > supposed to work.
> Would you mind to explain more about the problem with byte-by-byte copying
> you're worried about?
> I thought byte-by-byte copy always safe with regard to aligned access and
> that's the reason existing implementation does byte-by-byte copy.
> I can imagine there are some peripherals don't allow per-byte access.  But
> if that is the case, should they not use memset_io() and
> memcpy_{from,to}io() anyway?

Yes, if somebody tried to use memset_io to zero a bunch of control
registers, for example, you'd likely get a bunch of aborts because the
endpoint would give you a SLVERR for a byte-access to a word register.

It just seems like the expected usage of this function should be documented
somewhere to avoid it becoming highly dependent on the architecture.

Will

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
  2014-08-01  6:30 ` Joonwoo Park
  2014-08-01  8:32   ` Will Deacon
@ 2014-10-03 16:31   ` Catalin Marinas
  2014-10-09  2:39     ` Joonwoo Park
  1 sibling, 1 reply; 12+ messages in thread
From: Catalin Marinas @ 2014-10-03 16:31 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> as much as possible with minimized barrier usage.  This simplest optimization
> brings faster throughput compare to current byte-by-byte read and write with
> barrier in the loop.  Code's skeleton is taken from the powerpc.
> 
> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
> Acked-by: Trilok Soni <tsoni@codeaurora.org>

I thought about merging this but there are some issues. Comments below.

> diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
> index 7d37ead..c0e3ab1 100644
> --- a/arch/arm64/kernel/io.c
> +++ b/arch/arm64/kernel/io.c
> @@ -20,18 +20,34 @@
>  #include <linux/types.h>
>  #include <linux/io.h>
>  
> +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)

Can you not use just IS_ALIGNED?

>  /*
>   * Copy data from IO memory space to "real" memory space.
>   */
>  void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
>  {
> -	unsigned char *t = to;
> -	while (count) {
> +	while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) {
> +		*(u8 *)to = readb_relaxed(from);

We should not use the relaxed accessors here as we don't really care
about endianness conversion. We just copy data from one place to another
without interpreting it, so __raw_read*() is sufficient.

> +		from++;
> +		to++;
>  		count--;
> -		*t = readb(from);
> -		t++;
> +	}
> +
> +	while (count >= 8) {
> +		*(u64 *)to = readq_relaxed(from);
> +		from += 8;
> +		to += 8;
> +		count -= 8;
> +	}
> +
> +	while (count) {
> +		*(u8 *)to = readb_relaxed(from);
>  		from++;
> +		to++;
> +		count--;
>  	}
> +	__iormb();

We don't need this barrier here. In the readl() implementation, it's use
is for ordering between I/O polling and DMA buffer access.

>  }
>  EXPORT_SYMBOL(__memcpy_fromio);
>  
> @@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio);
>   */
>  void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
>  {
> -	const unsigned char *f = from;
> +	void *p = (void __force *)from;

Why do you need this?

> +
> +	__iowmb();

Not needed here either.

> +	while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) {
> +		writeb_relaxed(*(volatile u8 *)from, p);

Oh, so you copy from "from" to "from". Have you really tested this?

> @@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio);
>   */
>  void __memset_io(volatile void __iomem *dst, int c, size_t count)
>  {
> +	void *p = (void __force *)dst;

Do you need this?

> +	u64 qc = c;
> +
> +	qc |= qc << 8;
> +	qc |= qc << 16;
> +	qc |= qc << 32;
> +
> +	__iowmb();

What's this for?

> +	while (count && !IO_CHECK_ALIGN(p, 8)) {
> +		writeb_relaxed(c, p);

Using dst here directly here should work (__raw_writeb takes the same
type as the second argument).


-- 
Catalin

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
  2014-10-03 16:31   ` Catalin Marinas
@ 2014-10-09  2:39     ` Joonwoo Park
  2014-10-09 10:16       ` Catalin Marinas
  0 siblings, 1 reply; 12+ messages in thread
From: Joonwoo Park @ 2014-10-09  2:39 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Oct 03, 2014 at 05:31:42PM +0100, Catalin Marinas wrote:
> On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> > as much as possible with minimized barrier usage.  This simplest optimization
> > brings faster throughput compare to current byte-by-byte read and write with
> > barrier in the loop.  Code's skeleton is taken from the powerpc.
> > 
> > Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
> > Acked-by: Trilok Soni <tsoni@codeaurora.org>
> 
> I thought about merging this but there are some issues. Comments below.

Thanks for reviewing.

> 
> > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
> > index 7d37ead..c0e3ab1 100644
> > --- a/arch/arm64/kernel/io.c
> > +++ b/arch/arm64/kernel/io.c
> > @@ -20,18 +20,34 @@
> >  #include <linux/types.h>
> >  #include <linux/io.h>
> >  
> > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
> 
> Can you not use just IS_ALIGNED?
> 

Will do.  I would need to cast from/to with unsigned long.

> >  /*
> >   * Copy data from IO memory space to "real" memory space.
> >   */
> >  void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
> >  {
> > -	unsigned char *t = to;
> > -	while (count) {
> > +	while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) {
> > +		*(u8 *)to = readb_relaxed(from);
> 
> We should not use the relaxed accessors here as we don't really care
> about endianness conversion. We just copy data from one place to another
> without interpreting it, so __raw_read*() is sufficient.
> 

Will do.

> > +		from++;
> > +		to++;
> >  		count--;
> > -		*t = readb(from);
> > -		t++;
> > +	}
> > +
> > +	while (count >= 8) {
> > +		*(u64 *)to = readq_relaxed(from);
> > +		from += 8;
> > +		to += 8;
> > +		count -= 8;
> > +	}
> > +
> > +	while (count) {
> > +		*(u8 *)to = readb_relaxed(from);
> >  		from++;
> > +		to++;
> > +		count--;
> >  	}
> > +	__iormb();
> 
> We don't need this barrier here. In the readl() implementation, it's use
> is for ordering between I/O polling and DMA buffer access.
> 

The barriers here and down below are for accessing different devices in a row.
I thought that's what your suggestion too.
http://lists.infradead.org/pipermail/linux-arm-kernel/2012-September/123178.html

> >  }
> >  EXPORT_SYMBOL(__memcpy_fromio);
> >  
> > @@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio);
> >   */
> >  void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
> >  {
> > -	const unsigned char *f = from;
> > +	void *p = (void __force *)from;
> 
> Why do you need this?
> 

Will drop this.

> > +
> > +	__iowmb();
> 
> Not needed here either.
> 
> > +	while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) {
> > +		writeb_relaxed(*(volatile u8 *)from, p);
> 
> Oh, so you copy from "from" to "from". Have you really tested this?
> 

I also found this issue with more testing after sending this patch.  Will fix.

> > @@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio);
> >   */
> >  void __memset_io(volatile void __iomem *dst, int c, size_t count)
> >  {
> > +	void *p = (void __force *)dst;
> 
> Do you need this?
> 

Will drop this.

> > +	u64 qc = c;
> > +
> > +	qc |= qc << 8;
> > +	qc |= qc << 16;
> > +	qc |= qc << 32;
> > +
> > +	__iowmb();
> 
> What's this for?
> 

This barrier was for the same reason with one above for writing different devices that doesn't guarantee writing order.

> > +	while (count && !IO_CHECK_ALIGN(p, 8)) {
> > +		writeb_relaxed(c, p);
> 
> Using dst here directly here should work (__raw_writeb takes the same
> type as the second argument).
> 

Will update.

I'm quite not sure if barriers are not needed or not indeed.
The situation I'm worried about is like 'memcpy_io(device A); memcpy_io(device B);' which I think memcpy_io() needs to guarantee the order.
Please let me know.  I'll submit new version then.

Thanks,
Joonwoo

> 
> -- 
> Catalin

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
  2014-10-09  2:39     ` Joonwoo Park
@ 2014-10-09 10:16       ` Catalin Marinas
  2014-10-14  4:04         ` Joonwoo Park
  0 siblings, 1 reply; 12+ messages in thread
From: Catalin Marinas @ 2014-10-09 10:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Oct 09, 2014 at 03:39:33AM +0100, Joonwoo Park wrote:
> On Fri, Oct 03, 2014 at 05:31:42PM +0100, Catalin Marinas wrote:
> > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
> > > index 7d37ead..c0e3ab1 100644
> > > --- a/arch/arm64/kernel/io.c
> > > +++ b/arch/arm64/kernel/io.c
> > > @@ -20,18 +20,34 @@
> > >  #include <linux/types.h>
> > >  #include <linux/io.h>
> > >  
> > > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
> > 
> > Can you not use just IS_ALIGNED?
> 
> Will do.  I would need to cast from/to with unsigned long.

Or define IO_CHECK_ALIGN as IS_ALIGNED((unsigned long)v, a)

> > > +		from++;
> > > +		to++;
> > >  		count--;
> > > -		*t = readb(from);
> > > -		t++;
> > > +	}
> > > +
> > > +	while (count >= 8) {
> > > +		*(u64 *)to = readq_relaxed(from);
> > > +		from += 8;
> > > +		to += 8;
> > > +		count -= 8;
> > > +	}
> > > +
> > > +	while (count) {
> > > +		*(u8 *)to = readb_relaxed(from);
> > >  		from++;
> > > +		to++;
> > > +		count--;
> > >  	}
> > > +	__iormb();
> > 
> > We don't need this barrier here. In the readl() implementation, it's use
> > is for ordering between I/O polling and DMA buffer access.
> 
> The barriers here and down below are for accessing different devices in a row.
> I thought that's what your suggestion too.
> http://lists.infradead.org/pipermail/linux-arm-kernel/2012-September/123178.html

I think we should leave them out until we find a use case. I currently
don't see any (writel/readl etc. still have the barriers).

> > > +	while (count && !IO_CHECK_ALIGN(p, 8)) {
> > > +		writeb_relaxed(c, p);
> > 
> > Using dst here directly here should work (__raw_writeb takes the same
> > type as the second argument).
> 
> Will update.
> 
> I'm quite not sure if barriers are not needed or not indeed.
> The situation I'm worried about is like 'memcpy_io(device A);
> memcpy_io(device B);' which I think memcpy_io() needs to guarantee the
> order.

Without barriers, ordering between device A and B would not be
guaranteed. But do you have a scenario where this ordering actually
matters? Most case we have something like:

	memcpy_io(device A);	// mapped as Device or Normal NonCacheable
	writel(device B);	// or an I/O register of device A

Here writel() has the correct barrier.

-- 
Catalin

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
  2014-10-09 10:16       ` Catalin Marinas
@ 2014-10-14  4:04         ` Joonwoo Park
  2014-10-14  4:12           ` Joonwoo Park
  0 siblings, 1 reply; 12+ messages in thread
From: Joonwoo Park @ 2014-10-14  4:04 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Oct 09, 2014 at 11:16:14AM +0100, Catalin Marinas wrote:
> On Thu, Oct 09, 2014 at 03:39:33AM +0100, Joonwoo Park wrote:
> > On Fri, Oct 03, 2014 at 05:31:42PM +0100, Catalin Marinas wrote:
> > > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote:
> > > > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
> > > > index 7d37ead..c0e3ab1 100644
> > > > --- a/arch/arm64/kernel/io.c
> > > > +++ b/arch/arm64/kernel/io.c
> > > > @@ -20,18 +20,34 @@
> > > >  #include <linux/types.h>
> > > >  #include <linux/io.h>
> > > >  
> > > > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
> > > 
> > > Can you not use just IS_ALIGNED?
> > 
> > Will do.  I would need to cast from/to with unsigned long.
> 
> Or define IO_CHECK_ALIGN as IS_ALIGNED((unsigned long)v, a)
> 
> > > > +		from++;
> > > > +		to++;
> > > >  		count--;
> > > > -		*t = readb(from);
> > > > -		t++;
> > > > +	}
> > > > +
> > > > +	while (count >= 8) {
> > > > +		*(u64 *)to = readq_relaxed(from);
> > > > +		from += 8;
> > > > +		to += 8;
> > > > +		count -= 8;
> > > > +	}
> > > > +
> > > > +	while (count) {
> > > > +		*(u8 *)to = readb_relaxed(from);
> > > >  		from++;
> > > > +		to++;
> > > > +		count--;
> > > >  	}
> > > > +	__iormb();
> > > 
> > > We don't need this barrier here. In the readl() implementation, it's use
> > > is for ordering between I/O polling and DMA buffer access.
> > 
> > The barriers here and down below are for accessing different devices in a row.
> > I thought that's what your suggestion too.
> > http://lists.infradead.org/pipermail/linux-arm-kernel/2012-September/123178.html
> 
> I think we should leave them out until we find a use case. I currently
> don't see any (writel/readl etc. still have the barriers).
> 
> > > > +	while (count && !IO_CHECK_ALIGN(p, 8)) {
> > > > +		writeb_relaxed(c, p);
> > > 
> > > Using dst here directly here should work (__raw_writeb takes the same
> > > type as the second argument).
> > 
> > Will update.
> > 
> > I'm quite not sure if barriers are not needed or not indeed.
> > The situation I'm worried about is like 'memcpy_io(device A);
> > memcpy_io(device B);' which I think memcpy_io() needs to guarantee the
> > order.
> 
> Without barriers, ordering between device A and B would not be
> guaranteed. But do you have a scenario where this ordering actually
> matters? Most case we have something like:
> 
> 	memcpy_io(device A);	// mapped as Device or Normal NonCacheable
> 	writel(device B);	// or an I/O register of device A
> 
> Here writel() has the correct barrier.

I don't have such use case that ordering matters either.  I agree that we should leave until it's needed.

Thanks,
Joonwoo

> 
> -- 
> Catalin

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
  2014-10-14  4:04         ` Joonwoo Park
@ 2014-10-14  4:12           ` Joonwoo Park
  2014-10-20 13:33             ` Catalin Marinas
  0 siblings, 1 reply; 12+ messages in thread
From: Joonwoo Park @ 2014-10-14  4:12 UTC (permalink / raw)
  To: linux-arm-kernel

>From 1cb0bb81e0d399255e679337929a8438946a4b9a Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Mon, 28 Jul 2014 17:32:06 -0700
Subject: [PATCH v2] arm64: optimize memcpy_{from,to}io() and memset_io()

Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
as much as possible with minimized barrier usage.  This simplest
optimization brings faster throughput compare to current byte-by-byte read
and write with barrier in the loop.  Code's skeleton is taken from the
powerpc.

Reviewed-by: Trilok Soni <tsoni@codeaurora.org>
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 Changes in v2:
 * dropped barriers.
 * Use IS_ALIGNED() macro.
 * use __raw_read{b,q} and __raw_write{b,q}.
 * fix bug in __memcpy_toio()

 arch/arm64/kernel/io.c | 66 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index 7d37ead..a47cd65 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -25,12 +25,26 @@
  */
 void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
 {
-	unsigned char *t = to;
-	while (count) {
+	while (count && (!IS_ALIGNED((unsigned long)from, 8) ||
+			 !IS_ALIGNED((unsigned long)to, 8))) {
+		*(u8 *)to = __raw_readb(from);
+		from++;
+		to++;
 		count--;
-		*t = readb(from);
-		t++;
+	}
+
+	while (count >= 8) {
+		*(u64 *)to = __raw_readq(from);
+		from += 8;
+		to += 8;
+		count -= 8;
+	}
+
+	while (count) {
+		*(u8 *)to = __raw_readb(from);
 		from++;
+		to++;
+		count--;
 	}
 }
 EXPORT_SYMBOL(__memcpy_fromio);
@@ -40,12 +54,26 @@ EXPORT_SYMBOL(__memcpy_fromio);
  */
 void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
 {
-	const unsigned char *f = from;
-	while (count) {
+	while (count && (!IS_ALIGNED((unsigned long)to, 8) ||
+			 !IS_ALIGNED((unsigned long)from, 8))) {
+		__raw_writeb(*(volatile u8 *)from, to);
+		from++;
+		to++;
 		count--;
-		writeb(*f, to);
-		f++;
+	}
+
+	while (count >= 8) {
+		__raw_writeq(*(volatile u64 *)from, to);
+		from += 8;
+		to += 8;
+		count -= 8;
+	}
+
+	while (count) {
+		__raw_writeb(*(volatile u8 *)from, to);
+		from++;
 		to++;
+		count--;
 	}
 }
 EXPORT_SYMBOL(__memcpy_toio);
@@ -55,10 +83,28 @@ EXPORT_SYMBOL(__memcpy_toio);
  */
 void __memset_io(volatile void __iomem *dst, int c, size_t count)
 {
-	while (count) {
+	u64 qc = c;
+
+	qc |= qc << 8;
+	qc |= qc << 16;
+	qc |= qc << 32;
+
+	while (count && !IS_ALIGNED((unsigned long)dst, 8)) {
+		__raw_writeb(c, dst);
+		dst++;
 		count--;
-		writeb(c, dst);
+	}
+
+	while (count >= 8) {
+		__raw_writeq(qc, dst);
+		dst += 8;
+		count -= 8;
+	}
+
+	while (count) {
+		__raw_writeb(c, dst);
 		dst++;
+		count--;
 	}
 }
 EXPORT_SYMBOL(__memset_io);
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
  2014-10-14  4:12           ` Joonwoo Park
@ 2014-10-20 13:33             ` Catalin Marinas
  0 siblings, 0 replies; 12+ messages in thread
From: Catalin Marinas @ 2014-10-20 13:33 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Oct 14, 2014 at 05:12:41AM +0100, Joonwoo Park wrote:
> From 1cb0bb81e0d399255e679337929a8438946a4b9a Mon Sep 17 00:00:00 2001
> From: Joonwoo Park <joonwoop@codeaurora.org>
> Date: Mon, 28 Jul 2014 17:32:06 -0700
> Subject: [PATCH v2] arm64: optimize memcpy_{from,to}io() and memset_io()
> 
> Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
> as much as possible with minimized barrier usage.  This simplest
> optimization brings faster throughput compare to current byte-by-byte read
> and write with barrier in the loop.  Code's skeleton is taken from the
> powerpc.
> 
> Reviewed-by: Trilok Soni <tsoni@codeaurora.org>
> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>

[...]

> @@ -55,10 +83,28 @@ EXPORT_SYMBOL(__memcpy_toio);
>   */
>  void __memset_io(volatile void __iomem *dst, int c, size_t count)
>  {
> -	while (count) {
> +	u64 qc = c;

I think you should use a (u8) cast here (I'm not sure there is a
requirement for memset to only pass u8 values).

	u64 qc = (u8)c;

Otherwise the patch looks fine. Since Will is handling the next merging
window, you can add:

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io()
@ 2014-10-21  0:59 Joonwoo Park
  0 siblings, 0 replies; 12+ messages in thread
From: Joonwoo Park @ 2014-10-21  0:59 UTC (permalink / raw)
  To: linux-arm-kernel

Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit
as much as possible with minimized barrier usage.  This simplest
optimization brings faster throughput compare to current byte-by-byte read
and write with barrier in the loop.  Code's skeleton is taken from the
powerpc.

Link: http://lkml.kernel.org/p/20141020133304.GH23751 at e104818-lin.cambridge.arm.com
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Trilok Soni <tsoni@codeaurora.org>
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 arch/arm64/kernel/io.c | 66 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index 7d37ead..354be2a 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -25,12 +25,26 @@
  */
 void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
 {
-	unsigned char *t = to;
-	while (count) {
+	while (count && (!IS_ALIGNED((unsigned long)from, 8) ||
+			 !IS_ALIGNED((unsigned long)to, 8))) {
+		*(u8 *)to = __raw_readb(from);
+		from++;
+		to++;
 		count--;
-		*t = readb(from);
-		t++;
+	}
+
+	while (count >= 8) {
+		*(u64 *)to = __raw_readq(from);
+		from += 8;
+		to += 8;
+		count -= 8;
+	}
+
+	while (count) {
+		*(u8 *)to = __raw_readb(from);
 		from++;
+		to++;
+		count--;
 	}
 }
 EXPORT_SYMBOL(__memcpy_fromio);
@@ -40,12 +54,26 @@ EXPORT_SYMBOL(__memcpy_fromio);
  */
 void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
 {
-	const unsigned char *f = from;
-	while (count) {
+	while (count && (!IS_ALIGNED((unsigned long)to, 8) ||
+			 !IS_ALIGNED((unsigned long)from, 8))) {
+		__raw_writeb(*(volatile u8 *)from, to);
+		from++;
+		to++;
 		count--;
-		writeb(*f, to);
-		f++;
+	}
+
+	while (count >= 8) {
+		__raw_writeq(*(volatile u64 *)from, to);
+		from += 8;
+		to += 8;
+		count -= 8;
+	}
+
+	while (count) {
+		__raw_writeb(*(volatile u8 *)from, to);
+		from++;
 		to++;
+		count--;
 	}
 }
 EXPORT_SYMBOL(__memcpy_toio);
@@ -55,10 +83,28 @@ EXPORT_SYMBOL(__memcpy_toio);
  */
 void __memset_io(volatile void __iomem *dst, int c, size_t count)
 {
-	while (count) {
+	u64 qc = (u8)c;
+
+	qc |= qc << 8;
+	qc |= qc << 16;
+	qc |= qc << 32;
+
+	while (count && !IS_ALIGNED((unsigned long)dst, 8)) {
+		__raw_writeb(c, dst);
+		dst++;
 		count--;
-		writeb(c, dst);
+	}
+
+	while (count >= 8) {
+		__raw_writeq(qc, dst);
+		dst += 8;
+		count -= 8;
+	}
+
+	while (count) {
+		__raw_writeb(c, dst);
 		dst++;
+		count--;
 	}
 }
 EXPORT_SYMBOL(__memset_io);
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-10-21  0:59 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-07-30  6:28 [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() Joonwoo Park
2014-08-01  6:30 ` Joonwoo Park
2014-08-01  8:32   ` Will Deacon
2014-08-02  1:38     ` Joonwoo Park
2014-08-04  9:57       ` Will Deacon
2014-10-03 16:31   ` Catalin Marinas
2014-10-09  2:39     ` Joonwoo Park
2014-10-09 10:16       ` Catalin Marinas
2014-10-14  4:04         ` Joonwoo Park
2014-10-14  4:12           ` Joonwoo Park
2014-10-20 13:33             ` Catalin Marinas
  -- strict thread matches above, loose matches on Subject: below --
2014-10-21  0:59 Joonwoo Park

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).