[PATCH] arm64: optimize memcpy_{from,to}io() and memset

Refresh

This website lore-kernel.gnuweeb.org/all/20140804095719.GE15117@arm.com/t/ is currently offline. Cloudflare\'s Always Online™ shows a snapshot of this web page from the Internet Archive\'s Wayback Machine. To check for the live version, click Refresh.

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() @ 2014-07-30 6:28 Joonwoo Park 2014-08-01 6:30 ` Joonwoo Park 0 siblings, 1 reply; 12+ messages in thread From: Joonwoo Park @ 2014-07-30 6:28 UTC (permalink / raw) To: linux-arm-kernel Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit as much as possible with minimized barrier usage. This simplest optimization brings faster throughput compare to current byte-by-byte read and write with barrier in the loop. Code's skeleton is taken from the powerpc. Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> Acked-by: Trilok Soni <tsoni@codeaurora.org> --- arch/arm64/kernel/io.c | 72 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c index 7d37ead..c0e3ab1 100644 --- a/arch/arm64/kernel/io.c +++ b/arch/arm64/kernel/io.c @@ -20,18 +20,34 @@ #include <linux/types.h> #include <linux/io.h> +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0) + /* * Copy data from IO memory space to "real" memory space. */ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) { - unsigned char *t = to; - while (count) { + while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) { + *(u8 *)to = readb_relaxed(from); + from++; + to++; count--; - *t = readb(from); - t++; + } + + while (count >= 8) { + *(u64 *)to = readq_relaxed(from); + from += 8; + to += 8; + count -= 8; + } + + while (count) { + *(u8 *)to = readb_relaxed(from); from++; + to++; + count--; } + __iormb(); } EXPORT_SYMBOL(__memcpy_fromio); @@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio); */ void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) { - const unsigned char *f = from; + void *p = (void __force *)from; + + __iowmb(); + while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) { + writeb_relaxed(*(volatile u8 *)from, p); + from++; + p++; + count--; + } + + while (count >= 8) { + writeq_relaxed(*(volatile u64 *)from, p); + from += 8; + p += 8; + count -= 8; + } + while (count) { + writeb_relaxed(*(volatile u8 *)from, p); + from++; + p++; count--; - writeb(*f, to); - f++; - to++; } } EXPORT_SYMBOL(__memcpy_toio); @@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio); */ void __memset_io(volatile void __iomem *dst, int c, size_t count) { + void *p = (void __force *)dst; + u64 qc = c; + + qc |= qc << 8; + qc |= qc << 16; + qc |= qc << 32; + + __iowmb(); + while (count && !IO_CHECK_ALIGN(p, 8)) { + writeb_relaxed(c, p); + p++; + count--; + } + + while (count >= 8) { + writeq_relaxed(c, p); + p += 8; + count -= 8; + } + while (count) { + writeb_relaxed(c, p); + p++; count--; - writeb(c, dst); - dst++; } } EXPORT_SYMBOL(__memset_io); -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply related [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() 2014-07-30 6:28 [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() Joonwoo Park @ 2014-08-01 6:30 ` Joonwoo Park 2014-08-01 8:32 ` Will Deacon 2014-10-03 16:31 ` Catalin Marinas 0 siblings, 2 replies; 12+ messages in thread From: Joonwoo Park @ 2014-08-01 6:30 UTC (permalink / raw) To: linux-arm-kernel + Catalin, Will Thanks, Joonwoo On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote: > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit > as much as possible with minimized barrier usage. This simplest optimization > brings faster throughput compare to current byte-by-byte read and write with > barrier in the loop. Code's skeleton is taken from the powerpc. > > Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> > Acked-by: Trilok Soni <tsoni@codeaurora.org> > --- > arch/arm64/kernel/io.c | 72 +++++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 62 insertions(+), 10 deletions(-) > > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c > index 7d37ead..c0e3ab1 100644 > --- a/arch/arm64/kernel/io.c > +++ b/arch/arm64/kernel/io.c > @@ -20,18 +20,34 @@ > #include <linux/types.h> > #include <linux/io.h> > > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0) > + > /* > * Copy data from IO memory space to "real" memory space. > */ > void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) > { > - unsigned char *t = to; > - while (count) { > + while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) { > + *(u8 *)to = readb_relaxed(from); > + from++; > + to++; > count--; > - *t = readb(from); > - t++; > + } > + > + while (count >= 8) { > + *(u64 *)to = readq_relaxed(from); > + from += 8; > + to += 8; > + count -= 8; > + } > + > + while (count) { > + *(u8 *)to = readb_relaxed(from); > from++; > + to++; > + count--; > } > + __iormb(); > } > EXPORT_SYMBOL(__memcpy_fromio); > > @@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio); > */ > void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) > { > - const unsigned char *f = from; > + void *p = (void __force *)from; > + > + __iowmb(); > + while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) { > + writeb_relaxed(*(volatile u8 *)from, p); > + from++; > + p++; > + count--; > + } > + > + while (count >= 8) { > + writeq_relaxed(*(volatile u64 *)from, p); > + from += 8; > + p += 8; > + count -= 8; > + } > + > while (count) { > + writeb_relaxed(*(volatile u8 *)from, p); > + from++; > + p++; > count--; > - writeb(*f, to); > - f++; > - to++; > } > } > EXPORT_SYMBOL(__memcpy_toio); > @@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio); > */ > void __memset_io(volatile void __iomem *dst, int c, size_t count) > { > + void *p = (void __force *)dst; > + u64 qc = c; > + > + qc |= qc << 8; > + qc |= qc << 16; > + qc |= qc << 32; > + > + __iowmb(); > + while (count && !IO_CHECK_ALIGN(p, 8)) { > + writeb_relaxed(c, p); > + p++; > + count--; > + } > + > + while (count >= 8) { > + writeq_relaxed(c, p); > + p += 8; > + count -= 8; > + } > + > while (count) { > + writeb_relaxed(c, p); > + p++; > count--; > - writeb(c, dst); > - dst++; > } > } > EXPORT_SYMBOL(__memset_io); > -- > The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, > hosted by The Linux Foundation > -- Sent by an employee of the Qualcomm Innovation Center, Inc. The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum. ^ permalink raw reply [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() 2014-08-01 6:30 ` Joonwoo Park @ 2014-08-01 8:32 ` Will Deacon 2014-08-02 1:38 ` Joonwoo Park 2014-10-03 16:31 ` Catalin Marinas 1 sibling, 1 reply; 12+ messages in thread From: Will Deacon @ 2014-08-01 8:32 UTC (permalink / raw) To: linux-arm-kernel On Fri, Aug 01, 2014 at 07:30:09AM +0100, Joonwoo Park wrote: > + Catalin, Will > > Thanks, > Joonwoo > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote: > > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit > > as much as possible with minimized barrier usage. This simplest optimization > > brings faster throughput compare to current byte-by-byte read and write with > > barrier in the loop. Code's skeleton is taken from the powerpc. Hmm, I've never really understood the use-case for memcpy_{to,from}io on ARM, so getting to the bottom of that would help in reviewing this patch. Can you point me at the drivers which are using this for ARM please? Doing a blind byte-by-byte copy could easily cause problems with some peripherals, so there must be an underlying assumption somewhere about how this is supposed to work. Will ^ permalink raw reply [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() 2014-08-01 8:32 ` Will Deacon @ 2014-08-02 1:38 ` Joonwoo Park 2014-08-04 9:57 ` Will Deacon 0 siblings, 1 reply; 12+ messages in thread From: Joonwoo Park @ 2014-08-02 1:38 UTC (permalink / raw) To: linux-arm-kernel On Fri, Aug 01, 2014 at 09:32:46AM +0100, Will Deacon wrote: > On Fri, Aug 01, 2014 at 07:30:09AM +0100, Joonwoo Park wrote: > > + Catalin, Will > > > > Thanks, > > Joonwoo > > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote: > > > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit > > > as much as possible with minimized barrier usage. This simplest optimization > > > brings faster throughput compare to current byte-by-byte read and write with > > > barrier in the loop. Code's skeleton is taken from the powerpc. > > Hmm, I've never really understood the use-case for memcpy_{to,from}io on > ARM, so getting to the bottom of that would help in reviewing this patch. > > Can you point me at the drivers which are using this for ARM please? Doing a Sure. This peripheral-loader.c driver now moved under drivers/soc/ so it can be used for ARM and ARM64. https://android.googlesource.com/kernel/msm.git/+/db34f44bcba24345d26b8a4b8137cf94d70afa73/arch/arm/mach-msm/peripheral-loader.c static int load_segment(const struct elf32_phdr *phdr, unsigned num, struct pil_device *pil) { <snip> while (count > 0) { int size; u8 __iomem *buf; size = min_t(size_t, IOMAP_SIZE, count); buf = ioremap(paddr, size); } <snip> memset(buf, 0, size); <snip> } As you can see the function load_segment() does ioremap() followed by memset() and memcpy() which can cause unaligned multi-byte (maybe ARM64 traps 8byte unaligned access?) write to device memory. Because of this I was fixing the driver to use memset_io() and memcpy_io() but the existing implementations were too slow compare to the one I'm proposing. > blind byte-by-byte copy could easily cause problems with some peripherals, > so there must be an underlying assumption somewhere about how this is > supposed to work. Would you mind to explain more about the problem with byte-by-byte copying you're worried about? I thought byte-by-byte copy always safe with regard to aligned access and that's the reason existing implementation does byte-by-byte copy. I can imagine there are some peripherals don't allow per-byte access. But if that is the case, should they not use memset_io() and memcpy_{from,to}io() anyway? Also I found this. http://lists.infradead.org/pipermail/linux-arm-kernel/2014-February/234729.html Looks like Catalin also had a similar idea with mine. Thanks, Joonwoo > > Will -- The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() 2014-08-02 1:38 ` Joonwoo Park @ 2014-08-04 9:57 ` Will Deacon 0 siblings, 0 replies; 12+ messages in thread From: Will Deacon @ 2014-08-04 9:57 UTC (permalink / raw) To: linux-arm-kernel On Sat, Aug 02, 2014 at 02:38:34AM +0100, Joonwoo Park wrote: > On Fri, Aug 01, 2014 at 09:32:46AM +0100, Will Deacon wrote: > > On Fri, Aug 01, 2014 at 07:30:09AM +0100, Joonwoo Park wrote: > > > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote: > > > > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit > > > > as much as possible with minimized barrier usage. This simplest optimization > > > > brings faster throughput compare to current byte-by-byte read and write with > > > > barrier in the loop. Code's skeleton is taken from the powerpc. > > > > Hmm, I've never really understood the use-case for memcpy_{to,from}io on > > ARM, so getting to the bottom of that would help in reviewing this patch. > > > > Can you point me at the drivers which are using this for ARM please? Doing a > Sure. This peripheral-loader.c driver now moved under drivers/soc/ so it > can be used for ARM and ARM64. > https://android.googlesource.com/kernel/msm.git/+/db34f44bcba24345d26b8a4b8137cf94d70afa73/arch/arm/mach-msm/peripheral-loader.c > static int load_segment(const struct elf32_phdr *phdr, unsigned num, struct pil_device *pil) > { > <snip> > while (count > 0) { > int size; > u8 __iomem *buf; > size = min_t(size_t, IOMAP_SIZE, count); > buf = ioremap(paddr, size); > } > <snip> > memset(buf, 0, size); > <snip> Right, but that code doesn't exist in mainline afaict. > As you can see the function load_segment() does ioremap() followed by > memset() and memcpy() which can cause unaligned multi-byte (maybe ARM64 > traps 8byte unaligned access?) write to device memory. > Because of this I was fixing the driver to use memset_io() and memcpy_io() > but the existing implementations were too slow compare to the one I'm > proposing. > > > blind byte-by-byte copy could easily cause problems with some peripherals, > > so there must be an underlying assumption somewhere about how this is > > supposed to work. > Would you mind to explain more about the problem with byte-by-byte copying > you're worried about? > I thought byte-by-byte copy always safe with regard to aligned access and > that's the reason existing implementation does byte-by-byte copy. > I can imagine there are some peripherals don't allow per-byte access. But > if that is the case, should they not use memset_io() and > memcpy_{from,to}io() anyway? Yes, if somebody tried to use memset_io to zero a bunch of control registers, for example, you'd likely get a bunch of aborts because the endpoint would give you a SLVERR for a byte-access to a word register. It just seems like the expected usage of this function should be documented somewhere to avoid it becoming highly dependent on the architecture. Will ^ permalink raw reply [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() 2014-08-01 6:30 ` Joonwoo Park 2014-08-01 8:32 ` Will Deacon @ 2014-10-03 16:31 ` Catalin Marinas 2014-10-09 2:39 ` Joonwoo Park 1 sibling, 1 reply; 12+ messages in thread From: Catalin Marinas @ 2014-10-03 16:31 UTC (permalink / raw) To: linux-arm-kernel On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote: > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit > as much as possible with minimized barrier usage. This simplest optimization > brings faster throughput compare to current byte-by-byte read and write with > barrier in the loop. Code's skeleton is taken from the powerpc. > > Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> > Acked-by: Trilok Soni <tsoni@codeaurora.org> I thought about merging this but there are some issues. Comments below. > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c > index 7d37ead..c0e3ab1 100644 > --- a/arch/arm64/kernel/io.c > +++ b/arch/arm64/kernel/io.c > @@ -20,18 +20,34 @@ > #include <linux/types.h> > #include <linux/io.h> > > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0) Can you not use just IS_ALIGNED? > /* > * Copy data from IO memory space to "real" memory space. > */ > void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) > { > - unsigned char *t = to; > - while (count) { > + while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) { > + *(u8 *)to = readb_relaxed(from); We should not use the relaxed accessors here as we don't really care about endianness conversion. We just copy data from one place to another without interpreting it, so __raw_read*() is sufficient. > + from++; > + to++; > count--; > - *t = readb(from); > - t++; > + } > + > + while (count >= 8) { > + *(u64 *)to = readq_relaxed(from); > + from += 8; > + to += 8; > + count -= 8; > + } > + > + while (count) { > + *(u8 *)to = readb_relaxed(from); > from++; > + to++; > + count--; > } > + __iormb(); We don't need this barrier here. In the readl() implementation, it's use is for ordering between I/O polling and DMA buffer access. > } > EXPORT_SYMBOL(__memcpy_fromio); > > @@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio); > */ > void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) > { > - const unsigned char *f = from; > + void *p = (void __force *)from; Why do you need this? > + > + __iowmb(); Not needed here either. > + while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) { > + writeb_relaxed(*(volatile u8 *)from, p); Oh, so you copy from "from" to "from". Have you really tested this? > @@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio); > */ > void __memset_io(volatile void __iomem *dst, int c, size_t count) > { > + void *p = (void __force *)dst; Do you need this? > + u64 qc = c; > + > + qc |= qc << 8; > + qc |= qc << 16; > + qc |= qc << 32; > + > + __iowmb(); What's this for? > + while (count && !IO_CHECK_ALIGN(p, 8)) { > + writeb_relaxed(c, p); Using dst here directly here should work (__raw_writeb takes the same type as the second argument). -- Catalin ^ permalink raw reply [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() 2014-10-03 16:31 ` Catalin Marinas @ 2014-10-09 2:39 ` Joonwoo Park 2014-10-09 10:16 ` Catalin Marinas 0 siblings, 1 reply; 12+ messages in thread From: Joonwoo Park @ 2014-10-09 2:39 UTC (permalink / raw) To: linux-arm-kernel On Fri, Oct 03, 2014 at 05:31:42PM +0100, Catalin Marinas wrote: > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote: > > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit > > as much as possible with minimized barrier usage. This simplest optimization > > brings faster throughput compare to current byte-by-byte read and write with > > barrier in the loop. Code's skeleton is taken from the powerpc. > > > > Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> > > Acked-by: Trilok Soni <tsoni@codeaurora.org> > > I thought about merging this but there are some issues. Comments below. Thanks for reviewing. > > > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c > > index 7d37ead..c0e3ab1 100644 > > --- a/arch/arm64/kernel/io.c > > +++ b/arch/arm64/kernel/io.c > > @@ -20,18 +20,34 @@ > > #include <linux/types.h> > > #include <linux/io.h> > > > > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0) > > Can you not use just IS_ALIGNED? > Will do. I would need to cast from/to with unsigned long. > > /* > > * Copy data from IO memory space to "real" memory space. > > */ > > void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) > > { > > - unsigned char *t = to; > > - while (count) { > > + while (count && (!IO_CHECK_ALIGN(from, 8) || !IO_CHECK_ALIGN(to, 8))) { > > + *(u8 *)to = readb_relaxed(from); > > We should not use the relaxed accessors here as we don't really care > about endianness conversion. We just copy data from one place to another > without interpreting it, so __raw_read*() is sufficient. > Will do. > > + from++; > > + to++; > > count--; > > - *t = readb(from); > > - t++; > > + } > > + > > + while (count >= 8) { > > + *(u64 *)to = readq_relaxed(from); > > + from += 8; > > + to += 8; > > + count -= 8; > > + } > > + > > + while (count) { > > + *(u8 *)to = readb_relaxed(from); > > from++; > > + to++; > > + count--; > > } > > + __iormb(); > > We don't need this barrier here. In the readl() implementation, it's use > is for ordering between I/O polling and DMA buffer access. > The barriers here and down below are for accessing different devices in a row. I thought that's what your suggestion too. http://lists.infradead.org/pipermail/linux-arm-kernel/2012-September/123178.html > > } > > EXPORT_SYMBOL(__memcpy_fromio); > > > > @@ -40,12 +56,28 @@ EXPORT_SYMBOL(__memcpy_fromio); > > */ > > void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) > > { > > - const unsigned char *f = from; > > + void *p = (void __force *)from; > > Why do you need this? > Will drop this. > > + > > + __iowmb(); > > Not needed here either. > > > + while (count && (!IO_CHECK_ALIGN(p, 8) || !IO_CHECK_ALIGN(from, 8))) { > > + writeb_relaxed(*(volatile u8 *)from, p); > > Oh, so you copy from "from" to "from". Have you really tested this? > I also found this issue with more testing after sending this patch. Will fix. > > @@ -55,10 +87,30 @@ EXPORT_SYMBOL(__memcpy_toio); > > */ > > void __memset_io(volatile void __iomem *dst, int c, size_t count) > > { > > + void *p = (void __force *)dst; > > Do you need this? > Will drop this. > > + u64 qc = c; > > + > > + qc |= qc << 8; > > + qc |= qc << 16; > > + qc |= qc << 32; > > + > > + __iowmb(); > > What's this for? > This barrier was for the same reason with one above for writing different devices that doesn't guarantee writing order. > > + while (count && !IO_CHECK_ALIGN(p, 8)) { > > + writeb_relaxed(c, p); > > Using dst here directly here should work (__raw_writeb takes the same > type as the second argument). > Will update. I'm quite not sure if barriers are not needed or not indeed. The situation I'm worried about is like 'memcpy_io(device A); memcpy_io(device B);' which I think memcpy_io() needs to guarantee the order. Please let me know. I'll submit new version then. Thanks, Joonwoo > > -- > Catalin -- The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() 2014-10-09 2:39 ` Joonwoo Park @ 2014-10-09 10:16 ` Catalin Marinas 2014-10-14 4:04 ` Joonwoo Park 0 siblings, 1 reply; 12+ messages in thread From: Catalin Marinas @ 2014-10-09 10:16 UTC (permalink / raw) To: linux-arm-kernel On Thu, Oct 09, 2014 at 03:39:33AM +0100, Joonwoo Park wrote: > On Fri, Oct 03, 2014 at 05:31:42PM +0100, Catalin Marinas wrote: > > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote: > > > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c > > > index 7d37ead..c0e3ab1 100644 > > > --- a/arch/arm64/kernel/io.c > > > +++ b/arch/arm64/kernel/io.c > > > @@ -20,18 +20,34 @@ > > > #include <linux/types.h> > > > #include <linux/io.h> > > > > > > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0) > > > > Can you not use just IS_ALIGNED? > > Will do. I would need to cast from/to with unsigned long. Or define IO_CHECK_ALIGN as IS_ALIGNED((unsigned long)v, a) > > > + from++; > > > + to++; > > > count--; > > > - *t = readb(from); > > > - t++; > > > + } > > > + > > > + while (count >= 8) { > > > + *(u64 *)to = readq_relaxed(from); > > > + from += 8; > > > + to += 8; > > > + count -= 8; > > > + } > > > + > > > + while (count) { > > > + *(u8 *)to = readb_relaxed(from); > > > from++; > > > + to++; > > > + count--; > > > } > > > + __iormb(); > > > > We don't need this barrier here. In the readl() implementation, it's use > > is for ordering between I/O polling and DMA buffer access. > > The barriers here and down below are for accessing different devices in a row. > I thought that's what your suggestion too. > http://lists.infradead.org/pipermail/linux-arm-kernel/2012-September/123178.html I think we should leave them out until we find a use case. I currently don't see any (writel/readl etc. still have the barriers). > > > + while (count && !IO_CHECK_ALIGN(p, 8)) { > > > + writeb_relaxed(c, p); > > > > Using dst here directly here should work (__raw_writeb takes the same > > type as the second argument). > > Will update. > > I'm quite not sure if barriers are not needed or not indeed. > The situation I'm worried about is like 'memcpy_io(device A); > memcpy_io(device B);' which I think memcpy_io() needs to guarantee the > order. Without barriers, ordering between device A and B would not be guaranteed. But do you have a scenario where this ordering actually matters? Most case we have something like: memcpy_io(device A); // mapped as Device or Normal NonCacheable writel(device B); // or an I/O register of device A Here writel() has the correct barrier. -- Catalin ^ permalink raw reply [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() 2014-10-09 10:16 ` Catalin Marinas @ 2014-10-14 4:04 ` Joonwoo Park 2014-10-14 4:12 ` Joonwoo Park 0 siblings, 1 reply; 12+ messages in thread From: Joonwoo Park @ 2014-10-14 4:04 UTC (permalink / raw) To: linux-arm-kernel On Thu, Oct 09, 2014 at 11:16:14AM +0100, Catalin Marinas wrote: > On Thu, Oct 09, 2014 at 03:39:33AM +0100, Joonwoo Park wrote: > > On Fri, Oct 03, 2014 at 05:31:42PM +0100, Catalin Marinas wrote: > > > On Tue, Jul 29, 2014 at 11:28:26PM -0700, Joonwoo Park wrote: > > > > diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c > > > > index 7d37ead..c0e3ab1 100644 > > > > --- a/arch/arm64/kernel/io.c > > > > +++ b/arch/arm64/kernel/io.c > > > > @@ -20,18 +20,34 @@ > > > > #include <linux/types.h> > > > > #include <linux/io.h> > > > > > > > > +#define IO_CHECK_ALIGN(v, a) ((((unsigned long)(v)) & ((a) - 1)) == 0) > > > > > > Can you not use just IS_ALIGNED? > > > > Will do. I would need to cast from/to with unsigned long. > > Or define IO_CHECK_ALIGN as IS_ALIGNED((unsigned long)v, a) > > > > > + from++; > > > > + to++; > > > > count--; > > > > - *t = readb(from); > > > > - t++; > > > > + } > > > > + > > > > + while (count >= 8) { > > > > + *(u64 *)to = readq_relaxed(from); > > > > + from += 8; > > > > + to += 8; > > > > + count -= 8; > > > > + } > > > > + > > > > + while (count) { > > > > + *(u8 *)to = readb_relaxed(from); > > > > from++; > > > > + to++; > > > > + count--; > > > > } > > > > + __iormb(); > > > > > > We don't need this barrier here. In the readl() implementation, it's use > > > is for ordering between I/O polling and DMA buffer access. > > > > The barriers here and down below are for accessing different devices in a row. > > I thought that's what your suggestion too. > > http://lists.infradead.org/pipermail/linux-arm-kernel/2012-September/123178.html > > I think we should leave them out until we find a use case. I currently > don't see any (writel/readl etc. still have the barriers). > > > > > + while (count && !IO_CHECK_ALIGN(p, 8)) { > > > > + writeb_relaxed(c, p); > > > > > > Using dst here directly here should work (__raw_writeb takes the same > > > type as the second argument). > > > > Will update. > > > > I'm quite not sure if barriers are not needed or not indeed. > > The situation I'm worried about is like 'memcpy_io(device A); > > memcpy_io(device B);' which I think memcpy_io() needs to guarantee the > > order. > > Without barriers, ordering between device A and B would not be > guaranteed. But do you have a scenario where this ordering actually > matters? Most case we have something like: > > memcpy_io(device A); // mapped as Device or Normal NonCacheable > writel(device B); // or an I/O register of device A > > Here writel() has the correct barrier. I don't have such use case that ordering matters either. I agree that we should leave until it's needed. Thanks, Joonwoo > > -- > Catalin -- The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() 2014-10-14 4:04 ` Joonwoo Park @ 2014-10-14 4:12 ` Joonwoo Park 2014-10-20 13:33 ` Catalin Marinas 0 siblings, 1 reply; 12+ messages in thread From: Joonwoo Park @ 2014-10-14 4:12 UTC (permalink / raw) To: linux-arm-kernel >From 1cb0bb81e0d399255e679337929a8438946a4b9a Mon Sep 17 00:00:00 2001 From: Joonwoo Park <joonwoop@codeaurora.org> Date: Mon, 28 Jul 2014 17:32:06 -0700 Subject: [PATCH v2] arm64: optimize memcpy_{from,to}io() and memset_io() Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit as much as possible with minimized barrier usage. This simplest optimization brings faster throughput compare to current byte-by-byte read and write with barrier in the loop. Code's skeleton is taken from the powerpc. Reviewed-by: Trilok Soni <tsoni@codeaurora.org> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> --- Changes in v2: * dropped barriers. * Use IS_ALIGNED() macro. * use __raw_read{b,q} and __raw_write{b,q}. * fix bug in __memcpy_toio() arch/arm64/kernel/io.c | 66 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c index 7d37ead..a47cd65 100644 --- a/arch/arm64/kernel/io.c +++ b/arch/arm64/kernel/io.c @@ -25,12 +25,26 @@ */ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) { - unsigned char *t = to; - while (count) { + while (count && (!IS_ALIGNED((unsigned long)from, 8) || + !IS_ALIGNED((unsigned long)to, 8))) { + *(u8 *)to = __raw_readb(from); + from++; + to++; count--; - *t = readb(from); - t++; + } + + while (count >= 8) { + *(u64 *)to = __raw_readq(from); + from += 8; + to += 8; + count -= 8; + } + + while (count) { + *(u8 *)to = __raw_readb(from); from++; + to++; + count--; } } EXPORT_SYMBOL(__memcpy_fromio); @@ -40,12 +54,26 @@ EXPORT_SYMBOL(__memcpy_fromio); */ void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) { - const unsigned char *f = from; - while (count) { + while (count && (!IS_ALIGNED((unsigned long)to, 8) || + !IS_ALIGNED((unsigned long)from, 8))) { + __raw_writeb(*(volatile u8 *)from, to); + from++; + to++; count--; - writeb(*f, to); - f++; + } + + while (count >= 8) { + __raw_writeq(*(volatile u64 *)from, to); + from += 8; + to += 8; + count -= 8; + } + + while (count) { + __raw_writeb(*(volatile u8 *)from, to); + from++; to++; + count--; } } EXPORT_SYMBOL(__memcpy_toio); @@ -55,10 +83,28 @@ EXPORT_SYMBOL(__memcpy_toio); */ void __memset_io(volatile void __iomem *dst, int c, size_t count) { - while (count) { + u64 qc = c; + + qc |= qc << 8; + qc |= qc << 16; + qc |= qc << 32; + + while (count && !IS_ALIGNED((unsigned long)dst, 8)) { + __raw_writeb(c, dst); + dst++; count--; - writeb(c, dst); + } + + while (count >= 8) { + __raw_writeq(qc, dst); + dst += 8; + count -= 8; + } + + while (count) { + __raw_writeb(c, dst); dst++; + count--; } } EXPORT_SYMBOL(__memset_io); -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply related [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() 2014-10-14 4:12 ` Joonwoo Park @ 2014-10-20 13:33 ` Catalin Marinas 0 siblings, 0 replies; 12+ messages in thread From: Catalin Marinas @ 2014-10-20 13:33 UTC (permalink / raw) To: linux-arm-kernel On Tue, Oct 14, 2014 at 05:12:41AM +0100, Joonwoo Park wrote: > From 1cb0bb81e0d399255e679337929a8438946a4b9a Mon Sep 17 00:00:00 2001 > From: Joonwoo Park <joonwoop@codeaurora.org> > Date: Mon, 28 Jul 2014 17:32:06 -0700 > Subject: [PATCH v2] arm64: optimize memcpy_{from,to}io() and memset_io() > > Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit > as much as possible with minimized barrier usage. This simplest > optimization brings faster throughput compare to current byte-by-byte read > and write with barrier in the loop. Code's skeleton is taken from the > powerpc. > > Reviewed-by: Trilok Soni <tsoni@codeaurora.org> > Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> [...] > @@ -55,10 +83,28 @@ EXPORT_SYMBOL(__memcpy_toio); > */ > void __memset_io(volatile void __iomem *dst, int c, size_t count) > { > - while (count) { > + u64 qc = c; I think you should use a (u8) cast here (I'm not sure there is a requirement for memset to only pass u8 values). u64 qc = (u8)c; Otherwise the patch looks fine. Since Will is handling the next merging window, you can add: Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> ^ permalink raw reply [flat|nested] 12+ messages in thread

* [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() @ 2014-10-21 0:59 Joonwoo Park 0 siblings, 0 replies; 12+ messages in thread From: Joonwoo Park @ 2014-10-21 0:59 UTC (permalink / raw) To: linux-arm-kernel Optimize memcpy_{from,to}io() and memset_io() by transferring in 64 bit as much as possible with minimized barrier usage. This simplest optimization brings faster throughput compare to current byte-by-byte read and write with barrier in the loop. Code's skeleton is taken from the powerpc. Link: http://lkml.kernel.org/p/20141020133304.GH23751 at e104818-lin.cambridge.arm.com Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Reviewed-by: Trilok Soni <tsoni@codeaurora.org> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> --- arch/arm64/kernel/io.c | 66 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c index 7d37ead..354be2a 100644 --- a/arch/arm64/kernel/io.c +++ b/arch/arm64/kernel/io.c @@ -25,12 +25,26 @@ */ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count) { - unsigned char *t = to; - while (count) { + while (count && (!IS_ALIGNED((unsigned long)from, 8) || + !IS_ALIGNED((unsigned long)to, 8))) { + *(u8 *)to = __raw_readb(from); + from++; + to++; count--; - *t = readb(from); - t++; + } + + while (count >= 8) { + *(u64 *)to = __raw_readq(from); + from += 8; + to += 8; + count -= 8; + } + + while (count) { + *(u8 *)to = __raw_readb(from); from++; + to++; + count--; } } EXPORT_SYMBOL(__memcpy_fromio); @@ -40,12 +54,26 @@ EXPORT_SYMBOL(__memcpy_fromio); */ void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count) { - const unsigned char *f = from; - while (count) { + while (count && (!IS_ALIGNED((unsigned long)to, 8) || + !IS_ALIGNED((unsigned long)from, 8))) { + __raw_writeb(*(volatile u8 *)from, to); + from++; + to++; count--; - writeb(*f, to); - f++; + } + + while (count >= 8) { + __raw_writeq(*(volatile u64 *)from, to); + from += 8; + to += 8; + count -= 8; + } + + while (count) { + __raw_writeb(*(volatile u8 *)from, to); + from++; to++; + count--; } } EXPORT_SYMBOL(__memcpy_toio); @@ -55,10 +83,28 @@ EXPORT_SYMBOL(__memcpy_toio); */ void __memset_io(volatile void __iomem *dst, int c, size_t count) { - while (count) { + u64 qc = (u8)c; + + qc |= qc << 8; + qc |= qc << 16; + qc |= qc << 32; + + while (count && !IS_ALIGNED((unsigned long)dst, 8)) { + __raw_writeb(c, dst); + dst++; count--; - writeb(c, dst); + } + + while (count >= 8) { + __raw_writeq(qc, dst); + dst += 8; + count -= 8; + } + + while (count) { + __raw_writeb(c, dst); dst++; + count--; } } EXPORT_SYMBOL(__memset_io); -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by The Linux Foundation ^ permalink raw reply related [flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-10-21 0:59 UTC | newest] Thread overview: 12+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2014-07-30 6:28 [PATCH] arm64: optimize memcpy_{from,to}io() and memset_io() Joonwoo Park 2014-08-01 6:30 ` Joonwoo Park 2014-08-01 8:32 ` Will Deacon 2014-08-02 1:38 ` Joonwoo Park 2014-08-04 9:57 ` Will Deacon 2014-10-03 16:31 ` Catalin Marinas 2014-10-09 2:39 ` Joonwoo Park 2014-10-09 10:16 ` Catalin Marinas 2014-10-14 4:04 ` Joonwoo Park 2014-10-14 4:12 ` Joonwoo Park 2014-10-20 13:33 ` Catalin Marinas -- strict thread matches above, loose matches on Subject: below -- 2014-10-21 0:59 Joonwoo Park