* [patch 1/4] x86: optimize memcpy_flushcache
2018-05-19 5:25 [patch 0/4] dm-writecache patches Mikulas Patocka
@ 2018-05-19 5:25 ` Mikulas Patocka
2018-05-19 14:21 ` Dan Williams
0 siblings, 1 reply; 3+ messages in thread
From: Mikulas Patocka @ 2018-05-19 5:25 UTC (permalink / raw)
To: Mikulas Patocka, Mike Snitzer, Dan Williams; +Cc: dm-devel
[-- Attachment #1: memcpy_flushcache-optimization.patch --]
[-- Type: text/plain, Size: 2651 bytes --]
I use memcpy_flushcache in my persistent memory driver for metadata
updates and it turns out that the overhead of memcpy_flushcache causes 2%
performance degradation compared to "movnti" instruction explicitly coded
using inline assembler.
This patch recognizes memcpy_flushcache calls with constant short length
and turns them into inline assembler - so that I don't have to use inline
assembler in the driver.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
arch/x86/include/asm/string_64.h | 20 +++++++++++++++++++-
arch/x86/lib/usercopy_64.c | 4 ++--
2 files changed, 21 insertions(+), 3 deletions(-)
Index: linux-2.6/arch/x86/include/asm/string_64.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/string_64.h 2018-05-18 21:21:15.000000000 +0200
+++ linux-2.6/arch/x86/include/asm/string_64.h 2018-05-18 21:21:15.000000000 +0200
@@ -147,7 +147,25 @@ memcpy_mcsafe(void *dst, const void *src
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
-void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+ if (__builtin_constant_p(cnt)) {
+ switch (cnt) {
+ case 4:
+ asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
+ return;
+ case 8:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ return;
+ case 16:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
+ return;
+ }
+ }
+ __memcpy_flushcache(dst, src, cnt);
+}
#endif
#endif /* __KERNEL__ */
Index: linux-2.6/arch/x86/lib/usercopy_64.c
===================================================================
--- linux-2.6.orig/arch/x86/lib/usercopy_64.c 2018-05-18 21:21:15.000000000 +0200
+++ linux-2.6/arch/x86/lib/usercopy_64.c 2018-05-18 22:09:49.000000000 +0200
@@ -133,7 +133,7 @@ long __copy_user_flushcache(void *dst, c
return rc;
}
-void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
{
unsigned long dest = (unsigned long) _dst;
unsigned long source = (unsigned long) _src;
@@ -196,7 +196,7 @@ void memcpy_flushcache(void *_dst, const
clean_cache_range((void *) dest, size);
}
}
-EXPORT_SYMBOL_GPL(memcpy_flushcache);
+EXPORT_SYMBOL_GPL(__memcpy_flushcache);
void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
size_t len)
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [patch 1/4] x86: optimize memcpy_flushcache
2018-05-19 5:25 ` [patch 1/4] x86: optimize memcpy_flushcache Mikulas Patocka
@ 2018-05-19 14:21 ` Dan Williams
0 siblings, 0 replies; 3+ messages in thread
From: Dan Williams @ 2018-05-19 14:21 UTC (permalink / raw)
To: Mikulas Patocka
Cc: Mike Snitzer, X86 ML, Ingo Molnar, device-mapper development,
Thomas Gleixner
[ add x86 folks for their review / ack ]
On Fri, May 18, 2018 at 10:25 PM, Mikulas Patocka <mpatocka@redhat.com> wrote:
> I use memcpy_flushcache in my persistent memory driver for metadata
> updates and it turns out that the overhead of memcpy_flushcache causes 2%
> performance degradation compared to "movnti" instruction explicitly coded
> using inline assembler.
>
> This patch recognizes memcpy_flushcache calls with constant short length
> and turns them into inline assembler - so that I don't have to use inline
> assembler in the driver.
>
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>
> ---
> arch/x86/include/asm/string_64.h | 20 +++++++++++++++++++-
> arch/x86/lib/usercopy_64.c | 4 ++--
> 2 files changed, 21 insertions(+), 3 deletions(-)
>
> Index: linux-2.6/arch/x86/include/asm/string_64.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/include/asm/string_64.h 2018-05-18 21:21:15.000000000 +0200
> +++ linux-2.6/arch/x86/include/asm/string_64.h 2018-05-18 21:21:15.000000000 +0200
> @@ -147,7 +147,25 @@ memcpy_mcsafe(void *dst, const void *src
>
> #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
> #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
> -void memcpy_flushcache(void *dst, const void *src, size_t cnt);
> +void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
> +static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
> +{
> + if (__builtin_constant_p(cnt)) {
> + switch (cnt) {
> + case 4:
> + asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
> + return;
> + case 8:
> + asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
> + return;
> + case 16:
> + asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
> + asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
> + return;
> + }
> + }
> + __memcpy_flushcache(dst, src, cnt);
> +}
> #endif
>
> #endif /* __KERNEL__ */
> Index: linux-2.6/arch/x86/lib/usercopy_64.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/lib/usercopy_64.c 2018-05-18 21:21:15.000000000 +0200
> +++ linux-2.6/arch/x86/lib/usercopy_64.c 2018-05-18 22:09:49.000000000 +0200
> @@ -133,7 +133,7 @@ long __copy_user_flushcache(void *dst, c
> return rc;
> }
>
> -void memcpy_flushcache(void *_dst, const void *_src, size_t size)
> +void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
> {
> unsigned long dest = (unsigned long) _dst;
> unsigned long source = (unsigned long) _src;
> @@ -196,7 +196,7 @@ void memcpy_flushcache(void *_dst, const
> clean_cache_range((void *) dest, size);
> }
> }
> -EXPORT_SYMBOL_GPL(memcpy_flushcache);
> +EXPORT_SYMBOL_GPL(__memcpy_flushcache);
>
> void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
> size_t len)
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* [PATCH 1/4] x86: optimize memcpy_flushcache
@ 2018-05-30 13:16 Mikulas Patocka
0 siblings, 0 replies; 3+ messages in thread
From: Mikulas Patocka @ 2018-05-30 13:16 UTC (permalink / raw)
To: Mikulas Patocka, Mike Snitzer, Dan Williams; +Cc: dm-devel
[-- Attachment #1: memcpy_flushcache-optimization.patch --]
[-- Type: text/plain, Size: 2651 bytes --]
I use memcpy_flushcache in my persistent memory driver for metadata
updates and it turns out that the overhead of memcpy_flushcache causes 2%
performance degradation compared to "movnti" instruction explicitly coded
using inline assembler.
This patch recognizes memcpy_flushcache calls with constant short length
and turns them into inline assembler - so that I don't have to use inline
assembler in the driver.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
arch/x86/include/asm/string_64.h | 20 +++++++++++++++++++-
arch/x86/lib/usercopy_64.c | 4 ++--
2 files changed, 21 insertions(+), 3 deletions(-)
Index: linux-2.6/arch/x86/include/asm/string_64.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/string_64.h 2018-05-30 14:25:51.000000000 +0200
+++ linux-2.6/arch/x86/include/asm/string_64.h 2018-05-30 14:25:51.000000000 +0200
@@ -147,7 +147,25 @@ memcpy_mcsafe(void *dst, const void *src
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
-void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+ if (__builtin_constant_p(cnt)) {
+ switch (cnt) {
+ case 4:
+ asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
+ return;
+ case 8:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ return;
+ case 16:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
+ return;
+ }
+ }
+ __memcpy_flushcache(dst, src, cnt);
+}
#endif
#endif /* __KERNEL__ */
Index: linux-2.6/arch/x86/lib/usercopy_64.c
===================================================================
--- linux-2.6.orig/arch/x86/lib/usercopy_64.c 2018-05-30 14:25:51.000000000 +0200
+++ linux-2.6/arch/x86/lib/usercopy_64.c 2018-05-30 14:25:51.000000000 +0200
@@ -133,7 +133,7 @@ long __copy_user_flushcache(void *dst, c
return rc;
}
-void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
{
unsigned long dest = (unsigned long) _dst;
unsigned long source = (unsigned long) _src;
@@ -196,7 +196,7 @@ void memcpy_flushcache(void *_dst, const
clean_cache_range((void *) dest, size);
}
}
-EXPORT_SYMBOL_GPL(memcpy_flushcache);
+EXPORT_SYMBOL_GPL(__memcpy_flushcache);
void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
size_t len)
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2018-05-30 13:16 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-05-30 13:16 [PATCH 1/4] x86: optimize memcpy_flushcache Mikulas Patocka
-- strict thread matches above, loose matches on Subject: below --
2018-05-19 5:25 [patch 0/4] dm-writecache patches Mikulas Patocka
2018-05-19 5:25 ` [patch 1/4] x86: optimize memcpy_flushcache Mikulas Patocka
2018-05-19 14:21 ` Dan Williams
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.