From mboxrd@z Thu Jan 1 00:00:00 1970 From: Mike Snitzer Subject: [PATCH v2] x86: optimize memcpy_flushcache Date: Thu, 24 May 2018 14:20:15 -0400 Message-ID: <20180524182013.GA59755@redhat.com> References: <20180519052503.325953342@debian.vm> <20180519052631.730455475@debian.vm> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: Content-Disposition: inline In-Reply-To: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: dm-devel-bounces@redhat.com Errors-To: dm-devel-bounces@redhat.com To: Ingo Molnar , Thomas Gleixner Cc: Dan Williams , X86 ML , Mikulas Patocka , device-mapper development List-Id: dm-devel.ids [v2: revised header, reformatted asm, reduced indent in switch statement. Ingo or Thomas: please review and consider picking this up for 4.18] From: Mikulas Patocka Subject: [PATCH v2] x86: optimize memcpy_flushcache In the context of constant short length stores to persistent memory, memcpy_flushcache suffers from a 2% performance degradation compared to explicitly using the "movnti" instruction. Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the movnti instruction with inline assembler. Signed-off-by: Mikulas Patocka Reviewed-by: Dan Williams Signed-off-by: Mike Snitzer --- arch/x86/include/asm/string_64.h | 28 +++++++++++++++++++++++++++- arch/x86/lib/usercopy_64.c | 4 ++-- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 533f74c300c2..aaba83478cdc 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -147,7 +147,33 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt) #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 -void memcpy_flushcache(void *dst, const void *src, size_t cnt); +void __memcpy_flushcache(void *dst, const void *src, size_t cnt); +static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + if (__builtin_constant_p(cnt)) { + switch (cnt) { + case 4: + asm volatile("movntil %1, %0" + : "=m" (*(u32 *)dst) + : "r" (*(u32 *)src)); + return; + case 8: + asm volatile("movntiq %1, %0" + : "=m" (*(u64 *)dst) + : "r" (*(u64 *)src)); + return; + case 16: + asm volatile("movntiq %1, %0" + : "=m" (*(u64 *)dst) + : "r" (*(u64 *)src)); + asm volatile("movntiq %1, %0" + : "=m" (*(u64 *)(dst + 8)) + : "r" (*(u64 *)(src + 8))); + return; + } + } + __memcpy_flushcache(dst, src, cnt); +} #endif #endif /* __KERNEL__ */ diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 75d3776123cc..26f515aa3529 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -133,7 +133,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) return rc; } -void memcpy_flushcache(void *_dst, const void *_src, size_t size) +void __memcpy_flushcache(void *_dst, const void *_src, size_t size) { unsigned long dest = (unsigned long) _dst; unsigned long source = (unsigned long) _src; @@ -196,7 +196,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size) clean_cache_range((void *) dest, size); } } -EXPORT_SYMBOL_GPL(memcpy_flushcache); +EXPORT_SYMBOL_GPL(__memcpy_flushcache); void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len) -- 2.15.0