netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm
@ 2014-12-04 13:08 Hannes Frederic Sowa
  2014-12-04 15:56 ` Herbert Xu
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Hannes Frederic Sowa @ 2014-12-04 13:08 UTC (permalink / raw)
  To: netdev; +Cc: Herbert Xu, Jay Vosburgh, Thomas Graf, Daniel Borkmann,
	Eric Dumazet

By default the arch_fast_hash hashing function pointers are initialized
to jhash(2). If during boot-up a CPU with SSE4.2 is detected they get
updated to the CRC32 ones. This dispatching scheme incurs a function
pointer lookup and indirect call for every hashing operation.

To keep the number of clobbered registers short the hashing primitives
are implemented in assembler. This makes it easier to do the dispatch
by alternative_call.

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 arch/x86/include/asm/hash.h      |  53 ++++++++++-
 arch/x86/kernel/i386_ksyms_32.c  |   6 ++
 arch/x86/kernel/x8664_ksyms_64.c |   6 ++
 arch/x86/lib/Makefile            |   2 +-
 arch/x86/lib/arch_hash.S         | 192 +++++++++++++++++++++++++++++++++++++++
 arch/x86/lib/hash.c              |  92 -------------------
 arch/x86/lib/jhash.c             |   6 ++
 include/asm-generic/hash.h       |  18 +++-
 include/linux/hash.h             |  34 -------
 lib/Makefile                     |   2 +-
 lib/hash.c                       |  39 --------
 net/openvswitch/flow_table.c     |   2 +-
 12 files changed, 280 insertions(+), 172 deletions(-)
 create mode 100644 arch/x86/lib/arch_hash.S
 delete mode 100644 arch/x86/lib/hash.c
 create mode 100644 arch/x86/lib/jhash.c
 delete mode 100644 lib/hash.c

diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h
index e8c58f8..620081b 100644
--- a/arch/x86/include/asm/hash.h
+++ b/arch/x86/include/asm/hash.h
@@ -1,7 +1,56 @@
 #ifndef _ASM_X86_HASH_H
 #define _ASM_X86_HASH_H
 
-struct fast_hash_ops;
-extern void setup_arch_fast_hash(struct fast_hash_ops *ops);
+#include <linux/cpufeature.h>
+#include <asm/alternative.h>
+
+#include <linux/jhash.h>
+
+#ifdef CONFIG_AS_CRC32
+
+u32 __jhash_trampoline(const void *data, u32 len, u32 seed);
+u32 __sse42_crc32(const void *data, u32 len, u32 seed);
+
+#ifdef CONFIG_X86_64
+
+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
+{
+	u32 hash;
+
+	alternative_call(__jhash_trampoline, __sse42_crc32, X86_FEATURE_XMM4_2,
+			 ASM_OUTPUT2("=a" (hash), "=D" (data), "=S" (len),
+				     "=d" (seed)),
+			 "1" (data), "2" (len), "3" (seed)
+			 : "memory", "cc");
+
+	return hash;
+}
+
+#else /* CONFIG_X86_64 */
+
+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
+{
+	u32 hash;
+
+	alternative_call(__jhash_trampoline, __sse42_crc32, X86_FEATURE_XMM4_2,
+			 ASM_OUTPUT2("=a" (hash), "=d" (len), "=c" (seed)),
+			 "0" (data), "1" (len), "2" (seed)
+			 : "memory", "cc");
+
+	return hash;
+}
+
+#endif /* CONFIG_x86_64 */
+
+#else /* CONFIG_AS_CRC32 */
+
+u32 __jhash(const void *data, u32 len, u32 seed);
+
+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
+{
+	return __jhash(data, len, seed);
+}
+
+#endif
 
 #endif /* _ASM_X86_HASH_H */
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 05fd74f..afb98da 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,4 +1,5 @@
 #include <linux/module.h>
+#include <linux/hash.h>
 
 #include <asm/checksum.h>
 #include <asm/pgtable.h>
@@ -38,6 +39,11 @@ EXPORT_SYMBOL(strstr);
 EXPORT_SYMBOL(csum_partial);
 EXPORT_SYMBOL(empty_zero_page);
 
+#ifdef CONFIG_AS_CRC32
+EXPORT_SYMBOL(__sse42_crc32);
+EXPORT_SYMBOL(__jhash_trampoline);
+#endif
+
 #ifdef CONFIG_PREEMPT
 EXPORT_SYMBOL(___preempt_schedule);
 #ifdef CONFIG_CONTEXT_TRACKING
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 0406819..1094c13 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -3,6 +3,7 @@
 
 #include <linux/module.h>
 #include <linux/smp.h>
+#include <linux/hash.h>
 
 #include <net/checksum.h>
 
@@ -42,6 +43,11 @@ EXPORT_SYMBOL(clear_page);
 
 EXPORT_SYMBOL(csum_partial);
 
+#ifdef CONFIG_AS_CRC32
+EXPORT_SYMBOL(__sse42_crc32);
+EXPORT_SYMBOL(__jhash_trampoline);
+#endif
+
 /*
  * Export string functions. We normally rely on gcc builtin for most of these,
  * but gcc sometimes decides not to inline them.
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index db92793..168bbef 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -23,7 +23,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o hash.o
+obj-y += msr.o msr-reg.o msr-reg-export.o jhash.o arch_hash.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/arch_hash.S b/arch/x86/lib/arch_hash.S
new file mode 100644
index 0000000..ff526a4
--- /dev/null
+++ b/arch/x86/lib/arch_hash.S
@@ -0,0 +1,192 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+
+#ifdef CONFIG_AS_CRC32
+
+#ifdef CONFIG_X86_64
+
+ENTRY(__jhash_trampoline)
+	CFI_STARTPROC
+
+	pushq_cfi %rcx
+	pushq_cfi %r8
+	pushq_cfi %r9
+	pushq_cfi %r10
+	pushq_cfi %r11
+
+	call __jhash
+
+	popq_cfi %r11
+	popq_cfi %r10
+	popq_cfi %r9
+	popq_cfi %r8
+	popq_cfi %rcx
+
+	retq
+
+	CFI_ENDPROC
+ENDPROC(__jhash_trampoline)
+
+ENTRY(__sse42_crc32)
+	CFI_STARTPROC
+
+	movq %rdx, %rax
+	cmpq $0x40, %rsi
+	jb .Lcrc_32bytes
+	subq $0x40, %rsi
+
+.Lcrc_64bytes:
+	subq $0x40, %rsi
+	crc32q 0*8(%rdi), %rax
+	crc32q 1*8(%rdi), %rax
+	crc32q 2*8(%rdi), %rax
+	crc32q 3*8(%rdi), %rax
+	crc32q 4*8(%rdi), %rax
+	crc32q 5*8(%rdi), %rax
+	crc32q 6*8(%rdi), %rax
+	crc32q 7*8(%rdi), %rax
+	leaq   8*8(%rdi), %rdi
+	jae .Lcrc_64bytes
+	addq $0x40, %rsi
+
+.Lcrc_32bytes:
+	cmpq $0x20, %rsi
+	jb .Lcrc_16bytes
+
+	subq $0x20, %rsi
+	crc32q 0*8(%rdi), %rax
+	crc32q 1*8(%rdi), %rax
+	crc32q 2*8(%rdi), %rax
+	crc32q 3*8(%rdi), %rax
+	leaq   4*8(%rdi), %rdi
+
+.Lcrc_16bytes:
+	cmpq $0x10, %rsi
+	jb .Lcrc_8bytes
+
+	subq $0x10, %rsi
+	crc32q 0*8(%rdi), %rax
+	crc32q 1*8(%rdi), %rax
+	leaq   2*8(%rdi), %rdi
+
+.Lcrc_8bytes:
+	cmpq $0x8, %rsi
+	jb .Lcrc_4bytes
+
+	subq $0x8, %rsi
+	crc32q (%rdi), %rax
+	leaq 1*8(%rdi), %rdi
+
+.Lcrc_4bytes:
+	cmpq $0x4, %rsi
+	jb .Lcrc_2bytes
+
+	subq $0x4, %rsi
+	crc32l (%rdi), %eax
+	leaq   1*4(%rdi), %rdi
+
+.Lcrc_2bytes:
+	cmpq $0x2, %rsi
+	jb .Lcrc_1bytes
+
+	subq $0x2, %rsi
+	crc32w (%rdi), %eax
+	leaq 1*2(%rdi), %rdi
+
+.Lcrc_1bytes:
+	cmpq $0x1, %rsi
+	jb .Lend
+
+	crc32b (%rdi), %eax
+.Lend:
+	retq
+	CFI_ENDPROC
+ENDPROC(__sse42_crc32)
+
+#else /* CONFIG_X86_32 */
+
+ENTRY(__jhash_trampoline)
+	CFI_STARTPROC
+
+	call __jhash
+
+	retl
+	CFI_ENDPROC
+ENDPROC(__jhash_trampoline)
+
+ENTRY(__sse42_crc32)
+	CFI_STARTPROC
+
+	xchgl %eax,%ecx
+	xchgl %edx,%ecx
+
+	cmpl $0x20, %ecx
+	jb .Lcrc_16bytes
+	subl $0x20, %ecx
+
+.Lcrc_32bytes:
+	subl $0x20, %ecx
+	crc32l 0*4(%edx), %eax
+	crc32l 1*4(%edx), %eax
+	crc32l 2*4(%edx), %eax
+	crc32l 3*4(%edx), %eax
+	crc32l 4*4(%edx), %eax
+	crc32l 5*4(%edx), %eax
+	crc32l 6*4(%edx), %eax
+	crc32l 7*4(%edx), %eax
+	leal   8*4(%edx), %edx
+	jae .Lcrc_32bytes
+	addl $0x20, %ecx
+
+.Lcrc_16bytes:
+	cmpl $0x10, %ecx
+	jb .Lcrc_8bytes
+
+	subl $0x10, %ecx
+	crc32l 0*4(%edx), %eax
+	crc32l 1*4(%edx), %eax
+	crc32l 2*4(%edx), %eax
+	crc32l 3*4(%edx), %eax
+	leal   4*4(%edx), %edx
+
+.Lcrc_8bytes:
+	cmpl $0x8, %ecx
+	jb .Lcrc_4bytes
+
+	subl $0x8, %ecx
+	crc32l 0*4(%edx), %eax
+	crc32l 1*4(%edx), %eax
+	leal   2*4(%edx), %edx
+
+.Lcrc_4bytes:
+	cmpl $0x4, %ecx
+	jb .Lcrc_2bytes
+
+	subl $0x4, %ecx
+	crc32l 0*4(%edx), %eax
+	leal   1*4(%edx), %edx
+
+.Lcrc_2bytes:
+	cmpl $0x2, %ecx
+	jb .Lcrc_1bytes
+
+	subl $0x2, %ecx
+	crc32w (%edx), %eax
+	leal   1*2(%edx), %edx
+
+.Lcrc_1bytes:
+	cmpl $0x1, %ecx
+	jb .Lend
+
+	crc32b (%edx), %eax
+
+.Lend:
+	retl
+
+	CFI_ENDPROC
+ENDPROC(__sse42_crc32)
+
+#endif
+
+#endif /* CONFIG_AS_CRC32 */
diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
deleted file mode 100644
index ff4fa51..0000000
--- a/arch/x86/lib/hash.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Some portions derived from code covered by the following notice:
- *
- * Copyright (c) 2010-2013 Intel Corporation. All rights reserved.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in
- *     the documentation and/or other materials provided with the
- *     distribution.
- *   * Neither the name of Intel Corporation nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/hash.h>
-#include <linux/init.h>
-
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include <asm/hash.h>
-
-static inline u32 crc32_u32(u32 crc, u32 val)
-{
-#ifdef CONFIG_AS_CRC32
-	asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
-#else
-	asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val));
-#endif
-	return crc;
-}
-
-static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
-{
-	const u32 *p32 = (const u32 *) data;
-	u32 i, tmp = 0;
-
-	for (i = 0; i < len / 4; i++)
-		seed = crc32_u32(seed, *p32++);
-
-	switch (len & 3) {
-	case 3:
-		tmp |= *((const u8 *) p32 + 2) << 16;
-		/* fallthrough */
-	case 2:
-		tmp |= *((const u8 *) p32 + 1) << 8;
-		/* fallthrough */
-	case 1:
-		tmp |= *((const u8 *) p32);
-		seed = crc32_u32(seed, tmp);
-		break;
-	}
-
-	return seed;
-}
-
-static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
-{
-	const u32 *p32 = (const u32 *) data;
-	u32 i;
-
-	for (i = 0; i < len; i++)
-		seed = crc32_u32(seed, *p32++);
-
-	return seed;
-}
-
-void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
-{
-	if (cpu_has_xmm4_2) {
-		ops->hash  = intel_crc4_2_hash;
-		ops->hash2 = intel_crc4_2_hash2;
-	}
-}
diff --git a/arch/x86/lib/jhash.c b/arch/x86/lib/jhash.c
new file mode 100644
index 0000000..ab4b408
--- /dev/null
+++ b/arch/x86/lib/jhash.c
@@ -0,0 +1,6 @@
+#include <linux/jhash.h>
+
+u32 __jhash(const void *data, u32 len, u32 seed)
+{
+	return jhash(data, len, seed);
+}
diff --git a/include/asm-generic/hash.h b/include/asm-generic/hash.h
index b631284..07b8892 100644
--- a/include/asm-generic/hash.h
+++ b/include/asm-generic/hash.h
@@ -1,9 +1,23 @@
 #ifndef __ASM_GENERIC_HASH_H
 #define __ASM_GENERIC_HASH_H
 
-struct fast_hash_ops;
-static inline void setup_arch_fast_hash(struct fast_hash_ops *ops)
+#include <linux/jhash.h>
+
+/**
+ *	arch_fast_hash - Caclulates a hash over a given buffer that can have
+ *			 arbitrary size. This function will eventually use an
+ *			 architecture-optimized hashing implementation if
+ *			 available, and trades off distribution for speed.
+ *
+ *	@data: buffer to hash
+ *	@len: length of buffer in bytes
+ *	@seed: start seed
+ *
+ *	Returns 32bit hash.
+ */
+u32 arch_fast_hash(const void *data, u32 len, u32 seed)
 {
+	return jhash(data, len, seed);
 }
 
 #endif /* __ASM_GENERIC_HASH_H */
diff --git a/include/linux/hash.h b/include/linux/hash.h
index d0494c3..6e8fb02 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -84,38 +84,4 @@ static inline u32 hash32_ptr(const void *ptr)
 	return (u32)val;
 }
 
-struct fast_hash_ops {
-	u32 (*hash)(const void *data, u32 len, u32 seed);
-	u32 (*hash2)(const u32 *data, u32 len, u32 seed);
-};
-
-/**
- *	arch_fast_hash - Caclulates a hash over a given buffer that can have
- *			 arbitrary size. This function will eventually use an
- *			 architecture-optimized hashing implementation if
- *			 available, and trades off distribution for speed.
- *
- *	@data: buffer to hash
- *	@len: length of buffer in bytes
- *	@seed: start seed
- *
- *	Returns 32bit hash.
- */
-extern u32 arch_fast_hash(const void *data, u32 len, u32 seed);
-
-/**
- *	arch_fast_hash2 - Caclulates a hash over a given buffer that has a
- *			  size that is of a multiple of 32bit words. This
- *			  function will eventually use an architecture-
- *			  optimized hashing implementation if available,
- *			  and trades off distribution for speed.
- *
- *	@data: buffer to hash (must be 32bit padded)
- *	@len: number of 32bit words
- *	@seed: start seed
- *
- *	Returns 32bit hash.
- */
-extern u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed);
-
 #endif /* _LINUX_HASH_H */
diff --git a/lib/Makefile b/lib/Makefile
index 0211d2b..4b9baa4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o percpu_ida.o hash.o rhashtable.o reciprocal_div.o
+	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
diff --git a/lib/hash.c b/lib/hash.c
deleted file mode 100644
index fea973f..0000000
--- a/lib/hash.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* General purpose hashing library
- *
- * That's a start of a kernel hashing library, which can be extended
- * with further algorithms in future. arch_fast_hash{2,}() will
- * eventually resolve to an architecture optimized implementation.
- *
- * Copyright 2013 Francesco Fusco <ffusco@redhat.com>
- * Copyright 2013 Daniel Borkmann <dborkman@redhat.com>
- * Copyright 2013 Thomas Graf <tgraf@redhat.com>
- * Licensed under the GNU General Public License, version 2.0 (GPLv2)
- */
-
-#include <linux/jhash.h>
-#include <linux/hash.h>
-#include <linux/cache.h>
-
-static struct fast_hash_ops arch_hash_ops __read_mostly = {
-	.hash  = jhash,
-	.hash2 = jhash2,
-};
-
-u32 arch_fast_hash(const void *data, u32 len, u32 seed)
-{
-	return arch_hash_ops.hash(data, len, seed);
-}
-EXPORT_SYMBOL_GPL(arch_fast_hash);
-
-u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
-{
-	return arch_hash_ops.hash2(data, len, seed);
-}
-EXPORT_SYMBOL_GPL(arch_fast_hash2);
-
-static int __init hashlib_init(void)
-{
-	setup_arch_fast_hash(&arch_hash_ops);
-	return 0;
-}
-early_initcall(hashlib_init);
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index e0a7fef..79bc65d 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -366,7 +366,7 @@ static u32 flow_hash(const struct sw_flow_key *key, int key_start,
 	/* Make sure number of hash bytes are multiple of u32. */
 	BUILD_BUG_ON(sizeof(long) % sizeof(u32));
 
-	return arch_fast_hash2(hash_key, hash_u32s, 0);
+	return arch_fast_hash(hash_key, hash_u32s, 0);
 }
 
 static int flow_key_start(const struct sw_flow_key *key)
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm
  2014-12-04 13:08 [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm Hannes Frederic Sowa
@ 2014-12-04 15:56 ` Herbert Xu
  2014-12-04 16:37   ` Hannes Frederic Sowa
  2014-12-04 19:27 ` Jay Vosburgh
  2014-12-09 19:39 ` David Miller
  2 siblings, 1 reply; 6+ messages in thread
From: Herbert Xu @ 2014-12-04 15:56 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: netdev, Jay Vosburgh, Thomas Graf, Daniel Borkmann, Eric Dumazet,
	Linux Kernel Mailing List

On Thu, Dec 04, 2014 at 02:08:50PM +0100, Hannes Frederic Sowa wrote:
> By default the arch_fast_hash hashing function pointers are initialized
> to jhash(2). If during boot-up a CPU with SSE4.2 is detected they get
> updated to the CRC32 ones. This dispatching scheme incurs a function
> pointer lookup and indirect call for every hashing operation.

Just curious, is jhash actually faster than generic C CRC32 on
common platforms?

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm
  2014-12-04 15:56 ` Herbert Xu
@ 2014-12-04 16:37   ` Hannes Frederic Sowa
  0 siblings, 0 replies; 6+ messages in thread
From: Hannes Frederic Sowa @ 2014-12-04 16:37 UTC (permalink / raw)
  To: Herbert Xu
  Cc: netdev, Jay Vosburgh, Thomas Graf, Daniel Borkmann, Eric Dumazet,
	Linux Kernel Mailing List

On Do, 2014-12-04 at 23:56 +0800, Herbert Xu wrote:
> On Thu, Dec 04, 2014 at 02:08:50PM +0100, Hannes Frederic Sowa wrote:
> > By default the arch_fast_hash hashing function pointers are initialized
> > to jhash(2). If during boot-up a CPU with SSE4.2 is detected they get
> > updated to the CRC32 ones. This dispatching scheme incurs a function
> > pointer lookup and indirect call for every hashing operation.
> 
> Just curious, is jhash actually faster than generic C CRC32 on
> common platforms?

Yes, jhash always beats crc32 in software on x86_64 and ia32.

Bye,
Hannes

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm
  2014-12-04 13:08 [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm Hannes Frederic Sowa
  2014-12-04 15:56 ` Herbert Xu
@ 2014-12-04 19:27 ` Jay Vosburgh
  2014-12-04 19:32   ` Hannes Frederic Sowa
  2014-12-09 19:39 ` David Miller
  2 siblings, 1 reply; 6+ messages in thread
From: Jay Vosburgh @ 2014-12-04 19:27 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: netdev, Herbert Xu, Thomas Graf, Daniel Borkmann, Eric Dumazet

Hannes Frederic Sowa <hannes@stressinduktion.org> wrote:

>By default the arch_fast_hash hashing function pointers are initialized
>to jhash(2). If during boot-up a CPU with SSE4.2 is detected they get
>updated to the CRC32 ones. This dispatching scheme incurs a function
>pointer lookup and indirect call for every hashing operation.
>
>To keep the number of clobbered registers short the hashing primitives
>are implemented in assembler. This makes it easier to do the dispatch
>by alternative_call.

	I have tested this on the same system that panicked with the
original (now reverted) implementation (commit e5a2c8999576 "fast_hash:
avoid indirect function calls"), and it functions correctly and does not
panic.

	I looked at the disassembly, and, as a data point, on a
non-SSE4.2 system, the code generated is not as efficient as Hannes'
original test patch, found here:

http://comments.gmane.org/gmane.linux.network/338430

	which produced code as follows:

0xffffffffa00b6bb9 <ovs_flow_tbl_insert+0xb9>:  mov    %r15,0x348(%r14)
0xffffffffa00b6bc0 <ovs_flow_tbl_insert+0xc0>:  movzwl 0x28(%r15),%ecx
0xffffffffa00b6bc5 <ovs_flow_tbl_insert+0xc5>:  movzwl 0x2a(%r15),%esi
0xffffffffa00b6bca <ovs_flow_tbl_insert+0xca>:  movzwl %cx,%eax
0xffffffffa00b6bcd <ovs_flow_tbl_insert+0xcd>:  sub    %ecx,%esi
0xffffffffa00b6bcf <ovs_flow_tbl_insert+0xcf>:  lea    0x38(%r14,%rax,1),%rdi
0xffffffffa00b6bd4 <ovs_flow_tbl_insert+0xd4>:  sar    $0x2,%esi
0xffffffffa00b6bd7 <ovs_flow_tbl_insert+0xd7>:  callq  0xffffffff813a7810 <__jhash2>
0xffffffffa00b6bdc <ovs_flow_tbl_insert+0xdc>:  mov    %eax,0x30(%r14)
0xffffffffa00b6be0 <ovs_flow_tbl_insert+0xe0>:  mov    (%rbx),%r13
0xffffffffa00b6be3 <ovs_flow_tbl_insert+0xe3>:  mov    %r14,%rsi
0xffffffffa00b6be6 <ovs_flow_tbl_insert+0xe6>:  mov    %r13,%rdi
0xffffffffa00b6be9 <ovs_flow_tbl_insert+0xe9>:  callq  0xffffffffa00b61a0 <table_instance_insert>

	This patch's code ends up as follows:

0xffffffffa01b5a57 <ovs_flow_tbl_insert+0xb7>:	mov    %r15,0x348(%rcx)
0xffffffffa01b5a5e <ovs_flow_tbl_insert+0xbe>:	movzwl 0x28(%r15),%eax
0xffffffffa01b5a63 <ovs_flow_tbl_insert+0xc3>:	movzwl 0x2a(%r15),%esi
0xffffffffa01b5a68 <ovs_flow_tbl_insert+0xc8>:	movzwl %ax,%edx
0xffffffffa01b5a6b <ovs_flow_tbl_insert+0xcb>:	sub    %eax,%esi
0xffffffffa01b5a6d <ovs_flow_tbl_insert+0xcd>:	lea    0x38(%rcx,%rdx,1),%rdi
0xffffffffa01b5a72 <ovs_flow_tbl_insert+0xd2>:	xor    %edx,%edx
0xffffffffa01b5a74 <ovs_flow_tbl_insert+0xd4>:	sar    $0x2,%esi
0xffffffffa01b5a77 <ovs_flow_tbl_insert+0xd7>:	callq  0xffffffff813ae9f0 <__jhash_trampoline>
0xffffffffa01b5a7c <ovs_flow_tbl_insert+0xdc>:	mov    %eax,0x30(%rcx)
0xffffffffa01b5a7f <ovs_flow_tbl_insert+0xdf>:	mov    (%rbx),%r13
0xffffffffa01b5a82 <ovs_flow_tbl_insert+0xe2>:	mov    %rcx,%rsi
0xffffffffa01b5a85 <ovs_flow_tbl_insert+0xe5>:	mov    %r13,%rdi
0xffffffffa01b5a88 <ovs_flow_tbl_insert+0xe8>:	callq  0xffffffffa01b5030 <table_instance_insert>

0xffffffff813ae9f0 <__jhash_trampoline>:	push   %rcx
0xffffffff813ae9f1 <__jhash_trampoline+0x1>:	push   %r8
0xffffffff813ae9f3 <__jhash_trampoline+0x3>:	push   %r9
0xffffffff813ae9f5 <__jhash_trampoline+0x5>:	push   %r10
0xffffffff813ae9f7 <__jhash_trampoline+0x7>:	push   %r11
0xffffffff813ae9f9 <__jhash_trampoline+0x9>:	callq  0xffffffff813ae8a0 <__jhash>
0xffffffff813ae9fe <__jhash_trampoline+0xe>:	pop    %r11
0xffffffff813aea00 <__jhash_trampoline+0x10>:	pop    %r10
0xffffffff813aea02 <__jhash_trampoline+0x12>:	pop    %r9
0xffffffff813aea04 <__jhash_trampoline+0x14>:	pop    %r8
0xffffffff813aea06 <__jhash_trampoline+0x16>:	pop    %rcx
0xffffffff813aea07 <__jhash_trampoline+0x17>:	retq   

	In any event, this new patch does work correctly in my test that
originally failed, and it's debatable how much optimizing for old
systems is worthwhile.

	I only tested the non-SSE4.2 (i.e., old system) portion on
x86_64.

Tested-by: Jay Vosburgh <jay.vosburgh@canonical.com>

	-J

>Cc: Herbert Xu <herbert@gondor.apana.org.au>
>Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
>Cc: Thomas Graf <tgraf@suug.ch>
>Cc: Daniel Borkmann <dborkman@redhat.com>
>Cc: Eric Dumazet <eric.dumazet@gmail.com>
>Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
>---
> arch/x86/include/asm/hash.h      |  53 ++++++++++-
> arch/x86/kernel/i386_ksyms_32.c  |   6 ++
> arch/x86/kernel/x8664_ksyms_64.c |   6 ++
> arch/x86/lib/Makefile            |   2 +-
> arch/x86/lib/arch_hash.S         | 192 +++++++++++++++++++++++++++++++++++++++
> arch/x86/lib/hash.c              |  92 -------------------
> arch/x86/lib/jhash.c             |   6 ++
> include/asm-generic/hash.h       |  18 +++-
> include/linux/hash.h             |  34 -------
> lib/Makefile                     |   2 +-
> lib/hash.c                       |  39 --------
> net/openvswitch/flow_table.c     |   2 +-
> 12 files changed, 280 insertions(+), 172 deletions(-)
> create mode 100644 arch/x86/lib/arch_hash.S
> delete mode 100644 arch/x86/lib/hash.c
> create mode 100644 arch/x86/lib/jhash.c
> delete mode 100644 lib/hash.c
>
>diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h
>index e8c58f8..620081b 100644
>--- a/arch/x86/include/asm/hash.h
>+++ b/arch/x86/include/asm/hash.h
>@@ -1,7 +1,56 @@
> #ifndef _ASM_X86_HASH_H
> #define _ASM_X86_HASH_H
> 
>-struct fast_hash_ops;
>-extern void setup_arch_fast_hash(struct fast_hash_ops *ops);
>+#include <linux/cpufeature.h>
>+#include <asm/alternative.h>
>+
>+#include <linux/jhash.h>
>+
>+#ifdef CONFIG_AS_CRC32
>+
>+u32 __jhash_trampoline(const void *data, u32 len, u32 seed);
>+u32 __sse42_crc32(const void *data, u32 len, u32 seed);
>+
>+#ifdef CONFIG_X86_64
>+
>+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
>+{
>+	u32 hash;
>+
>+	alternative_call(__jhash_trampoline, __sse42_crc32, X86_FEATURE_XMM4_2,
>+			 ASM_OUTPUT2("=a" (hash), "=D" (data), "=S" (len),
>+				     "=d" (seed)),
>+			 "1" (data), "2" (len), "3" (seed)
>+			 : "memory", "cc");
>+
>+	return hash;
>+}
>+
>+#else /* CONFIG_X86_64 */
>+
>+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
>+{
>+	u32 hash;
>+
>+	alternative_call(__jhash_trampoline, __sse42_crc32, X86_FEATURE_XMM4_2,
>+			 ASM_OUTPUT2("=a" (hash), "=d" (len), "=c" (seed)),
>+			 "0" (data), "1" (len), "2" (seed)
>+			 : "memory", "cc");
>+
>+	return hash;
>+}
>+
>+#endif /* CONFIG_x86_64 */
>+
>+#else /* CONFIG_AS_CRC32 */
>+
>+u32 __jhash(const void *data, u32 len, u32 seed);
>+
>+static inline u32 arch_fast_hash(const void *data, u32 len, u32 seed)
>+{
>+	return __jhash(data, len, seed);
>+}
>+
>+#endif
> 
> #endif /* _ASM_X86_HASH_H */
>diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
>index 05fd74f..afb98da 100644
>--- a/arch/x86/kernel/i386_ksyms_32.c
>+++ b/arch/x86/kernel/i386_ksyms_32.c
>@@ -1,4 +1,5 @@
> #include <linux/module.h>
>+#include <linux/hash.h>
> 
> #include <asm/checksum.h>
> #include <asm/pgtable.h>
>@@ -38,6 +39,11 @@ EXPORT_SYMBOL(strstr);
> EXPORT_SYMBOL(csum_partial);
> EXPORT_SYMBOL(empty_zero_page);
> 
>+#ifdef CONFIG_AS_CRC32
>+EXPORT_SYMBOL(__sse42_crc32);
>+EXPORT_SYMBOL(__jhash_trampoline);
>+#endif
>+
> #ifdef CONFIG_PREEMPT
> EXPORT_SYMBOL(___preempt_schedule);
> #ifdef CONFIG_CONTEXT_TRACKING
>diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
>index 0406819..1094c13 100644
>--- a/arch/x86/kernel/x8664_ksyms_64.c
>+++ b/arch/x86/kernel/x8664_ksyms_64.c
>@@ -3,6 +3,7 @@
> 
> #include <linux/module.h>
> #include <linux/smp.h>
>+#include <linux/hash.h>
> 
> #include <net/checksum.h>
> 
>@@ -42,6 +43,11 @@ EXPORT_SYMBOL(clear_page);
> 
> EXPORT_SYMBOL(csum_partial);
> 
>+#ifdef CONFIG_AS_CRC32
>+EXPORT_SYMBOL(__sse42_crc32);
>+EXPORT_SYMBOL(__jhash_trampoline);
>+#endif
>+
> /*
>  * Export string functions. We normally rely on gcc builtin for most of these,
>  * but gcc sometimes decides not to inline them.
>diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
>index db92793..168bbef 100644
>--- a/arch/x86/lib/Makefile
>+++ b/arch/x86/lib/Makefile
>@@ -23,7 +23,7 @@ lib-y += memcpy_$(BITS).o
> lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
> lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
> 
>-obj-y += msr.o msr-reg.o msr-reg-export.o hash.o
>+obj-y += msr.o msr-reg.o msr-reg-export.o jhash.o arch_hash.o
> 
> ifeq ($(CONFIG_X86_32),y)
>         obj-y += atomic64_32.o
>diff --git a/arch/x86/lib/arch_hash.S b/arch/x86/lib/arch_hash.S
>new file mode 100644
>index 0000000..ff526a4
>--- /dev/null
>+++ b/arch/x86/lib/arch_hash.S
>@@ -0,0 +1,192 @@
>+#include <linux/linkage.h>
>+#include <asm/dwarf2.h>
>+#include <asm/calling.h>
>+
>+#ifdef CONFIG_AS_CRC32
>+
>+#ifdef CONFIG_X86_64
>+
>+ENTRY(__jhash_trampoline)
>+	CFI_STARTPROC
>+
>+	pushq_cfi %rcx
>+	pushq_cfi %r8
>+	pushq_cfi %r9
>+	pushq_cfi %r10
>+	pushq_cfi %r11
>+
>+	call __jhash
>+
>+	popq_cfi %r11
>+	popq_cfi %r10
>+	popq_cfi %r9
>+	popq_cfi %r8
>+	popq_cfi %rcx
>+
>+	retq
>+
>+	CFI_ENDPROC
>+ENDPROC(__jhash_trampoline)
>+
>+ENTRY(__sse42_crc32)
>+	CFI_STARTPROC
>+
>+	movq %rdx, %rax
>+	cmpq $0x40, %rsi
>+	jb .Lcrc_32bytes
>+	subq $0x40, %rsi
>+
>+.Lcrc_64bytes:
>+	subq $0x40, %rsi
>+	crc32q 0*8(%rdi), %rax
>+	crc32q 1*8(%rdi), %rax
>+	crc32q 2*8(%rdi), %rax
>+	crc32q 3*8(%rdi), %rax
>+	crc32q 4*8(%rdi), %rax
>+	crc32q 5*8(%rdi), %rax
>+	crc32q 6*8(%rdi), %rax
>+	crc32q 7*8(%rdi), %rax
>+	leaq   8*8(%rdi), %rdi
>+	jae .Lcrc_64bytes
>+	addq $0x40, %rsi
>+
>+.Lcrc_32bytes:
>+	cmpq $0x20, %rsi
>+	jb .Lcrc_16bytes
>+
>+	subq $0x20, %rsi
>+	crc32q 0*8(%rdi), %rax
>+	crc32q 1*8(%rdi), %rax
>+	crc32q 2*8(%rdi), %rax
>+	crc32q 3*8(%rdi), %rax
>+	leaq   4*8(%rdi), %rdi
>+
>+.Lcrc_16bytes:
>+	cmpq $0x10, %rsi
>+	jb .Lcrc_8bytes
>+
>+	subq $0x10, %rsi
>+	crc32q 0*8(%rdi), %rax
>+	crc32q 1*8(%rdi), %rax
>+	leaq   2*8(%rdi), %rdi
>+
>+.Lcrc_8bytes:
>+	cmpq $0x8, %rsi
>+	jb .Lcrc_4bytes
>+
>+	subq $0x8, %rsi
>+	crc32q (%rdi), %rax
>+	leaq 1*8(%rdi), %rdi
>+
>+.Lcrc_4bytes:
>+	cmpq $0x4, %rsi
>+	jb .Lcrc_2bytes
>+
>+	subq $0x4, %rsi
>+	crc32l (%rdi), %eax
>+	leaq   1*4(%rdi), %rdi
>+
>+.Lcrc_2bytes:
>+	cmpq $0x2, %rsi
>+	jb .Lcrc_1bytes
>+
>+	subq $0x2, %rsi
>+	crc32w (%rdi), %eax
>+	leaq 1*2(%rdi), %rdi
>+
>+.Lcrc_1bytes:
>+	cmpq $0x1, %rsi
>+	jb .Lend
>+
>+	crc32b (%rdi), %eax
>+.Lend:
>+	retq
>+	CFI_ENDPROC
>+ENDPROC(__sse42_crc32)
>+
>+#else /* CONFIG_X86_32 */
>+
>+ENTRY(__jhash_trampoline)
>+	CFI_STARTPROC
>+
>+	call __jhash
>+
>+	retl
>+	CFI_ENDPROC
>+ENDPROC(__jhash_trampoline)
>+
>+ENTRY(__sse42_crc32)
>+	CFI_STARTPROC
>+
>+	xchgl %eax,%ecx
>+	xchgl %edx,%ecx
>+
>+	cmpl $0x20, %ecx
>+	jb .Lcrc_16bytes
>+	subl $0x20, %ecx
>+
>+.Lcrc_32bytes:
>+	subl $0x20, %ecx
>+	crc32l 0*4(%edx), %eax
>+	crc32l 1*4(%edx), %eax
>+	crc32l 2*4(%edx), %eax
>+	crc32l 3*4(%edx), %eax
>+	crc32l 4*4(%edx), %eax
>+	crc32l 5*4(%edx), %eax
>+	crc32l 6*4(%edx), %eax
>+	crc32l 7*4(%edx), %eax
>+	leal   8*4(%edx), %edx
>+	jae .Lcrc_32bytes
>+	addl $0x20, %ecx
>+
>+.Lcrc_16bytes:
>+	cmpl $0x10, %ecx
>+	jb .Lcrc_8bytes
>+
>+	subl $0x10, %ecx
>+	crc32l 0*4(%edx), %eax
>+	crc32l 1*4(%edx), %eax
>+	crc32l 2*4(%edx), %eax
>+	crc32l 3*4(%edx), %eax
>+	leal   4*4(%edx), %edx
>+
>+.Lcrc_8bytes:
>+	cmpl $0x8, %ecx
>+	jb .Lcrc_4bytes
>+
>+	subl $0x8, %ecx
>+	crc32l 0*4(%edx), %eax
>+	crc32l 1*4(%edx), %eax
>+	leal   2*4(%edx), %edx
>+
>+.Lcrc_4bytes:
>+	cmpl $0x4, %ecx
>+	jb .Lcrc_2bytes
>+
>+	subl $0x4, %ecx
>+	crc32l 0*4(%edx), %eax
>+	leal   1*4(%edx), %edx
>+
>+.Lcrc_2bytes:
>+	cmpl $0x2, %ecx
>+	jb .Lcrc_1bytes
>+
>+	subl $0x2, %ecx
>+	crc32w (%edx), %eax
>+	leal   1*2(%edx), %edx
>+
>+.Lcrc_1bytes:
>+	cmpl $0x1, %ecx
>+	jb .Lend
>+
>+	crc32b (%edx), %eax
>+
>+.Lend:
>+	retl
>+
>+	CFI_ENDPROC
>+ENDPROC(__sse42_crc32)
>+
>+#endif
>+
>+#endif /* CONFIG_AS_CRC32 */
>diff --git a/arch/x86/lib/hash.c b/arch/x86/lib/hash.c
>deleted file mode 100644
>index ff4fa51..0000000
>--- a/arch/x86/lib/hash.c
>+++ /dev/null
>@@ -1,92 +0,0 @@
>-/*
>- * Some portions derived from code covered by the following notice:
>- *
>- * Copyright (c) 2010-2013 Intel Corporation. All rights reserved.
>- * All rights reserved.
>- *
>- * Redistribution and use in source and binary forms, with or without
>- * modification, are permitted provided that the following conditions
>- * are met:
>- *
>- *   * Redistributions of source code must retain the above copyright
>- *     notice, this list of conditions and the following disclaimer.
>- *   * Redistributions in binary form must reproduce the above copyright
>- *     notice, this list of conditions and the following disclaimer in
>- *     the documentation and/or other materials provided with the
>- *     distribution.
>- *   * Neither the name of Intel Corporation nor the names of its
>- *     contributors may be used to endorse or promote products derived
>- *     from this software without specific prior written permission.
>- *
>- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
>- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
>- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
>- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
>- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
>- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
>- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
>- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
>- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
>- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>- */
>-
>-#include <linux/hash.h>
>-#include <linux/init.h>
>-
>-#include <asm/processor.h>
>-#include <asm/cpufeature.h>
>-#include <asm/hash.h>
>-
>-static inline u32 crc32_u32(u32 crc, u32 val)
>-{
>-#ifdef CONFIG_AS_CRC32
>-	asm ("crc32l %1,%0\n" : "+r" (crc) : "rm" (val));
>-#else
>-	asm (".byte 0xf2, 0x0f, 0x38, 0xf1, 0xc1" : "+a" (crc) : "c" (val));
>-#endif
>-	return crc;
>-}
>-
>-static u32 intel_crc4_2_hash(const void *data, u32 len, u32 seed)
>-{
>-	const u32 *p32 = (const u32 *) data;
>-	u32 i, tmp = 0;
>-
>-	for (i = 0; i < len / 4; i++)
>-		seed = crc32_u32(seed, *p32++);
>-
>-	switch (len & 3) {
>-	case 3:
>-		tmp |= *((const u8 *) p32 + 2) << 16;
>-		/* fallthrough */
>-	case 2:
>-		tmp |= *((const u8 *) p32 + 1) << 8;
>-		/* fallthrough */
>-	case 1:
>-		tmp |= *((const u8 *) p32);
>-		seed = crc32_u32(seed, tmp);
>-		break;
>-	}
>-
>-	return seed;
>-}
>-
>-static u32 intel_crc4_2_hash2(const u32 *data, u32 len, u32 seed)
>-{
>-	const u32 *p32 = (const u32 *) data;
>-	u32 i;
>-
>-	for (i = 0; i < len; i++)
>-		seed = crc32_u32(seed, *p32++);
>-
>-	return seed;
>-}
>-
>-void __init setup_arch_fast_hash(struct fast_hash_ops *ops)
>-{
>-	if (cpu_has_xmm4_2) {
>-		ops->hash  = intel_crc4_2_hash;
>-		ops->hash2 = intel_crc4_2_hash2;
>-	}
>-}
>diff --git a/arch/x86/lib/jhash.c b/arch/x86/lib/jhash.c
>new file mode 100644
>index 0000000..ab4b408
>--- /dev/null
>+++ b/arch/x86/lib/jhash.c
>@@ -0,0 +1,6 @@
>+#include <linux/jhash.h>
>+
>+u32 __jhash(const void *data, u32 len, u32 seed)
>+{
>+	return jhash(data, len, seed);
>+}
>diff --git a/include/asm-generic/hash.h b/include/asm-generic/hash.h
>index b631284..07b8892 100644
>--- a/include/asm-generic/hash.h
>+++ b/include/asm-generic/hash.h
>@@ -1,9 +1,23 @@
> #ifndef __ASM_GENERIC_HASH_H
> #define __ASM_GENERIC_HASH_H
> 
>-struct fast_hash_ops;
>-static inline void setup_arch_fast_hash(struct fast_hash_ops *ops)
>+#include <linux/jhash.h>
>+
>+/**
>+ *	arch_fast_hash - Caclulates a hash over a given buffer that can have
>+ *			 arbitrary size. This function will eventually use an
>+ *			 architecture-optimized hashing implementation if
>+ *			 available, and trades off distribution for speed.
>+ *
>+ *	@data: buffer to hash
>+ *	@len: length of buffer in bytes
>+ *	@seed: start seed
>+ *
>+ *	Returns 32bit hash.
>+ */
>+u32 arch_fast_hash(const void *data, u32 len, u32 seed)
> {
>+	return jhash(data, len, seed);
> }
> 
> #endif /* __ASM_GENERIC_HASH_H */
>diff --git a/include/linux/hash.h b/include/linux/hash.h
>index d0494c3..6e8fb02 100644
>--- a/include/linux/hash.h
>+++ b/include/linux/hash.h
>@@ -84,38 +84,4 @@ static inline u32 hash32_ptr(const void *ptr)
> 	return (u32)val;
> }
> 
>-struct fast_hash_ops {
>-	u32 (*hash)(const void *data, u32 len, u32 seed);
>-	u32 (*hash2)(const u32 *data, u32 len, u32 seed);
>-};
>-
>-/**
>- *	arch_fast_hash - Caclulates a hash over a given buffer that can have
>- *			 arbitrary size. This function will eventually use an
>- *			 architecture-optimized hashing implementation if
>- *			 available, and trades off distribution for speed.
>- *
>- *	@data: buffer to hash
>- *	@len: length of buffer in bytes
>- *	@seed: start seed
>- *
>- *	Returns 32bit hash.
>- */
>-extern u32 arch_fast_hash(const void *data, u32 len, u32 seed);
>-
>-/**
>- *	arch_fast_hash2 - Caclulates a hash over a given buffer that has a
>- *			  size that is of a multiple of 32bit words. This
>- *			  function will eventually use an architecture-
>- *			  optimized hashing implementation if available,
>- *			  and trades off distribution for speed.
>- *
>- *	@data: buffer to hash (must be 32bit padded)
>- *	@len: number of 32bit words
>- *	@seed: start seed
>- *
>- *	Returns 32bit hash.
>- */
>-extern u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed);
>-
> #endif /* _LINUX_HASH_H */
>diff --git a/lib/Makefile b/lib/Makefile
>index 0211d2b..4b9baa4 100644
>--- a/lib/Makefile
>+++ b/lib/Makefile
>@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
> 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
> 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
> 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
>-	 percpu-refcount.o percpu_ida.o hash.o rhashtable.o reciprocal_div.o
>+	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
> obj-y += string_helpers.o
> obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
> obj-y += kstrtox.o
>diff --git a/lib/hash.c b/lib/hash.c
>deleted file mode 100644
>index fea973f..0000000
>--- a/lib/hash.c
>+++ /dev/null
>@@ -1,39 +0,0 @@
>-/* General purpose hashing library
>- *
>- * That's a start of a kernel hashing library, which can be extended
>- * with further algorithms in future. arch_fast_hash{2,}() will
>- * eventually resolve to an architecture optimized implementation.
>- *
>- * Copyright 2013 Francesco Fusco <ffusco@redhat.com>
>- * Copyright 2013 Daniel Borkmann <dborkman@redhat.com>
>- * Copyright 2013 Thomas Graf <tgraf@redhat.com>
>- * Licensed under the GNU General Public License, version 2.0 (GPLv2)
>- */
>-
>-#include <linux/jhash.h>
>-#include <linux/hash.h>
>-#include <linux/cache.h>
>-
>-static struct fast_hash_ops arch_hash_ops __read_mostly = {
>-	.hash  = jhash,
>-	.hash2 = jhash2,
>-};
>-
>-u32 arch_fast_hash(const void *data, u32 len, u32 seed)
>-{
>-	return arch_hash_ops.hash(data, len, seed);
>-}
>-EXPORT_SYMBOL_GPL(arch_fast_hash);
>-
>-u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
>-{
>-	return arch_hash_ops.hash2(data, len, seed);
>-}
>-EXPORT_SYMBOL_GPL(arch_fast_hash2);
>-
>-static int __init hashlib_init(void)
>-{
>-	setup_arch_fast_hash(&arch_hash_ops);
>-	return 0;
>-}
>-early_initcall(hashlib_init);
>diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
>index e0a7fef..79bc65d 100644
>--- a/net/openvswitch/flow_table.c
>+++ b/net/openvswitch/flow_table.c
>@@ -366,7 +366,7 @@ static u32 flow_hash(const struct sw_flow_key *key, int key_start,
> 	/* Make sure number of hash bytes are multiple of u32. */
> 	BUILD_BUG_ON(sizeof(long) % sizeof(u32));
> 
>-	return arch_fast_hash2(hash_key, hash_u32s, 0);
>+	return arch_fast_hash(hash_key, hash_u32s, 0);
> }
> 
> static int flow_key_start(const struct sw_flow_key *key)
>-- 
>1.9.3

---
	-Jay Vosburgh, jay.vosburgh@canonical.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm
  2014-12-04 19:27 ` Jay Vosburgh
@ 2014-12-04 19:32   ` Hannes Frederic Sowa
  0 siblings, 0 replies; 6+ messages in thread
From: Hannes Frederic Sowa @ 2014-12-04 19:32 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: netdev, Herbert Xu, Thomas Graf, Daniel Borkmann, Eric Dumazet

Hi Jay,

On Do, 2014-12-04 at 11:27 -0800, Jay Vosburgh wrote:
> Hannes Frederic Sowa <hannes@stressinduktion.org> wrote:
> 
> >By default the arch_fast_hash hashing function pointers are initialized
> >to jhash(2). If during boot-up a CPU with SSE4.2 is detected they get
> >updated to the CRC32 ones. This dispatching scheme incurs a function
> >pointer lookup and indirect call for every hashing operation.
> >
> >To keep the number of clobbered registers short the hashing primitives
> >are implemented in assembler. This makes it easier to do the dispatch
> >by alternative_call.
> 
> 	I have tested this on the same system that panicked with the
> original (now reverted) implementation (commit e5a2c8999576 "fast_hash:
> avoid indirect function calls"), and it functions correctly and does not
> panic.
> 
> 	I looked at the disassembly, and, as a data point, on a
> non-SSE4.2 system, the code generated is not as efficient as Hannes'
> original test patch, found here:
> 
> http://comments.gmane.org/gmane.linux.network/338430
> 
> 	which produced code as follows:
> 
> 0xffffffffa00b6bb9 <ovs_flow_tbl_insert+0xb9>:  mov    %r15,0x348(%r14)
> 0xffffffffa00b6bc0 <ovs_flow_tbl_insert+0xc0>:  movzwl 0x28(%r15),%ecx
> 0xffffffffa00b6bc5 <ovs_flow_tbl_insert+0xc5>:  movzwl 0x2a(%r15),%esi
> 0xffffffffa00b6bca <ovs_flow_tbl_insert+0xca>:  movzwl %cx,%eax
> 0xffffffffa00b6bcd <ovs_flow_tbl_insert+0xcd>:  sub    %ecx,%esi
> 0xffffffffa00b6bcf <ovs_flow_tbl_insert+0xcf>:  lea    0x38(%r14,%rax,1),%rdi
> 0xffffffffa00b6bd4 <ovs_flow_tbl_insert+0xd4>:  sar    $0x2,%esi
> 0xffffffffa00b6bd7 <ovs_flow_tbl_insert+0xd7>:  callq  0xffffffff813a7810 <__jhash2>
> 0xffffffffa00b6bdc <ovs_flow_tbl_insert+0xdc>:  mov    %eax,0x30(%r14)
> 0xffffffffa00b6be0 <ovs_flow_tbl_insert+0xe0>:  mov    (%rbx),%r13
> 0xffffffffa00b6be3 <ovs_flow_tbl_insert+0xe3>:  mov    %r14,%rsi
> 0xffffffffa00b6be6 <ovs_flow_tbl_insert+0xe6>:  mov    %r13,%rdi
> 0xffffffffa00b6be9 <ovs_flow_tbl_insert+0xe9>:  callq  0xffffffffa00b61a0 <table_instance_insert>
> 
> 	This patch's code ends up as follows:
> 
> 0xffffffffa01b5a57 <ovs_flow_tbl_insert+0xb7>:	mov    %r15,0x348(%rcx)
> 0xffffffffa01b5a5e <ovs_flow_tbl_insert+0xbe>:	movzwl 0x28(%r15),%eax
> 0xffffffffa01b5a63 <ovs_flow_tbl_insert+0xc3>:	movzwl 0x2a(%r15),%esi
> 0xffffffffa01b5a68 <ovs_flow_tbl_insert+0xc8>:	movzwl %ax,%edx
> 0xffffffffa01b5a6b <ovs_flow_tbl_insert+0xcb>:	sub    %eax,%esi
> 0xffffffffa01b5a6d <ovs_flow_tbl_insert+0xcd>:	lea    0x38(%rcx,%rdx,1),%rdi
> 0xffffffffa01b5a72 <ovs_flow_tbl_insert+0xd2>:	xor    %edx,%edx
> 0xffffffffa01b5a74 <ovs_flow_tbl_insert+0xd4>:	sar    $0x2,%esi
> 0xffffffffa01b5a77 <ovs_flow_tbl_insert+0xd7>:	callq  0xffffffff813ae9f0 <__jhash_trampoline>
> 0xffffffffa01b5a7c <ovs_flow_tbl_insert+0xdc>:	mov    %eax,0x30(%rcx)
> 0xffffffffa01b5a7f <ovs_flow_tbl_insert+0xdf>:	mov    (%rbx),%r13
> 0xffffffffa01b5a82 <ovs_flow_tbl_insert+0xe2>:	mov    %rcx,%rsi
> 0xffffffffa01b5a85 <ovs_flow_tbl_insert+0xe5>:	mov    %r13,%rdi
> 0xffffffffa01b5a88 <ovs_flow_tbl_insert+0xe8>:	callq  0xffffffffa01b5030 <table_instance_insert>
> 
> 0xffffffff813ae9f0 <__jhash_trampoline>:	push   %rcx
> 0xffffffff813ae9f1 <__jhash_trampoline+0x1>:	push   %r8
> 0xffffffff813ae9f3 <__jhash_trampoline+0x3>:	push   %r9
> 0xffffffff813ae9f5 <__jhash_trampoline+0x5>:	push   %r10
> 0xffffffff813ae9f7 <__jhash_trampoline+0x7>:	push   %r11
> 0xffffffff813ae9f9 <__jhash_trampoline+0x9>:	callq  0xffffffff813ae8a0 <__jhash>
> 0xffffffff813ae9fe <__jhash_trampoline+0xe>:	pop    %r11
> 0xffffffff813aea00 <__jhash_trampoline+0x10>:	pop    %r10
> 0xffffffff813aea02 <__jhash_trampoline+0x12>:	pop    %r9
> 0xffffffff813aea04 <__jhash_trampoline+0x14>:	pop    %r8
> 0xffffffff813aea06 <__jhash_trampoline+0x16>:	pop    %rcx
> 0xffffffff813aea07 <__jhash_trampoline+0x17>:	retq   
> 
> 	In any event, this new patch does work correctly in my test that
> originally failed, and it's debatable how much optimizing for old
> systems is worthwhile.

Yes, that is expected. I also don't have a good idea on how to improve
the hashing on non-SSE4.2 systems in a reasonable amount of time.

> 	I only tested the non-SSE4.2 (i.e., old system) portion on
> x86_64.

I tried every possible setup this time, especially with openvswitch. I
covered ia32 with and without SSE4.2 as well as x86_64 and it always
behaved correctly. Last time the problem was that the static inline
didn't become a function in OVS, but during the testing with rhashtable
it got synthesized into a normal C call because of the indirect
reference.

Thanks a lot,
Hannes

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm
  2014-12-04 13:08 [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm Hannes Frederic Sowa
  2014-12-04 15:56 ` Herbert Xu
  2014-12-04 19:27 ` Jay Vosburgh
@ 2014-12-09 19:39 ` David Miller
  2 siblings, 0 replies; 6+ messages in thread
From: David Miller @ 2014-12-09 19:39 UTC (permalink / raw)
  To: hannes; +Cc: netdev, herbert, jay.vosburgh, tgraf, dborkman, eric.dumazet

From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Thu,  4 Dec 2014 14:08:50 +0100

> By default the arch_fast_hash hashing function pointers are initialized
> to jhash(2). If during boot-up a CPU with SSE4.2 is detected they get
> updated to the CRC32 ones. This dispatching scheme incurs a function
> pointer lookup and indirect call for every hashing operation.
> 
> To keep the number of clobbered registers short the hashing primitives
> are implemented in assembler. This makes it easier to do the dispatch
> by alternative_call.
> 
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
> Cc: Thomas Graf <tgraf@suug.ch>
> Cc: Daniel Borkmann <dborkman@redhat.com>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>

I'm not applying this, I want this whole facility removed instead.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2014-12-09 19:39 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-12-04 13:08 [PATCH net-next] arch_fast_hash: avoid indirect function calls and implement hash in asm Hannes Frederic Sowa
2014-12-04 15:56 ` Herbert Xu
2014-12-04 16:37   ` Hannes Frederic Sowa
2014-12-04 19:27 ` Jay Vosburgh
2014-12-04 19:32   ` Hannes Frederic Sowa
2014-12-09 19:39 ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).