From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:56244) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1anYD0-0006MC-Gi for qemu-devel@nongnu.org; Tue, 05 Apr 2016 17:08:23 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1anYCw-0006TW-CM for qemu-devel@nongnu.org; Tue, 05 Apr 2016 17:08:22 -0400 Received: from mail-qk0-x244.google.com ([2607:f8b0:400d:c09::244]:34212) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1anYCw-0006TA-75 for qemu-devel@nongnu.org; Tue, 05 Apr 2016 17:08:18 -0400 Received: by mail-qk0-x244.google.com with SMTP id z64so1216929qkb.1 for ; Tue, 05 Apr 2016 14:08:18 -0700 (PDT) Sender: Richard Henderson From: Richard Henderson References: <1459834253-8291-1-git-send-email-cota@braap.org> <1459834253-8291-8-git-send-email-cota@braap.org> <5703DCB7.50302@twiddle.net> <5703DE37.3080306@redhat.com> <5703E2DD.3020103@twiddle.net> <20160405194028.GA6671@flamenco> Message-ID: <5704293D.1070105@twiddle.net> Date: Tue, 5 Apr 2016 14:08:13 -0700 MIME-Version: 1.0 In-Reply-To: <20160405194028.GA6671@flamenco> Content-Type: text/plain; charset=windows-1252; format=flowed Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [PATCH 07/10] tb hash: hash phys_pc, pc, and flags with xxhash List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: "Emilio G. Cota" Cc: MTTCG Devel , Peter Maydell , Peter Crosthwaite , QEMU Developers , Sergey Fedorov , Paolo Bonzini , =?UTF-8?Q?Alex_Benn=c3=a9e?= On 04/05/2016 12:40 PM, Emilio G. Cota wrote: > On Tue, Apr 05, 2016 at 09:07:57 -0700, Richard Henderson wrote: >> On 04/05/2016 08:48 AM, Paolo Bonzini wrote: >>> I think it's fine to use the struct. The exact size of the struct >>> varies from 3 to 5 32-bit words, so it's hard to write nice >>> size-dependent code for the hash. >> >> I don't think it is. We have 3 integers. It is trivial to create a simple >> function of 2 multiplies, two adds, and a remainder. >> >> Take the primes from the xxhash.h, for example: >> >> (phys_pc * PRIME32_2 + pc * PRIME32_3 + flags) >> % PRIME32_1 >> & (CODE_GEN_PHYS_HASH_SIZE - 1) >> >> Obviously, some bucket measurements should be taken, but I can well imagine >> that this might perform just as well as the fully generic hasher. > > That function doesn't perform well: 25.06s vs. 21.18s with xxh32. Sure, that's just what came off the top of my head. But the point is that we can do better than dropping data into memory. Particularly for those hosts that do not support unaligned data, such as you created with the packed structure. > * inline: > > 00000000001a6800 : > 1a6800: 48 83 ec 28 sub $0x28,%rsp > 1a6804: 69 cf 77 ca eb 85 imul $0x85ebca77,%edi,%ecx > 1a680a: 48 89 3c 24 mov %rdi,(%rsp) > 1a680e: 48 c1 ef 20 shr $0x20,%rdi > 1a6812: 69 ff 77 ca eb 85 imul $0x85ebca77,%edi,%edi > 1a6818: 48 89 74 24 08 mov %rsi,0x8(%rsp) > 1a681d: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax > 1a6824: 00 00 > 1a6826: 48 89 44 24 18 mov %rax,0x18(%rsp) > 1a682b: 31 c0 xor %eax,%eax > 1a682d: 81 c1 29 44 23 24 add $0x24234429,%ecx > 1a6833: 69 c6 77 ca eb 85 imul $0x85ebca77,%esi,%eax > 1a6839: 48 c1 ee 20 shr $0x20,%rsi > 1a683d: 81 ef 88 35 14 7a sub $0x7a143588,%edi > 1a6843: 69 f6 77 ca eb 85 imul $0x85ebca77,%esi,%esi > 1a6849: c1 c9 13 ror $0x13,%ecx > 1a684c: c1 cf 13 ror $0x13,%edi > 1a684f: 83 c0 01 add $0x1,%eax > 1a6852: 69 c9 b1 79 37 9e imul $0x9e3779b1,%ecx,%ecx > 1a6858: c1 c8 13 ror $0x13,%eax > 1a685b: 81 c6 50 86 c8 61 add $0x61c88650,%esi > 1a6861: 69 ff b1 79 37 9e imul $0x9e3779b1,%edi,%edi > 1a6867: c1 ce 13 ror $0x13,%esi > 1a686a: c1 c9 1f ror $0x1f,%ecx > 1a686d: 69 c0 b1 79 37 9e imul $0x9e3779b1,%eax,%eax > 1a6873: c1 cf 19 ror $0x19,%edi > 1a6876: 69 f6 b1 79 37 9e imul $0x9e3779b1,%esi,%esi > 1a687c: 8d 7c 39 14 lea 0x14(%rcx,%rdi,1),%edi > 1a6880: c1 c8 14 ror $0x14,%eax > 1a6883: 69 d2 3d ae b2 c2 imul $0xc2b2ae3d,%edx,%edx > 1a6889: 01 f8 add %edi,%eax > 1a688b: c1 ce 0e ror $0xe,%esi > 1a688e: 01 c6 add %eax,%esi > 1a6890: 01 f2 add %esi,%edx > 1a6892: c1 ca 0f ror $0xf,%edx > 1a6895: 69 d2 2f eb d4 27 imul $0x27d4eb2f,%edx,%edx > 1a689b: 89 d0 mov %edx,%eax > 1a689d: c1 e8 0f shr $0xf,%eax > 1a68a0: 31 d0 xor %edx,%eax > 1a68a2: 69 d0 77 ca eb 85 imul $0x85ebca77,%eax,%edx > 1a68a8: 89 d0 mov %edx,%eax > 1a68aa: c1 e8 0d shr $0xd,%eax > 1a68ad: 31 d0 xor %edx,%eax > 1a68af: 69 d0 3d ae b2 c2 imul $0xc2b2ae3d,%eax,%edx > 1a68b5: 89 d0 mov %edx,%eax > 1a68b7: c1 e8 10 shr $0x10,%eax > 1a68ba: 31 d0 xor %edx,%eax > 1a68bc: 48 8b 54 24 18 mov 0x18(%rsp),%rdx > 1a68c1: 64 48 33 14 25 28 00 xor %fs:0x28,%rdx > 1a68c8: 00 00 > 1a68ca: 75 05 jne 1a68d1 > 1a68cc: 48 83 c4 28 add $0x28,%rsp > 1a68d0: c3 retq If I inline everything explicitly, we do even better. r~ 0000000000000000 : 0: 44 69 c6 77 ca eb 85 imul $0x85ebca77,%esi,%r8d 7: 48 c1 ee 20 shr $0x20,%rsi b: 69 cf 77 ca eb 85 imul $0x85ebca77,%edi,%ecx 11: 48 c1 ef 20 shr $0x20,%rdi 15: 41 83 c0 01 add $0x1,%r8d 19: 41 c1 c0 0d rol $0xd,%r8d 1d: 81 c1 29 44 23 24 add $0x24234429,%ecx 23: 69 c6 77 ca eb 85 imul $0x85ebca77,%esi,%eax 29: c1 c1 0d rol $0xd,%ecx 2c: 41 69 f0 b1 79 37 9e imul $0x9e3779b1,%r8d,%esi 33: 05 50 86 c8 61 add $0x61c88650,%eax 38: 69 ff 77 ca eb 85 imul $0x85ebca77,%edi,%edi 3e: c1 c6 0c rol $0xc,%esi 41: c1 c0 0d rol $0xd,%eax 44: 69 d2 3d ae b2 c2 imul $0xc2b2ae3d,%edx,%edx 4a: 81 ef 88 35 14 7a sub $0x7a143588,%edi 50: 69 c9 b1 79 37 9e imul $0x9e3779b1,%ecx,%ecx 56: 8d 54 16 14 lea 0x14(%rsi,%rdx,1),%edx 5a: c1 c7 0d rol $0xd,%edi 5d: 69 c0 b1 79 37 9e imul $0x9e3779b1,%eax,%eax 63: d1 c1 rol %ecx 65: 01 d1 add %edx,%ecx 67: c1 c8 0e ror $0xe,%eax 6a: 69 d7 b1 79 37 9e imul $0x9e3779b1,%edi,%edx 70: 01 c8 add %ecx,%eax 72: c1 c2 07 rol $0x7,%edx 75: 01 d0 add %edx,%eax 77: c1 c8 0f ror $0xf,%eax 7a: 69 c0 2f eb d4 27 imul $0x27d4eb2f,%eax,%eax 80: 89 c2 mov %eax,%edx 82: c1 ea 0f shr $0xf,%edx 85: 31 d0 xor %edx,%eax 87: 69 c0 77 ca eb 85 imul $0x85ebca77,%eax,%eax 8d: 89 c2 mov %eax,%edx 8f: c1 ea 0d shr $0xd,%edx 92: 31 d0 xor %edx,%eax 94: 69 c0 3d ae b2 c2 imul $0xc2b2ae3d,%eax,%eax 9a: 89 c2 mov %eax,%edx 9c: c1 ea 10 shr $0x10,%edx 9f: 31 d0 xor %edx,%eax a1: c3 retq #define PRIME32_1 2654435761U #define PRIME32_2 2246822519U #define PRIME32_3 3266489917U #define PRIME32_4 668265263U #define PRIME32_5 374761393U #define XXH_rotl32(x, r) ((x << r) | (x >> (32 - r))) unsigned tb_hash_func(unsigned long phys_pc, unsigned long pc, unsigned flags) { unsigned h32, seed = 1; unsigned w0, w1, w2, w3, w4; unsigned v1, v2, v3, v4; w0 = phys_pc; w1 = phys_pc >> 31 >> 1; w2 = pc; w3 = pc >> 31 >> 1; w4 = flags; v1 = seed + PRIME32_1 + PRIME32_2; v2 = seed + PRIME32_2; v3 = seed + 0; v4 = seed - PRIME32_1; v1 += w0 * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; v2 += w1 * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; v3 += w2 * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; v4 += w3 * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); h32 += 20; h32 += w4 * PRIME32_3; h32 = XXH_rotl32(h32, 17) * PRIME32_4; h32 ^= h32 >> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; h32 *= PRIME32_3; h32 ^= h32 >> 16; return h32; }