From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from eggs.gnu.org ([2001:4830:134:3::10]:56244)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <rth7680@gmail.com>) id 1anYD0-0006MC-Gi
	for qemu-devel@nongnu.org; Tue, 05 Apr 2016 17:08:23 -0400
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <rth7680@gmail.com>) id 1anYCw-0006TW-CM
	for qemu-devel@nongnu.org; Tue, 05 Apr 2016 17:08:22 -0400
Received: from mail-qk0-x244.google.com ([2607:f8b0:400d:c09::244]:34212)
	by eggs.gnu.org with esmtp (Exim 4.71)
	(envelope-from <rth7680@gmail.com>) id 1anYCw-0006TA-75
	for qemu-devel@nongnu.org; Tue, 05 Apr 2016 17:08:18 -0400
Received: by mail-qk0-x244.google.com with SMTP id z64so1216929qkb.1
	for <qemu-devel@nongnu.org>; Tue, 05 Apr 2016 14:08:18 -0700 (PDT)
Sender: Richard Henderson <rth7680@gmail.com>
From: Richard Henderson <rth@twiddle.net>
References: <1459834253-8291-1-git-send-email-cota@braap.org>
	<1459834253-8291-8-git-send-email-cota@braap.org>
	<5703DCB7.50302@twiddle.net> <5703DE37.3080306@redhat.com>
	<5703E2DD.3020103@twiddle.net> <20160405194028.GA6671@flamenco>
Message-ID: <5704293D.1070105@twiddle.net>
Date: Tue, 5 Apr 2016 14:08:13 -0700
MIME-Version: 1.0
In-Reply-To: <20160405194028.GA6671@flamenco>
Content-Type: text/plain; charset=windows-1252; format=flowed
Content-Transfer-Encoding: 7bit
Subject: Re: [Qemu-devel] [PATCH 07/10] tb hash: hash phys_pc, pc,
	and flags with xxhash
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: "Emilio G. Cota" <cota@braap.org>
Cc: MTTCG Devel <mttcg@greensocs.com>, Peter Maydell <peter.maydell@linaro.org>, Peter Crosthwaite <crosthwaite.peter@gmail.com>, QEMU Developers <qemu-devel@nongnu.org>, Sergey Fedorov <serge.fdrv@gmail.com>, Paolo Bonzini <pbonzini@redhat.com>, =?UTF-8?Q?Alex_Benn=c3=a9e?= <alex.bennee@linaro.org>

On 04/05/2016 12:40 PM, Emilio G. Cota wrote:
> On Tue, Apr 05, 2016 at 09:07:57 -0700, Richard Henderson wrote:
>> On 04/05/2016 08:48 AM, Paolo Bonzini wrote:
>>> I think it's fine to use the struct.  The exact size of the struct
>>> varies from 3 to 5 32-bit words, so it's hard to write nice
>>> size-dependent code for the hash.
>>
>> I don't think it is.  We have 3 integers.  It is trivial to create a simple
>> function of 2 multiplies, two adds, and a remainder.
>>
>> Take the primes from the xxhash.h, for example:
>>
>>   (phys_pc * PRIME32_2 + pc * PRIME32_3 + flags)
>>   % PRIME32_1
>>   & (CODE_GEN_PHYS_HASH_SIZE - 1)
>>
>> Obviously, some bucket measurements should be taken, but I can well imagine
>> that this might perform just as well as the fully generic hasher.
>
> That function doesn't perform well: 25.06s vs. 21.18s with xxh32.

Sure, that's just what came off the top of my head.

But the point is that we can do better than dropping data into memory.
Particularly for those hosts that do not support unaligned data, such as you
created with the packed structure.


> * inline:
>
> 00000000001a6800 <tb_hash_func>:
>   1a6800:	48 83 ec 28          	sub    $0x28,%rsp
>   1a6804:	69 cf 77 ca eb 85    	imul   $0x85ebca77,%edi,%ecx
>   1a680a:	48 89 3c 24          	mov    %rdi,(%rsp)
>   1a680e:	48 c1 ef 20          	shr    $0x20,%rdi
>   1a6812:	69 ff 77 ca eb 85    	imul   $0x85ebca77,%edi,%edi
>   1a6818:	48 89 74 24 08       	mov    %rsi,0x8(%rsp)
>   1a681d:	64 48 8b 04 25 28 00 	mov    %fs:0x28,%rax
>   1a6824:	00 00
>   1a6826:	48 89 44 24 18       	mov    %rax,0x18(%rsp)
>   1a682b:	31 c0                	xor    %eax,%eax
>   1a682d:	81 c1 29 44 23 24    	add    $0x24234429,%ecx
>   1a6833:	69 c6 77 ca eb 85    	imul   $0x85ebca77,%esi,%eax
>   1a6839:	48 c1 ee 20          	shr    $0x20,%rsi
>   1a683d:	81 ef 88 35 14 7a    	sub    $0x7a143588,%edi
>   1a6843:	69 f6 77 ca eb 85    	imul   $0x85ebca77,%esi,%esi
>   1a6849:	c1 c9 13             	ror    $0x13,%ecx
>   1a684c:	c1 cf 13             	ror    $0x13,%edi
>   1a684f:	83 c0 01             	add    $0x1,%eax
>   1a6852:	69 c9 b1 79 37 9e    	imul   $0x9e3779b1,%ecx,%ecx
>   1a6858:	c1 c8 13             	ror    $0x13,%eax
>   1a685b:	81 c6 50 86 c8 61    	add    $0x61c88650,%esi
>   1a6861:	69 ff b1 79 37 9e    	imul   $0x9e3779b1,%edi,%edi
>   1a6867:	c1 ce 13             	ror    $0x13,%esi
>   1a686a:	c1 c9 1f             	ror    $0x1f,%ecx
>   1a686d:	69 c0 b1 79 37 9e    	imul   $0x9e3779b1,%eax,%eax
>   1a6873:	c1 cf 19             	ror    $0x19,%edi
>   1a6876:	69 f6 b1 79 37 9e    	imul   $0x9e3779b1,%esi,%esi
>   1a687c:	8d 7c 39 14          	lea    0x14(%rcx,%rdi,1),%edi
>   1a6880:	c1 c8 14             	ror    $0x14,%eax
>   1a6883:	69 d2 3d ae b2 c2    	imul   $0xc2b2ae3d,%edx,%edx
>   1a6889:	01 f8                	add    %edi,%eax
>   1a688b:	c1 ce 0e             	ror    $0xe,%esi
>   1a688e:	01 c6                	add    %eax,%esi
>   1a6890:	01 f2                	add    %esi,%edx
>   1a6892:	c1 ca 0f             	ror    $0xf,%edx
>   1a6895:	69 d2 2f eb d4 27    	imul   $0x27d4eb2f,%edx,%edx
>   1a689b:	89 d0                	mov    %edx,%eax
>   1a689d:	c1 e8 0f             	shr    $0xf,%eax
>   1a68a0:	31 d0                	xor    %edx,%eax
>   1a68a2:	69 d0 77 ca eb 85    	imul   $0x85ebca77,%eax,%edx
>   1a68a8:	89 d0                	mov    %edx,%eax
>   1a68aa:	c1 e8 0d             	shr    $0xd,%eax
>   1a68ad:	31 d0                	xor    %edx,%eax
>   1a68af:	69 d0 3d ae b2 c2    	imul   $0xc2b2ae3d,%eax,%edx
>   1a68b5:	89 d0                	mov    %edx,%eax
>   1a68b7:	c1 e8 10             	shr    $0x10,%eax
>   1a68ba:	31 d0                	xor    %edx,%eax
>   1a68bc:	48 8b 54 24 18       	mov    0x18(%rsp),%rdx
>   1a68c1:	64 48 33 14 25 28 00 	xor    %fs:0x28,%rdx
>   1a68c8:	00 00
>   1a68ca:	75 05                	jne    1a68d1 <tb_hash_func+0xd1>
>   1a68cc:	48 83 c4 28          	add    $0x28,%rsp
>   1a68d0:	c3                   	retq

If I inline everything explicitly, we do even better.


r~


0000000000000000 <tb_hash_func>:
    0:	44 69 c6 77 ca eb 85 	imul   $0x85ebca77,%esi,%r8d
    7:	48 c1 ee 20          	shr    $0x20,%rsi
    b:	69 cf 77 ca eb 85    	imul   $0x85ebca77,%edi,%ecx
   11:	48 c1 ef 20          	shr    $0x20,%rdi
   15:	41 83 c0 01          	add    $0x1,%r8d
   19:	41 c1 c0 0d          	rol    $0xd,%r8d
   1d:	81 c1 29 44 23 24    	add    $0x24234429,%ecx
   23:	69 c6 77 ca eb 85    	imul   $0x85ebca77,%esi,%eax
   29:	c1 c1 0d             	rol    $0xd,%ecx
   2c:	41 69 f0 b1 79 37 9e 	imul   $0x9e3779b1,%r8d,%esi
   33:	05 50 86 c8 61       	add    $0x61c88650,%eax
   38:	69 ff 77 ca eb 85    	imul   $0x85ebca77,%edi,%edi
   3e:	c1 c6 0c             	rol    $0xc,%esi
   41:	c1 c0 0d             	rol    $0xd,%eax
   44:	69 d2 3d ae b2 c2    	imul   $0xc2b2ae3d,%edx,%edx
   4a:	81 ef 88 35 14 7a    	sub    $0x7a143588,%edi
   50:	69 c9 b1 79 37 9e    	imul   $0x9e3779b1,%ecx,%ecx
   56:	8d 54 16 14          	lea    0x14(%rsi,%rdx,1),%edx
   5a:	c1 c7 0d             	rol    $0xd,%edi
   5d:	69 c0 b1 79 37 9e    	imul   $0x9e3779b1,%eax,%eax
   63:	d1 c1                	rol    %ecx
   65:	01 d1                	add    %edx,%ecx
   67:	c1 c8 0e             	ror    $0xe,%eax
   6a:	69 d7 b1 79 37 9e    	imul   $0x9e3779b1,%edi,%edx
   70:	01 c8                	add    %ecx,%eax
   72:	c1 c2 07             	rol    $0x7,%edx
   75:	01 d0                	add    %edx,%eax
   77:	c1 c8 0f             	ror    $0xf,%eax
   7a:	69 c0 2f eb d4 27    	imul   $0x27d4eb2f,%eax,%eax
   80:	89 c2                	mov    %eax,%edx
   82:	c1 ea 0f             	shr    $0xf,%edx
   85:	31 d0                	xor    %edx,%eax
   87:	69 c0 77 ca eb 85    	imul   $0x85ebca77,%eax,%eax
   8d:	89 c2                	mov    %eax,%edx
   8f:	c1 ea 0d             	shr    $0xd,%edx
   92:	31 d0                	xor    %edx,%eax
   94:	69 c0 3d ae b2 c2    	imul   $0xc2b2ae3d,%eax,%eax
   9a:	89 c2                	mov    %eax,%edx
   9c:	c1 ea 10             	shr    $0x10,%edx
   9f:	31 d0                	xor    %edx,%eax
   a1:	c3                   	retq


#define PRIME32_1   2654435761U
#define PRIME32_2   2246822519U
#define PRIME32_3   3266489917U
#define PRIME32_4    668265263U
#define PRIME32_5    374761393U
#define XXH_rotl32(x, r) ((x << r) | (x >> (32 - r)))

unsigned tb_hash_func(unsigned long phys_pc, unsigned long pc, unsigned flags)
{
   unsigned h32, seed = 1;
   unsigned w0, w1, w2, w3, w4;
   unsigned v1, v2, v3, v4;

   w0 = phys_pc;
   w1 = phys_pc >> 31 >> 1;
   w2 = pc;
   w3 = pc >> 31 >> 1;
   w4 = flags;

   v1 = seed + PRIME32_1 + PRIME32_2;
   v2 = seed + PRIME32_2;
   v3 = seed + 0;
   v4 = seed - PRIME32_1;

   v1 += w0 * PRIME32_2;
   v1 = XXH_rotl32(v1, 13);
   v1 *= PRIME32_1;
   v2 += w1 * PRIME32_2;
   v2 = XXH_rotl32(v2, 13);
   v2 *= PRIME32_1;
   v3 += w2 * PRIME32_2;
   v3 = XXH_rotl32(v3, 13);
   v3 *= PRIME32_1;
   v4 += w3 * PRIME32_2;
   v4 = XXH_rotl32(v4, 13);
   v4 *= PRIME32_1;

   h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7)
	+ XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);

   h32 += 20;

   h32 += w4 * PRIME32_3;
   h32  = XXH_rotl32(h32, 17) * PRIME32_4;

   h32 ^= h32 >> 15;
   h32 *= PRIME32_2;
   h32 ^= h32 >> 13;
   h32 *= PRIME32_3;
   h32 ^= h32 >> 16;

   return h32;
}