Distribution of longest common hash prefixes

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Distribution of longest common hash prefixes
@ 2007-04-02 14:58 Peter Eriksen
  2007-04-02 15:20 ` Linus Torvalds
  2007-04-02 15:28 ` Peter Eriksen
  0 siblings, 2 replies; 20+ messages in thread
From: Peter Eriksen @ 2007-04-02 14:58 UTC (permalink / raw)
  To: git

Hello,

With the recent and ongoing discussion about a more efficient .idx
format, I got curious about how long the longest common prefix between
two hashes is in git various projects. So I did

$ git rev-list --objects HEAD | cut -b1-40 | sort >proj.list
$ lcprefix proj.list

and decided to compare all neighbour hashes with each other, finding the
longest common prefix in bits between them. These numbers were then
tabulated into bins on the number of bits.

Why does it look so sparse and strange?

The following is the output of the program bellow on the linux-2.6.git
archive:


0000d162d198a60d558ab4be3f54f608ff8b7473
0000de01ec150d1a291564818571f719a6a6190f
lcprefix = 20
00038c17317a3633f176f17876e69d8e31a5c708
00038ef0cad0a60ee7cace23ade5dfb325b7700d
lcprefix = 22
00108a8dd8d4d772ac6a91efde40191392ba4624
00108ba9a78dcef5629ead0e8bea35d0c08c9ea7
lcprefix = 23
001c2d57248586464da570017e368e188eb4d270
001c2d594f26b529fdabb6cf05deaa384f400e12
lcprefix = 28
002c9920c7bc3cbf74b4cdc03491f80f68917528
002c9922e55255556f1bebea8772aa58c9724825
lcprefix = 30
15d954e50cae6e816b534bf959c49a2920bef808
15d954e51e5b40cf4d5930708fe98076adb1063a
lcprefix = 35
d37bdb4d4930ddb390902c19cffe6552d93d3fcf
d37bdb4d4c9c821e4765f10c206fa613f7781b65
lcprefix = 37
 0:  0
 1:  0
 2:  0
 3:  0
 4:  0
 5:  0
 6:  0
 7:  0
 8:  0
 9:  0
10:  0
11:  0
12:  0
13:  0
14:  0
15:  0
16:  0
17:  0
18:  0
19:  8
20: 19
21:  0
22: 87
23: 70
24:  0
25:  0
26:  0
27:  0
28: 116
29:  0
30: 36975
31:  0
32:  0
33:  0
34:  0
35: 325071
36:  0
37: 76996
38:  0
39:  0
40:  0
41:  0
42:  0
43:  0
44:  0
45:  0
46:  0
47:  0
48:  0
49:  0
50:  0
51:  0
52:  0
53:  0
54:  0
55:  0
56:  0
57:  0
58:  0
59:  0
60:  0
61:  0
62:  0
63:  0


======================= >8 ============================
#include <stdio.h>
#include <string.h>


int h2d(char c)
{
	if ('a' <= c && c <= 'f')
		return c-'a'+10;
	else
		return c-'0';
}

int lcprefix(char *a, char *b)
{
	int i = 0;
	int j = 4;
	while (a[i] == b[i])
		i++;

	while ((h2d(a[i])<<j & 128) == (h2d(b[i])<<j & 128))
		j++;
	
	return i*4 + j - 4;
}

int main(int argc, char **argv)
{
	FILE *fp;
	char old[41];
	char cur[41];
	int lcp = 0;
	int table[64];
	int i;

	memset(table, 0, 64*sizeof(int));
	memset(old, '0', 40);
	old[40] = '\0';

	fp = fopen(argv[1], "r");
	fscanf(fp, "%s\n", cur);

	if (lcp < lcprefix(old, cur)) {
		lcp = lcprefix(old, cur);
	}

	table[lcp]++;
	while (fscanf(fp, "%s\n", cur) != EOF) {
		table[lcp]++;
		if (lcp < lcprefix(old, cur)) {
			printf("%s\n%s\n", old, cur);
			lcp = lcprefix(old, cur);
			printf("lcprefix = %d\n", lcp);
		}
		memcpy(old, cur, 40);
	}

	for(i = 0; i < 64; i++) {
		printf("%2d: %2d\n", i, table[i]);
	}
	
	return 0;
}

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-02 14:58 Distribution of longest common hash prefixes Peter Eriksen
@ 2007-04-02 15:20 ` Linus Torvalds
  2007-04-02 16:29   ` Randal L. Schwartz
  2007-04-02 15:28 ` Peter Eriksen
  1 sibling, 1 reply; 20+ messages in thread
From: Linus Torvalds @ 2007-04-02 15:20 UTC (permalink / raw)
  To: Peter Eriksen; +Cc: git



On Mon, 2 Apr 2007, Peter Eriksen wrote:
> 
> Why does it look so sparse and strange?

Because your program is buggy.

You do

	table[lcp]++;

even though "lcp" is always that *maximum* lcp at any time, not the 
current one!

Here's a fixed program (although that first lcp thing is still bogus, I
just left it as you had it), and fixed output.. 

	0000d162d198a60d558ab4be3f54f608ff8b7473
	0000de01ec150d1a291564818571f719a6a6190f
	lcprefix = 20
	00038c17317a3633f176f17876e69d8e31a5c708
	00038ef0cad0a60ee7cace23ade5dfb325b7700d
	lcprefix = 22
	00108a8dd8d4d772ac6a91efde40191392ba4624
	00108ba9a78dcef5629ead0e8bea35d0c08c9ea7
	lcprefix = 23
	001c2d57248586464da570017e368e188eb4d270
	001c2d594f26b529fdabb6cf05deaa384f400e12
	lcprefix = 28
	002c9920c7bc3cbf74b4cdc03491f80f68917528
	002c9922e55255556f1bebea8772aa58c9724825
	lcprefix = 30
	15d954e50cae6e816b534bf959c49a2920bef808
	15d954e51e5b40cf4d5930708fe98076adb1063a
	lcprefix = 35
	d37bdb4d4930ddb390902c19cffe6552d93d3fcf
	d37bdb4d4c9c821e4765f10c206fa613f7781b65
	lcprefix = 37
	 0:  1
	 1:  2
	 2:  4
	 3:  8
	 4: 16
	 5: 32
	 6: 64
	 7: 128
	 8: 256
	 9: 512
	10: 1024
	11: 2048
	12: 4096
	13: 8192
	14: 16384
	15: 32685
	16: 60921
	17: 86511
	18: 84426
	19: 61518
	20: 37450
	21: 20728
	22: 11035
	23: 5562
	24: 2812
	25: 1467
	26: 738
	27: 355
	28: 191
	29: 83
	30: 49
	31: 27
	32: 10
	33:  3
	34:  1
	35:  2
	36:  0
	37:  1
	38:  0
	39:  0
	40:  0
	41:  0
	42:  0
	43:  0
	44:  0
	45:  0
	46:  0
	47:  0
	48:  0
	49:  0
	50:  0
	51:  0
	52:  0
	53:  0
	54:  0
	55:  0
	56:  0
	57:  0
	58:  0
	59:  0
	60:  0
	61:  0
	62:  0
	63:  0

which looks much saner..

		Linus

---
#include <stdio.h>
#include <string.h>


int h2d(char c)
{
	if ('a' <= c && c <= 'f')
		return c-'a'+10;
	else
		return c-'0';
}

int lcprefix(char *a, char *b)
{
	int bits = 0;
	unsigned n1, n2;

	while (*a == *b) {
		bits += 4;
		a++;
		b++;
	}

	n1 = h2d(*a);
	n2 = h2d(*b);

	/* Would make more sense to start from bit 0.. */
	while ((n1 & 8) == (n2 & 8)) {
		bits++;
		n1 <<= 1;
		n2 <<= 1;
	}
	
	return bits;
}

int main(int argc, char **argv)
{
	FILE *fp;
	char old[41];
	char cur[41];
	int lcp = 0;
	int table[64];
	int i;

	memset(table, 0, 64*sizeof(int));
	memset(old, '0', 40);
	old[40] = '\0';

	fp = fopen(argv[1], "r");
	fscanf(fp, "%s\n", cur);

	if (lcp < lcprefix(old, cur)) {
		lcp = lcprefix(old, cur);
	}

	table[lcp]++;
	while (fscanf(fp, "%s\n", cur) != EOF) {
		int newlcp = lcprefix(old, cur);
		table[newlcp]++;
		if (lcp < newlcp) {
			printf("%s\n%s\n", old, cur);
			lcp = newlcp;
			printf("lcprefix = %d\n", newlcp);
		}
		memcpy(old, cur, 40);
	}

	for(i = 0; i < 64; i++) {
		printf("%2d: %2d\n", i, table[i]);
	}
	
	return 0;
}

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-02 14:58 Distribution of longest common hash prefixes Peter Eriksen
  2007-04-02 15:20 ` Linus Torvalds
@ 2007-04-02 15:28 ` Peter Eriksen
  1 sibling, 0 replies; 20+ messages in thread
From: Peter Eriksen @ 2007-04-02 15:28 UTC (permalink / raw)
  To: git

On Mon, Apr 02, 2007 at 04:58:57PM +0200, Peter Eriksen wrote:
...
> Why does it look so sparse and strange?

... and here is the nonbroken version, which is neither sparse nor
strange. Thanks to chris on #git for the nice programming lesson.
More correct output and program bellow.

Peter


 0:  1
 1:  2
 2:  4
 3:  8
 4: 16
 5: 32
 6: 64
 7: 128
 8: 256
 9: 512
10: 1024
11: 2048
12: 4096
13: 8192
14: 16384
15: 32685
16: 60921
17: 86511
18: 84426
19: 61518
20: 37450
21: 20728
22: 11035
23: 5562
24: 2812
25: 1467
26: 738
27: 355
28: 191
29: 83
30: 49
31: 27
32: 10
33:  3
34:  1
35:  2
36:  0
37:  1
38:  0
39:  0
40:  0
41:  0
42:  0
43:  0
44:  0
45:  0
46:  0
47:  0
48:  0
49:  0
50:  0
51:  0
52:  0
53:  0
54:  0
55:  0
56:  0
57:  0
58:  0
59:  0
60:  0
61:  0
62:  0
63:  0



#include <stdio.h>
#include <string.h>


int h2d(char c)
{
	if ('a' <= c && c <= 'f')
		return c-'a'+10;
	else
		return c-'0';
}

int lcprefix(char *a, char *b)
{
	int i = 0;
	int j = 4;
	while (a[i] == b[i])
		i++;

	while ((h2d(a[i])<<j & 128) == (h2d(b[i])<<j & 128))
		j++;
	
	return i*4 + j - 4;
}

int max(int a, int b)
{
	return a < b ? b : a;
}

int main(int argc, char **argv)
{
	FILE *fp;
	char old[41];
	char cur[41];
	int lcp = 0;
	int table[64];
	int i;
	int tmp;

	memset(table, 0, 64*sizeof(int));
	memset(old, '0', 40);
	old[40] = '\0';

	fp = fopen(argv[1], "r");
	fscanf(fp, "%s\n", cur);

	tmp = lcprefix(old, cur);
	table[tmp]++;
	lcp = max(lcp, tmp);

	while (fscanf(fp, "%s\n", cur) != EOF) {
		tmp = lcprefix(old, cur);
		table[tmp]++;
		lcp = max(lcp, tmp);
		if (lcp < tmp) {
			printf("%s\n%s\n", old, cur);
			lcp = lcprefix(old, cur);
			printf("lcprefix = %d\n", lcp);
		}
		memcpy(old, cur, 40);
	}

	for(i = 0; i < 64; i++) {
		printf("%2d: %2d\n", i, table[i]);
	}
	
	return 0;
}

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-02 15:20 ` Linus Torvalds
@ 2007-04-02 16:29   ` Randal L. Schwartz
  2007-04-02 17:00     ` Linus Torvalds
  2007-04-02 17:18     ` James Cloos
  0 siblings, 2 replies; 20+ messages in thread
From: Randal L. Schwartz @ 2007-04-02 16:29 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Peter Eriksen, git

>>>>> "Linus" == Linus Torvalds <torvalds@linux-foundation.org> writes:

Linus> On Mon, 2 Apr 2007, Peter Eriksen wrote:

Linus> #include <stdio.h>
Linus> #include <string.h>


Linus> int h2d(char c)
Linus> {
Linus> 	if ('a' <= c && c <= 'f')
Linus> 		return c-'a'+10;
Linus> 	else
Linus> 		return c-'0';
Linus> }

Linus> int lcprefix(char *a, char *b)
Linus> {
Linus> 	int bits = 0;
Linus> 	unsigned n1, n2;

Linus> 	while (*a == *b) {
Linus> 		bits += 4;
Linus> 		a++;
Linus> 		b++;
Linus> 	}

Linus> 	n1 = h2d(*a);
Linus> 	n2 = h2d(*b);

Linus> 	/* Would make more sense to start from bit 0.. */
Linus> 	while ((n1 & 8) == (n2 & 8)) {
Linus> 		bits++;
Linus> 		n1 <<= 1;
Linus> 		n2 <<= 1;
Linus> 	}
	
Linus> 	return bits;
Linus> }

Linus> int main(int argc, char **argv)
Linus> {
Linus> 	FILE *fp;
Linus> 	char old[41];
Linus> 	char cur[41];
Linus> 	int lcp = 0;
Linus> 	int table[64];
Linus> 	int i;

Linus> 	memset(table, 0, 64*sizeof(int));
Linus> 	memset(old, '0', 40);
Linus> 	old[40] = '\0';

Linus> 	fp = fopen(argv[1], "r");
Linus> 	fscanf(fp, "%s\n", cur);

Linus> 	if (lcp < lcprefix(old, cur)) {
Linus> 		lcp = lcprefix(old, cur);
Linus> 	}

Linus> 	table[lcp]++;
Linus> 	while (fscanf(fp, "%s\n", cur) != EOF) {
Linus> 		int newlcp = lcprefix(old, cur);
Linus> 		table[newlcp]++;
Linus> 		if (lcp < newlcp) {
Linus> 			printf("%s\n%s\n", old, cur);
Linus> 			lcp = newlcp;
Linus> 			printf("lcprefix = %d\n", newlcp);
Linus> 		}
Linus> 		memcpy(old, cur, 40);
Linus> 	}

Linus> 	for(i = 0; i < 64; i++) {
Linus> 		printf("%2d: %2d\n", i, table[i]);
Linus> 	}
	
Linus> 	return 0;
Linus> }

I don't have access to the linux-2.6 kernel, but on git.git at
d8b6a1a10b93666246984a50d64a163e71163aeb I get this:

    $ git-rev-list --objects HEAD | sort | perl -lne '
      substr($_, 40) = "";
      ($p ^ $_) =~ /^(\0*)/;
      $count[length $1]++;
      $p = $_;
      END { print "$_: $count[$_]" for 0..$#count }
    '
    0: 16
    1: 240
    2: 3839
    3: 24458
    4: 8275
    5: 619
    6: 45
    7: 
    8: 1

Yeay Perl. :)

-- 
Randal L. Schwartz - Stonehenge Consulting Services, Inc. - +1 503 777 0095
<merlyn@stonehenge.com> <URL:http://www.stonehenge.com/merlyn/>
Perl/Unix/security consulting, Technical writing, Comedy, etc. etc.
See PerlTraining.Stonehenge.com for onsite and open-enrollment Perl training!

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-02 16:29   ` Randal L. Schwartz
@ 2007-04-02 17:00     ` Linus Torvalds
  2007-04-02 17:17       ` Randal L. Schwartz
  2007-04-02 17:18     ` James Cloos
  1 sibling, 1 reply; 20+ messages in thread
From: Linus Torvalds @ 2007-04-02 17:00 UTC (permalink / raw)
  To: Randal L. Schwartz; +Cc: Peter Eriksen, git

On Mon, 2 Apr 2007, Randal L. Schwartz wrote:
> 
> I don't have access to the linux-2.6 kernel, but on git.git at
> d8b6a1a10b93666246984a50d64a163e71163aeb I get this:
> 
>     $ git-rev-list --objects HEAD | sort | perl -lne '
>       substr($_, 40) = "";
>       ($p ^ $_) =~ /^(\0*)/;
>       $count[length $1]++;
>       $p = $_;
>       END { print "$_: $count[$_]" for 0..$#count }
>     '
>     0: 16
>     1: 240
>     2: 3839
>     3: 24458
>     4: 8275
>     5: 619
>     6: 45
>     7: 
>     8: 1
> 
> Yeay Perl. :)

No yay yet.. That counts hex digits, not bits.

However, both this and Peter's original thing show an interesting pattern 
in common: for the case where the data is dense (ie a few bits in common), 
you actually don't end up counting "bits in common", but "edges when the 
bits change in the sorted output".

For example, in the above, the 16/240/3839 comes simply from the fact that 
there are sixteen times that the first digit changes (and that makes the 
program think that it has zero bits in common). There are 256 times that 
the two first digit changes, but 16 of those the first one changed too, so 
only in 240 cases did just the second digit change).

And there are 4096 places where the three first digit change, but 256 of 
those were already counted, so you get 3840 for the third case (but the 
git repo didn't have enough objects, so you missed one, and then the next 
ones will hit a peak and then start an exponential decrease.

So with a nice random linear distribution (which we'd expect from a good 
hash), you should see an exponential increase to a maximum (which you'd 
expect to be at "floor(lnx(nr-objects))", and then an exponential decrease 
right back.

With the kernel, with 439342 objects reachable from HEAD, the peak should 
be around 4 (for a base-16 thing) and around 18 for the binary thing. 
Which is exactly what you get..

		Linus

--- for the kernel, using your nybble-counter ---
0: 16
1: 240
2: 3840
3: 61357
4: 293375
5: 74775
6: 5372
7: 350
8: 16
9: 1

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-02 17:00     ` Linus Torvalds
@ 2007-04-02 17:17       ` Randal L. Schwartz
  2007-04-02 17:33         ` Randal L. Schwartz
  0 siblings, 1 reply; 20+ messages in thread
From: Randal L. Schwartz @ 2007-04-02 17:17 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Peter Eriksen, git

>>>>> "Linus" == Linus Torvalds <torvalds@linux-foundation.org> writes:

Linus> No yay yet.. That counts hex digits, not bits.

I thought the goal was to figure out how long (on the average) you had to give
a SHA1 to be "unique".

But even that's wrong, because of the following:

CAFEFEED357
DEADBEEF123
DEADBEEF456
DEADBEEF467
DEADBEEF478

for that sequence, I'd count 0, 8, 9, 9 when in fact, it should be 8, 9, 9, 9.
It's not the items in common with the previous value... it's the longer of the
items in common with the string on either side.  The easiest way for that
would be to use a 3-item window:

git-rev-list --objects HEAD | sort | perl -lne '
  substr($_, 40) = "";
  if (defined $p) {
    ($p ^ $_) =~ /^(\0*)/;
    $common = length $1;
    if (defined $pcommon) {
      $count[$pcommon > $common ? $pcommon : $common]++;
    }
  }
  $p = $_;
  $pcommon = $common;
  END { print "$_: $count[$_]" for 0..$#count }
'

this also fixes the bug where I compare the first line to nothing.
With this, I get (on git.git):

    0: 
    1: 
    2: 6
    3: 21153
    4: 15008
    5: 1232
    6: 90
    7: 
    8: 2

which now makes sense.  There are 2 items that need 9 hex chars
to be unique.

-- 
Randal L. Schwartz - Stonehenge Consulting Services, Inc. - +1 503 777 0095
<merlyn@stonehenge.com> <URL:http://www.stonehenge.com/merlyn/>
Perl/Unix/security consulting, Technical writing, Comedy, etc. etc.
See PerlTraining.Stonehenge.com for onsite and open-enrollment Perl training!

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-02 16:29   ` Randal L. Schwartz
  2007-04-02 17:00     ` Linus Torvalds
@ 2007-04-02 17:18     ` James Cloos
  1 sibling, 0 replies; 20+ messages in thread
From: James Cloos @ 2007-04-02 17:18 UTC (permalink / raw)
  To: git; +Cc: Linus Torvalds, Peter Eriksen, Randal L. Schwartz

>>>>> "Randal" == Randal L Schwartz <merlyn@stonehenge.com> writes:

Randal> I don't have access to the linux-2.6 kernel

As of commit b6a8b31, I get this for the kernel tree from Randal's code:

0: 16
1: 240
2: 3840
3: 61357
4: 293437
5: 74792
6: 5373
7: 350
8: 16
9: 1

(That is, of course, counting hex digits (aka nybbles), not bits as
Peter's code does.)

-JimC
-- 
James Cloos <cloos@jhcloos.com>         OpenPGP: 1024D/ED7DAEA6

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-02 17:17       ` Randal L. Schwartz
@ 2007-04-02 17:33         ` Randal L. Schwartz
  2007-04-03 17:04           ` James Cloos
  0 siblings, 1 reply; 20+ messages in thread
From: Randal L. Schwartz @ 2007-04-02 17:33 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Peter Eriksen, git

>>>>> "Randal" == Randal L Schwartz <merlyn@stonehenge.com> writes:

Randal> git-rev-list --objects HEAD | sort | perl -lne '
Randal>   substr($_, 40) = "";
Randal>   if (defined $p) {
Randal>     ($p ^ $_) =~ /^(\0*)/;
Randal>     $common = length $1;
Randal>     if (defined $pcommon) {
Randal>       $count[$pcommon > $common ? $pcommon : $common]++;
Randal>     }
Randal>   }
Randal>   $p = $_;
Randal>   $pcommon = $common;
Randal>   END { print "$_: $count[$_]" for 0..$#count }
Randal> '

And that's off by one on either end. :)

    git-rev-list --objects HEAD | sort | perl -lne '
      substr($_, 40) = "";
      if (defined $p) {
        ($p ^ $_) =~ /^(\0*)/;
        $common = length $1;
        if (defined $pcommon) {
          $count[$pcommon > $common ? $pcommon : $common]++;
        } else {
          $count[$common]++; # first item
        }
      }
      $p = $_;
      $pcommon = $common;
      END {
        $count[$common]++; # last item
        print "$_: $count[$_]" for 0..$#count;
      }
    '

Which now yields:

    0: 
    1: 
    2: 6
    3: 21155
    4: 15008
    5: 1232
    6: 90
    7: 
    8: 2

And *that* totals to 37493, which is the number of objects.  Yeay.

-- 
Randal L. Schwartz - Stonehenge Consulting Services, Inc. - +1 503 777 0095
<merlyn@stonehenge.com> <URL:http://www.stonehenge.com/merlyn/>
Perl/Unix/security consulting, Technical writing, Comedy, etc. etc.
See PerlTraining.Stonehenge.com for onsite and open-enrollment Perl training!

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-02 17:33         ` Randal L. Schwartz
@ 2007-04-03 17:04           ` James Cloos
  2007-04-03 17:11             ` Randal L. Schwartz
  0 siblings, 1 reply; 20+ messages in thread
From: James Cloos @ 2007-04-03 17:04 UTC (permalink / raw)
  To: git; +Cc: Linus Torvalds, Peter Eriksen, Randal L. Schwartz

>>>>> "Randal" == Randal L Schwartz <merlyn@stonehenge.com> writes:

Randal> git-rev-list --objects HEAD | sort | perl -lne '
Randal>   substr($_, 40) = "";
Randal>   if (defined $p) {
Randal>     ($p ^ $_) =~ /^(\0*)/;
Randal>     $common = length $1;
Randal>     if (defined $pcommon) {
Randal>       $count[$pcommon > $common ? $pcommon : $common]++;
Randal>     } else {
Randal>       $count[$common]++; # first item
Randal>     }
Randal>   }
Randal>   $p = $_;
Randal>   $pcommon = $common;
Randal>   END {
Randal>     $count[$common]++; # last item
Randal>     print "$_: $count[$_]" for 0..$#count;
Randal>   }
Randal> '

With that version the kernel gives:

0: 
1: 
2: 
3: 565
4: 288450
5: 139080
6: 10699
7: 700
8: 32
9: 2

Adding in  $_ = unpack("B*",pack("H*",$_));
to the script, to do the work on bits, gives:

14: 
15: 565
16: 14723
17: 66765
18: 107838
19: 99124
20: 67367
21: 39238
22: 21503
23: 10972
24: 5591
25: 2927
26: 1472
27: 709
28: 382
29: 166
30: 98
31: 54
32: 20
33: 6
34: 2
35: 4
36: 
37: 2

-JimC
-- 
James Cloos <cloos@jhcloos.com>         OpenPGP: 1024D/ED7DAEA6

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 17:04           ` James Cloos
@ 2007-04-03 17:11             ` Randal L. Schwartz
  2007-04-03 17:21               ` Shawn O. Pearce
  0 siblings, 1 reply; 20+ messages in thread
From: Randal L. Schwartz @ 2007-04-03 17:11 UTC (permalink / raw)
  To: James Cloos; +Cc: git, Linus Torvalds, Peter Eriksen

>>>>> "James" == James Cloos <cloos@jhcloos.com> writes:

James> With that version the kernel gives:

James> 0: 
James> 1: 
James> 2: 
James> 3: 565
James> 4: 288450
James> 5: 139080
James> 6: 10699
James> 7: 700
James> 8: 32
James> 9: 2

Fascinating.  So you can spell out *any* commit in linux-2.6.git with
10 hex chars.  What do we need 40 for, again? :)

-- 
Randal L. Schwartz - Stonehenge Consulting Services, Inc. - +1 503 777 0095
<merlyn@stonehenge.com> <URL:http://www.stonehenge.com/merlyn/>
Perl/Unix/security consulting, Technical writing, Comedy, etc. etc.
See PerlTraining.Stonehenge.com for onsite and open-enrollment Perl training!

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 17:11             ` Randal L. Schwartz
@ 2007-04-03 17:21               ` Shawn O. Pearce
  2007-04-03 17:50                 ` Linus Torvalds
  0 siblings, 1 reply; 20+ messages in thread
From: Shawn O. Pearce @ 2007-04-03 17:21 UTC (permalink / raw)
  To: Randal L. Schwartz; +Cc: James Cloos, git, Linus Torvalds, Peter Eriksen

"Randal L. Schwartz" <merlyn@stonehenge.com> wrote:
> >>>>> "James" == James Cloos <cloos@jhcloos.com> writes:
> 
> James> With that version the kernel gives:
> 
> James> 0: 
> James> 1: 
> James> 2: 
> James> 3: 565
> James> 4: 288450
> James> 5: 139080
> James> 6: 10699
> James> 7: 700
> James> 8: 32
> James> 9: 2
> 
> Fascinating.  So you can spell out *any* commit in linux-2.6.git with
> 10 hex chars.  What do we need 40 for, again? :)

Well, the other thing is those 2 commits at 9 bytes probably were
not that way a year ago.  One of those might have only needed 8,
and the other is newer, so now you need 9.

What the above tells me is that 8 is almost a safe default for our
abbreviations, but isn't safe enough, as there are collisions past 8.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 17:21               ` Shawn O. Pearce
@ 2007-04-03 17:50                 ` Linus Torvalds
  2007-04-03 18:22                   ` Junio C Hamano
  2007-04-04 21:03                   ` James Cloos
  0 siblings, 2 replies; 20+ messages in thread
From: Linus Torvalds @ 2007-04-03 17:50 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Randal L. Schwartz, James Cloos, git, Peter Eriksen

On Tue, 3 Apr 2007, Shawn O. Pearce wrote:
> 
> Well, the other thing is those 2 commits at 9 bytes probably were
> not that way a year ago.  One of those might have only needed 8,
> and the other is newer, so now you need 9.

Well, neither of the the two objects at 9 bytes may not be (and probably 
aren't) commits and of the 32 8-nibble cases who knows how many are 
actually commits (probably none), so an 8-byte SHA1 is *probably* unique 
at least if you just look at commits.

Remove the "--objects" to find out.

> What the above tells me is that 8 is almost a safe default for our
> abbreviations, but isn't safe enough, as there are collisions past 8.

Yeah, the short SHA1 form is obviously always going to be risky. But in 
practice, since people almost always use it just for commits, it's 
probably good enough in practice, and even if you get a collision in 8 
nibbles, most of the time it will probably be trivial to figure out which 
one was meant, so it's not like it's a disaster if somebody ends up 
reporting a bug with a non-unique abbreviation.

		Linus

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 17:50                 ` Linus Torvalds
@ 2007-04-03 18:22                   ` Junio C Hamano
  2007-04-03 19:27                     ` Linus Torvalds
  2007-04-03 19:34                     ` Nicolas Pitre
  2007-04-04 21:03                   ` James Cloos
  1 sibling, 2 replies; 20+ messages in thread
From: Junio C Hamano @ 2007-04-03 18:22 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Shawn O. Pearce, Randal L. Schwartz, James Cloos, git,
	Peter Eriksen

Linus Torvalds <torvalds@linux-foundation.org> writes:

> Yeah, the short SHA1 form is obviously always going to be risky. But in 
> practice, since people almost always use it just for commits, it's 
> probably good enough in practice, and even if you get a collision in 8 
> nibbles, most of the time it will probably be trivial to figure out which 
> one was meant, so it's not like it's a disaster if somebody ends up 
> reporting a bug with a non-unique abbreviation.

Are you hinting to update sha1_name.c::get_sha1() so that we do
not accept abbreviated non-commit object names?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 18:22                   ` Junio C Hamano
@ 2007-04-03 19:27                     ` Linus Torvalds
  2007-04-03 19:34                     ` Nicolas Pitre
  1 sibling, 0 replies; 20+ messages in thread
From: Linus Torvalds @ 2007-04-03 19:27 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Shawn O. Pearce, Randal L. Schwartz, James Cloos, git,
	Peter Eriksen

On Tue, 3 Apr 2007, Junio C Hamano wrote:
> 
> Are you hinting to update sha1_name.c::get_sha1() so that we do
> not accept abbreviated non-commit object names?

No, but it might be nice if we had some fairly graceful way of handling 
abbreviated SHA1's that ended up being ambiguous (maybe they weren't 
ambiguous in the original context, but became ambiguous later).

Some way of just listing the alternatives, and sorting - and showing - by 
type (so that if you know it's supposed to be a commit, you can trivially 
pick it out from other objects that happen to collide in the first <n> 
digits).

Right now we can do it with

	git-rev-list --objects --all | grep '^<abbrev-sha1>'

but that's actually not even correct (maybe the reason sha1_name decided 
it was ambiguous was due to an _unreachable_ SHA1?), and it's also very 
inefficient.

We could have some helper that just looked things up (it's easy enough to 
look up all potential SHA1 matches both in the filesystem and in a 
pack-file - no need for any rev-list thing that lists all objects).

Is this a pressing concern? Absolutely not. I don't think we've ever had 
any real problems with this, and you *can* do it by hand with a bit of 
inefficient scripting right now..

			Linus

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 18:22                   ` Junio C Hamano
  2007-04-03 19:27                     ` Linus Torvalds
@ 2007-04-03 19:34                     ` Nicolas Pitre
  2007-04-03 20:25                       ` Junio C Hamano
  1 sibling, 1 reply; 20+ messages in thread
From: Nicolas Pitre @ 2007-04-03 19:34 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Linus Torvalds, Shawn O. Pearce, Randal L. Schwartz, James Cloos,
	git, Peter Eriksen

On Tue, 3 Apr 2007, Junio C Hamano wrote:

> Linus Torvalds <torvalds@linux-foundation.org> writes:
> 
> > Yeah, the short SHA1 form is obviously always going to be risky. But in 
> > practice, since people almost always use it just for commits, it's 
> > probably good enough in practice, and even if you get a collision in 8 
> > nibbles, most of the time it will probably be trivial to figure out which 
> > one was meant, so it's not like it's a disaster if somebody ends up 
> > reporting a bug with a non-unique abbreviation.
> 
> Are you hinting to update sha1_name.c::get_sha1() so that we do
> not accept abbreviated non-commit object names?

NO, I hope not.

Instead (and if the concern is real) we should error out when the 
abbreviated name is ambigous and impose no restriction otherwise.


Nicolas

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 19:34                     ` Nicolas Pitre
@ 2007-04-03 20:25                       ` Junio C Hamano
  2007-04-03 20:39                         ` Nicolas Pitre
  0 siblings, 1 reply; 20+ messages in thread
From: Junio C Hamano @ 2007-04-03 20:25 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: Linus Torvalds, Shawn O. Pearce, Randal L. Schwartz, James Cloos,
	git, Peter Eriksen

Nicolas Pitre <nico@cam.org> writes:

> On Tue, 3 Apr 2007, Junio C Hamano wrote:
>
>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>> 
>> > Yeah, the short SHA1 form is obviously always going to be risky. But in 
>> > practice, since people almost always use it just for commits, it's 
>> > probably good enough in practice, and even if you get a collision in 8 
>> > nibbles, most of the time it will probably be trivial to figure out which 
>> > one was meant, so it's not like it's a disaster if somebody ends up 
>> > reporting a bug with a non-unique abbreviation.
>> 
>> Are you hinting to update sha1_name.c::get_sha1() so that we do
>> not accept abbreviated non-commit object names?
>
> NO, I hope not.
>
> Instead (and if the concern is real) we should error out when the 
> abbreviated name is ambigous and impose no restriction otherwise.

I stated it wrongly.  What I was getting at was that we might
want to consider an abbreviation that matches only a single
commit unambiguous even when there are ambiguous objects of
other kinds.

Not that I consider it a pressing issue, though.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 20:25                       ` Junio C Hamano
@ 2007-04-03 20:39                         ` Nicolas Pitre
  2007-04-03 23:08                           ` Olivier Galibert
  0 siblings, 1 reply; 20+ messages in thread
From: Nicolas Pitre @ 2007-04-03 20:39 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Linus Torvalds, Shawn O. Pearce, Randal L. Schwartz, James Cloos,
	git, Peter Eriksen

On Tue, 3 Apr 2007, Junio C Hamano wrote:

> I stated it wrongly.  What I was getting at was that we might
> want to consider an abbreviation that matches only a single
> commit unambiguous even when there are ambiguous objects of
> other kinds.

Maybe.  But by the time your object hash distribution starts showing 
ambiguous objects with a given abbreviated name between a commit and a 
non commit, I'll bet you'll start to see ambiguities between commits 
soon enough as well.

> Not that I consider it a pressing issue, though.

Indeed.  And even then it is not something really hard to implement 
either.

Nicolas

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 20:39                         ` Nicolas Pitre
@ 2007-04-03 23:08                           ` Olivier Galibert
  2007-04-03 23:22                             ` Linus Torvalds
  0 siblings, 1 reply; 20+ messages in thread
From: Olivier Galibert @ 2007-04-03 23:08 UTC (permalink / raw)
  To: Nicolas Pitre
  Cc: Junio C Hamano, Linus Torvalds, Shawn O. Pearce,
	Randal L. Schwartz, James Cloos, git, Peter Eriksen

On Tue, Apr 03, 2007 at 04:39:02PM -0400, Nicolas Pitre wrote:
> On Tue, 3 Apr 2007, Junio C Hamano wrote:
> 
> > I stated it wrongly.  What I was getting at was that we might
> > want to consider an abbreviation that matches only a single
> > commit unambiguous even when there are ambiguous objects of
> > other kinds.
> 
> Maybe.  But by the time your object hash distribution starts showing 
> ambiguous objects with a given abbreviated name between a commit and a 
> non commit, I'll bet you'll start to see ambiguities between commits 
> soon enough as well.

Isn't the number of objects an order of magnitude bigger than the
number of commits?  Well, I guess that depends on your workflow...

  OG.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 23:08                           ` Olivier Galibert
@ 2007-04-03 23:22                             ` Linus Torvalds
  0 siblings, 0 replies; 20+ messages in thread
From: Linus Torvalds @ 2007-04-03 23:22 UTC (permalink / raw)
  To: Olivier Galibert
  Cc: Nicolas Pitre, Junio C Hamano, Shawn O. Pearce,
	Randal L. Schwartz, James Cloos, git, Peter Eriksen

On Wed, 4 Apr 2007, Olivier Galibert wrote:
> 
> Isn't the number of objects an order of magnitude bigger than the
> number of commits?  Well, I guess that depends on your workflow...

Judging by the kernel tree, it's not an order of magnitude, although it's 
fairly close:

	[torvalds@woody linux]$ git rev-list --all | wc -l
	51156

	[torvalds@woody linux]$ git rev-list --all --objects | wc -l
	444265

So you have about 50k commit objects, and about 390k "other" objects. 
About 7.7 "other" objects per commit. Not quite an order-of-magnitude, but 
close.

Part of the reason for this is that the kernel people tend to encourage 
lots of smaller commits over single large commits, so we have lots of 
commits.

To counter-act that somewhat, the kernel tree is also pretty deep, so a 
lot of the "other" objects are actually the tree objects that create the 
directory structure - it's quite normal to have a single file (blob) 
change, and then three new trees that lead up to that file, and the one 
commit that explains it.

Other projects - like git itself - have relatively fewer tree objects, 
which is probably why the ratio for git itself is just 3.04 "other" 
objects for each commit (ie on average, commits probably touch two blobs 
and the top-level tree - about 10 commits, and 30k non-commit objects).

So repo layout matters. Iirc, last I did the statistics, the git 
repository had more blobs than trees, while the kernel repo had more trees 
than blobs. And the commits-to-other-objects is obviously fairly different 
as a result (I think both git and the kernel have the "many small changes" 
approach, so they're similar in that respect).

Other repositories probably have more "big changes". Especially if you 
create the repo initially by importing just big releases over time, you'll 
have relatively few commits, and lots of blob/tree changes. 

			Linus

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Distribution of longest common hash prefixes
  2007-04-03 17:50                 ` Linus Torvalds
  2007-04-03 18:22                   ` Junio C Hamano
@ 2007-04-04 21:03                   ` James Cloos
  1 sibling, 0 replies; 20+ messages in thread
From: James Cloos @ 2007-04-04 21:03 UTC (permalink / raw)
  To: git; +Cc: Shawn O. Pearce, Randal L. Schwartz, Linus Torvalds,
	Peter Eriksen

>>>>> "Linus" == Linus Torvalds <torvalds@linux-foundation.org> writes:

Linus> Well, neither of the the two objects at 9 bytes may not be (and
Linus> probably aren't) commits and of the 32 8-nibble cases who knows
Linus> how many are actually commits (probably none), so an 8-byte SHA1
Linus> is *probably* unique at least if you just look at commits.

Linus> Remove the "--objects" to find out.

That makes for:

0: 
1: 
2: 1
3: 23320
4: 25431
5: 2259
6: 134
7: 8

But is the kernel large enough to be sufficiently representative?

Did Jon complete an import of the 'zilla src?  

Has anyone tried to import OOo?

-JimC
-- 
James Cloos <cloos@jhcloos.com>         OpenPGP: 1024D/ED7DAEA6

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2007-04-04 21:12 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-04-02 14:58 Distribution of longest common hash prefixes Peter Eriksen
2007-04-02 15:20 ` Linus Torvalds
2007-04-02 16:29   ` Randal L. Schwartz
2007-04-02 17:00     ` Linus Torvalds
2007-04-02 17:17       ` Randal L. Schwartz
2007-04-02 17:33         ` Randal L. Schwartz
2007-04-03 17:04           ` James Cloos
2007-04-03 17:11             ` Randal L. Schwartz
2007-04-03 17:21               ` Shawn O. Pearce
2007-04-03 17:50                 ` Linus Torvalds
2007-04-03 18:22                   ` Junio C Hamano
2007-04-03 19:27                     ` Linus Torvalds
2007-04-03 19:34                     ` Nicolas Pitre
2007-04-03 20:25                       ` Junio C Hamano
2007-04-03 20:39                         ` Nicolas Pitre
2007-04-03 23:08                           ` Olivier Galibert
2007-04-03 23:22                             ` Linus Torvalds
2007-04-04 21:03                   ` James Cloos
2007-04-02 17:18     ` James Cloos
2007-04-02 15:28 ` Peter Eriksen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).