[PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
@ 2006-02-28  4:09 Nicolas Pitre
  2006-02-28  6:51 ` Junio C Hamano
  0 siblings, 1 reply; 11+ messages in thread
From: Nicolas Pitre @ 2006-02-28  4:09 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

The diff-delta code can exhibit O(m*n) behavior with some patological 
data set where most hash entries end up in the same hash bucket.

The latest code rework reduced the block size making it particularly 
vulnerable to this issue, but the issue was always there and can be 
triggered regardless of the block size.

This patch does two things:

1) the hashing has been reworked to offer a better distribution to 
   atenuate the problem a bit, and

2) a limit is imposed to the number of entries that can exist in the 
   same hash bucket.

Because of the above the code is a bit more expensive on average, but 
the problematic samples used to diagnoze the issue are now orders of 
magnitude less expensive to process with only a slight loss in 
compression.

Signed-off-by: Nicolas Pitre <nico@cam.org>

---

For example, Carl Baldwin provided me with a couple 20MB files, and 
deltifying one against another one with test-delta takes around 
SEVENTEEN MINUTES for only one delta on my P4 @ 3GHz when using the 
original adler32 version with 16 byte blocks.  With the latest delta 
code in the pu branch it simply takes forever (I interrupted it after 20 
minutes of processing).  now imagine using git-repack -a with a window 
of 10. With this patch on top of the small block patch (still in the pu 
branch) this time dropped to only 10 seconds.  And the resulting delta 
is still smaller than the 16 byte adler32 based version:

blob 02220dae8cd219916ce52a12cda67322659e0e57 ->
blob af8f99dd11ca288f4e4a08f2626af2de8b3ecfd2
size (21187748 -> 19424744)
delta size (16 byte blocks): 5238542 in 16m55
delta size (3 byte blocks): [interrupted after 20 minutes]
delta size (3 byte blocks + this patch): 4910988 in 9.69 secs

Other data points from the Linux kernel repository:

blob 9af06ba723df75fed49f7ccae5b6c9c34bc5115f ->
blob dfc9cd58dc065d17030d875d3fea6e7862ede143
size (491102 -> 496045)
delta size (16 byte blocks): 248899 in less than 0.1 sec
delta size (3 byte blocks): 136000 in 11.8 secs
delta size (3 byte blocks + this patch): 171688 in 0.79 sec

blob 4917ec509720a42846d513addc11cbd25e0e3c4f ->
blob dfc9cd58dc065d17030d875d3fea6e7862ede143
size (495831 -> 496045)
delta size (16 byte blocks): 226218 in less than 0.1 sec
delta size (3 byte blocks): 120948 in 11.7 secs
delta size (3 byte blocks + this patch): 157135 in 0.77 sec


 diff-delta.c |   69 +++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 56 insertions(+), 13 deletions(-)

1682f4b1b2899288d7761844a4cfd02772c464d1
diff --git a/diff-delta.c b/diff-delta.c
index 27f83a0..bb07926 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -30,19 +30,20 @@ struct index {
 
 static struct index ** delta_index(const unsigned char *buf,
 				   unsigned long bufsize,
+				   unsigned long trg_bufsize,
 				   unsigned int *hash_shift)
 {
 	unsigned long hsize;
-	unsigned int hshift, i;
+	unsigned int i, hshift, hlimit, *hash_count;
 	const unsigned char *data;
 	struct index *entry, **hash;
 	void *mem;
 
 	/* determine index hash size */
 	hsize = bufsize / 4;
-	for (i = 8; (1 << i) < hsize && i < 16; i++);
+	for (i = 8; (1 << i) < hsize && i < 24; i += 2);
 	hsize = 1 << i;
-	hshift = i - 8;
+	hshift = (i - 8) / 2;
 	*hash_shift = hshift;
 
 	/* allocate lookup index */
@@ -53,15 +54,59 @@ static struct index ** delta_index(const
 	entry = mem + hsize * sizeof(*hash);
 	memset(hash, 0, hsize * sizeof(*hash));
 
-	/* then populate it */
+	/* allocate an array to count hash entries */
+	hash_count = calloc(hsize, sizeof(*hash_count));
+	if (!hash_count) {
+		free(hash);
+		return NULL;
+	}
+
+	/* then populate the index */
 	data = buf + bufsize - 2;
 	while (data > buf) {
 		entry->ptr = --data;
-		i = data[0] ^ data[1] ^ (data[2] << hshift);
+		i = data[0] ^ ((data[1] ^ (data[2] << hshift)) << hshift);
 		entry->next = hash[i];
 		hash[i] = entry++;
+		hash_count[i]++;
  	}
 
+	/*
+	 * Determine a limit on the number of entries in the same hash
+	 * bucket.  This guard us against patological data sets causing
+	 * really bad hash distribution with most entries in the same hash
+	 * bucket that would bring us to O(m*n) computing costs (m and n
+	 * corresponding to reference and target buffer sizes).
+	 *
+	 * The more the target buffer is large, the more it is important to
+	 * have small entry lists for each hash buckets.  With such a limit
+	 * the cost is bounded to something more like O(m+n). 
+	 */
+	hlimit = (1 << 26) / trg_bufsize;
+	if (hlimit < 16)
+		hlimit = 16;
+
+	/*
+	 * Now make sure none of the hash buckets has more entries than
+	 * we're willing to test.  Otherwise we short-circuit the entry
+	 * list uniformly to still preserve a good repartition across
+	 * the reference buffer.
+	 */
+	for (i = 0; i < hsize; i++) {
+		if (hash_count[i] < hlimit)
+			continue;
+		entry = hash[i];
+		do {
+			struct index *keep = entry;
+			int skip = hash_count[i] / hlimit / 2;
+			do {
+				entry = entry->next;
+			} while(--skip && entry);
+			keep->next = entry;
+		} while(entry);
+	}
+	free(hash_count);
+
 	return hash;
 }
 
@@ -85,7 +130,7 @@ void *diff_delta(void *from_buf, unsigne
 
 	if (!from_size || !to_size)
 		return NULL;
-	hash = delta_index(from_buf, from_size, &hash_shift);
+	hash = delta_index(from_buf, from_size, to_size, &hash_shift);
 	if (!hash)
 		return NULL;
 
@@ -126,8 +171,8 @@ void *diff_delta(void *from_buf, unsigne
 
 	while (data < top) {
 		unsigned int moff = 0, msize = 0;
-		if (data + 2 < top) {
-			i = data[0] ^ data[1] ^ (data[2] << hash_shift);
+		if (data + 3 <= top) {
+			i = data[0] ^ ((data[1] ^ (data[2] << hash_shift)) << hash_shift);
 			for (entry = hash[i]; entry; entry = entry->next) {
 				const unsigned char *ref = entry->ptr;
 				const unsigned char *src = data;
@@ -138,11 +183,9 @@ void *diff_delta(void *from_buf, unsigne
 					ref_size = 0x10000;
 				if (ref_size <= msize)
 					break;
-				while (ref_size && *src++ == *ref) {
-					ref++;
-					ref_size--;
-				}
-				ref_size = ref - entry->ptr;
+				if (*ref != *src)
+					continue;
+				while (ref_size-- && *++src == *++ref);
 				if (msize < ref - entry->ptr) {
 					/* this is our best match so far */
 					msize = ref - entry->ptr;
-- 
1.2.3.g8fcf1-dirty

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
  2006-02-28  4:09 [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior Nicolas Pitre
@ 2006-02-28  6:51 ` Junio C Hamano
  2006-02-28  8:10   ` Junio C Hamano
  0 siblings, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2006-02-28  6:51 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@cam.org> writes:

> The diff-delta code can exhibit O(m*n) behavior with some patological 
> data set where most hash entries end up in the same hash bucket.

> Other data points from the Linux kernel repository:
>
> blob 9af06ba723df75fed49f7ccae5b6c9c34bc5115f ->
> blob dfc9cd58dc065d17030d875d3fea6e7862ede143
> size (491102 -> 496045)
> delta size (16 byte blocks): 248899 in less than 0.1 sec
> delta size (3 byte blocks): 136000 in 11.8 secs
> delta size (3 byte blocks + this patch): 171688 in 0.79 sec

These numbers are misleading.

The 36K objects pack I used in my previous tests gives 9971223
(from "next" branch) vs 9664546 (this patch) final pack size.
The wallclock time on my machine is 1m35 vs 3m30.  I doubt many
people are willing to pay 100% more waiting time for 3% tighter
pack.

Although I do not mean to rain on your effort and substantial
improvement you made with this patch, we need to admit that
improving pathological corner case has quite a diminishing
effect in the overall picture.

But this definitely is an improvement nevertheless.  I should
try this on my wife's AMD64 (Cygwin).  The same datasets I used
in my previous complaint still seem to take a couple of seconds
(or more) each on my Duron 750 X-<.

A handful additional ideas.

 * Lower the hash limit a bit more (obvious).  That might have
   detrimental effect in the general case.

 * Study the hash bucket distribution for the pathological case
   and see if we can cheaply detect a pattern.  I suspect these
   cases have relatively few but heavily collided buckets with
   mostly sparse other entries.  If there is such an easily
   detectable pattern, maybe we can look for such a pattern at
   runtime, and cull hash entries more aggressively in such a
   case?

 * Try checking the target buffer to see if it would have many
   hits on the heavily collided hash entries from the source
   (maybe hash_count the target buffer as well).

 * Have pack-object detect a pathological blob (the test patch I
   sent you previously uses the eye-candy timer for this
   purpose, but we could getrusage() if we want to be more
   precise) by observing how much time is spent for a single
   round, and mark the blob as such, so we do not delta against
   it with other blobs in find_deltas, when we are packing many
   objects.  It does not really matter in the big picture if we
   choose not to delta the pathological ones tightly, as long as
   they are relatively few.

 * Also in pack-object, have an optional backing-store to write
   out deltified representations for results that took more than
   certain amount of time to produce in find_deltas(), and reuse
   them in write_object().

If anybody is interested, here are some other interesting pairs
from the Linux 2.6 repositories.

        2645d6239b74 cb968e7a0fcd
        357980942d73 5bb837052ef1
        5412dcb4b6d0 ac07e18abb1d
        5bb837052ef1 1f8744ad9cde
        63d827d7da07 5bb837052ef1
        9af06ba723df 1896a0eea7e5
        cb968e7a0fcd 97b5578ae9b1
        d8c5a8dbd086 97b5578ae9b1
        d8c5a8dbd086 cb968e7a0fcd
        dfc9cd58dc06 1896a0eea7e5
        fa0f82ec9fa0 b611e27470e1

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
  2006-02-28  6:51 ` Junio C Hamano
@ 2006-02-28  8:10   ` Junio C Hamano
  2006-02-28 17:05     ` Nicolas Pitre
  2006-03-04  2:28     ` Junio C Hamano
  0 siblings, 2 replies; 11+ messages in thread
From: Junio C Hamano @ 2006-02-28  8:10 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Junio C Hamano <junkio@cox.net> writes:

> Nicolas Pitre <nico@cam.org> writes:
>
>> blob 9af06ba723df75fed49f7ccae5b6c9c34bc5115f ->
>> blob dfc9cd58dc065d17030d875d3fea6e7862ede143
>> size (491102 -> 496045)
>> delta size (16 byte blocks): 248899 in less than 0.1 sec
>> delta size (3 byte blocks): 136000 in 11.8 secs
>> delta size (3 byte blocks + this patch): 171688 in 0.79 sec
>
> These numbers are misleading.
>
> The 36K objects pack I used in my previous tests gives 9971223
> (from "next" branch) vs 9664546 (this patch) final pack size.
> The wallclock time on my machine is 1m35 vs 3m30.  I doubt many
> people are willing to pay 100% more waiting time for 3% tighter
> pack.

I tried an experimental patch to cull collided hash buckets
very aggressively.  I haven't applied your last "reuse index"
patch, though -- I think that is orthogonal and I'd like to
leave that to the next round.

With the same dataset: resulting pack is 9651096 vs 9664546
(your patch) final pack size, with wallclock 2m45s (user 2m31).
Still not good enough, and at the same time I wonder why it gets
_smaller_ results than yours.  But the generated pack unpacked
cleanly in a cloned linux-2.6 repository (having objects and
refs up to v2.6.14) and the result was fsck-objects clean.

I'd appreciate it if you can test it on the 20MB blobs and see
what happens if you have time.

BTW, the benchmark I've been doing is with this dataset:

	git rev-list --objects-edge v2.6.14..v2.6.15-rc1 >RL-N
        time git pack-objects <RL-N --stdout | wc -c

---
diff --git a/diff-delta.c b/diff-delta.c
index 0730b24..52df372 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -88,22 +88,35 @@ static struct index ** delta_index(const
 
 	/*
 	 * Now make sure none of the hash buckets has more entries than
-	 * we're willing to test.  Otherwise we short-circuit the entry
-	 * list uniformly to still preserve a good repartition across
-	 * the reference buffer.
+	 * we're willing to test.  Otherwise we cull the entry list to
+	 * limit identical three byte prefixes to still preserve a good
+	 * repartition across the reference buffer.
 	 */
 	for (i = 0; i < hsize; i++) {
+		struct index **list, *bucket, *remaining;
+		int cnt;
 		if (hash_count[i] < hlimit)
 			continue;
-		entry = hash[i];
-		do {
-			struct index *keep = entry;
-			int skip = hash_count[i] / hlimit / 2;
-			do {
-				entry = entry->next;
-			} while(--skip && entry);
-			keep->next = entry;
-		} while(entry);
+		
+		bucket = NULL;
+		list = &bucket;
+		remaining = hash[i];
+		cnt = 0;
+		while (cnt < hlimit && remaining) {
+			struct index *this = remaining, *that;
+			remaining = remaining->next;
+			for (that = bucket; that; that = that->next) {
+				if (!memcmp(that->ptr, this->ptr, 3))
+					break;
+			}
+			if (that)
+				continue; /* discard */
+			cnt++;
+			*list = this;
+			list = &(this->next);
+			this->next = NULL;
+		}
+		hash[i] = bucket;
 	}
 	free(hash_count);
 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
  2006-02-28  8:10   ` Junio C Hamano
@ 2006-02-28 17:05     ` Nicolas Pitre
  2006-03-01 10:38       ` Junio C Hamano
  2006-03-04  2:28     ` Junio C Hamano
  1 sibling, 1 reply; 11+ messages in thread
From: Nicolas Pitre @ 2006-02-28 17:05 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Mon, 27 Feb 2006, Junio C Hamano wrote:

> Although I do not mean to rain on your effort and substantial
> improvement you made with this patch, we need to admit that
> improving pathological corner case has quite a diminishing
> effect in the overall picture.

It does, unfortunately.

The problem is that the code in diff_delta() is highly solicited during 
a pack generation.  Just the simple hashing change from this patch that 
involves two shifts instead of only one increased the CPU time to 
generate a pack quite appreciably.  This is just to say that this code 
is pretty sensitive to even small details.

> But this definitely is an improvement nevertheless.  I should
> try this on my wife's AMD64 (Cygwin).  The same datasets I used
> in my previous complaint still seem to take a couple of seconds
> (or more) each on my Duron 750 X-<.

There is no miracle.  To prevent the extreme cases additional tests have 
to be performed in all cases, and therefore performance for all cases is 
affected.

> A handful additional ideas.
> 
>  * Lower the hash limit a bit more (obvious).  That might have
>    detrimental effect in the general case.

It does.  The current parameters are what I think is the best compromize 
between cost and compression.  And still 99% of the cases don't need 
a lower hash limit since their hash lists are already way below the 
limit.  For the  cases where it makes a difference, well lowering the 
hash limit also increase the cost associated with the reworking of the 
hash list so there comes a point where you actually increase the 
resulting delta size while the CPU time stays constant.

>  * Study the hash bucket distribution for the pathological case
>    and see if we can cheaply detect a pattern.  I suspect these
>    cases have relatively few but heavily collided buckets with
>    mostly sparse other entries.  If there is such an easily
>    detectable pattern, maybe we can look for such a pattern at
>    runtime, and cull hash entries more aggressively in such a
>    case?

Same issue as above.  And "looking for such a pattern" really does 
increase the CPU time in _all_ cases.  So that looking for patological 
cases has to be as cheap as possible which I think is the case right 
now (and it is still too costly for my taste already).  And yet I don't 
think there is really a need for further reduction of the hash list at 
this point since the patological cases are really handled gracefully 
with this patch even with a nice and pretty packed delta.

>  * Try checking the target buffer to see if it would have many
>    hits on the heavily collided hash entries from the source
>    (maybe hash_count the target buffer as well).

Again that'd add another significant CPU cost to all cases, even those 
that don't need it at all.  The problem is not about those patological 
cases anymore since I think they are well under control now.  It is the 
overhead those few patological cases impose on the other 180000 good 
behaving objects that is a problem.

>  * Have pack-object detect a pathological blob (the test patch I
>    sent you previously uses the eye-candy timer for this
>    purpose, but we could getrusage() if we want to be more
>    precise) by observing how much time is spent for a single
>    round, and mark the blob as such, so we do not delta against
>    it with other blobs in find_deltas, when we are packing many
>    objects.  It does not really matter in the big picture if we
>    choose not to delta the pathological ones tightly, as long as
>    they are relatively few.

That is one solution, but that doesn't handle the root of the problem 
which is the cost of detecting those cases in the first place.

>  * Also in pack-object, have an optional backing-store to write
>    out deltified representations for results that took more than
>    certain amount of time to produce in find_deltas(), and reuse
>    them in write_object().

The pack reusing code is pretty effective in doing so already, isn't it?  
Since using git-repack -f should not be the common case then those 
patological cases (now taking one second instead of 60 or more) should 
be reused most of the time.

> I tried an experimental patch to cull collided hash buckets
> very aggressively.  I haven't applied your last "reuse index"
> patch, though -- I think that is orthogonal and I'd like to
> leave that to the next round.

It is indeed orthogonal and I think you could apply it to the next 
branch without the other patches (it should apply with little problems).  
This is an obvious and undisputable gain, even more if pack-objects is 
reworked to reduce memory usage by keeping only one live index for 
multiple consecutive deltaattempts.

> With the same dataset: resulting pack is 9651096 vs 9664546
> (your patch) final pack size, with wallclock 2m45s (user 2m31).
> Still not good enough, and at the same time I wonder why it gets
> _smaller_ results than yours.

It really becomes hard to find the best balance especially when the 
resulting delta is then fed through zlib.  Sometimes a larger delta will 
compress better, sometimes not.  My test bench was the whole git 
repository and the kernel repository, and with such large number of 
objects it seems that smaller deltas always translate into smaller 
packs.  But it might not necessarily always be the case.

> I'd appreciate it if you can test it on the 20MB blobs and see
> what happens if you have time.

Before your patch:
user 0m9.713s, delta size = 4910988 bytes, or 1744110 compressed.

With your patch:
user 0m3.948s, delta size = 6753517 bytes, or 1978803 once compressed.

BTW there is another potential for improvement in the delta code (but I 
have real work to do now so)...

Let's suppose the reference buffer has:

***********************************************************************/

This is common with some comment block styles.  Now that line will end 
up with multiple blocks that will hash to the same thing and linked 
successively in the same hash bucket except for the last ones where the 
'/' is involved in the hashing.

Now if the target buffer also contains a line similar to the above but 
one '*' character longer.  The first "***" will be hashed to x, then x 
will be looked up in the reference index, and the first entry 
corresponding to the first character of the line above will be returned.  
At that point a search forward is started to find out how much matches.  
In this case it will match up to the '/' in the reference buffer while 
the target will have another '*'.  The length will be recorded as the 
best match, so far so good.

Now the next entry from the same hash bucket will be tested in case it 
might provide a starting point for a longer match.  But we know in this 
case that the location of the second '*' in the reference buffer will be 
returned.  And we obviously know that this cannot match more data than 
the previous attempt, in fact it'll match one byte less.  And the hash 
entry to follow will return the location of the third '*' for another 
byte less to match.  Then the fourth '*' for yet another shorter match. 
And so on repeating those useless string comparisons multiple times.

One improvement might consist of counting the number of consecutive 
identical bytes when starting a compare, and manage to skip as many hash 
entries (minus the block size) before looping again with more entries in 
the same hash bucket.

The other case to distinguish is when the target buffer has instead a 
_shorter_ line, say 5 fewer '*''s before the final '/'.  Here we would 
loop 4 times matching only a bunch of '*' before we finally match the 
'/' as well on the fifth attempt becoming the best match.  In this case 
we could test if the repeated byte we noticed from the start is present 
further away in the reference buffer, and if so simply skip hash entries 
while searching forward in the reference buffer.  When the reference 
buffer doesn't match the repeated character then the comparison is 
resumed with the target buffer, and in this case the '/' would match 
right away, avoiding those four extra loops recomparing all those '*' 
needlessly.

Such improvements added to the search algorithm might make it 
significantly faster, even in the presence of multiple hash entries all 
located in the same bucket.  Or maybe not if the data set has few 
repeated characters.  And this is probably worthwhile only on top of my 
small block patch. Only experimentation could tell.  Someone willing to 
try?

Nicolas

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
  2006-02-28 17:05     ` Nicolas Pitre
@ 2006-03-01 10:38       ` Junio C Hamano
  2006-03-01 17:22         ` Nicolas Pitre
  0 siblings, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2006-03-01 10:38 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@cam.org> writes:

>> I tried an experimental patch to cull collided hash buckets
>> very aggressively.  I haven't applied your last "reuse index"
>> patch, though -- I think that is orthogonal and I'd like to
>> leave that to the next round.
>
> It is indeed orthogonal and I think you could apply it to the next 
> branch without the other patches (it should apply with little problems).  
> This is an obvious and undisputable gain, even more if pack-objects is 
> reworked to reduce memory usage by keeping only one live index for 
> multiple consecutive deltaattempts.

Umm.  The hash-index is rather huge, isn't it?  I did not
realize it was two-pointer structure for every byte in the
source material, and we typically delta from larger to smaller,
so we will keep about 10x the unpacked source.  Until we swap
the windowing around, that means about 100x the unpacked source
with the default window size.

Also, I am not sure which one is more costly: hash-index
building or use of that to search inside target.  I somehow got
an impression that the former is relatively cheap, and that is
what is being cached here.

> Let's suppose the reference buffer has:
>  
> ***********************************************************************/
>...
> One improvement might consist of counting the number of consecutive 
> identical bytes when starting a compare, and manage to skip as many hash 
> entries (minus the block size) before looping again with more entries in 
> the same hash bucket.

Umm, again.  Consecutive identical bytes (BTW, I think "* * *"
and "** ** **" patterns have the same collision issues without
being consecutive bytes, so such an optimization may be trickier
and cost more), when emitted as literals, would compress well,
wouldn't they?  At the end of the day, I think what matters is
the size of deflated delta, since going to disk to read it out
is more expensive than deflating and applying.  I think you made
a suggestion along the same line, capping the max delta used by
try_delta() more precisely by taking the deflated size into
account.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
  2006-03-01 10:38       ` Junio C Hamano
@ 2006-03-01 17:22         ` Nicolas Pitre
  0 siblings, 0 replies; 11+ messages in thread
From: Nicolas Pitre @ 2006-03-01 17:22 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Wed, 1 Mar 2006, Junio C Hamano wrote:

> Nicolas Pitre <nico@cam.org> writes:
> 
> >> I tried an experimental patch to cull collided hash buckets
> >> very aggressively.  I haven't applied your last "reuse index"
> >> patch, though -- I think that is orthogonal and I'd like to
> >> leave that to the next round.
> >
> > It is indeed orthogonal and I think you could apply it to the next 
> > branch without the other patches (it should apply with little problems).  
> > This is an obvious and undisputable gain, even more if pack-objects is 
> > reworked to reduce memory usage by keeping only one live index for 
> > multiple consecutive deltaattempts.
> 
> Umm.  The hash-index is rather huge, isn't it?  I did not
> realize it was two-pointer structure for every byte in the
> source material, and we typically delta from larger to smaller,
> so we will keep about 10x the unpacked source.  Until we swap
> the windowing around, that means about 100x the unpacked source
> with the default window size.

That's why I said that the window reversal has to be done as well to be 
effective.  As for the index itself it can be reduced to a single 
pointer since the "ptr" value can be deduced from the offset of the 
index entry.

> Also, I am not sure which one is more costly: hash-index
> building or use of that to search inside target.  I somehow got
> an impression that the former is relatively cheap, and that is
> what is being cached here.

Yes, but caching it saves 10% on CPU time, probably more when the window 
is swapped around due to less memory usage.

> > Let's suppose the reference buffer has:
> >  
> > ***********************************************************************/
> >...
> > One improvement might consist of counting the number of consecutive 
> > identical bytes when starting a compare, and manage to skip as many hash 
> > entries (minus the block size) before looping again with more entries in 
> > the same hash bucket.
> 
> Umm, again.  Consecutive identical bytes (BTW, I think "* * *"
> and "** ** **" patterns have the same collision issues without
> being consecutive bytes, so such an optimization may be trickier
> and cost more),

First, those "** ** **" are less frequent in general. Next, they will be 
spread amongst 3 hash buckets instead of all the same one.  And with 
large binary files with lots of zeroes then scanning over those areas in 
one pass instead of iterating over them from every offset would help 
enormously as well, even without limiting the hash list length.

 when emitted as literals, would compress well,
> wouldn't they?  At the end of the day, I think what matters is
> the size of deflated delta, since going to disk to read it out
> is more expensive than deflating and applying.  I think you made
> a suggestion along the same line, capping the max delta used by
> try_delta() more precisely by taking the deflated size into
> account.

Yes.  But deflating a bunch of characters will never be as dense as a 4 
byte delta sequence that might expand to hundreds.


Nicolas

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
  2006-02-28  8:10   ` Junio C Hamano
  2006-02-28 17:05     ` Nicolas Pitre
@ 2006-03-04  2:28     ` Junio C Hamano
  2006-03-04  2:39       ` Junio C Hamano
                         ` (2 more replies)
  1 sibling, 3 replies; 11+ messages in thread
From: Junio C Hamano @ 2006-03-04  2:28 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Junio C Hamano <junkio@cox.net> writes:

>> These numbers are misleading.
>>
>> The 36K objects pack I used in my previous tests gives 9971223
>> (from "next" branch) vs 9664546 (this patch) final pack size.
>> The wallclock time on my machine is 1m35 vs 3m30.  I doubt many
>> people are willing to pay 100% more waiting time for 3% tighter
>> pack.

	I started to suspect that the above benchmark was
unreliable, because I do not know how the original linux-2.6
repository I used for my testing were packed, and I was letting
the delta reusing to do its thing, so the times were probably
off (the 3-byte finer grained deltifier would have spent a lot
more time during the initial packing) and the result size were
too (the finer grained one would have packed things a lot more
tightly if the delta it was allowed to reuse was made by itself,
not by the old one).

	So I tried a fresh packing without reusing delta at all,
with three variants: the original 16-byte one (master), 3-byte
finer grained one with your hash limit (nico), and 3-byte finer
grained one with my more aggressive hash limit (jc).  The
benchmark is not meant to be scientific, but to see how well it
serves the kernel community ;-).


	The first round.  The set of objects packed were from
today's Linus tip (everything down to epoch v2.6.12-rc2), 193309
objects in total, on my Duron 750 with slow disks.

	real		user		sys		bytes		savings
master	11m17.121s	10m23.280s	0m47.290s       109045599	N/A
nico	25m37.058s	23m0.770s       2m20.460s	104401392	4.25%
jc	24m12.072s	21m45.120s	2m16.400s	104409761	4.25%

My hack does not change things much in the overall picture
compared to yours, although it does cut down runtime by about
5%.  We can immediately see that finer grained ones are
significantly more expensive than the original loose one either
way.


	The next round of test is to place the "full pack"
generated in the previous experiment in .git/objects/pack and
generate packs by reusing delta.  When testing the "master"
version, I move the pack produced above with "master" under
.git/objects/pack, and run this to emulate pack generation for
downloader who has v2.6.14 and wants v2.6.15-rc1 (the same 36K
objects test I did previously):

	git rev-list --objects-edge v2.6.14..v2.6.15-rc1 |
        time git-pack-objects --stdout | wc -c

When switching to test "nico" deltifier implementation, I start
with the full pack generated by "nico" in .git/object/pack to
make comparison fair.  Here is the result:

	real		user		sys		bytes		savings
master	1m37.703s       1m28.970s	0m5.860s        9968386		N/A
nico	3m39.982s	3m24.420s	0m14.380s	9182761		7.88%
jc	3m0.434s	2m35.940s	0m13.730s       9139460		8.31%

In thin transfer, base object is omitted, so the generated pack
has higher delta to non-delta ratio than usual -- and that is
the reason we see much more savings.  Still, we are paying quite
a lot of overhead by finer grained delta code.


	The last test is not to do a thin pack but still reusing
delta.  This is to emulate "git repack" performance.  Again, I
had the matching pack in .git/objects/pack to make the
compararison fair:

	git rev-list --objects v2.6.14..v2.6.15-rc1 |
        time git-pack-objects --stdout | wc -c

And here is the result:

	real		user		sys		bytes		savings
master	1m0.866s	0m57.590s	0m2.810s	34601417	N/A
nico	2m8.059s	2m0.360s	0m6.350s	33888419	2.06%
jc	1m49.894s	1m42.250s	0m6.780s	33838262	2.20%

This one is not getting much savings compared to the full pack
case, but still spending about twice the time.


	In short, I'd love to get a tighter packing, but as it
currently stands, I do not think 5% resulting packsize reduction
warrants making 100% slower operation the default.  And the
experiment I did does not account for the memory pressure the
new delitifier imposes us (two pointers per every byte of source
material -- it could be reduced to one pointer though), so when
we start talking about huge files I am not sure we can manage to
keep the runtime reasonably low.

	It maybe is an option to have a flag to tell "I want a
lot tighter pack made; you can spend all the time while I am
sleeping" and switch between two deltifiers, but otherwise...

	One thing I do not understand is why my patch improves
the compressed result size.  The patch was primarily to brutally
pessimize the selection of "copied from source" to avoid the
corner case performance problems, so I would understand why it
is faster than yours, but I expected it to *always* create a
looser pack.  I am puzzled.

	Swapping window scan order is orthogonal to this, so
maybe we could do it first and make it work, then start reusing
the refindex to see how well things improve.  But we should be
reminded of the recent report that pack-object ran out of space
even with the current code that does not reuse the refindex when
packing 8 huge objects X-<.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
  2006-03-04  2:28     ` Junio C Hamano
@ 2006-03-04  2:39       ` Junio C Hamano
  2006-03-04  3:01       ` Nicolas Pitre
  2006-03-04  6:21       ` Linus Torvalds
  2 siblings, 0 replies; 11+ messages in thread
From: Junio C Hamano @ 2006-03-04  2:39 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Junio C Hamano <junkio@cox.net> writes:

> 	The first round.  The set of objects packed were from
> today's Linus tip (everything down to epoch v2.6.12-rc2), 193309
> objects in total, on my Duron 750 with slow disks.
>
> 	  real		user		sys		bytes		savings
> master  11m17.121s	10m23.280s	0m47.290s       109045599	N/A
> nico	  25m37.058s	23m0.770s       2m20.460s	104401392	4.25%
> jc	  24m12.072s	21m45.120s	2m16.400s	104409761	4.25%

Minor correction in numbers.  The size for nico and jc are
swapped.  jc variant created the smallest pack in this
experiment.

Which puzzles me even more...

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
  2006-03-04  2:28     ` Junio C Hamano
  2006-03-04  2:39       ` Junio C Hamano
@ 2006-03-04  3:01       ` Nicolas Pitre
  2006-03-04  6:21       ` Linus Torvalds
  2 siblings, 0 replies; 11+ messages in thread
From: Nicolas Pitre @ 2006-03-04  3:01 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Fri, 3 Mar 2006, Junio C Hamano wrote:

> 	In short, I'd love to get a tighter packing, but as it
> currently stands, I do not think 5% resulting packsize reduction
> warrants making 100% slower operation the default.

Agreed.  Please just drop that one patch for now.

I'll rework the hash limiting patch so the 16-byte block version behaves 
better with the large 20MB files.


Nicolas

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
  2006-03-04  2:28     ` Junio C Hamano
  2006-03-04  2:39       ` Junio C Hamano
  2006-03-04  3:01       ` Nicolas Pitre
@ 2006-03-04  6:21       ` Linus Torvalds
  2 siblings, 0 replies; 11+ messages in thread
From: Linus Torvalds @ 2006-03-04  6:21 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nicolas Pitre, git

On Fri, 3 Mar 2006, Junio C Hamano wrote:
> 
> 	The first round.  The set of objects packed were from
> today's Linus tip (everything down to epoch v2.6.12-rc2), 193309
> objects in total, on my Duron 750 with slow disks.
> 
> 	real		user		sys		bytes		savings
> master	11m17.121s	10m23.280s	0m47.290s       109045599	N/A
> nico	25m37.058s	23m0.770s       2m20.460s	104401392	4.25%
> jc	24m12.072s	21m45.120s	2m16.400s	104409761	4.25%

Ouch.

Btw, it's often worth using /usr/bin/time instead of the bash built-in 
time. Why? Because /usr/bin/time reports one absolutely _hugely_ important 
number (maybe more important than almost any of the other numbers in 
there).

Which one? It's the "minor pagefaults". That's a very good approximation 
of memory usage overhead. 

We've already had one person report that they ran out of memory when 
packing. So memory usage is actually a problem.

Anyway, it looks like the 16-byte window is the way to go, even regardless 
of any memory use issue.

		Linus

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior
@ 2006-03-08 19:32 Nicolas Pitre
  0 siblings, 0 replies; 11+ messages in thread
From: Nicolas Pitre @ 2006-03-08 19:32 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git


The diff-delta code can exhibit O(m*n) behavior with some patological
data set where most hash entries end up in the same hash bucket.

To prevent this, a limit is imposed to the number of entries that can 
exist in the same hash bucket.

Because of the above the code is a tiny bit more expensive on average, 
even if some small optimizations were added as well to atenuate the 
overhead. But the problematic samples used to diagnoze the issue are now 
orders of magnitude less expensive to process with only a slight loss in 
compression.

Signed-off-by: Nicolas Pitre <nico@cam.org>

---

For example, Carl Baldwin provided me with a couple 20MB files, and 
deltifying one against another one with test-delta takes around 
TEN MINUTES for only one delta on my P4 @ 3GHz.

Nnow imagine using git-repack -a with a default window
of 10 ... 

With this patch the test-delta time dropped to only 9 seconds.  And the 
resulting delta, once compressed, is about 2% larger.

diff --git a/diff-delta.c b/diff-delta.c
index 2ed5984..aaee7be 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -40,17 +40,18 @@ struct index {
 
 static struct index ** delta_index(const unsigned char *buf,
 				   unsigned long bufsize,
+				   unsigned long trg_bufsize,
 				   unsigned int *hash_shift)
 {
-	unsigned int hsize, hshift, entries, blksize, i;
+	unsigned int i, hsize, hshift, hlimit, entries, *hash_count;
 	const unsigned char *data;
 	struct index *entry, **hash;
 	void *mem;
 
 	/* determine index hash size */
-	entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE;
+	entries = bufsize  / BLK_SIZE;
 	hsize = entries / 4;
-	for (i = 4; (1 << i) < hsize && i < 16; i++);
+	for (i = 4; (1 << i) < hsize && i < 31; i++);
 	hsize = 1 << i;
 	hshift = 32 - i;
 	*hash_shift = hshift;
@@ -63,20 +64,62 @@ static struct index ** delta_index(const
 	entry = mem + hsize * sizeof(*hash);
 	memset(hash, 0, hsize * sizeof(*hash));
 
-	/* then populate it */
+	/* allocate an array to count hash entries */
+	hash_count = calloc(hsize, sizeof(*hash_count));
+	if (!hash_count) {
+		free(hash);
+		return NULL;
+	}
+
+	/* then populate the index */
 	data = buf + entries * BLK_SIZE - BLK_SIZE;
-	blksize = bufsize - (data - buf);
 	while (data >= buf) {
-		unsigned int val = adler32(0, data, blksize);
+		unsigned int val = adler32(0, data, BLK_SIZE);
 		i = HASH(val, hshift);
 		entry->ptr = data;
 		entry->val = val;
 		entry->next = hash[i];
 		hash[i] = entry++;
-		blksize = BLK_SIZE;
+		hash_count[i]++;
 		data -= BLK_SIZE;
  	}
 
+	/*
+	 * Determine a limit on the number of entries in the same hash
+	 * bucket.  This guard us against patological data sets causing
+	 * really bad hash distribution with most entries in the same hash
+	 * bucket that would bring us to O(m*n) computing costs (m and n
+	 * corresponding to reference and target buffer sizes).
+	 *
+	 * The more the target buffer is large, the more it is important to
+	 * have small entry lists for each hash buckets.  With such a limit
+	 * the cost is bounded to something more like O(m+n).
+	 */
+	hlimit = (1 << 26) / trg_bufsize;
+	if (hlimit < 4*BLK_SIZE)
+		hlimit = 4*BLK_SIZE;
+
+	/*
+	 * Now make sure none of the hash buckets has more entries than
+	 * we're willing to test.  Otherwise we cull the entry list
+	 * uniformly to still preserve a good repartition across
+	 * the reference buffer.
+	 */
+	for (i = 0; i < hsize; i++) {
+		if (hash_count[i] < hlimit)
+			continue;
+		entry = hash[i];
+		do {
+			struct index *keep = entry;
+			int skip = hash_count[i] / hlimit / 2;
+			do {
+				entry = entry->next;
+			} while(--skip && entry);
+			keep->next = entry;
+		} while(entry);
+	}
+	free(hash_count);
+
 	return hash;
 }
 
@@ -100,7 +143,7 @@ void *diff_delta(void *from_buf, unsigne
 
 	if (!from_size || !to_size)
 		return NULL;
-	hash = delta_index(from_buf, from_size, &hash_shift);
+	hash = delta_index(from_buf, from_size, to_size, &hash_shift);
 	if (!hash)
 		return NULL;
 
@@ -141,29 +184,27 @@ void *diff_delta(void *from_buf, unsigne
 
 	while (data < top) {
 		unsigned int moff = 0, msize = 0;
-		unsigned int blksize = MIN(top - data, BLK_SIZE);
-		unsigned int val = adler32(0, data, blksize);
-		i = HASH(val, hash_shift);
-		for (entry = hash[i]; entry; entry = entry->next) {
-			const unsigned char *ref = entry->ptr;
-			const unsigned char *src = data;
-			unsigned int ref_size = ref_top - ref;
-			if (entry->val != val)
-				continue;
-			if (ref_size > top - src)
-				ref_size = top - src;
-			while (ref_size && *src++ == *ref) {
-				ref++;
-				ref_size--;
-			}
-			ref_size = ref - entry->ptr;
-			if (ref_size > msize) {
-				/* this is our best match so far */
-				moff = entry->ptr - ref_data;
-				msize = ref_size;
-				if (msize >= 0x10000) {
-					msize = 0x10000;
+		if (data + BLK_SIZE <= top) {
+			unsigned int val = adler32(0, data, BLK_SIZE);
+			i = HASH(val, hash_shift);
+			for (entry = hash[i]; entry; entry = entry->next) {
+				const unsigned char *ref = entry->ptr;
+				const unsigned char *src = data;
+				unsigned int ref_size = ref_top - ref;
+				if (entry->val != val)
+					continue;
+				if (ref_size > top - src)
+					ref_size = top - src;
+				if (ref_size > 0x10000)
+					ref_size = 0x10000;
+				if (ref_size <= msize)
 					break;
+				while (ref_size-- && *src++ == *ref)
+					ref++;
+				if (msize < ref - entry->ptr) {
+					/* this is our best match so far */
+					msize = ref - entry->ptr;
+					moff = entry->ptr - ref_data;
 				}
 			}
 		}

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2006-03-08 19:32 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-02-28  4:09 [PATCH] diff-delta: bound hash list length to avoid O(m*n) behavior Nicolas Pitre
2006-02-28  6:51 ` Junio C Hamano
2006-02-28  8:10   ` Junio C Hamano
2006-02-28 17:05     ` Nicolas Pitre
2006-03-01 10:38       ` Junio C Hamano
2006-03-01 17:22         ` Nicolas Pitre
2006-03-04  2:28     ` Junio C Hamano
2006-03-04  2:39       ` Junio C Hamano
2006-03-04  3:01       ` Nicolas Pitre
2006-03-04  6:21       ` Linus Torvalds
  -- strict thread matches above, loose matches on Subject: below --
2006-03-08 19:32 Nicolas Pitre

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).