[PATCH] diff-delta: produce optimal pack data

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] diff-delta: produce optimal pack data
@ 2006-02-22  1:45 Nicolas Pitre
  2006-02-24  8:49 ` Junio C Hamano
  0 siblings, 1 reply; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-22  1:45 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git


Indexing based on adler32 has a match precision based on the block size 
(currently 16).  Lowering the block size would produce smaller deltas 
but the indexing memory and computing cost increases significantly.

For optimal delta result the indexing block size should be 3 with an 
increment of 1 (instead of 16 and 16).  With such low params the adler32 
becomes a clear overhead increasing the time for git-repack by a factor 
of 3.  And with such small blocks the adler 32 is not very useful as the 
whole of the block bits can be used directly.

This patch replaces the adler32 with an open coded index value based on 
3 characters directly.  This gives sufficient bits for hashing and 
allows for optimal delta with reasonable CPU cycles.

The resulting packs are 6% smaller on average.  The increase in CPU time 
is about 25%.  But this cost is now hidden by the delta reuse patch 
while the saving on data transfers is always there.

Signed-off-by: Nicolas Pitre <nico@cam.org>

---

 diff-delta.c |   77 +++++++++++++++++++++++-----------------------------------
 1 files changed, 30 insertions(+), 47 deletions(-)

54aa50fb403981a9292453b76d894a79da9698de
diff --git a/diff-delta.c b/diff-delta.c
index 2ed5984..27f83a0 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -20,21 +20,11 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include <zlib.h>
 #include "delta.h"
 
 
-/* block size: min = 16, max = 64k, power of 2 */
-#define BLK_SIZE 16
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-#define GR_PRIME 0x9e370001
-#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))
-
 struct index {
 	const unsigned char *ptr;
-	unsigned int val;
 	struct index *next;
 };
 
@@ -42,21 +32,21 @@ static struct index ** delta_index(const
 				   unsigned long bufsize,
 				   unsigned int *hash_shift)
 {
-	unsigned int hsize, hshift, entries, blksize, i;
+	unsigned long hsize;
+	unsigned int hshift, i;
 	const unsigned char *data;
 	struct index *entry, **hash;
 	void *mem;
 
 	/* determine index hash size */
-	entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE;
-	hsize = entries / 4;
-	for (i = 4; (1 << i) < hsize && i < 16; i++);
+	hsize = bufsize / 4;
+	for (i = 8; (1 << i) < hsize && i < 16; i++);
 	hsize = 1 << i;
-	hshift = 32 - i;
+	hshift = i - 8;
 	*hash_shift = hshift;
 
 	/* allocate lookup index */
-	mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
+	mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
 	if (!mem)
 		return NULL;
 	hash = mem;
@@ -64,17 +54,12 @@ static struct index ** delta_index(const
 	memset(hash, 0, hsize * sizeof(*hash));
 
 	/* then populate it */
-	data = buf + entries * BLK_SIZE - BLK_SIZE;
-	blksize = bufsize - (data - buf);
-	while (data >= buf) {
-		unsigned int val = adler32(0, data, blksize);
-		i = HASH(val, hshift);
-		entry->ptr = data;
-		entry->val = val;
+	data = buf + bufsize - 2;
+	while (data > buf) {
+		entry->ptr = --data;
+		i = data[0] ^ data[1] ^ (data[2] << hshift);
 		entry->next = hash[i];
 		hash[i] = entry++;
-		blksize = BLK_SIZE;
-		data -= BLK_SIZE;
  	}
 
 	return hash;
@@ -141,29 +126,27 @@ void *diff_delta(void *from_buf, unsigne
 
 	while (data < top) {
 		unsigned int moff = 0, msize = 0;
-		unsigned int blksize = MIN(top - data, BLK_SIZE);
-		unsigned int val = adler32(0, data, blksize);
-		i = HASH(val, hash_shift);
-		for (entry = hash[i]; entry; entry = entry->next) {
-			const unsigned char *ref = entry->ptr;
-			const unsigned char *src = data;
-			unsigned int ref_size = ref_top - ref;
-			if (entry->val != val)
-				continue;
-			if (ref_size > top - src)
-				ref_size = top - src;
-			while (ref_size && *src++ == *ref) {
-				ref++;
-				ref_size--;
-			}
-			ref_size = ref - entry->ptr;
-			if (ref_size > msize) {
-				/* this is our best match so far */
-				moff = entry->ptr - ref_data;
-				msize = ref_size;
-				if (msize >= 0x10000) {
-					msize = 0x10000;
+		if (data + 2 < top) {
+			i = data[0] ^ data[1] ^ (data[2] << hash_shift);
+			for (entry = hash[i]; entry; entry = entry->next) {
+				const unsigned char *ref = entry->ptr;
+				const unsigned char *src = data;
+				unsigned int ref_size = ref_top - ref;
+				if (ref_size > top - src)
+					ref_size = top - src;
+				if (ref_size > 0x10000)
+					ref_size = 0x10000;
+				if (ref_size <= msize)
 					break;
+				while (ref_size && *src++ == *ref) {
+					ref++;
+					ref_size--;
+				}
+				ref_size = ref - entry->ptr;
+				if (msize < ref - entry->ptr) {
+					/* this is our best match so far */
+					msize = ref - entry->ptr;
+					moff = entry->ptr - ref_data;
 				}
 			}
 		}
-- 
1.2.2.g6643-dirty

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-22  1:45 [PATCH] diff-delta: produce optimal pack data Nicolas Pitre
@ 2006-02-24  8:49 ` Junio C Hamano
  2006-02-24 15:37   ` Nicolas Pitre
  2006-02-24 17:44   ` Carl Baldwin
  0 siblings, 2 replies; 35+ messages in thread
From: Junio C Hamano @ 2006-02-24  8:49 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@cam.org> writes:

> Indexing based on adler32 has a match precision based on the block size 
> (currently 16).  Lowering the block size would produce smaller deltas 
> but the indexing memory and computing cost increases significantly.

Indeed.

I had this patch in my personal tree for a while.  I was
wondring why sometimes progress indication during "Deltifying"
stage stops for literally several seconds, or more.

In Linux 2.6 repository, these object pairs take forever to
delta.

        blob 9af06ba723df75fed49f7ccae5b6c9c34bc5115f -> 
        blob dfc9cd58dc065d17030d875d3fea6e7862ede143
        size (491102 -> 496045)
        58 seconds

        blob 4917ec509720a42846d513addc11cbd25e0e3c4f -> 
        blob dfc9cd58dc065d17030d875d3fea6e7862ede143
        size (495831 -> 496045)
        64 seconds

Admittedly, these are *BAD* input samples (a binary firmware
blob with many similar looking ", 0x" sequences).  I can see
that trying to reuse source materials really hard would take
significant computation.

However, this is simply unacceptable.

The new algoritm takes 58 seconds to produce 136000 bytes of
delta, while the old takes 0.25 seconds to produce 248899 (I am
using the test-delta program in git.git distribution).  The
compression ratio is significantly better, but this is unusable
even for offline archival use (remember, pack delta selection
needs to do window=10 such deltification trials to come up with
the best delta, so you are spending 10 minutes to save 100k from
one oddball blob), let alone on-the-fly pack generation for
network transfer.

Maybe we would want two implementation next to each other, and
internally see if it is taking too much cycles compared to the
input size then switch to cheaper version?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24  8:49 ` Junio C Hamano
@ 2006-02-24 15:37   ` Nicolas Pitre
  2006-02-24 23:55     ` Junio C Hamano
  2006-02-24 17:44   ` Carl Baldwin
  1 sibling, 1 reply; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-24 15:37 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Fri, 24 Feb 2006, Junio C Hamano wrote:

> Nicolas Pitre <nico@cam.org> writes:
> 
> > Indexing based on adler32 has a match precision based on the block size 
> > (currently 16).  Lowering the block size would produce smaller deltas 
> > but the indexing memory and computing cost increases significantly.
> 
> Indeed.
> 
> I had this patch in my personal tree for a while.  I was
> wondring why sometimes progress indication during "Deltifying"
> stage stops for literally several seconds, or more.

Note that above I'm saying that _keeping_ adler32 for small blocks is 
even longer.  In other words, for small blocks, the version not using 
adler32 is about 3 times faster.  

I also noticed the significant slowdown after I made the 
improved progress patch. The idea now has to do with detecting 
patological cases and breaking out of them early.

> In Linux 2.6 repository, these object pairs take forever to
> delta.
> 
>         blob 9af06ba723df75fed49f7ccae5b6c9c34bc5115f -> 
>         blob dfc9cd58dc065d17030d875d3fea6e7862ede143
>         size (491102 -> 496045)
>         58 seconds
> 
>         blob 4917ec509720a42846d513addc11cbd25e0e3c4f -> 
>         blob dfc9cd58dc065d17030d875d3fea6e7862ede143
>         size (495831 -> 496045)
>         64 seconds

Thanks for this.  I'll see what I can do to tweak the code to better 
cope with those.  Just keep my fourth delta patch in the pu branch for 
now.


Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 15:37   ` Nicolas Pitre
@ 2006-02-24 23:55     ` Junio C Hamano
  0 siblings, 0 replies; 35+ messages in thread
From: Junio C Hamano @ 2006-02-24 23:55 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

Nicolas Pitre <nico@cam.org> writes:

> On Fri, 24 Feb 2006, Junio C Hamano wrote:
>
>> In Linux 2.6 repository, these object pairs take forever to
>> delta.
>> 
>>         blob 9af06ba723df75fed49f7ccae5b6c9c34bc5115f -> 
>>         blob dfc9cd58dc065d17030d875d3fea6e7862ede143
>>         size (491102 -> 496045)
>>         58 seconds
>> 
>>         blob 4917ec509720a42846d513addc11cbd25e0e3c4f -> 
>>         blob dfc9cd58dc065d17030d875d3fea6e7862ede143
>>         size (495831 -> 496045)
>>         64 seconds
>
> Thanks for this.  I'll see what I can do to tweak the code to better 
> cope with those.  Just keep my fourth delta patch in the pu branch for 
> now.

If apply this on top of pack-objects.c, you can find more of
them yourself.

---
diff --git a/pack-objects.c b/pack-objects.c
index be7a200..3f88e86 100644
--- a/pack-objects.c
+++ b/pack-objects.c
@@ -62,6 +62,7 @@ static const char *base_name;
 static unsigned char pack_file_sha1[20];
 static int progress = 1;
 static volatile int progress_update = 0;
+static volatile int progress_update_cnt = 0;
 
 /*
  * The object names in objects array are hashed with this hashtable,
@@ -826,6 +827,7 @@ static int try_delta(struct unpacked *cu
 	struct object_entry *old_entry = old->entry;
 	int old_preferred = (old_entry->preferred_base ||
 			     old_entry->based_on_preferred);
+	int last_up;
 	unsigned long size, oldsize, delta_size, sizediff;
 	long max_size;
 	void *delta_buf;
@@ -890,8 +892,17 @@ static int try_delta(struct unpacked *cu
 	}
 	if (sizediff >= max_size)
 		return -1;
+	last_up = progress_update_cnt;
 	delta_buf = diff_delta(old->data, oldsize,
 			       cur->data, size, &delta_size, max_size);
+	if (last_up + 1 < progress_update_cnt) {
+		/* It took more than one second */
+		fprintf(stderr, "%d -> %d: %s -> ",
+			last_up, progress_update_cnt,
+			sha1_to_hex(old_entry->sha1));
+		fprintf(stderr, "%s (%lu -> %lu)\n",
+			sha1_to_hex(cur_entry->sha1), oldsize, size);
+	}
 	if (!delta_buf)
 		return 0;
 	cur_entry->delta = old_entry;
@@ -906,6 +917,7 @@ static void progress_interval(int signum
 {
 	signal(SIGALRM, progress_interval);
 	progress_update = 1;
+	progress_update_cnt++;
 }
 
 static void find_deltas(struct object_entry **list, int window, int depth)

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24  8:49 ` Junio C Hamano
  2006-02-24 15:37   ` Nicolas Pitre
@ 2006-02-24 17:44   ` Carl Baldwin
  2006-02-24 17:56     ` Nicolas Pitre
  1 sibling, 1 reply; 35+ messages in thread
From: Carl Baldwin @ 2006-02-24 17:44 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nicolas Pitre, git

Junio,

This message came to me at exactly the right time.  Yesterday I was
exploring using git as the content storage back-end for some binary
files.  Up until now I've only used it for software projects.

I found the largest RCS file that we had in our current back-end.  It
contained twelve versions of a binary file.  Each version averaged about
20 MB.  The ,v file from RCS was about 250MB.  I did some experiments on
these binary files.

First, gzip consistantly is able to compress these files to about 10%
their original size.  So, they are quite inflated.  Second, xdelta would
produce a delta between two neighboring revisions of about 2.5MB in size
that would compress down to about 2MB.  (about the same size as the next
revision compressed without deltification so packing is ineffective
here).

I added these 12 revisions to several version control back-ends
including subversion and git.  Git produced a much smaller repository
size than the others simply due to the compression that it applies to
objects.  It also was at least as fast as the others.

The problem came when I tried to clone this repository.
git-pack-objects chewed on these 12 revisions for over an hour before I
finally interrupted it.  As far as I could tell, it hadn't made much
progress.

My other complaint was that git prune ran slow (~8 seconds on my very
fast machine with fast disk access) on a repository with only these
twelve revisions in it (37 total objects in the object store).  This is
because 'git prune' actually ends up running fsck on all of the objects
which verifies the sha1 of each object.  This seems like a lot of work
just to prune unwanted objects.  What would you say to a --fast option
to git-prune that would avoid most of what fsck does including verifying
sha1 for each object?

Anyway, that was a tangent.  I looked into to overriding the --depth
option to git-pack-objects and set it to 0.  However, this isn't
trivial.  git-pack-objects is never called directly by the user.  It is
only called through things like 'git clone', 'git push' and 'git
repack'.  What do you think about this?  Could we add a configuration
option that could be set for the repository?  Something smarter like
what you suggest where git would pack small text files but give up on
large binaries would be optimal.

I've already determined that packing a repository with this type of
largish binary file doesn't do any good but there doesn't seem to be a
way to avoid packing when doing network operations.

Thoughts?
Carl

On Fri, Feb 24, 2006 at 12:49:13AM -0800, Junio C Hamano wrote:
> Nicolas Pitre <nico@cam.org> writes:
> 
> > Indexing based on adler32 has a match precision based on the block size 
> > (currently 16).  Lowering the block size would produce smaller deltas 
> > but the indexing memory and computing cost increases significantly.
> 
> Indeed.
> 
> I had this patch in my personal tree for a while.  I was
> wondring why sometimes progress indication during "Deltifying"
> stage stops for literally several seconds, or more.
> 
> In Linux 2.6 repository, these object pairs take forever to
> delta.
> 
>         blob 9af06ba723df75fed49f7ccae5b6c9c34bc5115f -> 
>         blob dfc9cd58dc065d17030d875d3fea6e7862ede143
>         size (491102 -> 496045)
>         58 seconds
> 
>         blob 4917ec509720a42846d513addc11cbd25e0e3c4f -> 
>         blob dfc9cd58dc065d17030d875d3fea6e7862ede143
>         size (495831 -> 496045)
>         64 seconds
> 
> Admittedly, these are *BAD* input samples (a binary firmware
> blob with many similar looking ", 0x" sequences).  I can see
> that trying to reuse source materials really hard would take
> significant computation.
> 
> However, this is simply unacceptable.
> 
> The new algoritm takes 58 seconds to produce 136000 bytes of
> delta, while the old takes 0.25 seconds to produce 248899 (I am
> using the test-delta program in git.git distribution).  The
> compression ratio is significantly better, but this is unusable
> even for offline archival use (remember, pack delta selection
> needs to do window=10 such deltification trials to come up with
> the best delta, so you are spending 10 minutes to save 100k from
> one oddball blob), let alone on-the-fly pack generation for
> network transfer.
> 
> Maybe we would want two implementation next to each other, and
> internally see if it is taking too much cycles compared to the
> input size then switch to cheaper version?
> 
> -
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 Carl Baldwin                        RADCAD (R&D CAD)
 Hewlett Packard Company
 MS 88                               work: 970 898-1523
 3404 E. Harmony Rd.                 work: Carl.N.Baldwin@hp.com
 Fort Collins, CO 80525              home: Carl@ecBaldwin.net
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 17:44   ` Carl Baldwin
@ 2006-02-24 17:56     ` Nicolas Pitre
  2006-02-24 18:35       ` Carl Baldwin
  2006-02-24 18:49       ` Carl Baldwin
  0 siblings, 2 replies; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-24 17:56 UTC (permalink / raw)
  To: Carl Baldwin; +Cc: Junio C Hamano, git

On Fri, 24 Feb 2006, Carl Baldwin wrote:

> Junio,
> 
> This message came to me at exactly the right time.  Yesterday I was
> exploring using git as the content storage back-end for some binary
> files.  Up until now I've only used it for software projects.
> 
> I found the largest RCS file that we had in our current back-end.  It
> contained twelve versions of a binary file.  Each version averaged about
> 20 MB.  The ,v file from RCS was about 250MB.  I did some experiments on
> these binary files.
> 
> First, gzip consistantly is able to compress these files to about 10%
> their original size.  So, they are quite inflated.  Second, xdelta would
> produce a delta between two neighboring revisions of about 2.5MB in size
> that would compress down to about 2MB.  (about the same size as the next
> revision compressed without deltification so packing is ineffective
> here).
> 
> I added these 12 revisions to several version control back-ends
> including subversion and git.  Git produced a much smaller repository
> size than the others simply due to the compression that it applies to
> objects.  It also was at least as fast as the others.
> 
> The problem came when I tried to clone this repository.
> git-pack-objects chewed on these 12 revisions for over an hour before I
> finally interrupted it.  As far as I could tell, it hadn't made much
> progress.

I must ask if you had applied my latest delta patches?

Also did you use a recent version of git that implements pack data 
reuse?


Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 17:56     ` Nicolas Pitre
@ 2006-02-24 18:35       ` Carl Baldwin
  2006-02-24 18:57         ` Nicolas Pitre
  2006-02-24 18:49       ` Carl Baldwin
  1 sibling, 1 reply; 35+ messages in thread
From: Carl Baldwin @ 2006-02-24 18:35 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

On Fri, Feb 24, 2006 at 12:56:04PM -0500, Nicolas Pitre wrote:
My version is 1.2.1.  I have not been following your work.  When was
pack data reuse introduced?  From where can I obtain your delta patches?

There is really no opportunity for pack-data reuse in this case.  The
repository had never been packed or cloned in the first place.  As I
said, I do not intend to pack these binary files at all since there is
no benefit in this case.

The delta patches may help but I can't say for sure since I don't know
anything about them.  Let me know where I can get them.

Carl

> 
> I must ask if you had applied my latest delta patches?
> 
> Also did you use a recent version of git that implements pack data 
> reuse?
> 
> 
> Nicolas
> -
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 Carl Baldwin                        RADCAD (R&D CAD)
 Hewlett Packard Company
 MS 88                               work: 970 898-1523
 3404 E. Harmony Rd.                 work: Carl.N.Baldwin@hp.com
 Fort Collins, CO 80525              home: Carl@ecBaldwin.net
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 18:35       ` Carl Baldwin
@ 2006-02-24 18:57         ` Nicolas Pitre
  2006-02-24 19:23           ` Carl Baldwin
  0 siblings, 1 reply; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-24 18:57 UTC (permalink / raw)
  To: Carl Baldwin; +Cc: Junio C Hamano, git

On Fri, 24 Feb 2006, Carl Baldwin wrote:

> My version is 1.2.1.  I have not been following your work.  When was
> pack data reuse introduced?

Try out version 1.2.3.

> From where can I obtain your delta patches?

Forget them for now -- they won't help you.

> There is really no opportunity for pack-data reuse in this case.  The
> repository had never been packed or cloned in the first place.  As I
> said, I do not intend to pack these binary files at all since there is
> no benefit in this case.

Yes there is, as long as you have version 1.2.3.  The clone logic will 
simply reuse already packed data without attempting to recompute it.

> The delta patches may help but I can't say for sure since I don't know
> anything about them.

They (actually the last one) might help reduce the size of resulting 
packs but it currently has performance problems with some patological 
data sets.

I think you really should try git version 1.2.3 with a packed 
repository.  It might handle your special case just fine.

Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 18:57         ` Nicolas Pitre
@ 2006-02-24 19:23           ` Carl Baldwin
  2006-02-24 20:02             ` Nicolas Pitre
  2006-02-24 20:02             ` Linus Torvalds
  0 siblings, 2 replies; 35+ messages in thread
From: Carl Baldwin @ 2006-02-24 19:23 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

On Fri, Feb 24, 2006 at 01:57:20PM -0500, Nicolas Pitre wrote:
> On Fri, 24 Feb 2006, Carl Baldwin wrote:
> 
> > My version is 1.2.1.  I have not been following your work.  When was
> > pack data reuse introduced?
> 
> Try out version 1.2.3.

I'm on it now.

> > From where can I obtain your delta patches?
> 
> Forget them for now -- they won't help you.

Ah, I have been looking at your patches and clearly they will not help.

> > There is really no opportunity for pack-data reuse in this case.  The
> > repository had never been packed or cloned in the first place.  As I
> > said, I do not intend to pack these binary files at all since there is
> > no benefit in this case.
> 
> Yes there is, as long as you have version 1.2.3.  The clone logic will 
> simply reuse already packed data without attempting to recompute it.

I meant that there is no benefit in disk space usage.  Packing may
actually increase my disk space usage in this case.  Refer to what I
said about experimentally running gzip and xdelta on the files
independantly of git.

I see what you're saying about this data reuse helping to speed up
subsequent cloning operations.  However, if packing takes this long and
doesn't give me any disk space savings then I don't want to pay the very
heavy price of packing these files even the first time nor do I want to
pay the price incrementally.

The most I would tolerate for the first pack is a few seconds.  The most
I would tolerate for any incremental pack is about 1 second.

BTW, git repack has been going for 30 minutes and has packed 4/36
objects.  :-)

> I think you really should try git version 1.2.3 with a packed 
> repository.  It might handle your special case just fine.

No, not when I'm not willing to pay the price to pack even once.  This
isn't a case where I have one such repository and 'once its been packed
then its packed'.  This is only one example of such a repository.  I am
looking for a process for revisioning this type of data that will be
used over and over.  Git may not be the answer here but it sure is
looking good in many other ways.

I think the right answer would be for git to avoid trying to pack files
like this.  Junio mentioned something like this in his message.

Thanks for your input.

Cheers,
Carl

-- 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 Carl Baldwin                        RADCAD (R&D CAD)
 Hewlett Packard Company
 MS 88                               work: 970 898-1523
 3404 E. Harmony Rd.                 work: Carl.N.Baldwin@hp.com
 Fort Collins, CO 80525              home: Carl@ecBaldwin.net
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 19:23           ` Carl Baldwin
@ 2006-02-24 20:02             ` Nicolas Pitre
  2006-02-24 20:40               ` Carl Baldwin
  2006-02-24 20:02             ` Linus Torvalds
  1 sibling, 1 reply; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-24 20:02 UTC (permalink / raw)
  To: Carl Baldwin; +Cc: Junio C Hamano, git

On Fri, 24 Feb 2006, Carl Baldwin wrote:

> I see what you're saying about this data reuse helping to speed up
> subsequent cloning operations.  However, if packing takes this long and
> doesn't give me any disk space savings then I don't want to pay the very
> heavy price of packing these files even the first time nor do I want to
> pay the price incrementally.

Of course.  There is admitedly a problem here.  I'm just abusing a bit 
of your time to properly identify its parameters.

> The most I would tolerate for the first pack is a few seconds.  The most
> I would tolerate for any incremental pack is about 1 second.

Well that is probably a bit tight.  Ideally it should be linear with the 
size of the data set to process.  If you have 10 files 10MB each it 
should take about the same time to pack than 10000 files of 10KB each.  
Of course incrementally packing one additional 10MB file might take more 
than a second although it is only one file.

> BTW, git repack has been going for 30 minutes and has packed 4/36
> objects.  :-)

Pathetic.

> I think the right answer would be for git to avoid trying to pack files
> like this.  Junio mentioned something like this in his message.

Yes.  First we could add an additional parameter to the repacking 
strategy which is the undeltified but deflated size of an object.  That 
would prevent any deltas to become bigger than the simply deflated 
version.

Remains the delta performance issue.  I think I know what the problem 
is.  I'm not sure I know what the best solution would be though.  The 
patological data set is easy to identify quickly and one strategy might 
simply to bail out early when it happens and therefore not attempt any 
delta.

However, if you could let me play with two samples of your big file I'd 
be grateful.  If so I'd like to make git work well with your data set 
too which is not that uncommon after all.

Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 20:02             ` Nicolas Pitre
@ 2006-02-24 20:40               ` Carl Baldwin
  2006-02-24 21:12                 ` Nicolas Pitre
  0 siblings, 1 reply; 35+ messages in thread
From: Carl Baldwin @ 2006-02-24 20:40 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

On Fri, Feb 24, 2006 at 03:02:07PM -0500, Nicolas Pitre wrote:
> Well that is probably a bit tight.  Ideally it should be linear with the 
> size of the data set to process.  If you have 10 files 10MB each it 
> should take about the same time to pack than 10000 files of 10KB each.  
> Of course incrementally packing one additional 10MB file might take more 
> than a second although it is only one file.

Well, I might not have been fair here.  I tried an experiment where I
packed each of the twelve large blob objects explicitly one-by-one using
git-pack-objects.  Incrementally packing each single object was very
fast.  Well under a second per object on my machine.

After the twelve large objects were packed into individual packs the
rest of the packing went very quickly and git v1.2.3's date reuse worked
very well.  This was sort of my attempt at simulating how things would
be if git avoided deltification of each of these large files. I'm sorry
to have been so harsh earlier I just didn't understand that
incrementally packing one-by-one was going to help this much.

This gives me hope that if somehow git were to not attempt to deltify
these objects then performance would be much better than acceptible.

[snip]
> However, if you could let me play with two samples of your big file I'd 
> be grateful.  If so I'd like to make git work well with your data set 
> too which is not that uncommon after all.

I would be happy to do this.  I will probably need to scrub a bit and
make sure that the result shows the same characteristics.  How would you
like me to deliver these files to you?  They are about 25 MB deflated.

Carl

-- 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 Carl Baldwin                        RADCAD (R&D CAD)
 Hewlett Packard Company
 MS 88                               work: 970 898-1523
 3404 E. Harmony Rd.                 work: Carl.N.Baldwin@hp.com
 Fort Collins, CO 80525              home: Carl@ecBaldwin.net
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 20:40               ` Carl Baldwin
@ 2006-02-24 21:12                 ` Nicolas Pitre
  2006-02-24 22:50                   ` Carl Baldwin
  0 siblings, 1 reply; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-24 21:12 UTC (permalink / raw)
  To: Carl Baldwin; +Cc: Junio C Hamano, git

On Fri, 24 Feb 2006, Carl Baldwin wrote:

> On Fri, Feb 24, 2006 at 03:02:07PM -0500, Nicolas Pitre wrote:
> > Well that is probably a bit tight.  Ideally it should be linear with the 
> > size of the data set to process.  If you have 10 files 10MB each it 
> > should take about the same time to pack than 10000 files of 10KB each.  
> > Of course incrementally packing one additional 10MB file might take more 
> > than a second although it is only one file.
> 
> Well, I might not have been fair here.  I tried an experiment where I
> packed each of the twelve large blob objects explicitly one-by-one using
> git-pack-objects.  Incrementally packing each single object was very
> fast.  Well under a second per object on my machine.
> 
> After the twelve large objects were packed into individual packs the
> rest of the packing went very quickly and git v1.2.3's date reuse worked
> very well.  This was sort of my attempt at simulating how things would
> be if git avoided deltification of each of these large files. I'm sorry
> to have been so harsh earlier I just didn't understand that
> incrementally packing one-by-one was going to help this much.

Hmmmmmmm....

I don't think I understand what is going on here.

You say that, if you add those big files and incrementally repack after 
each commit using git repack with no option, then it requires only about 
one second each time.  Right?

But if you use "git-repack -a -f" then it is gone for more than an hour?

I'd expect something like 2 * (sum i for i = 1 to 10) i.e. in the 110 
second range due to the combinatorial effect when repacking everything.  
This is far from one hour and something appears to be really really 
wrong.

How many files besides those 12 big blobs do you have?

> This gives me hope that if somehow git were to not attempt to deltify
> these objects then performance would be much better than acceptible.
> 
> [snip]
> > However, if you could let me play with two samples of your big file I'd 
> > be grateful.  If so I'd like to make git work well with your data set 
> > too which is not that uncommon after all.
> 
> I would be happy to do this.  I will probably need to scrub a bit and
> make sure that the result shows the same characteristics.  How would you
> like me to deliver these files to you?  They are about 25 MB deflated.

If you can add them into a single .tgz with instructions on how 
to reproduce the issue and provide me with an URL where I can fetch it 
that'd be perfect.


Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 21:12                 ` Nicolas Pitre
@ 2006-02-24 22:50                   ` Carl Baldwin
  2006-02-25  3:53                     ` Nicolas Pitre
  0 siblings, 1 reply; 35+ messages in thread
From: Carl Baldwin @ 2006-02-24 22:50 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

On Fri, Feb 24, 2006 at 04:12:14PM -0500, Nicolas Pitre wrote:
> On Fri, 24 Feb 2006, Carl Baldwin wrote:
> > After the twelve large objects were packed into individual packs the
> > rest of the packing went very quickly and git v1.2.3's date reuse worked
> > very well.  This was sort of my attempt at simulating how things would
> > be if git avoided deltification of each of these large files. I'm sorry
> > to have been so harsh earlier I just didn't understand that
> > incrementally packing one-by-one was going to help this much.
> 
> Hmmmmmmm....
> 
> I don't think I understand what is going on here.
> 
> You say that, if you add those big files and incrementally repack after 
> each commit using git repack with no option, then it requires only about 
> one second each time.  Right?

Well, actually I was packing them individually by calling
git-pack-objects directly on each blob.

I'll try doing it exactly as you describe...

Ok, I tried it.  Basically I do the following.

% mkdir test
% cd test
% git init-db
% cp ../files/binfile.1 binfile
% time git add binfile

real    0m2.459s
user    0m2.443s
sys     0m0.019s
% git commit -a -m "Rev 1"
% time git repack
[snip]

real    0m1.111s
user    0m1.046s
sys     0m0.061s
% for i in $(seq 2 12); do
    cp ../files/binfile.$i binfile
    time git commit -a -m "Rev $i"
    time git repack
done

Each commit takes around 2.8-3.5 seconds and each repack takes about
1.2-1.5 seconds.  These are prettly reasonable.

Now, I try 'git repack -a -f' (or even without -f) and it goes out to
lunch.  I think it would take on the order of a day to actually finish
because it wasn't very far after an hour.

[snip]
> How many files besides those 12 big blobs do you have?

This repository has been completely stripped to the 12 revisions of the
one file.  So, there are 36 objects.

12 blobs.
12 trees.
12 commits.

That is all.

[snip]
> If you can add them into a single .tgz with instructions on how 
> to reproduce the issue and provide me with an URL where I can fetch it 
> that'd be perfect.

I will do this in an email off of the list because these files really
shouldn't be available on a public list.

Carl

-- 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 Carl Baldwin                        RADCAD (R&D CAD)
 Hewlett Packard Company
 MS 88                               work: 970 898-1523
 3404 E. Harmony Rd.                 work: Carl.N.Baldwin@hp.com
 Fort Collins, CO 80525              home: Carl@ecBaldwin.net
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 22:50                   ` Carl Baldwin
@ 2006-02-25  3:53                     ` Nicolas Pitre
  0 siblings, 0 replies; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-25  3:53 UTC (permalink / raw)
  To: Carl Baldwin; +Cc: Junio C Hamano, git

On Fri, 24 Feb 2006, Carl Baldwin wrote:

> > If you can add them into a single .tgz with instructions on how 
> > to reproduce the issue and provide me with an URL where I can fetch it 
> > that'd be perfect.
> 
> I will do this in an email off of the list because these files really
> shouldn't be available on a public list.

OK I have the files, and I can confirm that your problem is of the same 
combinatorial explosion type I already talked about, _even_ with the 
version using adler32.  This is really O(m*n) where m and n being the 
size of two consecutive versions of the files.  But since m and n are 
both _huge_ then the delta code really goes out to lunch.

I'm working on a patch to cap this to something like O(m+n).

Stay tuned.

Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 19:23           ` Carl Baldwin
  2006-02-24 20:02             ` Nicolas Pitre
@ 2006-02-24 20:02             ` Linus Torvalds
  2006-02-24 20:19               ` Nicolas Pitre
  2006-02-24 20:53               ` Junio C Hamano
  1 sibling, 2 replies; 35+ messages in thread
From: Linus Torvalds @ 2006-02-24 20:02 UTC (permalink / raw)
  To: Carl Baldwin; +Cc: Nicolas Pitre, Junio C Hamano, git

On Fri, 24 Feb 2006, Carl Baldwin wrote:
> 
> I meant that there is no benefit in disk space usage.  Packing may
> actually increase my disk space usage in this case.  Refer to what I
> said about experimentally running gzip and xdelta on the files
> independantly of git.

Yes. The deltas tend to compress a lot less well than "normal" files.

> I see what you're saying about this data reuse helping to speed up
> subsequent cloning operations.  However, if packing takes this long and
> doesn't give me any disk space savings then I don't want to pay the very
> heavy price of packing these files even the first time nor do I want to
> pay the price incrementally.

I would look at tuning the heuristics in "try_delta()" (pack-objects.c) a 
bit. That's the place that decides whether to even bother trying to make a 
delta, and how big a delta is acceptable. For example, looking at them, I 
already see one bug:

	..
        sizediff = oldsize > size ? oldsize - size : size - oldsize;
        if (sizediff > size / 8)
                return -1;
	..

we really should compare sizediff to the _smaller_ of the two sizes, and 
skip the delta if the difference in sizes is bound to be bigger than that.

However, the "size / 8" thing isn't a very strict limit anyway, so this 
probably doesn't matter (and I think Nico already removed it as part of 
his patches: the heuristic can make us avoid some deltas that would be 
ok).

The other thing to look at is "max_size": right now it initializes that to 
"size / 2 - 20", which just says that we don't ever want a delta that is 
larger than about half the result (plus the 20 byte overhead for pointing 
to the thing we delta against). Again, if you feel that normal compression 
compresses better than half, you could try changing that to

	..
	max_size = size / 4 - 20;
	..

or something like that instead (but then you need to check that it's still 
positive - otherwise the comparisons with unsigned later on are screwed 
up. Right now that value is guaranteed to be positive if only because we 
already checked

	..
	if (size < 50)
		return -1;
	..

earlier).

NOTE! Every SINGLE one of those heuristics are just totally made up by 
yours truly, and have no testing behind them. They're more of the type 
"that sounds about right" than "this is how it must be". As mentioned, 
Nico has already been playing with the heuristics - but he wanted better 
packs, not better CPU usage, so he went the other way from what you would 
want to try..

		Linus

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 20:02             ` Linus Torvalds
@ 2006-02-24 20:19               ` Nicolas Pitre
  2006-02-24 20:53               ` Junio C Hamano
  1 sibling, 0 replies; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-24 20:19 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Carl Baldwin, Junio C Hamano, git

On Fri, 24 Feb 2006, Linus Torvalds wrote:

> The other thing to look at is "max_size": right now it initializes that to 
> "size / 2 - 20", which just says that we don't ever want a delta that is 
> larger than about half the result (plus the 20 byte overhead for pointing 
> to the thing we delta against). Again, if you feel that normal compression 
> compresses better than half, you could try changing that to
> 
> 	..
> 	max_size = size / 4 - 20;
> 	..

Like I mentioned, max_size should also be caped with the deflated 
undeltified object 
size.  This value is easy to get since plain objects are already 
deflated.

> NOTE! Every SINGLE one of those heuristics are just totally made up by 
> yours truly, and have no testing behind them. They're more of the type 
> "that sounds about right" than "this is how it must be". As mentioned, 
> Nico has already been playing with the heuristics - but he wanted better 
> packs, not better CPU usage, so he went the other way from what you would 
> want to try..

Actually it's a good balance I'm after.

Using 30% more CPU for 10% smaller packs is OK I'd say.

Using 100 times the CPU for 50% saving on only one particular delta is 
not acceptable.

And using more than one hour for 200MB of data with the current window 
default is not acceptable either.


Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 20:02             ` Linus Torvalds
  2006-02-24 20:19               ` Nicolas Pitre
@ 2006-02-24 20:53               ` Junio C Hamano
  2006-02-24 21:39                 ` Nicolas Pitre
  1 sibling, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2006-02-24 20:53 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git, Linus Torvalds, Carl Baldwin

Linus Torvalds <torvalds@osdl.org> writes:

> NOTE! Every SINGLE one of those heuristics are just totally made up by 
> yours truly, and have no testing behind them. They're more of the type 
> "that sounds about right" than "this is how it must be". As mentioned, 
> Nico has already been playing with the heuristics - but he wanted better 
> packs, not better CPU usage, so he went the other way from what you would 
> want to try..

I haven't looked at Nico's original or updated code closely at
all, but two things come to mind.

(1) if we could tell the particular data is intrinsically
    diff_delta unfriendly and diff_delta would waste too much
    time when tried to delta against almost _any_ other blob,
    then it might help to give an interface in diff-delta.c for
    the caller to check for such a blob without even trying
    diff_delta.

(2) otherwise, if diff_delta could detect it would spend too
    many cycles to finish its work for a particular input early
    on, we might want it to bail out instead of trying a
    complete job.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 20:53               ` Junio C Hamano
@ 2006-02-24 21:39                 ` Nicolas Pitre
  2006-02-24 21:48                   ` Nicolas Pitre
  2006-02-25  0:45                   ` Linus Torvalds
  0 siblings, 2 replies; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-24 21:39 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Linus Torvalds, Carl Baldwin

On Fri, 24 Feb 2006, Junio C Hamano wrote:

> I haven't looked at Nico's original or updated code closely at
> all, but two things come to mind.
> 
> (1) if we could tell the particular data is intrinsically
>     diff_delta unfriendly and diff_delta would waste too much
>     time when tried to delta against almost _any_ other blob,
>     then it might help to give an interface in diff-delta.c for
>     the caller to check for such a blob without even trying
>     diff_delta.
> 
> (2) otherwise, if diff_delta could detect it would spend too
>     many cycles to finish its work for a particular input early
>     on, we might want it to bail out instead of trying a
>     complete job.

I have a patch that implements an hybrid approach.

Currently, diff-delta takes blocks of data in the reference file and 
hash them.  When the target file is scanned, it uses the hash to match 
blocks from the target file with the reference file.

If blocks are hashed evenly the cost of  producing a delta is at most 
O(n+m) where n and m are the size of the reference and target files 
respectively.  In other words, with good data set the cost is linear.

But if many blocks from the reference buffer do hash to the same bucket 
then for each block in the target file many blocks from the reference 
buffer have to be tested against, making it tend towards O(n^m) which is 
pretty highly exponential.

The solution I'm investigating is to put a limit on the number of 
entries in the same hash bucket so to bring the cost back to something 
more linear.  That means the delta might miss on better matches that 
have not been hashed but still benefit from a limited set. Experience 
seems to show that the time to deltify the first two blobs you found to 
be problematic can be reduced by 2 orders of magnitude with about only 
10% increase in the resulting delta size, and still nearly 40% smaller 
than what the current delta code produces.

The question is how to determine the best limit on the number of entries 
in the same hash bucket.

Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 21:39                 ` Nicolas Pitre
@ 2006-02-24 21:48                   ` Nicolas Pitre
  2006-02-25  0:45                   ` Linus Torvalds
  1 sibling, 0 replies; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-24 21:48 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Linus Torvalds, Carl Baldwin

On Fri, 24 Feb 2006, Nicolas Pitre wrote:

> If blocks are hashed evenly the cost of  producing a delta is at most 
> O(n+m) where n and m are the size of the reference and target files 
> respectively.  In other words, with good data set the cost is linear.
> 
> But if many blocks from the reference buffer do hash to the same bucket 
> then for each block in the target file many blocks from the reference 
> buffer have to be tested against, making it tend towards O(n^m) which is 
> pretty highly exponential.

Well, actually this is rather O(n*m) not O(n^m), but bad nevertheless.


Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 21:39                 ` Nicolas Pitre
  2006-02-24 21:48                   ` Nicolas Pitre
@ 2006-02-25  0:45                   ` Linus Torvalds
  2006-02-25  3:07                     ` Nicolas Pitre
  1 sibling, 1 reply; 35+ messages in thread
From: Linus Torvalds @ 2006-02-25  0:45 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git, Carl Baldwin

On Fri, 24 Feb 2006, Nicolas Pitre wrote:
> 
> Currently, diff-delta takes blocks of data in the reference file and 
> hash them.  When the target file is scanned, it uses the hash to match 
> blocks from the target file with the reference file.
> 
> If blocks are hashed evenly the cost of  producing a delta is at most 
> O(n+m) where n and m are the size of the reference and target files 
> respectively.  In other words, with good data set the cost is linear.

Assuming the hash is good, of course.

I think this was the problem with you trying something simpler than 
adler32..

> But if many blocks from the reference buffer do hash to the same bucket 
> then for each block in the target file many blocks from the reference 
> buffer have to be tested against, making it tend towards O(n^m) which is 
> pretty highly exponential.
> 
> The solution I'm investigating is to put a limit on the number of 
> entries in the same hash bucket so to bring the cost back to something 
> more linear.  That means the delta might miss on better matches that 
> have not been hashed but still benefit from a limited set.

Sounds fair enough.

However, you migt also want to consider another approach..

One of the biggest costs for the xdelta algorithm is probably just the 
"delta_prepare()", but at the same time, that is constant wrt the source 
buffer.

Now, the sad part is that when I wrote pack-objects, I didn't really 
understand the diff-delta algorithm, I just plugged it in. Which means 
that when I did it, I made the (obvious and simple) decision to keep the 
_result_ that we are looking at constant, and try to delta against 
different sources.

HOWEVER.

I suspect you already see where this is going..

We _could_ switch the "pack-objects" window handling around, and instead 
of looking at the object we want to pack, and looking at the ten (or 
"window") previous objects to delta against, we could do it the other way 
around: keep the object we delta against constant, and see what deltas we 
could prepare for the ten next objects.

And since the source would now be constant, you'd need to do the 
"delta_prepare()" just _once_ per window, instead of every single time.

Now, I haven't done any profiling on the diff-delta code, and maybe my 
guess that delta_prepare() is a pretty expensive part is wrong, and maybe 
it wouldn't help to switch the window probing around. But I thought I'd 
mention it as one thing to explore..

		Linus

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-25  0:45                   ` Linus Torvalds
@ 2006-02-25  3:07                     ` Nicolas Pitre
  2006-02-25  4:05                       ` Linus Torvalds
  0 siblings, 1 reply; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-25  3:07 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Junio C Hamano, git, Carl Baldwin

On Fri, 24 Feb 2006, Linus Torvalds wrote:

> 
> 
> On Fri, 24 Feb 2006, Nicolas Pitre wrote:
> > 
> > Currently, diff-delta takes blocks of data in the reference file and 
> > hash them.  When the target file is scanned, it uses the hash to match 
> > blocks from the target file with the reference file.
> > 
> > If blocks are hashed evenly the cost of  producing a delta is at most 
> > O(n+m) where n and m are the size of the reference and target files 
> > respectively.  In other words, with good data set the cost is linear.
> 
> Assuming the hash is good, of course.
> 
> I think this was the problem with you trying something simpler than 
> adler32..

Well, that's the compromize to make.  By default the version with 
adler32 used 16 byte blocks to index the reference buffer.  That means 
you can match target data against the reference only if whole 16 byte 
blocks match.  Then, if you fix a typo in the target buffer then you'll 
inevitably need 16 literal bytes in the delta instead of only 
one because you won't be able to resynchronize with the reference buffer 
until the next 16 byte block.

What I've made in my last delta patch is to reduce that 16 byte block to 
only 3 bytes.  Why 3 bytes? Because less than that produces smaller 
delta data if done with literal bytes directly, and 3 bytes provided 
enough bits to hash.  I also made those 3 byte blocks overlap so 
indexing would start at any offset with byte precision.  This really 
allows for optimal deltas so that they cannot be smaller.

Now the problem comes when indexing a reference file full of:

        0x46f8, 0x000b, 0x42fe, 0x0000, 0xffc0, 0x0001, 0xff00, 0x0008,
        0x03e0, 0x0009, 0x0f01, 0x0003, 0x8072, 0x0000, 0x0400, 0x0000,
        0x0046, 0x0003, 0x9180, 0x0001, 0x0003, 0x0008, 0x02eb, 0x0003,
        0x8072, 0x0000, 0x0400, 0x0000, 0x8010, 0x0008, 0x0010, 0x0000,
        0x0361, 0x0003, 0x037e, 0x0004, 0x3941, 0x0002, 0x0b0f, 0x0003,
        0x8072, 0x0000, 0x0400, 0x0000, 0x000a, 0x000b, 0x0346, 0x000c,
        0x11fe, 0x0000, 0x3717, 0x0003, 0x8072, 0x0000, 0x0400, 0x0000,
        0x8010, 0x0008, 0x000e, 0x0000, 0x0361, 0x0003, 0x8060, 0x0000,

There is a bunch of ", 0x" that get hashed to the same thing.  And when 
the second phase i.e. trying to find the best match into the reference 
buffer for each occurrence of the same many ", 0x" in the target buffer 
you get a conbinatorial explosion.

The adler32 made that particular example a non issue since the 
likelyhood of many 16 byte blocks to be the same is pretty low in this 
case.  But the flaw remains if for example there is lots of similar 16 
byte blocks, like a binary file with lots of zeroes for example.  In 
fact, the performance problem Carl is having does use the diff-delta 
version still using adler32.

> > But if many blocks from the reference buffer do hash to the same bucket 
> > then for each block in the target file many blocks from the reference 
> > buffer have to be tested against, making it tend towards O(n^m) which is 
> > pretty highly exponential.
> > 
> > The solution I'm investigating is to put a limit on the number of 
> > entries in the same hash bucket so to bring the cost back to something 
> > more linear.  That means the delta might miss on better matches that 
> > have not been hashed but still benefit from a limited set.
> 
> Sounds fair enough.

Testing appear to show that this is a worthwhile safety valve.  And in 
most case that safety valve should not be activated at all.

> However, you migt also want to consider another approach..
> 
> One of the biggest costs for the xdelta algorithm is probably just the 
> "delta_prepare()", but at the same time, that is constant wrt the source 
> buffer.

Actually it is not that costly.  Much much less than computing the sha1 
of the same buffer for example.

> Now, the sad part is that when I wrote pack-objects, I didn't really 
> understand the diff-delta algorithm, I just plugged it in. Which means 
> that when I did it, I made the (obvious and simple) decision to keep the 
> _result_ that we are looking at constant, and try to delta against 
> different sources.
> 
> HOWEVER.
> 
> I suspect you already see where this is going..
> 
> We _could_ switch the "pack-objects" window handling around, and instead 
> of looking at the object we want to pack, and looking at the ten (or 
> "window") previous objects to delta against, we could do it the other way 
> around: keep the object we delta against constant, and see what deltas we 
> could prepare for the ten next objects.
> 
> And since the source would now be constant, you'd need to do the 
> "delta_prepare()" just _once_ per window, instead of every single time.

Might be worth trying.  Actually, this can be tested without even 
changing the window handling just yet, since diff-delta() could return 
the index data instead of freeing it, and pack-objects can store it 
along side with the object data it tries to delta against.  That 
wouldn't be memory efficient, but at least that would give an idea of 
the magnitude of the saving on CPU time.  But I really doubt that'll 
save more than a few percent.

> 
> Now, I haven't done any profiling on the diff-delta code, and maybe my 
> guess that delta_prepare() is a pretty expensive part is wrong, and maybe 
> it wouldn't help to switch the window probing around. But I thought I'd 
> mention it as one thing to explore..

Just to give you an idea, the bulk of my current "prepare" code looks 
like this:

        /* then populate the index */
        data = buf + bufsize - 2;
        while (data > buf) {
                entry->ptr = --data;
                i = (data[0] << hshift) ^ data[1];
                i ^= (i << hshift) ^ data[2];
                entry->next = hash[i];
                hash[i] = entry++;
        }

As you can see it is pretty lightweight.

But that would probably be a worthwhile optimization to have even if it 
saves 10% of CPU time.

Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-25  3:07                     ` Nicolas Pitre
@ 2006-02-25  4:05                       ` Linus Torvalds
  2006-02-25  5:10                         ` Nicolas Pitre
  0 siblings, 1 reply; 35+ messages in thread
From: Linus Torvalds @ 2006-02-25  4:05 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git, Carl Baldwin

On Fri, 24 Feb 2006, Nicolas Pitre wrote:
> 
> Well, that's the compromize to make.  By default the version with 
> adler32 used 16 byte blocks to index the reference buffer.  That means 
> you can match target data against the reference only if whole 16 byte 
> blocks match.  Then, if you fix a typo in the target buffer then you'll 
> inevitably need 16 literal bytes in the delta instead of only 
> one because you won't be able to resynchronize with the reference buffer 
> until the next 16 byte block.

That shouldn't be true. Once you find a 16-byte match, you can search 
forward (in theory you should be able to search backwards too, but by that 
time we've already expanded the non-matching previous bytes, I think).

But I'm no xdelta expert, so I migt be wrong.

> What I've made in my last delta patch is to reduce that 16 byte block to 
> only 3 bytes.  Why 3 bytes? Because less than that produces smaller 
> delta data if done with literal bytes directly, and 3 bytes provided 
> enough bits to hash.

On the other hand, the cost is that your lookups _are_ going to be more 
expensive. Regardless of how good the hash is, basically you have 16/3 
more hash-entries to look up, so you've made compression more expensive in 
footprint, at least (I assume you've made the hash appropriately larger).

Also, at 3 bytes, insertion is at least equally dense (three bytes of data 
vs three bytes of offset into the source), and can be worse (the offset 
might be 5 bytes, no?). So it would seem like you'd be better off with 4+ 
bytes, at which point the delta should be a win.

Have you tried some half-way point, like ~8 bytes?

> Now the problem comes when indexing a reference file full of:
> 
>         0x46f8, 0x000b, 0x42fe, 0x0000, 0xffc0, 0x0001, 0xff00, 0x0008,
...
> 
> There is a bunch of ", 0x" that get hashed to the same thing.

You'll find a lot of that in any file: three or four bytes of similarity 
just doesn't sound worthwhile to go digging after. 

> The adler32 made that particular example a non issue since the 
> likelyhood of many 16 byte blocks to be the same is pretty low in this 
> case.  But the flaw remains if for example there is lots of similar 16 
> byte blocks, like a binary file with lots of zeroes for example.  In 
> fact, the performance problem Carl is having does use the diff-delta 
> version still using adler32.

Agreed. I think limiting the hash length is a fine idea regardless, I just 
think it sounds dangerous with the three-byte thing where a lot of matches 
should be expected (never mind ", 0x", just things like newlines and tabs 
in source code).

Only considering 16-byte sequences of similarities is a trade-off of 
packing cost vs win.. Not saying other block-sizes aren't worth testing, 
but I suspect trying too hard is going to be too costly.

Especially as deltas compress _worse_ the smaller they are. Bigger 
"insert" chunks probably compress a lot better than a copy chunk. 

Have you looked at the delta size vs compression?

		Linus

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-25  4:05                       ` Linus Torvalds
@ 2006-02-25  5:10                         ` Nicolas Pitre
  2006-02-25  5:35                           ` Nicolas Pitre
  2006-02-25 19:18                           ` [PATCH] diff-delta: produce optimal pack data Linus Torvalds
  0 siblings, 2 replies; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-25  5:10 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Junio C Hamano, git, Carl Baldwin

On Fri, 24 Feb 2006, Linus Torvalds wrote:

> 
> 
> On Fri, 24 Feb 2006, Nicolas Pitre wrote:
> > 
> > Well, that's the compromize to make.  By default the version with 
> > adler32 used 16 byte blocks to index the reference buffer.  That means 
> > you can match target data against the reference only if whole 16 byte 
> > blocks match.  Then, if you fix a typo in the target buffer then you'll 
> > inevitably need 16 literal bytes in the delta instead of only 
> > one because you won't be able to resynchronize with the reference buffer 
> > until the next 16 byte block.
> 
> That shouldn't be true. Once you find a 16-byte match, you can search 
> forward (in theory you should be able to search backwards too, but by that 
> time we've already expanded the non-matching previous bytes, I think).

Obviously, but that's not my point.  What I mean is that small changes 
will kind of sabotage the match since you'll have to scan forward 
(adding literal bytes to the delta output in the mean time) until you 
find another 16 byte block that matches.  This is harder to find than a 
3 byte block.  Hence you'll end up adding more literal bytes (up to 16 
of them) until you can match the reference buffer again even if you only 
flipped one byte in the target buffer.  In some cases that makes up for 
deltas that are twice as large.

> > What I've made in my last delta patch is to reduce that 16 byte block to 
> > only 3 bytes.  Why 3 bytes? Because less than that produces smaller 
> > delta data if done with literal bytes directly, and 3 bytes provided 
> > enough bits to hash.
> 
> On the other hand, the cost is that your lookups _are_ going to be more 
> expensive. Regardless of how good the hash is, basically you have 16/3 
> more hash-entries to look up, so you've made compression more expensive in 
> footprint, at least (I assume you've made the hash appropriately larger).

Yes, the hash is larger.  There is a cost in memory usage but not really 
in CPU cycles.

> Also, at 3 bytes, insertion is at least equally dense (three bytes of data 
> vs three bytes of offset into the source), and can be worse (the offset 
> might be 5 bytes, no?). So it would seem like you'd be better off with 4+ 
> bytes, at which point the delta should be a win.

The code already discriminate the space of a block copy notation given 
the offset and size vs the space for the equivalent literal bytes.  So 
the optimal encoding is always chosen already.

In fact, if you want to copy up to 15 bytes from offset 0 that will be 
encoded with only 2 bytes in the delta.  The only case that is 
suboptimal is when you want to copy only two bytes from offset 0 (2 
delta bytes) but only two bytes is mever matched by the hash lookup 
since the hash is computed with 3 bytes.  In that case 2 literal bytes 
will be added to the delta plus opcode = 3 bytes.  I considered that 
special case not worth it.  However copying a block of 3 bytes that gets 
encoded into 3 bytes of delta is quite common (that'd take 4 bytes of 
delta if they were literals).

As for using more bytes for block hashing, that increase thenumber of 
cycles to compute the hash.  The adler32 version reads 16 bytes for 
every byte offset in the target file while my latest version only reads 
3 bytes for every byte offset.  So in effect my target hash computation 
is faster than the adler32 one.  However there is potentially more 
entries in the same hash bucket to validate especially with repetitive 
data.

> Have you tried some half-way point, like ~8 bytes?

Yes, and while the needed cycles tend to remain the same on average, the 
resulting pack gets larger.

> > Now the problem comes when indexing a reference file full of:
> > 
> >         0x46f8, 0x000b, 0x42fe, 0x0000, 0xffc0, 0x0001, 0xff00, 0x0008,
> ...
> > 
> > There is a bunch of ", 0x" that get hashed to the same thing.
> 
> You'll find a lot of that in any file: three or four bytes of similarity 
> just doesn't sound worthwhile to go digging after. 

Well after having experimented a lot with multiple parameters I think 
they are worth it after all.  Not only they provide for optimal deltas, 
but their hash is faster to compute than larger blocks which seems to 
counter balance for the cost of increased hash list.

> > The adler32 made that particular example a non issue since the 
> > likelyhood of many 16 byte blocks to be the same is pretty low in this 
> > case.  But the flaw remains if for example there is lots of similar 16 
> > byte blocks, like a binary file with lots of zeroes for example.  In 
> > fact, the performance problem Carl is having does use the diff-delta 
> > version still using adler32.
> 
> Agreed. I think limiting the hash length is a fine idea regardless, I just 
> think it sounds dangerous with the three-byte thing where a lot of matches 
> should be expected (never mind ", 0x", just things like newlines and tabs 
> in source code).

They are usually less than the number of lines.  Yet if you have a 1000 
line source file and let's suppose that we keep only 50 hashed "\n\t\t" 
this is sufficient to provide enough opportunities for matching that 
plus common patterns like a following "if (" for example.

> Only considering 16-byte sequences of similarities is a trade-off of 
> packing cost vs win.. Not saying other block-sizes aren't worth testing, 
> but I suspect trying too hard is going to be too costly.

I of course looked at the time to pack vs the size reduction in my 
tests.  And really like I said above the cost is well balanced.  The 
only issue is that smaller blocks are more likely to trap into 
patological data sets.  But that problem does exist with larger blocks 
too, to a lesser degree of course but still.  For example, using a 16 
block size with adler32, computing a delta between two files 

> Especially as deltas compress _worse_ the smaller they are. Bigger 
> "insert" chunks probably compress a lot better than a copy chunk. 

Yes, but given that we favor deltas from larger to smaller already those 
inserts are already not making much differences.  They have to be quite 
large to effectively provide better zlib compression.

> Have you looked at the delta size vs compression?

That's certainly an additional test worth adding to try_delta(). 
max_size should be smaller than the original object deflated size making 
sure we won't store deltas that might end up larger than the undeltified 
object.

Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-25  5:10                         ` Nicolas Pitre
@ 2006-02-25  5:35                           ` Nicolas Pitre
  2006-03-07 23:48                             ` [RFH] zlib gurus out there? Junio C Hamano
  2006-02-25 19:18                           ` [PATCH] diff-delta: produce optimal pack data Linus Torvalds
  1 sibling, 1 reply; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-25  5:35 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Junio C Hamano, git, Carl Baldwin

OOps.... Forgot to complete one paragraph.

On Sat, 25 Feb 2006, Nicolas Pitre wrote:

> I of course looked at the time to pack vs the size reduction in my 
> tests.  And really like I said above the cost is well balanced.  The 
> only issue is that smaller blocks are more likely to trap into 
> patological data sets.  But that problem does exist with larger blocks 
> too, to a lesser degree of course but still.  For example, using a 16 
> block size with adler32, computing a delta between two files 

... as provided by Carl takes up to _nine_ minutes for a _single_ delta !

So regardless of the block size used, the issue right now has more to do 
with that combinatorial explosion than the actual block size.  And 
preventing that patological case from expending out of bounds is pretty 
easy to do.

OK I just tested a tentative patch to trap that case and the time to 
delta those two 20MB files passed from over 9 minutes to only 36 seconds 
here, with less than 10% in delta size difference.  So I think I might 
be on the right track.  Further tuning might help even further.

Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [RFH] zlib gurus out there?
  2006-02-25  5:35                           ` Nicolas Pitre
@ 2006-03-07 23:48                             ` Junio C Hamano
  2006-03-08  0:59                               ` Linus Torvalds
  2006-03-08 10:45                               ` [PATCH] write_sha1_file(): Perform Z_FULL_FLUSH between header and data Sergey Vlasov
  0 siblings, 2 replies; 35+ messages in thread
From: Junio C Hamano @ 2006-03-07 23:48 UTC (permalink / raw)
  To: git; +Cc: Nicolas Pitre, Linus Torvalds

I've been staring at reusing existing data while packing, and
this occurred to me...

During packing, suppose that we chose to store an object in
base form, undeltified.  And also suppose we have that object
loose in .git/objects/??/ directory.  We already have it in
deflated form, but with its own header.  I started wondering if
we can somehow reuse this.

A short object format brush-up lesson is in order here.  

* An undeltified object in a pack is represented like this:

 (1) the header is a dense variable size binary data, that
     encodes type and inflated length;
 (2) deflated data immediately follows the header.

* On the other hand, a loose object is represented like this:

 (1) the header looks like sprintf("%s %lu%c", type, len, 0);
 (2) concatenate the data to the header;
 (3) SHA1 checksum of the above becomes the object name.
 (4) deflate the header and data using the same z_stream, in two
     steps, like this (sha1_file.c::write_sha1_file):

	/* Compress it */
	stream.next_out = compressed;
	stream.avail_out = size;

	/* First header.. */
	stream.next_in = hdr;
	stream.avail_in = hdrlen;
	while (deflate(&stream, 0) == Z_OK)
		/* nothing */;

	/* Then the data itself.. */
	stream.next_in = buf;
	stream.avail_in = len;
	while (deflate(&stream, Z_FINISH) == Z_OK)
		/* nothing */;
	deflateEnd(&stream);
	size = stream.total_out;

So I thought... if we cause a full flush after the header part,
I can find the flush boundaries from a loose object file and
copy the rest into a packfile I am generating, after placing the
binary encoded header.  If this works, we do not have to inflate
loose object to read it and deflate it to store that in the
pack.  We will get a better packing as well, since we deflate
loose objects with Z_BEST_COMPRESSION, while packs are done with
Z_DEFAULT_COMPRESSION.  While pack-objects read from a loose
object, if we can detect that there is no full flush after the
header, we would do the traditional inflate-deflate cycle, so
this would be backward compatible.

However, I am stuck with the first step, which is to do a full
flush after the header.  An obvious change to the code quoted
above writes out a corrupt object:

	/* First header.. */
	stream.next_in = hdr;
	stream.avail_in = hdrlen;
-	while (deflate(&stream, 0) == Z_OK)
+	while (deflate(&stream, Z_FULL_FLUSH) == Z_OK)
		/* nothing */;

git-fsck-objects complains that sha1 does not match.  It appears
that the sha1_file.c::unpack_sha1_rest() somehow barfs upon
seeing the full flush, but I haven't dug into it yet.

Would anybody with more experience with zlib want to help?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFH] zlib gurus out there?
  2006-03-07 23:48                             ` [RFH] zlib gurus out there? Junio C Hamano
@ 2006-03-08  0:59                               ` Linus Torvalds
  2006-03-08  1:22                                 ` Junio C Hamano
  2006-03-08 10:45                               ` [PATCH] write_sha1_file(): Perform Z_FULL_FLUSH between header and data Sergey Vlasov
  1 sibling, 1 reply; 35+ messages in thread
From: Linus Torvalds @ 2006-03-08  0:59 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre



On Tue, 7 Mar 2006, Junio C Hamano wrote:
> 
> However, I am stuck with the first step, which is to do a full
> flush after the header.  An obvious change to the code quoted
> above writes out a corrupt object:
> 
> 	/* First header.. */
> 	stream.next_in = hdr;
> 	stream.avail_in = hdrlen;
> -	while (deflate(&stream, 0) == Z_OK)
> +	while (deflate(&stream, Z_FULL_FLUSH) == Z_OK)
> 		/* nothing */;

No, I don't think that's good. You're only doing a partial deflate, you 
can't ask for a Z_FULL_FLUSH. That only works if you give it the whole 
buffer, and you don't.

iirc, wtf, wdik, and ianal.

		Linus

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFH] zlib gurus out there?
  2006-03-08  0:59                               ` Linus Torvalds
@ 2006-03-08  1:22                                 ` Junio C Hamano
  2006-03-08  2:00                                   ` Linus Torvalds
  0 siblings, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2006-03-08  1:22 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: git

Linus Torvalds <torvalds@osdl.org> writes:

> On Tue, 7 Mar 2006, Junio C Hamano wrote:
>> 
>> However, I am stuck with the first step, which is to do a full
>> flush after the header.  An obvious change to the code quoted
>> above writes out a corrupt object:
>> 
>> 	/* First header.. */
>> 	stream.next_in = hdr;
>> 	stream.avail_in = hdrlen;
>> -	while (deflate(&stream, 0) == Z_OK)
>> +	while (deflate(&stream, Z_FULL_FLUSH) == Z_OK)
>> 		/* nothing */;
>
> No, I don't think that's good. You're only doing a partial deflate, you 
> can't ask for a Z_FULL_FLUSH. That only works if you give it the whole 
> buffer, and you don't.

So, in short there is no way to create:

    hdr part deflated.
    flush.
    data part deflated independently.

and have the current sha1_read_file() not to notice that flush,
while I can inspect the deflated stream to find the "flush", and
copy only the defalted data part into a pack?  Bummer...  I was
really shooting for full backward compatibility.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFH] zlib gurus out there?
  2006-03-08  1:22                                 ` Junio C Hamano
@ 2006-03-08  2:00                                   ` Linus Torvalds
  2006-03-08  9:46                                     ` Johannes Schindelin
  0 siblings, 1 reply; 35+ messages in thread
From: Linus Torvalds @ 2006-03-08  2:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git



On Tue, 7 Mar 2006, Junio C Hamano wrote:
> >
> > No, I don't think that's good. You're only doing a partial deflate, you 
> > can't ask for a Z_FULL_FLUSH. That only works if you give it the whole 
> > buffer, and you don't.

Actually, I misread what you were trying to do, and thought this was the 
inflate phase, not the deflate. Now that I understand what you want, 

> So, in short there is no way to create:
> 
>     hdr part deflated.
>     flush.
>     data part deflated independently.
> 
> and have the current sha1_read_file() not to notice that flush,

Actually, try the patch you already tried, except you'll need to add a 

	deflateEnd(&stream);
	deflateInit(&stream, Z_BEST_COMPRESSION);
	.. set up output parameters again ..

and you need to change the initial 

	size = deflateBound(&stream, len+hdrlen);

to

	size = deflateBound(&stream, len) + deflateBound(&stream, hdrlen);

and then you might be ok.

That said, I'm not sure I agree with what you're trying to do. 

		Linus

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFH] zlib gurus out there?
  2006-03-08  2:00                                   ` Linus Torvalds
@ 2006-03-08  9:46                                     ` Johannes Schindelin
  0 siblings, 0 replies; 35+ messages in thread
From: Johannes Schindelin @ 2006-03-08  9:46 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Junio C Hamano, git

Hi,

On Tue, 7 Mar 2006, Linus Torvalds wrote:

> On Tue, 7 Mar 2006, Junio C Hamano wrote:
> > >
> > > No, I don't think that's good. You're only doing a partial deflate, you 
> > > can't ask for a Z_FULL_FLUSH. That only works if you give it the whole 
> > > buffer, and you don't.
> 
> Actually, I misread what you were trying to do, and thought this was the 
> inflate phase, not the deflate.

I don't think it matters if it is inflate or deflate. ZLib keeps an 
internal state depending on the data. That is the whole reason why the 
packing is so good: it uses the redundancy in the data already seen to 
construct a codebook. (And that's also the reason why you can't start to 
deflate in the middle.)

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH] write_sha1_file(): Perform Z_FULL_FLUSH between header and data
  2006-03-07 23:48                             ` [RFH] zlib gurus out there? Junio C Hamano
  2006-03-08  0:59                               ` Linus Torvalds
@ 2006-03-08 10:45                               ` Sergey Vlasov
  2006-03-08 11:04                                 ` Junio C Hamano
  1 sibling, 1 reply; 35+ messages in thread
From: Sergey Vlasov @ 2006-03-08 10:45 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre, Linus Torvalds

Data after Z_FULL_FLUSH will be compressed independently of the
header, and could therefore be reused without recompressing when
creating a pack.

---

This passes "make test" and unpacking of the whole git repo with
git-fsck-objects afterwards.

However, a straight reuse still will not be possible, because
sha1write_compressed() uses deflateInit(&stream, Z_DEFAULT_COMPRESSION),
which writes zlib headers around the deflate stream, and the zlib footer
contains adler32 checksum.  So, as a minimum, you will need to
decompress the object data, calculate its adler32 checksum and write the
zlib header yourself.

 sha1_file.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

8b12d9a58e87a4c5b5a2a7b20d06fe29a5afb903
diff --git a/sha1_file.c b/sha1_file.c
index a80d849..34d4da4 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1399,7 +1399,8 @@ int write_sha1_file(void *buf, unsigned 
 	/* Set it up */
 	memset(&stream, 0, sizeof(stream));
 	deflateInit(&stream, Z_BEST_COMPRESSION);
-	size = deflateBound(&stream, len+hdrlen);
+	/* Additional 6 bytes for the Z_FULL_FLUSH marker */
+	size = deflateBound(&stream, hdrlen) + 6 + deflateBound(&stream, len);
 	compressed = xmalloc(size);
 
 	/* Compress it */
@@ -1412,6 +1413,10 @@ int write_sha1_file(void *buf, unsigned 
 	while (deflate(&stream, 0) == Z_OK)
 		/* nothing */;
 
+	/* Flush before data */
+	while (deflate(&stream, Z_FULL_FLUSH) == Z_OK)
+		/* nothing */;
+
 	/* Then the data itself.. */
 	stream.next_in = buf;
 	stream.avail_in = len;
-- 
1.2.GIT

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH] write_sha1_file(): Perform Z_FULL_FLUSH between header and data
  2006-03-08 10:45                               ` [PATCH] write_sha1_file(): Perform Z_FULL_FLUSH between header and data Sergey Vlasov
@ 2006-03-08 11:04                                 ` Junio C Hamano
  2006-03-08 14:17                                   ` Sergey Vlasov
  0 siblings, 1 reply; 35+ messages in thread
From: Junio C Hamano @ 2006-03-08 11:04 UTC (permalink / raw)
  To: Sergey Vlasov; +Cc: git

Sergey Vlasov <vsu@altlinux.ru> writes:

> However, a straight reuse still will not be possible, because
> sha1write_compressed() uses deflateInit(&stream, Z_DEFAULT_COMPRESSION),
> which writes zlib headers around the deflate stream, and the zlib footer
> contains adler32 checksum.  So, as a minimum, you will need to
> decompress the object data, calculate its adler32 checksum and write the
> zlib header yourself.

Hmph.  Thanks for helping, but it sounds like my original plan
was not useful at all.  Probably inflating would be still
cheaper than inflating and then deflating, but it would not be
as cool as a straight copy.  Sigh...

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] write_sha1_file(): Perform Z_FULL_FLUSH between header and data
  2006-03-08 11:04                                 ` Junio C Hamano
@ 2006-03-08 14:17                                   ` Sergey Vlasov
  0 siblings, 0 replies; 35+ messages in thread
From: Sergey Vlasov @ 2006-03-08 14:17 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

[-- Attachment #1: Type: text/plain, Size: 1732 bytes --]

On Wed, Mar 08, 2006 at 03:04:14AM -0800, Junio C Hamano wrote:
> Sergey Vlasov <vsu@altlinux.ru> writes:
> > However, a straight reuse still will not be possible, because
> > sha1write_compressed() uses deflateInit(&stream, Z_DEFAULT_COMPRESSION),
> > which writes zlib headers around the deflate stream, and the zlib footer
> > contains adler32 checksum.  So, as a minimum, you will need to
> > decompress the object data, calculate its adler32 checksum and write the
> > zlib header yourself.
> 
> Hmph.  Thanks for helping, but it sounds like my original plan
> was not useful at all.  Probably inflating would be still
> cheaper than inflating and then deflating, but it would not be
> as cool as a straight copy.  Sigh...

Actually you can calculate adler32 checksum of object data from
adler32(header+data) (available at the end of the loose object file),
adler32(header) (which you will need to calculate) and len(data)
(which is available in the header):

#define ADLER32_BASE	65521UL

unsigned int adler32_split(unsigned int adler_full, unsigned int adler_1,
			   unsigned long len_2)
{
	unsigned long s1_1 = adler_1 & 0xffff;
	unsigned long s1_2 = (adler_1 >> 16) & 0xffff;
	unsigned long rem = len_2 % ADLER32_BASE;
	unsigned long s_1_offset = (s1_1 + ADLER32_BASE - 1) % ADLER32_BASE;
	unsigned long s_2_offset = (s1_2 + s_1_offset*rem) % ADLER32_BASE;
	unsigned long sf_1 = adler_full & 0xffff;
	unsigned long sf_2 = (adler_full >> 16) & 0xffff;
	unsigned long s2_1 = (sf_1 + ADLER32_BASE - s_1_offset) % ADLER32_BASE;
	unsigned long s2_2 = (sf_2 + ADLER32_BASE - s_2_offset) % ADLER32_BASE;
	return (s2_2 << 16) | s2_1;
}

However, the resulting code probably won't be pretty...

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-25  5:10                         ` Nicolas Pitre
  2006-02-25  5:35                           ` Nicolas Pitre
@ 2006-02-25 19:18                           ` Linus Torvalds
  1 sibling, 0 replies; 35+ messages in thread
From: Linus Torvalds @ 2006-02-25 19:18 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git, Carl Baldwin

On Sat, 25 Feb 2006, Nicolas Pitre wrote:
> 
> Yes, the hash is larger.  There is a cost in memory usage but not really 
> in CPU cycles.

Note that memory usage translates almost 1:1 (or worse) to CPU cycles in 
almost all real-life behaviours. Only in carefully tuned benchmarks does 
it not.

Increased memory usage means more paging, and worse cache behaviour. Now, 
hashes aren't wonderful for caches in the first place, but imagine the 
hump you pass when the data doesn't fit in a 64kB L1 any more (or a 256kB 
L2). Huge.

> > You'll find a lot of that in any file: three or four bytes of similarity 
> > just doesn't sound worthwhile to go digging after. 
> 
> Well after having experimented a lot with multiple parameters I think 
> they are worth it after all.  Not only they provide for optimal deltas, 
> but their hash is faster to compute than larger blocks which seems to 
> counter balance for the cost of increased hash list.

Hey, numbers talk. If you've got the numbers, I'll just shut up ;)

		Linus

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 17:56     ` Nicolas Pitre
  2006-02-24 18:35       ` Carl Baldwin
@ 2006-02-24 18:49       ` Carl Baldwin
  2006-02-24 19:03         ` Nicolas Pitre
  1 sibling, 1 reply; 35+ messages in thread
From: Carl Baldwin @ 2006-02-24 18:49 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

I've updated to a very current master branch.  This seems to include the
pack data reuse stuff.  I've not made an attempt yet to apply your delta
patches.

git-repack quickly gets up to 5% (2/36) and hangs there.  I'll let it
run for a while just to see how far it claims to get.  I'm not hopeful.

Maybe your patches can help?

Carl

On Fri, Feb 24, 2006 at 12:56:04PM -0500, Nicolas Pitre wrote:
> On Fri, 24 Feb 2006, Carl Baldwin wrote:
> 
> > Junio,
> > 
> > This message came to me at exactly the right time.  Yesterday I was
> > exploring using git as the content storage back-end for some binary
> > files.  Up until now I've only used it for software projects.
> > 
> > I found the largest RCS file that we had in our current back-end.  It
> > contained twelve versions of a binary file.  Each version averaged about
> > 20 MB.  The ,v file from RCS was about 250MB.  I did some experiments on
> > these binary files.
> > 
> > First, gzip consistantly is able to compress these files to about 10%
> > their original size.  So, they are quite inflated.  Second, xdelta would
> > produce a delta between two neighboring revisions of about 2.5MB in size
> > that would compress down to about 2MB.  (about the same size as the next
> > revision compressed without deltification so packing is ineffective
> > here).
> > 
> > I added these 12 revisions to several version control back-ends
> > including subversion and git.  Git produced a much smaller repository
> > size than the others simply due to the compression that it applies to
> > objects.  It also was at least as fast as the others.
> > 
> > The problem came when I tried to clone this repository.
> > git-pack-objects chewed on these 12 revisions for over an hour before I
> > finally interrupted it.  As far as I could tell, it hadn't made much
> > progress.
> 
> I must ask if you had applied my latest delta patches?
> 
> Also did you use a recent version of git that implements pack data 
> reuse?
> 
> 
> Nicolas
> -
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 Carl Baldwin                        RADCAD (R&D CAD)
 Hewlett Packard Company
 MS 88                               work: 970 898-1523
 3404 E. Harmony Rd.                 work: Carl.N.Baldwin@hp.com
 Fort Collins, CO 80525              home: Carl@ecBaldwin.net
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] diff-delta: produce optimal pack data
  2006-02-24 18:49       ` Carl Baldwin
@ 2006-02-24 19:03         ` Nicolas Pitre
  0 siblings, 0 replies; 35+ messages in thread
From: Nicolas Pitre @ 2006-02-24 19:03 UTC (permalink / raw)
  To: Carl Baldwin; +Cc: Junio C Hamano, git

On Fri, 24 Feb 2006, Carl Baldwin wrote:

> I've updated to a very current master branch.  This seems to include the
> pack data reuse stuff.  I've not made an attempt yet to apply your delta
> patches.
> 
> git-repack quickly gets up to 5% (2/36) and hangs there.  I'll let it
> run for a while just to see how far it claims to get.  I'm not hopeful.

It should complete sometimes, probably after the same amount of time 
needed by your previous clone attempt.  But after that any clone 
operation should be quick.  This is clearly unacceptable but at least 
with the pack data reuse you should suffer only once for the initial 
repack.

> Maybe your patches can help?

No.  They actually make things worse performance wise, much worse in 
some special cases.

Is it possible for me to have access to 2 consecutive versions of your 
big binary file?

Nicolas

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2006-03-08 14:17 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-02-22  1:45 [PATCH] diff-delta: produce optimal pack data Nicolas Pitre
2006-02-24  8:49 ` Junio C Hamano
2006-02-24 15:37   ` Nicolas Pitre
2006-02-24 23:55     ` Junio C Hamano
2006-02-24 17:44   ` Carl Baldwin
2006-02-24 17:56     ` Nicolas Pitre
2006-02-24 18:35       ` Carl Baldwin
2006-02-24 18:57         ` Nicolas Pitre
2006-02-24 19:23           ` Carl Baldwin
2006-02-24 20:02             ` Nicolas Pitre
2006-02-24 20:40               ` Carl Baldwin
2006-02-24 21:12                 ` Nicolas Pitre
2006-02-24 22:50                   ` Carl Baldwin
2006-02-25  3:53                     ` Nicolas Pitre
2006-02-24 20:02             ` Linus Torvalds
2006-02-24 20:19               ` Nicolas Pitre
2006-02-24 20:53               ` Junio C Hamano
2006-02-24 21:39                 ` Nicolas Pitre
2006-02-24 21:48                   ` Nicolas Pitre
2006-02-25  0:45                   ` Linus Torvalds
2006-02-25  3:07                     ` Nicolas Pitre
2006-02-25  4:05                       ` Linus Torvalds
2006-02-25  5:10                         ` Nicolas Pitre
2006-02-25  5:35                           ` Nicolas Pitre
2006-03-07 23:48                             ` [RFH] zlib gurus out there? Junio C Hamano
2006-03-08  0:59                               ` Linus Torvalds
2006-03-08  1:22                                 ` Junio C Hamano
2006-03-08  2:00                                   ` Linus Torvalds
2006-03-08  9:46                                     ` Johannes Schindelin
2006-03-08 10:45                               ` [PATCH] write_sha1_file(): Perform Z_FULL_FLUSH between header and data Sergey Vlasov
2006-03-08 11:04                                 ` Junio C Hamano
2006-03-08 14:17                                   ` Sergey Vlasov
2006-02-25 19:18                           ` [PATCH] diff-delta: produce optimal pack data Linus Torvalds
2006-02-24 18:49       ` Carl Baldwin
2006-02-24 19:03         ` Nicolas Pitre

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).