git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] improve delta long block matching with big files
@ 2007-05-26  1:38 Nicolas Pitre
  0 siblings, 0 replies; 2+ messages in thread
From: Nicolas Pitre @ 2007-05-26  1:38 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Martin Koegler noted that create_delta() performs a new hash lookup
after every block copy encoding which are currently limited to 64KB.

In case of larger identical blocks, the next hash lookup would normally
point to the next 64KB block in the reference buffer and multiple block
copy operations will be consecutively encoded.

It is however possible that the reference buffer be sparsely indexed if
hash buckets have been trimmed down in create_delta_index() when hashing
of the reference buffer isn't well balanced.  In that case the hash
lookup following a block copy might fail to match anything and the fact
that the reference buffer still matches beyond the previous 64KB block
will be missed.

Let's rework the code so that buffer comparison isn't bounded to 64KB
anymore.  The match size should be as large as possible up front and
only then should multiple block copy be encoded to cover it all.
Also, fewer hash lookups will be performed in the end.

According to Martin, this patch should reduce his 92MB pack down to 75MB
with the dataset he has.

Tests performed on the Linux kernel repo show a slightly smaller pack and
a slightly faster repack.

Signed-off-by: Nicolas Pitre <nico@cam.org>

diff --git a/diff-delta.c b/diff-delta.c
index 9f998d0..09e6bcd 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -246,7 +246,7 @@ create_delta(const struct delta_index *index,
 	     const void *trg_buf, unsigned long trg_size,
 	     unsigned long *delta_size, unsigned long max_size)
 {
-	unsigned int i, outpos, outsize, val;
+	unsigned int i, outpos, outsize, moff, msize, val;
 	int inscnt;
 	const unsigned char *ref_data, *ref_top, *data, *top;
 	unsigned char *out;
@@ -291,30 +291,33 @@ create_delta(const struct delta_index *index,
 	}
 	inscnt = i;
 
+	moff = 0;
+	msize = 0;
 	while (data < top) {
-		unsigned int moff = 0, msize = 0;
-		struct index_entry *entry;
-		val ^= U[data[-RABIN_WINDOW]];
-		val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
-		i = val & index->hash_mask;
-		for (entry = index->hash[i]; entry; entry = entry->next) {
-			const unsigned char *ref = entry->ptr;
-			const unsigned char *src = data;
-			unsigned int ref_size = ref_top - ref;
-			if (entry->val != val)
-				continue;
-			if (ref_size > top - src)
-				ref_size = top - src;
-			if (ref_size > 0x10000)
-				ref_size = 0x10000;
-			if (ref_size <= msize)
-				break;
-			while (ref_size-- && *src++ == *ref)
-				ref++;
-			if (msize < ref - entry->ptr) {
-				/* this is our best match so far */
-				msize = ref - entry->ptr;
-				moff = entry->ptr - ref_data;
+		if (msize < 4096) {
+			struct index_entry *entry;
+			val ^= U[data[-RABIN_WINDOW]];
+			val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
+			i = val & index->hash_mask;
+			for (entry = index->hash[i]; entry; entry = entry->next) {
+				const unsigned char *ref = entry->ptr;
+				const unsigned char *src = data;
+				unsigned int ref_size = ref_top - ref;
+				if (entry->val != val)
+					continue;
+				if (ref_size > top - src)
+					ref_size = top - src;
+				if (ref_size <= msize)
+					break;
+				while (ref_size-- && *src++ == *ref)
+					ref++;
+				if (msize < ref - entry->ptr) {
+					/* this is our best match so far */
+					msize = ref - entry->ptr;
+					moff = entry->ptr - ref_data;
+					if (msize >= 4096) /* good enough */
+						break;
+				}
 			}
 		}
 
@@ -327,27 +330,13 @@ create_delta(const struct delta_index *index,
 				out[outpos - inscnt - 1] = inscnt;
 				inscnt = 0;
 			}
+			msize = 0;
 		} else {
+			unsigned int left;
 			unsigned char *op;
 
-			if (msize >= RABIN_WINDOW) {
-				const unsigned char *sk;
-				sk = data + msize - RABIN_WINDOW;
-				val = 0;
-				for (i = 0; i < RABIN_WINDOW; i++)
-					val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
-			} else {
-				const unsigned char *sk = data + 1;
-				for (i = 1; i < msize; i++) {
-					val ^= U[sk[-RABIN_WINDOW]];
-					val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
-				}
-			}
-
 			if (inscnt) {
 				while (moff && ref_data[moff-1] == data[-1]) {
-					if (msize == 0x10000)
-						break;
 					/* we can match one byte back */
 					msize++;
 					moff--;
@@ -363,23 +352,40 @@ create_delta(const struct delta_index *index,
 				inscnt = 0;
 			}
 
-			data += msize;
+			/* A copy op is currently limited to 64KB (pack v2) */
+			left = (msize < 0x10000) ? 0 : (msize - 0x10000);
+			msize -= left;
+
 			op = out + outpos++;
 			i = 0x80;
 
-			if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
-			moff >>= 8;
-			if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
-			moff >>= 8;
-			if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
-			moff >>= 8;
-			if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
+			if (moff & 0x000000ff)
+				out[outpos++] = moff >> 0,  i |= 0x01;
+			if (moff & 0x0000ff00)
+				out[outpos++] = moff >> 8,  i |= 0x02;
+			if (moff & 0x00ff0000)
+				out[outpos++] = moff >> 16, i |= 0x04;
+			if (moff & 0xff000000)
+				out[outpos++] = moff >> 24, i |= 0x08;
 
-			if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
-			msize >>= 8;
-			if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
+			if (msize & 0x00ff)
+				out[outpos++] = msize >> 0, i |= 0x10;
+			if (msize & 0xff00)
+				out[outpos++] = msize >> 8, i |= 0x20;
 
 			*op = i;
+
+			data += msize;
+			moff += msize;
+			msize = left;
+
+			if (msize < 4096) {
+				int j;
+				val = 0;
+				for (j = -RABIN_WINDOW; j < 0; j++)
+					val = ((val << 8) | data[j])
+					      ^ T[val >> RABIN_SHIFT];
+			}
 		}
 
 		if (outpos >= outsize - MAX_OP_SIZE) {

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] improve delta long block matching with big files
@ 2007-05-26 15:19 Martin Koegler
  0 siblings, 0 replies; 2+ messages in thread
From: Martin Koegler @ 2007-05-26 15:19 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

Nicolas Pitre wrote:
> Martin Koegler noted that create_delta() performs a new hash lookup
> after every block copy encoding which are currently limited to 64KB.
> 
> In case of larger identical blocks, the next hash lookup would normally
> point to the next 64KB block in the reference buffer and multiple block
> copy operations will be consecutively encoded.
> 
> It is however possible that the reference buffer be sparsely indexed if
> hash buckets have been trimmed down in create_delta_index() when hashing
> of the reference buffer isn't well balanced.  In that case the hash
> lookup following a block copy might fail to match anything and the fact
> that the reference buffer still matches beyond the previous 64KB block
> will be missed.
> 
> Let's rework the code so that buffer comparison isn't bounded to 64KB
> anymore.  The match size should be as large as possible up front and
> only then should multiple block copy be encoded to cover it all.
> Also, fewer hash lookups will be performed in the end.
> 
> According to Martin, this patch should reduce his 92MB pack down to 75MB
> with the dataset he has.
> 
> Tests performed on the Linux kernel repo show a slightly smaller pack and
> a slightly faster repack.
>
Acked-by: Martin Koegler <mkoegler@auto.tuwien.ac.at>
> Signed-off-by: Nicolas Pitre <nico@cam.org>
---

The patch results in a 75 MB pack file for my repository and is
faster:

Total 6452 (delta 4581), reused 1522 (delta 0)
10073.11user 5200.33system 4:14:36elapsed 99%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (0major+1371504760minor)pagefaults 0swaps

mfg Martin Kögler

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2007-05-26 15:19 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-05-26  1:38 [PATCH] improve delta long block matching with big files Nicolas Pitre
  -- strict thread matches above, loose matches on Subject: below --
2007-05-26 15:19 Martin Koegler

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).