git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Nicolas Pitre <nico@cam.org>
To: Junio C Hamano <junkio@cox.net>
Cc: git@vger.kernel.org
Subject: [PATCH] improve diff-delta with sparse and/or repetitive data
Date: Tue, 02 May 2006 23:31:00 -0400 (EDT)	[thread overview]
Message-ID: <Pine.LNX.4.64.0605022226070.28543@localhost.localdomain> (raw)

It is useless to preserve multiple hash entries for consecutive blocks 
with the same hash.  Keeping only the first one will allow for matching 
the longest string of identical bytes while subsequent blocks will only 
allow for shorter matches.  The backward matching code will match the 
end of it as necessary.

This improves both performances (no repeated string compare with long 
successions of identical bytes, or even small group of bytes), as well 
as compression (less likely to need random hash bucket entry culling), 
especially with sparse files.

With well behaved data sets this patch doesn't change much.

Signed-off-by: Nicolas Pitre <nico@cam.org>

---

diff --git a/diff-delta.c b/diff-delta.c
index 35e517d..c618875 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -136,11 +136,12 @@ struct delta_index {
 
 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 {
-	unsigned int i, hsize, hmask, entries, *hash_count;
+	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 	const unsigned char *data, *buffer = buf;
 	struct delta_index *index;
 	struct index_entry *entry, **hash;
 	void *mem;
+	unsigned long memsize;
 
 	if (!buf || !bufsize)
 		return NULL;
@@ -155,9 +156,10 @@ struct delta_index * create_delta_index(
 	hmask = hsize - 1;
 
 	/* allocate lookup index */
-	mem = malloc(sizeof(*index) +
-		     sizeof(*hash) * hsize +
-		     sizeof(*entry) * entries);
+	memsize = sizeof(*index) +
+		  sizeof(*hash) * hsize +
+		  sizeof(*entry) * entries;
+	mem = malloc(memsize);
 	if (!mem)
 		return NULL;
 	index = mem;
@@ -179,18 +181,26 @@ struct delta_index * create_delta_index(
 	}
 
 	/* then populate the index */
-	data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
-	while (data >= buffer) {
+	prev_val = ~0;
+	for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
+	     data >= buffer;
+	     data -= RABIN_WINDOW) {
 		unsigned int val = 0;
 		for (i = 1; i <= RABIN_WINDOW; i++)
 			val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
-		i = val & hmask;
-		entry->ptr = data + RABIN_WINDOW;
-		entry->val = val;
-		entry->next = hash[i];
-		hash[i] = entry++;
-		hash_count[i]++;
-		data -= RABIN_WINDOW;
+		if (val == prev_val) {
+			/* keep the lowest of consecutive identical blocks */
+			entry[-1].ptr = data + RABIN_WINDOW;
+		} else {
+			prev_val = val;
+			i = val & hmask;
+			entry->ptr = data + RABIN_WINDOW;
+			entry->val = val;
+			entry->next = hash[i];
+			hash[i] = entry++;
+			hash_count[i]++;
+			entries--;
+		}
 	}
 
 	/*
@@ -220,6 +230,10 @@ struct delta_index * create_delta_index(
 	}
 	free(hash_count);
 
+	/* If we didn't use all hash entries, free the unused memory. */
+	if (entries)
+		index = realloc(index, memsize - entries * sizeof(*entry));
+
 	return index;
 }
 

             reply	other threads:[~2006-05-03  3:31 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-05-03  3:31 Nicolas Pitre [this message]
2006-05-03  3:46 ` [PATCH] tiny optimization to diff-delta Nicolas Pitre

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Pine.LNX.4.64.0605022226070.28543@localhost.localdomain \
    --to=nico@cam.org \
    --cc=git@vger.kernel.org \
    --cc=junkio@cox.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).