* [PATCH] improve diff-delta with sparse and/or repetitive data
@ 2006-05-03 3:31 Nicolas Pitre
2006-05-03 3:46 ` [PATCH] tiny optimization to diff-delta Nicolas Pitre
0 siblings, 1 reply; 2+ messages in thread
From: Nicolas Pitre @ 2006-05-03 3:31 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git
It is useless to preserve multiple hash entries for consecutive blocks
with the same hash. Keeping only the first one will allow for matching
the longest string of identical bytes while subsequent blocks will only
allow for shorter matches. The backward matching code will match the
end of it as necessary.
This improves both performances (no repeated string compare with long
successions of identical bytes, or even small group of bytes), as well
as compression (less likely to need random hash bucket entry culling),
especially with sparse files.
With well behaved data sets this patch doesn't change much.
Signed-off-by: Nicolas Pitre <nico@cam.org>
---
diff --git a/diff-delta.c b/diff-delta.c
index 35e517d..c618875 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -136,11 +136,12 @@ struct delta_index {
struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
{
- unsigned int i, hsize, hmask, entries, *hash_count;
+ unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
const unsigned char *data, *buffer = buf;
struct delta_index *index;
struct index_entry *entry, **hash;
void *mem;
+ unsigned long memsize;
if (!buf || !bufsize)
return NULL;
@@ -155,9 +156,10 @@ struct delta_index * create_delta_index(
hmask = hsize - 1;
/* allocate lookup index */
- mem = malloc(sizeof(*index) +
- sizeof(*hash) * hsize +
- sizeof(*entry) * entries);
+ memsize = sizeof(*index) +
+ sizeof(*hash) * hsize +
+ sizeof(*entry) * entries;
+ mem = malloc(memsize);
if (!mem)
return NULL;
index = mem;
@@ -179,18 +181,26 @@ struct delta_index * create_delta_index(
}
/* then populate the index */
- data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
- while (data >= buffer) {
+ prev_val = ~0;
+ for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
+ data >= buffer;
+ data -= RABIN_WINDOW) {
unsigned int val = 0;
for (i = 1; i <= RABIN_WINDOW; i++)
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
- i = val & hmask;
- entry->ptr = data + RABIN_WINDOW;
- entry->val = val;
- entry->next = hash[i];
- hash[i] = entry++;
- hash_count[i]++;
- data -= RABIN_WINDOW;
+ if (val == prev_val) {
+ /* keep the lowest of consecutive identical blocks */
+ entry[-1].ptr = data + RABIN_WINDOW;
+ } else {
+ prev_val = val;
+ i = val & hmask;
+ entry->ptr = data + RABIN_WINDOW;
+ entry->val = val;
+ entry->next = hash[i];
+ hash[i] = entry++;
+ hash_count[i]++;
+ entries--;
+ }
}
/*
@@ -220,6 +230,10 @@ struct delta_index * create_delta_index(
}
free(hash_count);
+ /* If we didn't use all hash entries, free the unused memory. */
+ if (entries)
+ index = realloc(index, memsize - entries * sizeof(*entry));
+
return index;
}
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH] tiny optimization to diff-delta
2006-05-03 3:31 [PATCH] improve diff-delta with sparse and/or repetitive data Nicolas Pitre
@ 2006-05-03 3:46 ` Nicolas Pitre
0 siblings, 0 replies; 2+ messages in thread
From: Nicolas Pitre @ 2006-05-03 3:46 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git
This is my assembly freak side looking at generated code again. And
since create_delta() is certainly pretty high on the radar every bits
count. In this case shorter code is generated if hash_mask is not
copied to a local variable.
Signed-off-by: Nicolas Pitre <nico@cam.org>
---
diff --git a/diff-delta.c b/diff-delta.c
index 26540d8..c618875 100644
--- a/diff-delta.c
+++ b/diff-delta.c
@@ -253,7 +253,7 @@ create_delta(const struct delta_index *i
const void *trg_buf, unsigned long trg_size,
unsigned long *delta_size, unsigned long max_size)
{
- unsigned int i, outpos, outsize, hash_mask, val;
+ unsigned int i, outpos, outsize, val;
int inscnt;
const unsigned char *ref_data, *ref_top, *data, *top;
unsigned char *out;
@@ -289,7 +289,6 @@ create_delta(const struct delta_index *i
ref_top = ref_data + index->src_size;
data = trg_buf;
top = trg_buf + trg_size;
- hash_mask = index->hash_mask;
outpos++;
val = 0;
@@ -304,7 +303,7 @@ create_delta(const struct delta_index *i
struct index_entry *entry;
val ^= U[data[-RABIN_WINDOW]];
val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
- i = val & hash_mask;
+ i = val & index->hash_mask;
for (entry = index->hash[i]; entry; entry = entry->next) {
const unsigned char *ref = entry->ptr;
const unsigned char *src = data;
^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2006-05-03 3:46 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-05-03 3:31 [PATCH] improve diff-delta with sparse and/or repetitive data Nicolas Pitre
2006-05-03 3:46 ` [PATCH] tiny optimization to diff-delta Nicolas Pitre
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).