[PATCH] improved delta support for git

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] improved delta support for git
@ 2005-05-12  3:51 Nicolas Pitre
  2005-05-12  4:36 ` Junio C Hamano
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Nicolas Pitre @ 2005-05-12  3:51 UTC (permalink / raw)
  To: git


OK, here's some improved support for delta objects in a git repository. 
This patch adds the ability to create and restore delta objects with the 
git-mkdelta command.  A list of objects is provided and the 
corresponding delta chain is created.  The maximum depth of a delta 
chain can be specified with the -d argument.  If a max depth of 0 is 
provided then all given objects are undeltafied and replaced by their 
original version.  With the -v argument a lot of lovely details are 
printed out.

Also included is a script to deltafy an entire repository.  Simply 
execute git-deltafy-script to create deltas of objects corresponding to 
successive previous versions of every files.  Running 
'git-deltafy-script -d 0' will revert everything to non deltafied form.

I've yet to add suport to fsck-cache to understand delta objects.  It is 
advised to undeltafy your repository before running it otherwise you'll 
see lots of reported errors.  Once undeltafied you should have good 
output from fsck-cache again.

Please backup your repository before playing with this for now... just 
in case.

If you happen to have the whole kernel history in your repository I'd be 
interested to know what the space figure is and how it performs.  So far 
I tested a tar of the .git/objects directory from git's git repository.  
This is to estimate the real data size without the filesystem block 
round up. The undeltafied repository created a 1708kb tar file while the 
deltafied repository created a 1173kb tar file.  The chunking storage 
code should be considered for real life usage of course.

There are probably things to experiment in order to save space further, 
such as deltafying tree objects, and in the context of Linux, deltafying 
files with lots of similitudes between content in diferent include/asm-* 
subdirectories.

Signed-off-by: Nicolas Pitre <nico@cam.org>

Index: git/diff-delta.c
===================================================================
--- /dev/null
+++ git/diff-delta.c
@@ -0,0 +1,330 @@
+/*
+ * diff-delta.c: generate a delta between two buffers
+ *
+ *  Many parts of this file have been lifted from LibXDiff version 0.10.
+ *  http://www.xmailserver.org/xdiff-lib.html
+ *
+ *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2003	Davide Libenzi
+ *
+ *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
+ *
+ *  This file is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ */
+
+#include <stdlib.h>
+#include "delta.h"
+
+
+/* block size: min = 16, max = 64k, power of 2 */
+#define BLK_SIZE 16
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define GR_PRIME 0x9e370001
+#define HASH(v, b) (((unsigned int)(v) * GR_PRIME) >> (32 - (b)))
+	
+/* largest prime smaller than 65536 */
+#define BASE 65521
+
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+#define NMAX 5552
+
+#define DO1(buf, i)  { s1 += buf[i]; s2 += s1; }
+#define DO2(buf, i)  DO1(buf, i); DO1(buf, i + 1);
+#define DO4(buf, i)  DO2(buf, i); DO2(buf, i + 2);
+#define DO8(buf, i)  DO4(buf, i); DO4(buf, i + 4);
+#define DO16(buf)    DO8(buf, 0); DO8(buf, 8);
+
+static unsigned int adler32(unsigned int adler, const unsigned char *buf, int len)
+{
+	int k;
+	unsigned int s1 = adler & 0xffff;
+	unsigned int s2 = adler >> 16;
+
+	while (len > 0) {
+		k = MIN(len, NMAX);
+		len -= k;
+		while (k >= 16) {
+			DO16(buf);
+			buf += 16;
+			k -= 16;
+		}
+		if (k != 0)
+			do {
+				s1 += *buf++;
+				s2 += s1;
+			} while (--k);
+		s1 %= BASE;
+		s2 %= BASE;
+	}
+
+	return (s2 << 16) | s1;
+}
+
+static unsigned int hashbits(unsigned int size)
+{
+	unsigned int val = 1, bits = 0;
+	while (val < size && bits < 32) {
+		val <<= 1;
+	       	bits++;
+	}
+	return bits ? bits: 1;
+}
+
+typedef struct s_chanode {
+	struct s_chanode *next;
+	int icurr;
+} chanode_t;
+
+typedef struct s_chastore {
+	chanode_t *head, *tail;
+	int isize, nsize;
+	chanode_t *ancur;
+	chanode_t *sncur;
+	int scurr;
+} chastore_t;
+
+static void cha_init(chastore_t *cha, int isize, int icount)
+{
+	cha->head = cha->tail = NULL;
+	cha->isize = isize;
+	cha->nsize = icount * isize;
+	cha->ancur = cha->sncur = NULL;
+	cha->scurr = 0;
+}
+
+static void *cha_alloc(chastore_t *cha)
+{
+	chanode_t *ancur;
+	void *data;
+
+	ancur = cha->ancur;
+	if (!ancur || ancur->icurr == cha->nsize) {
+		ancur = malloc(sizeof(chanode_t) + cha->nsize);
+		if (!ancur)
+			return NULL;
+		ancur->icurr = 0;
+		ancur->next = NULL;
+		if (cha->tail)
+			cha->tail->next = ancur;
+		if (!cha->head)
+			cha->head = ancur;
+		cha->tail = ancur;
+		cha->ancur = ancur;
+	}
+
+	data = (void *)ancur + sizeof(chanode_t) + ancur->icurr;
+	ancur->icurr += cha->isize;
+	return data;
+}
+
+static void cha_free(chastore_t *cha)
+{
+	chanode_t *cur = cha->head;
+	while (cur) {
+		chanode_t *tmp = cur;
+		cur = cur->next;
+		free(tmp);
+	}
+}
+
+typedef struct s_bdrecord {
+	struct s_bdrecord *next;
+	unsigned int fp;
+	const unsigned char *ptr;
+} bdrecord_t;
+
+typedef struct s_bdfile {
+	const unsigned char *data, *top;
+	chastore_t cha;
+	unsigned int fphbits;
+	bdrecord_t **fphash;
+} bdfile_t;
+
+static int delta_prepare(const unsigned char *buf, int bufsize, bdfile_t *bdf)
+{
+	unsigned int fphbits;
+	int i, hsize;
+	const unsigned char *base, *data, *top;
+	bdrecord_t *brec;
+	bdrecord_t **fphash;
+
+	fphbits = hashbits(bufsize / BLK_SIZE + 1);
+	hsize = 1 << fphbits;
+	fphash = malloc(hsize * sizeof(bdrecord_t *));
+	if (!fphash)
+		return -1;
+	for (i = 0; i < hsize; i++)
+		fphash[i] = NULL;
+	cha_init(&bdf->cha, sizeof(bdrecord_t), hsize / 4 + 1);
+
+	bdf->data = data = base = buf;
+	bdf->top = top = buf + bufsize;
+	data += (bufsize / BLK_SIZE) * BLK_SIZE;
+	if (data == top)
+		data -= BLK_SIZE;
+
+	for ( ; data >= base; data -= BLK_SIZE) {
+		brec = cha_alloc(&bdf->cha);
+		if (!brec) {
+			cha_free(&bdf->cha);
+			free(fphash);
+			return -1;
+		}
+		brec->fp = adler32(0, data, MIN(BLK_SIZE, top - data));
+		brec->ptr = data;
+		i = HASH(brec->fp, fphbits);
+		brec->next = fphash[i];
+		fphash[i] = brec;
+	}
+
+	bdf->fphbits = fphbits;
+	bdf->fphash = fphash;
+
+	return 0;
+}
+
+static void delta_cleanup(bdfile_t *bdf)
+{
+	free(bdf->fphash);
+	cha_free(&bdf->cha);
+}
+
+#define COPYOP_SIZE(o, s) \
+    (!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \
+     !!(s & 0xff) + !!(s & 0xff00) + 1)
+
+void *diff_delta(void *from_buf, unsigned long from_size,
+		 void *to_buf, unsigned long to_size,
+		 unsigned long *delta_size)
+{
+	int i, outpos, outsize, inscnt, csize, msize, moff;
+	unsigned int fp;
+	const unsigned char *data, *top, *ptr1, *ptr2;
+	unsigned char *out, *orig;
+	bdrecord_t *brec;
+	bdfile_t bdf;
+
+	if (!from_size || !to_size || delta_prepare(from_buf, from_size, &bdf))
+		return NULL;
+	
+	outpos = 0;
+	outsize = 8192;
+	out = malloc(outsize);
+	if (!out) {
+		delta_cleanup(&bdf);
+		return NULL;
+	}
+
+	data = to_buf;
+	top = to_buf + to_size;
+
+	/* store reference buffer size */
+	orig = out + outpos++;
+	*orig = i = 0;
+	do {
+		if (from_size & 0xff) {
+			*orig |= (1 << i);
+			out[outpos++] = from_size;
+		}
+		i++;
+		from_size >>= 8;
+	} while (from_size);
+
+	/* store target buffer size */
+	orig = out + outpos++;
+	*orig = i = 0;
+	do {
+		if (to_size & 0xff) {
+			*orig |= (1 << i);
+			out[outpos++] = to_size;
+		}
+		i++;
+		to_size >>= 8;
+	} while (to_size);
+
+	inscnt = 0;
+	moff = 0;
+	while (data < top) {
+		msize = 0;
+		fp = adler32(0, data, MIN(top - data, BLK_SIZE));
+		i = HASH(fp, bdf.fphbits);
+		for (brec = bdf.fphash[i]; brec; brec = brec->next) {
+			if (brec->fp == fp) {
+				csize = bdf.top - brec->ptr;
+				if (csize > top - data)
+					csize = top - data;
+				for (ptr1 = brec->ptr, ptr2 = data; 
+				     csize && *ptr1 == *ptr2;
+				     csize--, ptr1++, ptr2++);
+
+				csize = ptr1 - brec->ptr;
+				if (csize > msize) {
+					moff = brec->ptr - bdf.data;
+					msize = csize;
+					if (msize >= 0x10000) {
+						msize = 0x10000;
+						break;
+					}
+				}
+			}
+		}
+
+		if (!msize || msize < COPYOP_SIZE(moff, msize)) {
+			if (!inscnt)
+				outpos++;
+			out[outpos++] = *data++;
+			inscnt++;
+			if (inscnt == 0x7f) {
+				out[outpos - inscnt - 1] = inscnt;
+				inscnt = 0;
+			}
+		} else {
+			if (inscnt) {
+				out[outpos - inscnt - 1] = inscnt;
+				inscnt = 0;
+			}
+
+			data += msize;
+			orig = out + outpos++;
+			i = 0x80;
+
+			if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
+			moff >>= 8;
+			if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
+			moff >>= 8;
+			if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
+			moff >>= 8;
+			if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
+
+			if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
+			msize >>= 8;
+			if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
+
+			*orig = i;
+		}
+
+		/* next time around the largest possible output is 1 + 4 + 3 */
+		if (outpos > outsize - 8) {
+			void *tmp = out;
+			outsize = outsize * 3 / 2;
+			out = realloc(out, outsize);
+			if (!out) {
+				free(tmp);
+				delta_cleanup(&bdf);
+				return NULL;
+			}
+		}
+	}
+
+	if (inscnt)
+		out[outpos - inscnt - 1] = inscnt;
+
+	delta_cleanup(&bdf);
+	*delta_size = outpos;
+	return out;
+}
Index: git/delta.h
===================================================================
--- /dev/null
+++ git/delta.h
@@ -0,0 +1,6 @@
+extern void *diff_delta(void *from_buf, unsigned long from_size,
+			void *to_buf, unsigned long to_size,
+		        unsigned long *delta_size);
+extern void *patch_delta(void *src_buf, unsigned long src_size,
+			 void *delta_buf, unsigned long delta_size,
+			 unsigned long *dst_size);
Index: git/Makefile
===================================================================
--- git.orig/Makefile
+++ git/Makefile
@@ -29,7 +29,7 @@
 	install $(PROG) $(SCRIPTS) $(HOME)/bin/
 
 LIB_OBJS=read-cache.o sha1_file.o usage.o object.o commit.o tree.o blob.o \
-	 tag.o date.o
+	 tag.o date.o diff-delta.o patch-delta.o
 LIB_FILE=libgit.a
 LIB_H=cache.h object.h blob.h tree.h commit.h tag.h
 
@@ -63,6 +63,9 @@
 test-date: test-date.c date.o
 	$(CC) $(CFLAGS) -o $@ test-date.c date.o
 
+test-delta: test-delta.c diff-delta.o patch-delta.o
+	$(CC) $(CFLAGS) -o $@ $^
+
 git-%: %.c $(LIB_FILE)
 	$(CC) $(CFLAGS) -o $@ $(filter %.c,$^) $(LIBS)
 
Index: git/patch-delta.c
===================================================================
--- /dev/null
+++ git/patch-delta.c
@@ -0,0 +1,88 @@
+/*
+ * patch-delta.c:
+ * recreate a buffer from a source and the delta produced by diff-delta.c
+ *
+ * (C) 2005 Nicolas Pitre <nico@cam.org>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "delta.h"
+
+void *patch_delta(void *src_buf, unsigned long src_size,
+		  void *delta_buf, unsigned long delta_size,
+		  unsigned long *dst_size)
+{
+	const unsigned char *data, *top;
+	unsigned char *dst_buf, *out, cmd;
+	unsigned long size;
+	int i;
+
+	/* the smallest delta size possible is 6 bytes */
+	if (delta_size < 6)
+		return NULL;
+
+	data = delta_buf;
+	top = delta_buf + delta_size;
+
+	/* make sure the orig file size matches what we expect */
+	size = i = 0;
+	cmd = *data++;
+	while (cmd) {
+		if (cmd & 1)
+			size |= *data++ << i;
+		i += 8;
+		cmd >>= 1;
+	}
+	if (size != src_size)
+		return NULL;
+
+	/* now the result size */
+	size = i = 0;
+	cmd = *data++;
+	while (cmd) {
+		if (cmd & 1)
+			size |= *data++ << i;
+		i += 8;
+		cmd >>= 1;
+	}
+	dst_buf = malloc(size);
+	if (!dst_buf)
+		return NULL;
+
+	out = dst_buf;
+	while (data < top) {
+		cmd = *data++;
+		if (cmd & 0x80) {
+			unsigned long cp_off = 0, cp_size = 0;
+			const unsigned char *buf;
+			if (cmd & 0x01) cp_off = *data++;
+			if (cmd & 0x02) cp_off |= (*data++ << 8);
+			if (cmd & 0x04) cp_off |= (*data++ << 16);
+			if (cmd & 0x08) cp_off |= (*data++ << 24);
+			if (cmd & 0x10) cp_size = *data++;
+			if (cmd & 0x20) cp_size |= (*data++ << 8);
+			if (cp_size == 0) cp_size = 0x10000;
+			buf = (cmd & 0x40) ? dst_buf : src_buf;
+			memcpy(out, buf + cp_off, cp_size);
+			out += cp_size;
+		} else {
+			memcpy(out, data, cmd);
+			out += cmd;
+			data += cmd;
+		}
+	}
+
+	/* sanity check */
+	if (data != top || out - dst_buf != size) {
+		free(dst_buf);
+		return NULL;
+	}
+
+	*dst_size = size;
+	return dst_buf;
+}
Index: git/test-delta.c
===================================================================
--- /dev/null
+++ git/test-delta.c
@@ -0,0 +1,79 @@
+/*
+ * test-delta.c: test code to exercise diff-delta.c and patch-delta.c
+ *
+ * (C) 2005 Nicolas Pitre <nico@cam.org>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include "delta.h"
+
+static const char *usage =
+	"test-delta (-d|-p) <from_file> <data_file> <out_file>";
+
+int main(int argc, char *argv[])
+{
+	int fd;
+	struct stat st;
+	void *from_buf, *data_buf, *out_buf;
+	unsigned long from_size, data_size, out_size;
+
+	if (argc != 5 || (strcmp(argv[1], "-d") && strcmp(argv[1], "-p"))) {
+		fprintf(stderr, "Usage: %s\n", usage);
+		return 1;
+	}
+
+	fd = open(argv[2], O_RDONLY);
+	if (fd < 0 || fstat(fd, &st)) {
+		perror(argv[2]);
+		return 1;
+	}
+	from_size = st.st_size;
+	from_buf = mmap(NULL, from_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (from_buf == MAP_FAILED) {
+		perror(argv[2]);
+		return 1;
+	}
+	close(fd);
+
+	fd = open(argv[3], O_RDONLY);
+	if (fd < 0 || fstat(fd, &st)) {
+		perror(argv[3]);
+		return 1;
+	}
+	data_size = st.st_size;
+	data_buf = mmap(NULL, data_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (data_buf == MAP_FAILED) {
+		perror(argv[3]);
+		return 1;
+	}
+	close(fd);
+
+	if (argv[1][1] == 'd')
+		out_buf = diff_delta(from_buf, from_size,
+				     data_buf, data_size, &out_size);
+	else
+		out_buf = patch_delta(from_buf, from_size,
+				      data_buf, data_size, &out_size);
+	if (!out_buf) {
+		fprintf(stderr, "delta operation failed (returned NULL)\n");
+		return 1;
+	}
+
+	fd = open (argv[4], O_WRONLY|O_CREAT|O_TRUNC, 0666);
+	if (fd < 0 || write(fd, out_buf, out_size) != out_size) {
+		perror(argv[4]);
+		return 1;
+	}
+
+	return 0;
+}
Index: git/sha1_file.c
===================================================================
--- git.orig/sha1_file.c
+++ git/sha1_file.c
@@ -8,6 +8,7 @@
  */
 #include <stdarg.h>
 #include "cache.h"
+#include "delta.h"
 
 #ifndef O_NOATIME
 #if defined(__linux__) && (defined(__i386__) || defined(__PPC__))
@@ -224,6 +225,19 @@
 	if (map) {
 		buf = unpack_sha1_file(map, mapsize, type, size);
 		munmap(map, mapsize);
+		if (buf && !strcmp(type, "delta")) {
+			void *ref = NULL, *delta = buf;
+			unsigned long ref_size, delta_size = *size;
+			buf = NULL;
+			if (delta_size > 20)
+				ref = read_sha1_file(delta, type, &ref_size);
+			if (ref)
+				buf = patch_delta(ref, ref_size,
+						  delta+20, delta_size-20, 
+						  size);
+			free(delta);
+			free(ref);
+		}
 		return buf;
 	}
 	return NULL;
Index: git/Makefile
===================================================================
--- git.orig/Makefile
+++ git/Makefile
@@ -13,7 +13,7 @@
 AR=ar
 
 SCRIPTS=git-apply-patch-script git-merge-one-file-script git-prune-script \
-	git-pull-script git-tag-script git-resolve-script
+	git-pull-script git-tag-script git-resolve-script git-deltafy-script
 
 PROG=   git-update-cache git-diff-files git-init-db git-write-tree \
 	git-read-tree git-commit-tree git-cat-file git-fsck-cache \
@@ -21,7 +21,8 @@
 	git-check-files git-ls-tree git-merge-base git-merge-cache \
 	git-unpack-file git-export git-diff-cache git-convert-cache \
 	git-http-pull git-rpush git-rpull git-rev-list git-mktag \
-	git-diff-tree-helper git-tar-tree git-local-pull git-write-blob
+	git-diff-tree-helper git-tar-tree git-local-pull git-write-blob \
+	git-mkdelta
 
 all: $(PROG)
 
@@ -95,6 +96,7 @@
 git-rpull: rsh.c pull.c
 git-rev-list: rev-list.c
 git-mktag: mktag.c
+git-mkdelta: mkdelta.c
 git-diff-tree-helper: diff-tree-helper.c
 git-tar-tree: tar-tree.c
 git-write-blob: write-blob.c
Index: git/mkdelta.c
===================================================================
--- /dev/null
+++ git/mkdelta.c
@@ -0,0 +1,283 @@
+/*
+ * Deltafication of a GIT database.
+ *
+ * (C) 2005 Nicolas Pitre <nico@cam.org>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "cache.h"
+#include "delta.h"
+
+static int replace_object(char *buf, unsigned long len, unsigned char *sha1,
+			  char *hdr, int hdrlen)
+{
+	char tmpfile[PATH_MAX];
+	int size;
+	char *compressed;
+	z_stream stream;
+	int fd;
+
+	snprintf(tmpfile, sizeof(tmpfile), "%s/obj_XXXXXX", get_object_directory());
+	fd = mkstemp(tmpfile);
+	if (fd < 0)
+		return error("%s: %s\n", tmpfile, strerror(errno));
+	
+	/* Set it up */
+	memset(&stream, 0, sizeof(stream));
+	deflateInit(&stream, Z_BEST_COMPRESSION);
+	size = deflateBound(&stream, len+hdrlen);
+	compressed = xmalloc(size);
+
+	/* Compress it */
+	stream.next_out = compressed;
+	stream.avail_out = size;
+
+	/* First header.. */
+	stream.next_in = hdr;
+	stream.avail_in = hdrlen;
+	while (deflate(&stream, 0) == Z_OK)
+		/* nothing */;
+
+	/* Then the data itself.. */
+	stream.next_in = buf;
+	stream.avail_in = len;
+	while (deflate(&stream, Z_FINISH) == Z_OK)
+		/* nothing */;
+	deflateEnd(&stream);
+	size = stream.total_out;
+
+	if (write(fd, compressed, size) != size) {
+		perror("unable to write file");
+		close(fd);
+		unlink(tmpfile);
+		return -1;
+	}
+	fchmod(fd, 0444);
+	close(fd);
+
+	if (rename(tmpfile, sha1_file_name(sha1))) {
+		perror("unable to replace original object");
+		unlink(tmpfile);
+		return -1;
+	}
+	return 0;
+}
+
+static int write_delta_file(char *buf, unsigned long len,
+			    unsigned char *sha1_ref, unsigned char *sha1_trg)
+{
+	char hdr[50];
+	int hdrlen;
+
+	/* Generate the header + sha1 of reference for delta */
+	hdrlen = sprintf(hdr, "delta %lu", len+20)+1;
+	memcpy(hdr + hdrlen, sha1_ref, 20);
+	hdrlen += 20;
+
+	return replace_object(buf, len, sha1_trg, hdr, hdrlen);
+}
+
+static int replace_sha1_file(char *buf, unsigned long len,
+			     char *type, unsigned char *sha1)
+{
+	char hdr[50];
+	int hdrlen;
+
+	hdrlen = sprintf(hdr, "%s %lu", type, len)+1;
+	return replace_object(buf, len, sha1, hdr, hdrlen);
+}
+
+static void *get_buffer(unsigned char *sha1, char *type, unsigned long *size)
+{
+	unsigned long mapsize;
+	void *map = map_sha1_file(sha1, &mapsize);
+	if (map) {
+		void *buffer = unpack_sha1_file(map, mapsize, type, size);
+		munmap(map, mapsize);
+		if (buffer)
+			return buffer;
+	}
+	error("unable to get object %s", sha1_to_hex(sha1));
+	return NULL;
+}
+
+static void *expand_delta(void *delta, unsigned long delta_size, char *type,
+			  unsigned long *size, unsigned int *depth, char *head)
+{
+	void *buf = NULL;
+	*depth++;
+	if (delta_size < 20) {
+		error("delta object is bad");
+		free(delta);
+	} else {
+		unsigned long ref_size;
+		void *ref = get_buffer(delta, type, &ref_size);
+		if (ref && !strcmp(type, "delta"))
+			ref = expand_delta(ref, ref_size, type, &ref_size,
+					   depth, head);
+		else
+			memcpy(head, delta, 20);
+		if (ref)
+			buf = patch_delta(ref, ref_size, delta+20,
+					  delta_size-20, size);
+		free(ref);
+		free(delta);
+	}
+	return buf;
+}
+
+static char *mkdelta_usage =
+"mkdelta [ --max-depth=N ] <reference_sha1> <target_sha1> [ <next_sha1> ... ]";
+
+int main(int argc, char **argv)
+{
+	unsigned char sha1_ref[20], sha1_trg[20], head_ref[20], head_trg[20];
+	char type_ref[20], type_trg[20];
+	void *buf_ref, *buf_trg, *buf_delta;
+	unsigned long size_ref, size_trg, size_orig, size_delta;
+	unsigned int depth_ref, depth_trg, depth_max = -1;
+	int i, verbose = 0;
+
+	for (i = 1; i < argc; i++) {
+		if (!strcmp(argv[i], "-v")) {
+			verbose = 1;
+		} else if (!strcmp(argv[i], "-d") && i+1 < argc) {
+			depth_max = atoi(argv[++i]);
+		} else if (!strncmp(argv[i], "--max-depth=", 12)) {
+			depth_max = atoi(argv[i]+12);
+		} else
+			break;
+	}
+
+	if (i + (depth_max != 0) >= argc)
+		usage(mkdelta_usage);
+
+	if (get_sha1(argv[i], sha1_ref))
+		die("bad sha1 %s", argv[i]);
+	depth_ref = 0;
+	buf_ref = get_buffer(sha1_ref, type_ref, &size_ref);
+	if (buf_ref && !strcmp(type_ref, "delta"))
+		buf_ref = expand_delta(buf_ref, size_ref, type_ref,
+				       &size_ref, &depth_ref, head_ref);
+	else
+		memcpy(head_ref, sha1_ref, 20);
+	if (!buf_ref)
+		die("unable to obtain initial object %s", argv[i]);
+
+	if (depth_ref > depth_max) {
+		if (replace_sha1_file(buf_ref, size_ref, type_ref, sha1_ref))
+			die("unable to restore %s", argv[i]);
+		if (verbose)
+			printf("undelta %s (depth was %d)\n", argv[i], depth_ref);
+		depth_ref = 0;
+	}
+
+	while (++i < argc) {
+		if (get_sha1(argv[i], sha1_trg))
+			die("bad sha1 %s", argv[i]);
+		depth_trg = 0;
+		buf_trg = get_buffer(sha1_trg, type_trg, &size_trg);
+		if (buf_trg && !size_trg) {
+			if (verbose)
+				printf("skip    %s (object is empty)\n", argv[i]);
+			continue;
+		}
+		size_orig = size_trg;
+		if (buf_trg && !strcmp(type_trg, "delta")) {
+			if (!memcmp(buf_trg, sha1_ref, 20)) {
+				/* delta already in place */
+				depth_ref++;
+				memcpy(sha1_ref, sha1_trg, 20);
+				buf_ref = patch_delta(buf_ref, size_ref,
+						      buf_trg+20, size_trg-20,
+						      &size_ref);
+				if (!buf_ref)
+					die("unable to apply delta %s", argv[i]);
+				if (depth_ref > depth_max) {
+					if (replace_sha1_file(buf_ref, size_ref,
+							      type_ref, sha1_ref))
+						die("unable to restore %s", argv[i]);
+					if (verbose)
+						printf("undelta %s (depth was %d)\n", argv[i], depth_ref);
+					depth_ref = 0;
+					continue;
+				}
+				if (verbose)
+					printf("skip    %s (delta already in place)\n", argv[i]);
+				continue;
+			}
+			buf_trg = expand_delta(buf_trg, size_trg, type_trg,
+					       &size_trg, &depth_trg, head_trg);
+		} else
+			memcpy(head_trg, sha1_trg, 20);
+		if (!buf_trg)
+			die("unable to read target object %s", argv[i]);
+
+		if (depth_trg > depth_max) {
+			if (replace_sha1_file(buf_trg, size_trg, type_trg, sha1_trg))
+				die("unable to restore %s", argv[i]);
+			if (verbose)
+				printf("undelta %s (depth was %d)\n", argv[i], depth_trg);
+			depth_trg = 0;
+			size_orig = size_trg;
+		}
+
+		if (depth_max == 0)
+			goto skip;
+
+		if (strcmp(type_ref, type_trg))
+			die("type mismatch for object %s", argv[i]);
+
+		if (!size_ref) {
+			if (verbose)
+				printf("skip    %s (initial object is empty)\n", argv[i]);
+			goto skip;
+		}
+		
+		depth_ref++;
+		if (depth_ref > depth_max) {
+			if (verbose)
+				printf("skip    %s (exceeding max link depth)\n", argv[i]);
+			goto skip;
+		}
+
+		if (!memcmp(head_ref, sha1_trg, 20)) {
+			if (verbose)
+				printf("skip    %s (would create a loop)\n", argv[i]);
+			goto skip;
+		}
+
+		buf_delta = diff_delta(buf_ref, size_ref, buf_trg, size_trg, &size_delta);
+		if (!buf_delta)
+			die("out of memory");
+
+		if (size_delta+20 < size_orig) {
+			if (write_delta_file(buf_delta, size_delta,
+					     sha1_ref, sha1_trg))
+				die("unable to write delta for %s", argv[i]);
+			free(buf_delta);
+			if (verbose)
+				printf("delta   %s (size=%ld.%02ld%%, depth=%d)\n",
+				       argv[i], (size_delta+20)*100 / size_trg,
+				       ((size_delta+20)*10000 / size_trg)%100,
+				       depth_ref);
+		} else {
+			free(buf_delta);
+			if (verbose)
+				printf("skip    %s (original is smaller)\n", argv[i]);
+			skip:
+			depth_ref = depth_trg;
+			memcpy(head_ref, head_trg, 20);
+		}
+
+		free(buf_ref);
+		buf_ref = buf_trg;
+		size_ref = size_trg;
+		memcpy(sha1_ref, sha1_trg, 20);
+	}
+
+	return 0;
+}
Index: git/mktag.c
===================================================================
--- git.orig/mktag.c
+++ git/mktag.c
@@ -25,20 +25,14 @@
 static int verify_object(unsigned char *sha1, const char *expected_type)
 {
 	int ret = -1;
-	unsigned long mapsize;
-	void *map = map_sha1_file(sha1, &mapsize);
+	char type[100];
+	unsigned long size;
+	void *buffer = read_sha1_file(sha1, type, &size);
 
-	if (map) {
-		char type[100];
-		unsigned long size;
-		void *buffer = unpack_sha1_file(map, mapsize, type, &size);
-
-		if (buffer) {
-			if (!strcmp(type, expected_type))
-				ret = check_sha1_signature(sha1, buffer, size, type);
-			free(buffer);
-		}
-		munmap(map, mapsize);
+	if (buffer) {
+		if (!strcmp(type, expected_type))
+			ret = check_sha1_signature(sha1, buffer, size, type);
+		free(buffer);
 	}
 	return ret;
 }
Index: git/git-deltafy-script
===================================================================
--- /dev/null
+++ git/git-deltafy-script
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Script to deltafy an entire GIT repository based on the commit list.
+# The most recent version of a file is the reference and the previous version
+# is changed into a delta from that most recent version. And so on for
+# successive versions going back in time.
+#
+# The -d argument allows to provide a limit on the delta chain depth.
+# If 0 is passed then everything is undeltafied.
+
+set -e
+
+depth=
+[ "$1" == "-d" ] && depth="--max-depth=$2" && shift 2
+
+curr_file=""
+
+git-rev-list HEAD |
+git-diff-tree -r --stdin |
+sed -n '/^\*/ s/^.*->\(.\{41\}\)\(.*\)$/\2 \1/p' | sort | uniq |
+while read file sha1; do
+	if [ "$file" == "$curr_file" ]; then
+		list="$list $sha1"
+	else
+		if [ "$list" ]; then
+			echo "Processing $curr_file"
+			echo "$head $list" | xargs git-mkdelta $depth -v
+		fi
+		curr_file="$file"
+		list=""
+		head="$sha1"
+	fi
+done

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-12  3:51 [PATCH] improved delta support for git Nicolas Pitre
@ 2005-05-12  4:36 ` Junio C Hamano
  2005-05-12 14:27   ` Chris Mason
  2005-05-17 18:22 ` Thomas Glanzmann
  2005-05-17 21:43 ` Dan Holmsand
  2 siblings, 1 reply; 17+ messages in thread
From: Junio C Hamano @ 2005-05-12  4:36 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: git

The changes to sha1_file interface seems to be contained to
read_sha1_file() only; which is a very good sign.  You have
already expressed that you are aware that fsck-cache needs to be
taught about the delta objects, so I'd trust that would be what
you will be tackling next.

I started wondering how the delta chains would affect pull.c,
the engine that decides which files under GIT_OBJECT_DIRECTORY
need to be pulled from the remote side in order to construct the
set of objects needed by the given commit ID, under various
combinations of cut-off criteria given with -c, -t, and -a
options.

It appears to me that changes to the make_sure_we_have_it()
routine along the following lines (completely untested) would
suffice.  Instead of just returning success, we first fetch the
named object from the remote side, read it to see if it is
really the object we have asked, or just a delta, and if it is a
delta call itself again on the underlying object that delta
object depends upon.

Signed-off-by: Junio C Hamano <junkio@cox.net>
---
# - git-pb: Fixed a leak in read-tree
# + (working tree)
--- a/pull.c
+++ b/pull.c
@@ -32,11 +32,23 @@ static void report_missing(const char *w
 static int make_sure_we_have_it(const char *what, unsigned char *sha1)
 {
 	int status;
+	unsigned long mapsize;
+	void *map, *buf;
+
 	if (has_sha1_file(sha1))
 		return 0;
 	status = fetch(sha1);
 	if (status && what)
 		report_missing(what, sha1);
+
+	map = map_sha1_file(sha1, &mapsize);
+	if (map) {
+		buf = unpack_sha1_file(map, mapsize, type, size);
+		munmap(map, mapsize);
+		if (buf && !strcmp(type, "delta"))
+			status = make_sure_we_have_it(what, buf);
+		free(buf);
+	}
 	return status;
 }

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-12  4:36 ` Junio C Hamano
@ 2005-05-12 14:27   ` Chris Mason
       [not found]     ` <2cfc403205051207467755cdf@mail.gmail.com>
  0 siblings, 1 reply; 17+ messages in thread
From: Chris Mason @ 2005-05-12 14:27 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nicolas Pitre, git

On Thursday 12 May 2005 00:36, Junio C Hamano wrote:
> It appears to me that changes to the make_sure_we_have_it()
> routine along the following lines (completely untested) would
> suffice.  Instead of just returning success, we first fetch the
> named object from the remote side, read it to see if it is
> really the object we have asked, or just a delta, and if it is a
> delta call itself again on the underlying object that delta
> object depends upon.

If we fetch the named object and it is a delta, the delta will either depend 
on an object we already have or an object that we don't have.  If we don't 
have it, the pull should find it while pulling other commits we don't have.

-chris



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
       [not found]     ` <2cfc403205051207467755cdf@mail.gmail.com>
@ 2005-05-12 14:47       ` Jon Seymour
  2005-05-12 15:18         ` Nicolas Pitre
  0 siblings, 1 reply; 17+ messages in thread
From: Jon Seymour @ 2005-05-12 14:47 UTC (permalink / raw)
  To: Git Mailing List

On 5/13/05, Chris Mason <mason@suse.com> wrote:
> On Thursday 12 May 2005 00:36, Junio C Hamano wrote:
> > It appears to me that changes to the make_sure_we_have_it() ...
>
> If we fetch the named object and it is a delta, the delta will either depend
> on an object we already have or an object that we don't have.  If we don't
> have it, the pull should find it while pulling other commits we don't have.
>

Chris,

Doesn't that assume that the object referenced by the delta is
reachable from the commit being pulled. While that may be true in
practice, I don't think it is a logical certainty.

jon.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-12 14:47       ` Jon Seymour
@ 2005-05-12 15:18         ` Nicolas Pitre
  2005-05-12 17:16           ` Junio C Hamano
  0 siblings, 1 reply; 17+ messages in thread
From: Nicolas Pitre @ 2005-05-12 15:18 UTC (permalink / raw)
  To: jon; +Cc: Git Mailing List

On Fri, 13 May 2005, Jon Seymour wrote:

> On 5/13/05, Chris Mason <mason@suse.com> wrote:
> > On Thursday 12 May 2005 00:36, Junio C Hamano wrote:
> > > It appears to me that changes to the make_sure_we_have_it() ...
> >
> > If we fetch the named object and it is a delta, the delta will either depend
> > on an object we already have or an object that we don't have.  If we don't
> > have it, the pull should find it while pulling other commits we don't have.
> >
> 
> Chris,
> 
> Doesn't that assume that the object referenced by the delta is
> reachable from the commit being pulled. While that may be true in
> practice, I don't think it is a logical certainty.

1) If you happen to already have the referenced object in your local 
   repository then you're done.

2) If not you pull the referenced object from the remote repository, 
   repeat with #1 if it happens to be another delta object.

3) If the remote repository doesn't contain the object referenced by any 
   pulled delta object then that repository is inconsistent just like if 
   a blob object referenced by a tree object was missing.  This 
   therefore should not happen.  git-fsck-cache will flag broken delta 
   links soon.

Nicolas

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-12 15:18         ` Nicolas Pitre
@ 2005-05-12 17:16           ` Junio C Hamano
  2005-05-13 11:44             ` Chris Mason
  0 siblings, 1 reply; 17+ messages in thread
From: Junio C Hamano @ 2005-05-12 17:16 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: jon, Git Mailing List

>>>>> "NP" == Nicolas Pitre <nico@cam.org> writes:

>> On 5/13/05, Chris Mason <mason@suse.com> wrote:
>> > On Thursday 12 May 2005 00:36, Junio C Hamano wrote:
>> > > It appears to me that changes to the make_sure_we_have_it() ...
>> >
>> > If we fetch the named object and it is a delta, the delta will either depend
>> > on an object we already have or an object that we don't have.  If we don't
>> > have it, the pull should find it while pulling other commits we don't have.

NP> 1) If you happen to already have the referenced object in your local 
NP>    repository then you're done.

Yes.

NP> 2) If not you pull the referenced object from the remote repository, 
NP>    repeat with #1 if it happens to be another delta object.

Yes, that is the outline of what my (untested) patch does.

Unless I am grossly mistaken, what Chris says is true only when
we are pulling with -a flag to the git-*-pull family.  If we are
pulling "partially near the tip", we do not necessarily pull
"other commits we don't have", hence detecting delta's
requirement at per-object level and pulling the dependent
becomes necessary, which is essentially what you wrote in (2)
above.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-12 17:16           ` Junio C Hamano
@ 2005-05-13 11:44             ` Chris Mason
  0 siblings, 0 replies; 17+ messages in thread
From: Chris Mason @ 2005-05-13 11:44 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Nicolas Pitre, jon, Git Mailing List

On Thursday 12 May 2005 13:16, Junio C Hamano wrote:
> >>>>> "NP" == Nicolas Pitre <nico@cam.org> writes:
> >>
> >> On 5/13/05, Chris Mason <mason@suse.com> wrote:
> >> > On Thursday 12 May 2005 00:36, Junio C Hamano wrote:
> >> > > It appears to me that changes to the make_sure_we_have_it() ...
> >> >
> >> > If we fetch the named object and it is a delta, the delta will either
> >> > depend on an object we already have or an object that we don't have. 
> >> > If we don't have it, the pull should find it while pulling other
> >> > commits we don't have.
>
> NP> 1) If you happen to already have the referenced object in your local
> NP>    repository then you're done.
>
> Yes.
>
> NP> 2) If not you pull the referenced object from the remote repository,
> NP>    repeat with #1 if it happens to be another delta object.
>
> Yes, that is the outline of what my (untested) patch does.
>
> Unless I am grossly mistaken, what Chris says is true only when
> we are pulling with -a flag to the git-*-pull family.  If we are
> pulling "partially near the tip", we do not necessarily pull
> "other commits we don't have", hence detecting delta's
> requirement at per-object level and pulling the dependent
> becomes necessary, which is essentially what you wrote in (2)
> above.
>
Yes, my post does assume that you're pulling everything and the repo you're 
pulling from has a sane state.  This should be the common case though, so I 
would suggest optimizing things to build a list of the delta objects and 
check them at the end to see if we didn't pull any.

We want the list of delta objects regardless, this way we can warn the user 
that they have pulled in deltas and give them the chance to convert them into 
full files.

-chris

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-12  3:51 [PATCH] improved delta support for git Nicolas Pitre
  2005-05-12  4:36 ` Junio C Hamano
@ 2005-05-17 18:22 ` Thomas Glanzmann
  2005-05-17 19:02   ` Thomas Glanzmann
  2005-05-17 19:10   ` Thomas Glanzmann
  2005-05-17 21:43 ` Dan Holmsand
  2 siblings, 2 replies; 17+ messages in thread
From: Thomas Glanzmann @ 2005-05-17 18:22 UTC (permalink / raw)
  To: git

[-- Attachment #1: Type: text/plain, Size: 828 bytes --]

Hello Nicolas,
I just tried it against Linus git-HEAD. It worked like a charm so far. I
imported mutt-1.5 CVS branch using cvsps (1103 patches).

(medion) [/scratch/mutt/mutt-cvs] git-rev-tree HEAD | wc -l
1103
(medion) [/scratch/mutt/mutt-cvs] du -sh .git/objects/
63M     .git/objects/
(medion) [/scratch/mutt/mutt-cvs] git-deltafy-script -d 2000
...
(medion) [/scratch/mutt/mutt-cvs] du -sh .git/objects/
35M     .git/objects/

Maybe you should add git-deltafy-script and git-mkdelta to the installation
targets (patch attached).

And I wonder why the mutt CVS Repository is still smaller than the
zdelta compressed mutt git repository. And with mutt CVS Repository I
mean every commit since mutt-0.9x not only mutt-1.5 branch?

(faui03) [~] du -sh work/mutt/cvsrepository
34M     work/mutt/cvsrepository

Greetings,
	Thomas

[-- Attachment #2: diff --]
[-- Type: text/plain, Size: 802 bytes --]

[PATCH] Install git-mkdelta and git-deltafy-script

Signed-off-by: Thomas Glanzmann <sithglan@stud.uni-erlangen.de>

--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@
 INSTALL=install

 SCRIPTS=git-apply-patch-script git-merge-one-file-script git-prune-script \
-	git-pull-script git-tag-script git-resolve-script
+	git-pull-script git-tag-script git-resolve-script git-deltafy-script

 PROG=   git-update-cache git-diff-files git-init-db git-write-tree \
 	git-read-tree git-commit-tree git-cat-file git-fsck-cache \
@@ -28,7 +28,7 @@
 	git-unpack-file git-export git-diff-cache git-convert-cache \
 	git-http-pull git-rpush git-rpull git-rev-list git-mktag \
 	git-diff-helper git-tar-tree git-local-pull git-write-blob \
-	git-get-tar-commit-id
+	git-get-tar-commit-id git-mkdelta

 all: $(PROG)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-17 18:22 ` Thomas Glanzmann
@ 2005-05-17 19:02   ` Thomas Glanzmann
  2005-05-17 19:10   ` Thomas Glanzmann
  1 sibling, 0 replies; 17+ messages in thread
From: Thomas Glanzmann @ 2005-05-17 19:02 UTC (permalink / raw)
  To: git

Hello,
btw. 6 Megabyte are spend on commit and tree objects:

(medion) [/scratch/mutt/mutt-cvs] find .git/objects/ -type f | sed 's^.*\(..\)/^\1^' | while read FILE; do echo -n "$FILE " ;git-cat-file -t $FILE; done | egrep 'tree|commit' | awk '{print $1}' | sed 's,^\(..\),.git/objects/\1/,' | xargs ls -l | awk '{sum += $5} END {print sum}'
6179105

I know my shell sux.

	Thomas

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-17 18:22 ` Thomas Glanzmann
  2005-05-17 19:02   ` Thomas Glanzmann
@ 2005-05-17 19:10   ` Thomas Glanzmann
  1 sibling, 0 replies; 17+ messages in thread
From: Thomas Glanzmann @ 2005-05-17 19:10 UTC (permalink / raw)
  To: git

Hi,
okay I got it. Fragmentation:

before:
	(medion) [/scratch/mutt/mutt-cvs] du -sh --apparent-size .git/objects/
	49M     .git/objects/

after:
	(medion) [/scratch/mutt/mutt-cvs] du -sh --apparent-size .git/objects/
	19M     .git/objects/

cvs repository:

	(faui00u) [~/work/git/yagf] du -sh --apparent-size ../../mutt/cvsrepository
	33M     ../../mutt/cvsrepository

Sincerely,
	Thomas

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-12  3:51 [PATCH] improved delta support for git Nicolas Pitre
  2005-05-12  4:36 ` Junio C Hamano
  2005-05-17 18:22 ` Thomas Glanzmann
@ 2005-05-17 21:43 ` Dan Holmsand
  2005-05-18  4:32   ` Nicolas Pitre
  2005-05-18 15:12   ` Linus Torvalds
  2 siblings, 2 replies; 17+ messages in thread
From: Dan Holmsand @ 2005-05-17 21:43 UTC (permalink / raw)
  To: git

[-- Attachment #1: Type: text/plain, Size: 3856 bytes --]


Nicolas (and others),

I've been trying out your delta stuff as well. It was a bit
disappointing at first, but some tweaking payed off in the end...

First, I tried the entire bkcvs history for 2.6, but storing only the
"fs" directory tree in git (hoping that would be representative
enough, since the entire tree gets *big*). I got 4678 commits.

In its original form, it looks like this (first size is "network
size", the last one disk size on ext3. Average size per object in
bytes):

trees:  16M  (15684 files)  avg: 1119, disk:  61M
blobs: 121M  (17200 files)  avg: 7414, disk: 157M
Total: 139M  (37562 files)  avg: 3883, disk: 237M

Using your code, with unlimited delta depth:

trees:  16M  (15684 files)  avg: 1119, disk:  61M
blobs:   9M   (2333 files)  avg: 4491, disk:  15M
deltas: 30M  (14867 files)  avg: 2147, disk:  71M
Total:  83M  (37562 files)  avg: 2334, disk: 188M

Same thing, with a maximum delta depth of 2:

trees:  16M  (15684 files)  avg: 1119, disk:  61M
blobs:  45M   (6940 files)  avg: 6906, disk:  60M
deltas: 20M  (10260 files)  avg: 2086, disk:  48M
Total:  83M  (37562 files)  avg: 2334, disk: 188M

So, total size from a network perspective went from 139M to 83M, which
seemed a little disappointing to me.

I think there are too reasons, as shown by these statistics:

1) Too many deltas get too big and/or compress badly.

2) Trees take up a big chunk of total space.


Therefore, I tried some other approaches. This one seemed to work
best:

1) I limit the maximum size of any delta to 10% of the size of the new
version. That guarantees a big saving, as long as any delta is
produced.

2) If the "previous" version of a blob is a delta, I produce the new
delta form the old deltas base version. This works surprisingly well.
I'm guessing the reason for this is that most changes are really
small, and they tend to be in the same area as a previous change (as
in "Commit new feature. Commit bugfix for new feature. Commit fix for
bugfix of new feature. Delete new feature as it doesn't work...").

3) I use the same method for all tree objects.

This method of "opportunistic delta compression" has some other
advantages: No risk of long delta chains (as the maximum delta depth
is one). It should be disk cache friendly, as many deltas are produced
against the same base version. And this method could easily be used
incrementally, or "on the fly", as forward deltas are used.

Using these tweaks helped a lot, size wise:

trees:   3M   (9746 files)  avg:  380, disk:  38M
blobs:  13M   (3301 files)  avg: 4208, disk:  20M
deltas: 11M  (19837 files)  avg:  586, disk:  78M
Total:  28M  (37562 files)  avg:  799, disk: 155M

As this method turned 139M worth of git repository into 28M, I decided
to try the same method on the entire bkcvs history (28203 commits).

Plain vanilla git looks like this:

trees:  246M  (156812 files)  avg: 1647, disk:  699M
blobs: 1171M  (185458 files)  avg: 6623, disk: 1573M
Total: 1422M  (370473 files)  avg: 4025, disk: 2382M

The delta compressed approach outlined above yields:

trees:   47M   (73519 files)  avg:  672, disk:  289M
blobs:  156M   (49857 files)  avg: 3285, disk:  281M
deltas: 107M  (218894 files)  avg:  515, disk:  863M
Total:  315M  (370473 files)  avg:  892, disk: 1544M

So, 1.4G became 315M. Not too bad, IMHO. Disk size is still big,
of course, but disks are apparently cheap these days.

It could probably be even better, if git didn't produce quite as many
tree objects. Some sort of chunking together of tree objects would
help delta compression a lot (and improve disk size quite a bit in the
process).

Attached is a patch (against current cogito). It is basically the same
as yours, Nicolas, except for some hackery to make the above possible.
I'm sure I've made lots of stupid mistakes in it (and the 10% limit is
hardcoded right now; I'm lazy).

/dan

[-- Attachment #2: delta.patch --]
[-- Type: text/x-patch, Size: 26038 bytes --]

Index: Makefile
===================================================================
--- 4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2/Makefile  (mode:100644)
+++ uncommitted/Makefile  (mode:100644)
@@ -35,7 +35,7 @@
 INSTALL?=install
 
 SCRIPTS=git-apply-patch-script git-merge-one-file-script git-prune-script \
-	git-pull-script git-tag-script git-resolve-script
+	git-pull-script git-tag-script git-resolve-script git-deltafy-script
 
 PROG=   git-update-cache git-diff-files git-init-db git-write-tree \
 	git-read-tree git-commit-tree git-cat-file git-fsck-cache \
@@ -44,7 +44,7 @@
 	git-unpack-file git-export git-diff-cache git-convert-cache \
 	git-http-pull git-rpush git-rpull git-rev-list git-mktag \
 	git-diff-helper git-tar-tree git-local-pull git-write-blob \
-	git-get-tar-commit-id
+	git-get-tar-commit-id git-mkdelta
 
 SCRIPT=	commit-id tree-id parent-id cg-add cg-admin-lsobj cg-admin-uncommit \
 	cg-branch-add cg-branch-ls cg-cancel cg-clone cg-commit cg-diff \
@@ -60,7 +60,7 @@
 COMMON=	read-cache.o
 
 LIB_OBJS=read-cache.o sha1_file.o usage.o object.o commit.o tree.o blob.o \
-	 tag.o date.o
+	 tag.o date.o diff-delta.o patch-delta.o
 LIB_FILE=libgit.a
 LIB_H=cache.h object.h blob.h tree.h commit.h tag.h
 
@@ -94,6 +94,9 @@
 all: $(PROG) $(GEN_SCRIPT)
 
 
+test-delta: test-delta.c diff-delta.o patch-delta.o
+	$(CC) $(CFLAGS) -o $@ $^
+
 git-%: %.c $(LIB_FILE)
 	$(CC) $(CFLAGS) -o $@ $(filter %.c,$^) $(LIBS)
 
Index: delta.h
===================================================================
--- /dev/null  (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2)
+++ uncommitted/delta.h  (mode:100644)
@@ -0,0 +1,6 @@
+extern void *diff_delta(void *from_buf, unsigned long from_size,
+			void *to_buf, unsigned long to_size,
+		        unsigned long *delta_size);
+extern void *patch_delta(void *src_buf, unsigned long src_size,
+			 void *delta_buf, unsigned long delta_size,
+			 unsigned long *dst_size);
Index: diff-delta.c
===================================================================
--- /dev/null  (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2)
+++ uncommitted/diff-delta.c  (mode:100644)
@@ -0,0 +1,330 @@
+/*
+ * diff-delta.c: generate a delta between two buffers
+ *
+ *  Many parts of this file have been lifted from LibXDiff version 0.10.
+ *  http://www.xmailserver.org/xdiff-lib.html
+ *
+ *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2003	Davide Libenzi
+ *
+ *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
+ *
+ *  This file is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ */
+
+#include <stdlib.h>
+#include "delta.h"
+
+
+/* block size: min = 16, max = 64k, power of 2 */
+#define BLK_SIZE 16
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define GR_PRIME 0x9e370001
+#define HASH(v, b) (((unsigned int)(v) * GR_PRIME) >> (32 - (b)))
+	
+/* largest prime smaller than 65536 */
+#define BASE 65521
+
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+#define NMAX 5552
+
+#define DO1(buf, i)  { s1 += buf[i]; s2 += s1; }
+#define DO2(buf, i)  DO1(buf, i); DO1(buf, i + 1);
+#define DO4(buf, i)  DO2(buf, i); DO2(buf, i + 2);
+#define DO8(buf, i)  DO4(buf, i); DO4(buf, i + 4);
+#define DO16(buf)    DO8(buf, 0); DO8(buf, 8);
+
+static unsigned int adler32(unsigned int adler, const unsigned char *buf, int len)
+{
+	int k;
+	unsigned int s1 = adler & 0xffff;
+	unsigned int s2 = adler >> 16;
+
+	while (len > 0) {
+		k = MIN(len, NMAX);
+		len -= k;
+		while (k >= 16) {
+			DO16(buf);
+			buf += 16;
+			k -= 16;
+		}
+		if (k != 0)
+			do {
+				s1 += *buf++;
+				s2 += s1;
+			} while (--k);
+		s1 %= BASE;
+		s2 %= BASE;
+	}
+
+	return (s2 << 16) | s1;
+}
+
+static unsigned int hashbits(unsigned int size)
+{
+	unsigned int val = 1, bits = 0;
+	while (val < size && bits < 32) {
+		val <<= 1;
+	       	bits++;
+	}
+	return bits ? bits: 1;
+}
+
+typedef struct s_chanode {
+	struct s_chanode *next;
+	int icurr;
+} chanode_t;
+
+typedef struct s_chastore {
+	chanode_t *head, *tail;
+	int isize, nsize;
+	chanode_t *ancur;
+	chanode_t *sncur;
+	int scurr;
+} chastore_t;
+
+static void cha_init(chastore_t *cha, int isize, int icount)
+{
+	cha->head = cha->tail = NULL;
+	cha->isize = isize;
+	cha->nsize = icount * isize;
+	cha->ancur = cha->sncur = NULL;
+	cha->scurr = 0;
+}
+
+static void *cha_alloc(chastore_t *cha)
+{
+	chanode_t *ancur;
+	void *data;
+
+	ancur = cha->ancur;
+	if (!ancur || ancur->icurr == cha->nsize) {
+		ancur = malloc(sizeof(chanode_t) + cha->nsize);
+		if (!ancur)
+			return NULL;
+		ancur->icurr = 0;
+		ancur->next = NULL;
+		if (cha->tail)
+			cha->tail->next = ancur;
+		if (!cha->head)
+			cha->head = ancur;
+		cha->tail = ancur;
+		cha->ancur = ancur;
+	}
+
+	data = (void *)ancur + sizeof(chanode_t) + ancur->icurr;
+	ancur->icurr += cha->isize;
+	return data;
+}
+
+static void cha_free(chastore_t *cha)
+{
+	chanode_t *cur = cha->head;
+	while (cur) {
+		chanode_t *tmp = cur;
+		cur = cur->next;
+		free(tmp);
+	}
+}
+
+typedef struct s_bdrecord {
+	struct s_bdrecord *next;
+	unsigned int fp;
+	const unsigned char *ptr;
+} bdrecord_t;
+
+typedef struct s_bdfile {
+	const unsigned char *data, *top;
+	chastore_t cha;
+	unsigned int fphbits;
+	bdrecord_t **fphash;
+} bdfile_t;
+
+static int delta_prepare(const unsigned char *buf, int bufsize, bdfile_t *bdf)
+{
+	unsigned int fphbits;
+	int i, hsize;
+	const unsigned char *base, *data, *top;
+	bdrecord_t *brec;
+	bdrecord_t **fphash;
+
+	fphbits = hashbits(bufsize / BLK_SIZE + 1);
+	hsize = 1 << fphbits;
+	fphash = malloc(hsize * sizeof(bdrecord_t *));
+	if (!fphash)
+		return -1;
+	for (i = 0; i < hsize; i++)
+		fphash[i] = NULL;
+	cha_init(&bdf->cha, sizeof(bdrecord_t), hsize / 4 + 1);
+
+	bdf->data = data = base = buf;
+	bdf->top = top = buf + bufsize;
+	data += (bufsize / BLK_SIZE) * BLK_SIZE;
+	if (data == top)
+		data -= BLK_SIZE;
+
+	for ( ; data >= base; data -= BLK_SIZE) {
+		brec = cha_alloc(&bdf->cha);
+		if (!brec) {
+			cha_free(&bdf->cha);
+			free(fphash);
+			return -1;
+		}
+		brec->fp = adler32(0, data, MIN(BLK_SIZE, top - data));
+		brec->ptr = data;
+		i = HASH(brec->fp, fphbits);
+		brec->next = fphash[i];
+		fphash[i] = brec;
+	}
+
+	bdf->fphbits = fphbits;
+	bdf->fphash = fphash;
+
+	return 0;
+}
+
+static void delta_cleanup(bdfile_t *bdf)
+{
+	free(bdf->fphash);
+	cha_free(&bdf->cha);
+}
+
+#define COPYOP_SIZE(o, s) \
+    (!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \
+     !!(s & 0xff) + !!(s & 0xff00) + 1)
+
+void *diff_delta(void *from_buf, unsigned long from_size,
+		 void *to_buf, unsigned long to_size,
+		 unsigned long *delta_size)
+{
+	int i, outpos, outsize, inscnt, csize, msize, moff;
+	unsigned int fp;
+	const unsigned char *data, *top, *ptr1, *ptr2;
+	unsigned char *out, *orig;
+	bdrecord_t *brec;
+	bdfile_t bdf;
+
+	if (!from_size || !to_size || delta_prepare(from_buf, from_size, &bdf))
+		return NULL;
+	
+	outpos = 0;
+	outsize = 8192;
+	out = malloc(outsize);
+	if (!out) {
+		delta_cleanup(&bdf);
+		return NULL;
+	}
+
+	data = to_buf;
+	top = to_buf + to_size;
+
+	/* store reference buffer size */
+	orig = out + outpos++;
+	*orig = i = 0;
+	do {
+		if (from_size & 0xff) {
+			*orig |= (1 << i);
+			out[outpos++] = from_size;
+		}
+		i++;
+		from_size >>= 8;
+	} while (from_size);
+
+	/* store target buffer size */
+	orig = out + outpos++;
+	*orig = i = 0;
+	do {
+		if (to_size & 0xff) {
+			*orig |= (1 << i);
+			out[outpos++] = to_size;
+		}
+		i++;
+		to_size >>= 8;
+	} while (to_size);
+
+	inscnt = 0;
+	moff = 0;
+	while (data < top) {
+		msize = 0;
+		fp = adler32(0, data, MIN(top - data, BLK_SIZE));
+		i = HASH(fp, bdf.fphbits);
+		for (brec = bdf.fphash[i]; brec; brec = brec->next) {
+			if (brec->fp == fp) {
+				csize = bdf.top - brec->ptr;
+				if (csize > top - data)
+					csize = top - data;
+				for (ptr1 = brec->ptr, ptr2 = data; 
+				     csize && *ptr1 == *ptr2;
+				     csize--, ptr1++, ptr2++);
+
+				csize = ptr1 - brec->ptr;
+				if (csize > msize) {
+					moff = brec->ptr - bdf.data;
+					msize = csize;
+					if (msize >= 0x10000) {
+						msize = 0x10000;
+						break;
+					}
+				}
+			}
+		}
+
+		if (!msize || msize < COPYOP_SIZE(moff, msize)) {
+			if (!inscnt)
+				outpos++;
+			out[outpos++] = *data++;
+			inscnt++;
+			if (inscnt == 0x7f) {
+				out[outpos - inscnt - 1] = inscnt;
+				inscnt = 0;
+			}
+		} else {
+			if (inscnt) {
+				out[outpos - inscnt - 1] = inscnt;
+				inscnt = 0;
+			}
+
+			data += msize;
+			orig = out + outpos++;
+			i = 0x80;
+
+			if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
+			moff >>= 8;
+			if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
+			moff >>= 8;
+			if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
+			moff >>= 8;
+			if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
+
+			if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
+			msize >>= 8;
+			if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
+
+			*orig = i;
+		}
+
+		/* next time around the largest possible output is 1 + 4 + 3 */
+		if (outpos > outsize - 8) {
+			void *tmp = out;
+			outsize = outsize * 3 / 2;
+			out = realloc(out, outsize);
+			if (!out) {
+				free(tmp);
+				delta_cleanup(&bdf);
+				return NULL;
+			}
+		}
+	}
+
+	if (inscnt)
+		out[outpos - inscnt - 1] = inscnt;
+
+	delta_cleanup(&bdf);
+	*delta_size = outpos;
+	return out;
+}
Index: git-deltafy-script
===================================================================
--- /dev/null  (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2)
+++ uncommitted/git-deltafy-script  (mode:100755)
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Script to deltafy an entire GIT repository based on the commit list.
+
+# git-deltafy-script --pack deltafies a repository, from current HEAD
+# down.
+# git-deltafy-script --unpack undeltafies all objects.
+
+export LANG=C
+
+depth=
+[ "$1" == "-d" ] && depth="--max-depth=$2" && shift 2
+
+prevcommit=
+prevtree=
+
+treeid() {
+	git-cat-file commit "$1" | sed -e 's/tree //;q'
+}
+
+mkdelta() {
+	git-mkdelta --max-depth=1 -o -v "$@" || exit 1
+}
+
+if [ "$1" = --pack ] ; then
+git-rev-list HEAD | tac |
+while read commit; do
+	if [ "$prevcommit" ]; then
+		git-diff-tree -r -z $prevcommit $commit |
+		while IFS=$'\t' read -d $'\0' a1 a2 sha file; do
+			to=${sha#*->}
+			from=${sha%->*}
+			[ "$from" != "$to" ] && mkdelta $from $to 
+		done
+
+		prevdir=
+		prevsha=
+		tree=$(treeid "$commit") 
+		[ "$prevtree" ] || prevtree=$(treeid $prevcommit)
+		[ $prevtree != $tree ] && mkdelta $prevtree $tree 
+
+		( git-ls-tree -r $prevtree; git-ls-tree -r $tree ) |
+		grep $'^[0-9]*\ttree' | sort -k4 -s | uniq -u -s4 |
+		while IFS=$'\t' read a1 a2 sha dir; do
+			if [ "$prevdir" = "$dir" -a "$prevsha" != "$sha" ]; then
+				echo "deltafying tree $dir"
+				mkdelta $prevsha $sha 
+			fi
+			prevdir=$dir
+			prevsha=$sha
+		done
+
+	fi
+	prevcommit=$commit
+	prevtree=$tree
+done
+elif [ "$1" = --unpack ]; then
+	( cd .git/objects && find -type f ) | sed 's,[./],,g' | 
+	xargs git-mkdelta --max-depth=0 -v
+else
+	echo "usage: $(basename "$0") [--pack|--unpack]" >&2
+	exit 1
+fi
+exit 0
Index: mkdelta.c
===================================================================
--- /dev/null  (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2)
+++ uncommitted/mkdelta.c  (mode:100644)
@@ -0,0 +1,306 @@
+/*
+ * Deltafication of a GIT database.
+ *
+ * (C) 2005 Nicolas Pitre <nico@cam.org>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "cache.h"
+#include "delta.h"
+
+static int replace_object(char *buf, unsigned long len, unsigned char *sha1,
+			  char *hdr, int hdrlen)
+{
+	char tmpfile[PATH_MAX];
+	int size;
+	char *compressed;
+	z_stream stream;
+	int fd;
+
+	snprintf(tmpfile, sizeof(tmpfile), "%s/obj_XXXXXX", get_object_directory());
+	fd = mkstemp(tmpfile);
+	if (fd < 0)
+		return error("%s: %s\n", tmpfile, strerror(errno));
+	
+	/* Set it up */
+	memset(&stream, 0, sizeof(stream));
+	deflateInit(&stream, Z_BEST_COMPRESSION);
+	size = deflateBound(&stream, len+hdrlen);
+	compressed = xmalloc(size);
+
+	/* Compress it */
+	stream.next_out = compressed;
+	stream.avail_out = size;
+
+	/* First header.. */
+	stream.next_in = hdr;
+	stream.avail_in = hdrlen;
+	while (deflate(&stream, 0) == Z_OK)
+		/* nothing */;
+
+	/* Then the data itself.. */
+	stream.next_in = buf;
+	stream.avail_in = len;
+	while (deflate(&stream, Z_FINISH) == Z_OK)
+		/* nothing */;
+	deflateEnd(&stream);
+	size = stream.total_out;
+
+	if (write(fd, compressed, size) != size) {
+		perror("unable to write file");
+		close(fd);
+		unlink(tmpfile);
+		return -1;
+	}
+	fchmod(fd, 0444);
+	close(fd);
+
+	if (rename(tmpfile, sha1_file_name(sha1))) {
+		perror("unable to replace original object");
+		unlink(tmpfile);
+		return -1;
+	}
+	return 0;
+}
+
+static int write_delta_file(char *buf, unsigned long len,
+			    unsigned char *sha1_ref, unsigned char *sha1_trg)
+{
+	char hdr[50];
+	int hdrlen;
+
+	/* Generate the header + sha1 of reference for delta */
+	hdrlen = sprintf(hdr, "delta %lu", len+20)+1;
+	memcpy(hdr + hdrlen, sha1_ref, 20);
+	hdrlen += 20;
+
+	return replace_object(buf, len, sha1_trg, hdr, hdrlen);
+}
+
+static int replace_sha1_file(char *buf, unsigned long len,
+			     char *type, unsigned char *sha1)
+{
+	char hdr[50];
+	int hdrlen;
+
+	hdrlen = sprintf(hdr, "%s %lu", type, len)+1;
+	return replace_object(buf, len, sha1, hdr, hdrlen);
+}
+
+static void *get_buffer(unsigned char *sha1, char *type, unsigned long *size)
+{
+	unsigned long mapsize;
+	void *map = map_sha1_file(sha1, &mapsize);
+	if (map) {
+		void *buffer = unpack_sha1_file(map, mapsize, type, size);
+		munmap(map, mapsize);
+		if (buffer)
+			return buffer;
+	}
+	error("unable to get object %s", sha1_to_hex(sha1));
+	return NULL;
+}
+
+static void *expand_delta(void *delta, unsigned long delta_size, char *type,
+			  unsigned long *size, unsigned int *depth, char *head)
+{
+	void *buf = NULL;
+	*depth++;
+	if (delta_size < 20) {
+		error("delta object is bad");
+		free(delta);
+	} else {
+		unsigned long ref_size;
+		void *ref = get_buffer(delta, type, &ref_size);
+		if (ref && !strcmp(type, "delta"))
+			ref = expand_delta(ref, ref_size, type, &ref_size,
+					   depth, head);
+		else
+			memcpy(head, delta, 20);
+		if (ref)
+			buf = patch_delta(ref, ref_size, delta+20,
+					  delta_size-20, size);
+		free(ref);
+		free(delta);
+	}
+	return buf;
+}
+
+static char *mkdelta_usage =
+"mkdelta [ --max-depth=N ] [ -o ] <reference_sha1> <target_sha1> [ <next_sha1> ... ]";
+
+int main(int argc, char **argv)
+{
+	unsigned char sha1_ref[20], sha1_trg[20], head_ref[20], head_trg[20];
+	char type_ref[20], type_trg[20];
+	void *buf_ref, *buf_trg, *buf_delta;
+	unsigned long size_ref, size_trg, size_orig, size_delta;
+	unsigned int depth_ref, depth_trg, depth_max = -1;
+	int i, verbose = 0, oneparent = 0;
+
+	for (i = 1; i < argc; i++) {
+		if (!strcmp(argv[i], "-v")) {
+			verbose = 1;
+		} else if (!strcmp(argv[i], "-o")) {
+			oneparent = 1;
+		} else if (!strcmp(argv[i], "-d") && i+1 < argc) {
+			depth_max = atoi(argv[++i]);
+		} else if (!strncmp(argv[i], "--max-depth=", 12)) {
+			depth_max = atoi(argv[i]+12);
+		} else
+			break;
+	}
+
+	if (i + (depth_max != 0) >= argc)
+		usage(mkdelta_usage);
+
+	if (get_sha1(argv[i], sha1_ref))
+		die("bad sha1 %s", argv[i]);
+	depth_ref = 0;
+	buf_ref = get_buffer(sha1_ref, type_ref, &size_ref);
+	if (oneparent && depth_max > 0) {
+		while (buf_ref && !strcmp(type_ref, "delta")) {
+			if (size_ref < 20)
+				die("bad delta object");
+			//printf ("getting parent for %s %i\n", 
+					//sha1_to_hex(sha1_ref), depth_max);
+			memcpy(sha1_ref, buf_ref, 20);
+			free(buf_ref);
+			//printf("loading parent %s\n",
+					//sha1_to_hex(sha1_ref));
+			buf_ref = get_buffer(sha1_ref, 
+					type_ref, &size_ref);
+			if (!buf_ref) die("broken get_buffer!");
+		}
+	}
+	if (buf_ref && !strcmp(type_ref, "delta"))
+		buf_ref = expand_delta(buf_ref, size_ref, type_ref,
+				       &size_ref, &depth_ref, head_ref);
+	else
+		memcpy(head_ref, sha1_ref, 20);
+	if (!buf_ref)
+		die("unable to obtain initial object %s", argv[i]);
+
+	if (depth_ref > depth_max) {
+		if (replace_sha1_file(buf_ref, size_ref, type_ref, sha1_ref))
+			die("unable to restore %s", argv[i]);
+		if (verbose)
+			printf("undelta %s (depth was %d)\n", argv[i], depth_ref);
+		depth_ref = 0;
+	}
+
+	while (++i < argc) {
+		if (get_sha1(argv[i], sha1_trg))
+			die("bad sha1 %s", argv[i]);
+		depth_trg = 0;
+		buf_trg = get_buffer(sha1_trg, type_trg, &size_trg);
+		if (buf_trg && !size_trg) {
+			if (verbose)
+				printf("skip    %s (object is empty)\n", argv[i]);
+			continue;
+		}
+		size_orig = size_trg;
+		if (buf_trg && !strcmp(type_trg, "delta")) {
+			if (!memcmp(buf_trg, sha1_ref, 20)) {
+				/* delta already in place */
+				depth_ref++;
+				memcpy(sha1_ref, sha1_trg, 20);
+				buf_ref = patch_delta(buf_ref, size_ref,
+						      buf_trg+20, size_trg-20,
+						      &size_ref);
+				if (!buf_ref)
+					die("unable to apply delta %s", argv[i]);
+				if (depth_ref > depth_max) {
+					if (replace_sha1_file(buf_ref, size_ref,
+							      type_ref, sha1_ref))
+						die("unable to restore %s", argv[i]);
+					if (verbose)
+						printf("undelta %s (depth was %d)\n", argv[i], depth_ref);
+					depth_ref = 0;
+					continue;
+				}
+				if (verbose)
+					printf("skip    %s (delta already in place)\n", argv[i]);
+				continue;
+			}
+			buf_trg = expand_delta(buf_trg, size_trg, type_trg,
+					       &size_trg, &depth_trg, head_trg);
+		} else
+			memcpy(head_trg, sha1_trg, 20);
+		if (!buf_trg)
+			die("unable to read target object %s", argv[i]);
+
+		if (depth_trg > depth_max || depth_max == 0) {
+			if (replace_sha1_file(buf_trg, size_trg, type_trg, sha1_trg))
+				die("unable to restore %s", argv[i]);
+			if (verbose)
+				printf("undelta %s (depth was %d)\n", argv[i], depth_trg);
+			depth_trg = 0;
+			size_orig = size_trg;
+		}
+
+		if (depth_max == 0)
+			goto skip;
+
+		if (strcmp(type_ref, type_trg))
+			die("type mismatch for object %s", argv[i]);
+
+		if (!size_ref) {
+			if (verbose)
+				printf("skip    %s (initial object is empty)\n", argv[i]);
+			goto skip;
+		}
+		
+		depth_ref++;
+		if (depth_ref > depth_max) {
+			if (verbose)
+				printf("skip    %s (exceeding max link depth)\n", argv[i]);
+			goto skip;
+		}
+
+		if (!memcmp(head_ref, sha1_trg, 20)) {
+			if (verbose)
+				printf("skip    %s (would create a loop)\n", argv[i]);
+			goto skip;
+		}
+
+		buf_delta = diff_delta(buf_ref, size_ref, buf_trg, size_trg, &size_delta);
+		if (!buf_delta)
+			die("out of memory");
+
+		//if (size_delta+20 < size_orig) {
+		if ((size_delta+20)*100/size_trg < 10) {
+			if (write_delta_file(buf_delta, size_delta,
+					     sha1_ref, sha1_trg))
+				die("unable to write delta for %s", argv[i]);
+			free(buf_delta);
+			if (verbose)
+				printf("delta   %s (size=%ld.%02ld%%, depth=%d)\n",
+				       argv[i], (size_delta+20)*100 / size_trg,
+				       ((size_delta+20)*10000 / size_trg)%100,
+				       depth_ref);
+		} else {
+			free(buf_delta);
+			if (verbose) {
+				printf("skip    %s (original is smaller)", argv[i]);
+				printf(" (size=%ld.%02ld%%, depth=%d)\n",
+				       (size_delta+20)*100 / size_trg,
+				       ((size_delta+20)*10000 / size_trg)%100,
+				       depth_ref);
+			}
+			skip:
+			depth_ref = depth_trg;
+			memcpy(head_ref, head_trg, 20);
+		}
+
+		free(buf_ref);
+		buf_ref = buf_trg;
+		size_ref = size_trg;
+		memcpy(sha1_ref, sha1_trg, 20);
+	}
+
+	return 0;
+}
Index: mktag.c
===================================================================
--- 4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2/mktag.c  (mode:100644)
+++ uncommitted/mktag.c  (mode:100644)
@@ -25,20 +25,14 @@
 static int verify_object(unsigned char *sha1, const char *expected_type)
 {
 	int ret = -1;
-	unsigned long mapsize;
-	void *map = map_sha1_file(sha1, &mapsize);
+	char type[100];
+	unsigned long size;
+	void *buffer = read_sha1_file(sha1, type, &size);
 
-	if (map) {
-		char type[100];
-		unsigned long size;
-		void *buffer = unpack_sha1_file(map, mapsize, type, &size);
-
-		if (buffer) {
-			if (!strcmp(type, expected_type))
-				ret = check_sha1_signature(sha1, buffer, size, type);
-			free(buffer);
-		}
-		munmap(map, mapsize);
+	if (buffer) {
+		if (!strcmp(type, expected_type))
+			ret = check_sha1_signature(sha1, buffer, size, type);
+		free(buffer);
 	}
 	return ret;
 }
Index: patch-delta.c
===================================================================
--- /dev/null  (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2)
+++ uncommitted/patch-delta.c  (mode:100644)
@@ -0,0 +1,88 @@
+/*
+ * patch-delta.c:
+ * recreate a buffer from a source and the delta produced by diff-delta.c
+ *
+ * (C) 2005 Nicolas Pitre <nico@cam.org>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "delta.h"
+
+void *patch_delta(void *src_buf, unsigned long src_size,
+		  void *delta_buf, unsigned long delta_size,
+		  unsigned long *dst_size)
+{
+	const unsigned char *data, *top;
+	unsigned char *dst_buf, *out, cmd;
+	unsigned long size;
+	int i;
+
+	/* the smallest delta size possible is 6 bytes */
+	if (delta_size < 6)
+		return NULL;
+
+	data = delta_buf;
+	top = delta_buf + delta_size;
+
+	/* make sure the orig file size matches what we expect */
+	size = i = 0;
+	cmd = *data++;
+	while (cmd) {
+		if (cmd & 1)
+			size |= *data++ << i;
+		i += 8;
+		cmd >>= 1;
+	}
+	if (size != src_size)
+		return NULL;
+
+	/* now the result size */
+	size = i = 0;
+	cmd = *data++;
+	while (cmd) {
+		if (cmd & 1)
+			size |= *data++ << i;
+		i += 8;
+		cmd >>= 1;
+	}
+	dst_buf = malloc(size);
+	if (!dst_buf)
+		return NULL;
+
+	out = dst_buf;
+	while (data < top) {
+		cmd = *data++;
+		if (cmd & 0x80) {
+			unsigned long cp_off = 0, cp_size = 0;
+			const unsigned char *buf;
+			if (cmd & 0x01) cp_off = *data++;
+			if (cmd & 0x02) cp_off |= (*data++ << 8);
+			if (cmd & 0x04) cp_off |= (*data++ << 16);
+			if (cmd & 0x08) cp_off |= (*data++ << 24);
+			if (cmd & 0x10) cp_size = *data++;
+			if (cmd & 0x20) cp_size |= (*data++ << 8);
+			if (cp_size == 0) cp_size = 0x10000;
+			buf = (cmd & 0x40) ? dst_buf : src_buf;
+			memcpy(out, buf + cp_off, cp_size);
+			out += cp_size;
+		} else {
+			memcpy(out, data, cmd);
+			out += cmd;
+			data += cmd;
+		}
+	}
+
+	/* sanity check */
+	if (data != top || out - dst_buf != size) {
+		free(dst_buf);
+		return NULL;
+	}
+
+	*dst_size = size;
+	return dst_buf;
+}
Index: sha1_file.c
===================================================================
--- 4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2/sha1_file.c  (mode:100644)
+++ uncommitted/sha1_file.c  (mode:100644)
@@ -9,6 +9,7 @@
 #include <stdarg.h>
 #include <limits.h>
 #include "cache.h"
+#include "delta.h"
 
 #ifndef O_NOATIME
 #if defined(__linux__) && (defined(__i386__) || defined(__PPC__))
@@ -353,6 +354,19 @@
 	if (map) {
 		buf = unpack_sha1_file(map, mapsize, type, size);
 		munmap(map, mapsize);
+		if (buf && !strcmp(type, "delta")) {
+			void *ref = NULL, *delta = buf;
+			unsigned long ref_size, delta_size = *size;
+			buf = NULL;
+			if (delta_size > 20)
+				ref = read_sha1_file(delta, type, &ref_size);
+			if (ref)
+				buf = patch_delta(ref, ref_size,
+						  delta+20, delta_size-20, 
+						  size);
+			free(delta);
+			free(ref);
+		}
 		return buf;
 	}
 	return NULL;
Index: test-delta.c
===================================================================
--- /dev/null  (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2)
+++ uncommitted/test-delta.c  (mode:100644)
@@ -0,0 +1,79 @@
+/*
+ * test-delta.c: test code to exercise diff-delta.c and patch-delta.c
+ *
+ * (C) 2005 Nicolas Pitre <nico@cam.org>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include "delta.h"
+
+static const char *usage =
+	"test-delta (-d|-p) <from_file> <data_file> <out_file>";
+
+int main(int argc, char *argv[])
+{
+	int fd;
+	struct stat st;
+	void *from_buf, *data_buf, *out_buf;
+	unsigned long from_size, data_size, out_size;
+
+	if (argc != 5 || (strcmp(argv[1], "-d") && strcmp(argv[1], "-p"))) {
+		fprintf(stderr, "Usage: %s\n", usage);
+		return 1;
+	}
+
+	fd = open(argv[2], O_RDONLY);
+	if (fd < 0 || fstat(fd, &st)) {
+		perror(argv[2]);
+		return 1;
+	}
+	from_size = st.st_size;
+	from_buf = mmap(NULL, from_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (from_buf == MAP_FAILED) {
+		perror(argv[2]);
+		return 1;
+	}
+	close(fd);
+
+	fd = open(argv[3], O_RDONLY);
+	if (fd < 0 || fstat(fd, &st)) {
+		perror(argv[3]);
+		return 1;
+	}
+	data_size = st.st_size;
+	data_buf = mmap(NULL, data_size, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (data_buf == MAP_FAILED) {
+		perror(argv[3]);
+		return 1;
+	}
+	close(fd);
+
+	if (argv[1][1] == 'd')
+		out_buf = diff_delta(from_buf, from_size,
+				     data_buf, data_size, &out_size);
+	else
+		out_buf = patch_delta(from_buf, from_size,
+				      data_buf, data_size, &out_size);
+	if (!out_buf) {
+		fprintf(stderr, "delta operation failed (returned NULL)\n");
+		return 1;
+	}
+
+	fd = open (argv[4], O_WRONLY|O_CREAT|O_TRUNC, 0666);
+	if (fd < 0 || write(fd, out_buf, out_size) != out_size) {
+		perror(argv[4]);
+		return 1;
+	}
+
+	return 0;
+}

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-17 21:43 ` Dan Holmsand
@ 2005-05-18  4:32   ` Nicolas Pitre
  2005-05-18  8:54     ` Dan Holmsand
  2005-05-18 15:12   ` Linus Torvalds
  1 sibling, 1 reply; 17+ messages in thread
From: Nicolas Pitre @ 2005-05-18  4:32 UTC (permalink / raw)
  To: Dan Holmsand; +Cc: git

On Tue, 17 May 2005, Dan Holmsand wrote:

> I've been trying out your delta stuff as well. It was a bit
> disappointing at first, but some tweaking payed off in the end...

Cool!

My goal is to provide the mechanism that can be used by a higher level 
implementing the deltafication policy.  I only provided one script as an 
example, but it turns out that you found a way to achieve better space 
saving.  And I bet you that there is probably ways to do even better 
with more exhaustive delta targets.  For example you could try all 
possible combinations on an object list for each file (and let it run 
overnight).

> 1) Too many deltas get too big and/or compress badly.

One thing I've been wondering about is whether gzipping small deltas is 
actually a gain.  For very small files it seems that gzip is adding more 
overhead making the compressed file actually larger.  Might be worth 
storing some deltas uncompressed if the compressed version turns out to 
be larger.

> 2) Trees take up a big chunk of total space.

Tree objects can be deltafied as well, but I didn't had time to script 
it.  A large space saving can be expected there as well, especially for 
changesets that modify only a few files deep down the tree hierarchy.

> Therefore, I tried some other approaches. This one seemed to work
> best:
> 
> 1) I limit the maximum size of any delta to 10% of the size of the new
> version. That guarantees a big saving, as long as any delta is
> produced.

Well, any delta object smaller than its original object saves space, 
even if it's 75% of the original size. But...

> 2) If the "previous" version of a blob is a delta, I produce the new
> delta form the old deltas base version. This works surprisingly well.
> I'm guessing the reason for this is that most changes are really
> small, and they tend to be in the same area as a previous change (as
> in "Commit new feature. Commit bugfix for new feature. Commit fix for
> bugfix of new feature. Delete new feature as it doesn't work...").

... but then the ultimate solution is to try out all possible references 
within a given list.  My git-deltafy-script already finds out the list 
of objects belonging to the same file.  Maybe git-mkdelta should try 
all combinations between them.  This way a deeper delta chain could be 
allowed for maximum space saving.

> 3) I use the same method for all tree objects.

Yup.

> Attached is a patch (against current cogito). It is basically the same
> as yours, Nicolas, except for some hackery to make the above possible.
> I'm sure I've made lots of stupid mistakes in it (and the 10% limit is
> hardcoded right now; I'm lazy).

I will look at it and merge the good stuff.

Thanks for testing!

Nicolas

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-18  4:32   ` Nicolas Pitre
@ 2005-05-18  8:54     ` Dan Holmsand
  2005-05-18 18:41       ` Nicolas Pitre
  0 siblings, 1 reply; 17+ messages in thread
From: Dan Holmsand @ 2005-05-18  8:54 UTC (permalink / raw)
  To: git

Nicolas Pitre wrote:
> My goal is to provide the mechanism that can be used by a higher level 
> implementing the deltafication policy.  I only provided one script as an 
> example, but it turns out that you found a way to achieve better space 
> saving.  And I bet you that there is probably ways to do even better 
> with more exhaustive delta targets.  For example you could try all 
> possible combinations on an object list for each file (and let it run 
> overnight).

Well, any kind of deltafication of, say, the complete kernel history 
pretty much has to run overnight anyway :-)

> One thing I've been wondering about is whether gzipping small deltas is 
> actually a gain.  For very small files it seems that gzip is adding more 
> overhead making the compressed file actually larger.  Might be worth 
> storing some deltas uncompressed if the compressed version turns out to 
> be larger.

It's probably better to skip deltafication of very small files 
altogether. Big pain for small gain, and all that.

>>1) I limit the maximum size of any delta to 10% of the size of the new
>>version. That guarantees a big saving, as long as any delta is
>>produced.
> 
> 
> Well, any delta object smaller than its original object saves space, 
> even if it's 75% of the original size. But...

That's not true if you want to keep the delta chain length down (and 
thus performance up).

Then, the most efficient approach is to generate many deltas against the 
same base file (otherwise, you only get 50% delta files with a maximum 
delta depth of 1).

But in this case, the trick is to know when to stop deltafying against 
one base file, and start over with another. If you switch to a new 
keyframe too often, you obviously lose some potential savings. But if 
you don't switch often enough, you end up repeating the same data in too 
many delta files.

A maximum delta size of 10% turned out to be ideal for at least the "fs" 
  tree. 8% was significantly worse, as was 15%. (The ideal size depends 
on  how big the average change is: the smaller the average change, the 
smaller the max delta size should be).

> ... but then the ultimate solution is to try out all possible references 
> within a given list.  My git-deltafy-script already finds out the list 
> of objects belonging to the same file.  Maybe git-mkdelta should try 
> all combinations between them.  This way a deeper delta chain could be 
> allowed for maximum space saving.

Yeah. But then you lose the ability to do incremental deltafication, or 
deltafication on-the-fly. And it would be really, really nice to have 
git do deltas at commit time - that way you could keep the very cool 
"immutable objects" property of git, while still saving a lot of space.

> I will look at it and merge the good stuff.

Cool! Thanks!

/dan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-17 21:43 ` Dan Holmsand
  2005-05-18  4:32   ` Nicolas Pitre
@ 2005-05-18 15:12   ` Linus Torvalds
  2005-05-18 17:15     ` Dan Holmsand
  1 sibling, 1 reply; 17+ messages in thread
From: Linus Torvalds @ 2005-05-18 15:12 UTC (permalink / raw)
  To: Dan Holmsand; +Cc: git

On Tue, 17 May 2005, Dan Holmsand wrote:
> 
> Therefore, I tried some other approaches. This one seemed to work
> best:
> 
> 1) I limit the maximum size of any delta to 10% of the size of the new
> version. That guarantees a big saving, as long as any delta is
> produced.
> 
> 2) If the "previous" version of a blob is a delta, I produce the new
> delta form the old deltas base version. This works surprisingly well.
> I'm guessing the reason for this is that most changes are really
> small, and they tend to be in the same area as a previous change (as
> in "Commit new feature. Commit bugfix for new feature. Commit fix for
> bugfix of new feature. Delete new feature as it doesn't work...").
> 
> 3) I use the same method for all tree objects.

Has anybody tried:

 4) don't limit yourself to previous-history-objects

One of the things I liked best about the delta patches was that it is
history-neutral, and can happily delta an object against any other random
object, in the same tree, in a future tree, or in a past tree.

Even without any history at all, there should be a noticeable amount of 
delta opportunities, as different architectures often end up sharing files 
that are quite similar, but not exactly the same.

Now, that's a very expensive thing to do, since it changes the question of
"which object should I delta against" from O(1) to O(n) (where "n" is tyhe
total number of objects), and thus the whole deltafication from O(n) to
O(n**2), but especially together with your "max 10%" rule, you should be
able to limit your choices very effectively: if you know that your delta
should be within 10% of the total size, you can limit your "let's try that
object" search to other objects that are also within 10% of your object.

That doesn't change the basic expense factor much in theory (if sizes were
truly evenly distributed in <n> it might change it, but there's probably
only a few different "classes" of file sizes, much fewer than <n>, so it's
still probably O(n**2)), but it should cut down the work by some
noticeable constant factor, making it a hopefully realistic experiment.

So your first rule makes a global deltafication cheaper, and in fact,
together with your second rule, you might even decide to make the size 
differential depend on the size of the _compressed_ object, since you 
don't care about objects that have already been deltafied, and if they are 
within 10% of each other, then they should also likely compress similarly, 
and it should thus be pretty equivalent to just compare compressed sizes.

Again, that second optimization wouldn't change the O(n**2) nature of the
expense, but should give another nice factor of speedup, maybe making the
exercise possible in the first place.

As to the long-term "O(n**2) deltafication is not practical for big 
projects with lots of history" issue, doing things incrementally should 
hopefully solve that, and turn it into a series of O(n) operations at the 
cost of saying "we'll never re-delta an object against the future once 
we've found a delta in the past or used it as a base for a delta".

The fsck "scan all objects" code could be a good starting point.

Doing this experiment at least once should be interesting. It may turn out
that the incremental space savings aren't all that noticeable, and that
the pure history-based one already finds 90% of all savings, making the
expensive version not worth it. It would be nice to _know_, though.

		Linus

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-18 15:12   ` Linus Torvalds
@ 2005-05-18 17:15     ` Dan Holmsand
  0 siblings, 0 replies; 17+ messages in thread
From: Dan Holmsand @ 2005-05-18 17:15 UTC (permalink / raw)
  To: git

Linus Torvalds wrote:
> Has anybody tried:
> 
>  4) don't limit yourself to previous-history-objects
> 
> One of the things I liked best about the delta patches was that it is
> history-neutral, and can happily delta an object against any other random
> object, in the same tree, in a future tree, or in a past tree.
> 
> Even without any history at all, there should be a noticeable amount of 
> delta opportunities, as different architectures often end up sharing files 
> that are quite similar, but not exactly the same.
> 
> Now, that's a very expensive thing to do, since it changes the question of
> "which object should I delta against" from O(1) to O(n) (where "n" is tyhe
> total number of objects), and thus the whole deltafication from O(n) to
> O(n**2), but especially together with your "max 10%" rule, you should be
> able to limit your choices very effectively: if you know that your delta
> should be within 10% of the total size, you can limit your "let's try that
> object" search to other objects that are also within 10% of your object.

Yeah, that sounds very interresting *and* very expensive...

Ideally, I'd like to find the set of objects that should *not* be 
deltafied (i.e. the ideal "keyframe" objects), but that would generate 
the maximum number of small, depth-one deltas with the least total size. 
But I can't really see how that could be done in a number of 
deltafications significantly less than the number of atoms in the 
universe. Let me think about that some more, though.

I'd like to try a couple of other approaches anyway:

a) Sort all objects by size. Start by biggest (or smallest), and try to 
get as many max-10%-deltas out of that as possible, stopping the search 
when objects get too small (big) according to some size difference 
limit. Cross already deltafied objects off the list, and continue. Might 
work, and might be fast enough with a sufficiently small size limit.

b) Use the same history-based approach as before, and in addition try to 
deltafy any "new" objects against other new objects and previous ones 
(say one or two commits back) in a given size range. That should catch 
renames, copys of the same template, etc. That shouldn't really affect 
performance, as new files are added comparatively seldom.

> Doing this experiment at least once should be interesting. It may turn out
> that the incremental space savings aren't all that noticeable, and that
> the pure history-based one already finds 90% of all savings, making the
> expensive version not worth it. It would be nice to _know_, though.

I definitely agree. And I also agree that the history-based 
deltafication seems less than pure, from a "git, the object store that 
doesn't really care" point of view.

On the other hand, the history-based thing has its advantages. It takes 
advantage of people's hard work to make patches as small as possible. 
It's fast. And (perhaps more importantly), it's deterministic. The 
"ideal" approach could possibly require every single blob to be 
redeltafied when a new object is added, if we want to stay ideal.

And it could be done at commit-time, thus keeping git's nice promise of 
immutable files, while still keeping size requirements down. And as my 
current method gives roughly an 80% size reduction over "plain git", 
that might (by boring, excessively practical people) be considered 
enough :-)

/dan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-18  8:54     ` Dan Holmsand
@ 2005-05-18 18:41       ` Nicolas Pitre
  2005-05-18 19:32         ` Dan Holmsand
  0 siblings, 1 reply; 17+ messages in thread
From: Nicolas Pitre @ 2005-05-18 18:41 UTC (permalink / raw)
  To: Dan Holmsand; +Cc: git

On Wed, 18 May 2005, Dan Holmsand wrote:

> Nicolas Pitre wrote:
> > One thing I've been wondering about is whether gzipping small deltas is
> > actually a gain.  For very small files it seems that gzip is adding more
> > overhead making the compressed file actually larger.  Might be worth storing
> > some deltas uncompressed if the compressed version turns out to be larger.
> 
> It's probably better to skip deltafication of very small files altogether. Big
> pain for small gain, and all that.

No, that's not what I mean.

Suppose a large source file that may change only one line between two 
versions.  The delta may therefore end up being only a few bytes long.  
Compressing a few bytes with zlib creates a _larger_ file than the 
original few bytes.

> > Well, any delta object smaller than its original object saves space, even if
> > it's 75% of the original size. But...
> 
> That's not true if you want to keep the delta chain length down (and thus
> performance up).

Sure.  That's why I added the -d switch to mkdelta.  But if you can fit 
a delta which is 75% the size of its original object size then you still 
save 25% of the space, regardless of the delta chain length.

> But in this case, the trick is to know when to stop deltafying against one
> base file, and start over with another. If you switch to a new keyframe too
> often, you obviously lose some potential savings. But if you don't switch
> often enough, you end up repeating the same data in too many delta files.

That's why multiple combinations should be tried.  And to keep things 
under control then a new argument specifying the delta "distance" might 
limit the number of trials.

> A maximum delta size of 10% turned out to be ideal for at least the "fs"
> tree. 8% was significantly worse, as was 15%. (The ideal size depends on  how
> big the average change is: the smaller the average change, the smaller the max
> delta size should be).

In fact it seems that deltas might be significantly harder to compress.  
Therefore a test on the resulting file should probably be done as well 
to make sure we don't end up with a delta larger than the original 
object.

> > ... but then the ultimate solution is to try out all possible references
> > within a given list.  My git-deltafy-script already finds out the list of
> > objects belonging to the same file.  Maybe git-mkdelta should try all
> > combinations between them.  This way a deeper delta chain could be allowed
> > for maximum space saving.
> 
> Yeah. But then you lose the ability to do incremental deltafication, or
> deltafication on-the-fly.

Not at all.  Nothing prevents you from making the latest revision of a 
file be the reference object and the previous revision turned into a 
delta against that latest revision, even if it was itself a reference 
object before.  The only thing that must be avoided is a delta loop and 
current mkdelta code takes care of that already.

Nicolas

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] improved delta support for git
  2005-05-18 18:41       ` Nicolas Pitre
@ 2005-05-18 19:32         ` Dan Holmsand
  0 siblings, 0 replies; 17+ messages in thread
From: Dan Holmsand @ 2005-05-18 19:32 UTC (permalink / raw)
  To: git

Nicolas Pitre wrote:
> On Wed, 18 May 2005, Dan Holmsand wrote:
>>Nicolas Pitre wrote:
>>It's probably better to skip deltafication of very small files altogether. Big
>>pain for small gain, and all that.
> 
> No, that's not what I mean.
> 
> Suppose a large source file that may change only one line between two 
> versions.  The delta may therefore end up being only a few bytes long.  
> Compressing a few bytes with zlib creates a _larger_ file than the 
> original few bytes.

Oh, I see. You're right, of course. I doubt, however, that there are 
really large gains to be made, measured in bytes; since small files tend 
to be small :-) It might nevertheless be worthwhile to skip processing 
of really small files, though, as there's basically no hope of gaining 
anything by deltafication.

Anyway, I've also noticed that deltas compress a lot worse than regular 
blobs. In particular, "complex deltas" (i.e. small changes on lines 
1,3,5,9,12, etc.), compress poorly. That might explain why my simplistic 
"depth-one-deltas-against-a-common-keyframe" method works comparatively 
well, as that should tend to cleaner deltas from time to time (i.e. 
chunk on lines 1-12 got replaced by new stuff), that ought to be easier 
to compress.

>>>Well, any delta object smaller than its original object saves space, even if
>>>it's 75% of the original size. But...
>>
>>That's not true if you want to keep the delta chain length down (and thus
>>performance up).
> 
> Sure.  That's why I added the -d switch to mkdelta.  But if you can fit 
> a delta which is 75% the size of its original object size then you still 
> save 25% of the space, regardless of the delta chain length.

Ok, I guess we're talking about slightly different things here. I was 
talking about my simple-and-fast method of processing one new version of 
an object at a time, deltafying each new version against the first 
previous non-deltafied one. If you, in that scenario, allow use of up to 
75% sized deltas, on average delta size will probably be something like 37%.

> In fact it seems that deltas might be significantly harder to compress.  
> Therefore a test on the resulting file should probably be done as well 
> to make sure we don't end up with a delta larger than the original 
> object.

Yeah (see above). That's just one of the things I cheated my way out of, 
by limiting delta size to 10% (I trusting that compression doesn't 
increase size by 90%).

>>>... but then the ultimate solution is to try out all possible references
>>>within a given list.  My git-deltafy-script already finds out the list of
>>>objects belonging to the same file.  Maybe git-mkdelta should try all
>>>combinations between them.  This way a deeper delta chain could be allowed
>>>for maximum space saving.
>>
>>Yeah. But then you lose the ability to do incremental deltafication, or
>>deltafication on-the-fly.
> 
> 
> Not at all.  Nothing prevents you from making the latest revision of a 
> file be the reference object and the previous revision turned into a 
> delta against that latest revision, even if it was itself a reference 
> object before.  The only thing that must be avoided is a delta loop and 
> current mkdelta code takes care of that already.

Sure. But there's some downside to modifying already existing objects. 
In particular, downloads using the existing methods (both http and 
rsync) won't get your new, smaller objects. If someone is pulling from a 
deltafied repository often enough, and the objects of the top-most 
commit are always non-deltafied, they will never see any deltas. Object 
immutability is a really good thing.

And there might be some performance issues involved, if you'd like to do 
deltafying at commit-time (not that I've actually tried this, or 
anything)...

Thanks for your comments!

/dan

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2005-05-18 19:35 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-05-12  3:51 [PATCH] improved delta support for git Nicolas Pitre
2005-05-12  4:36 ` Junio C Hamano
2005-05-12 14:27   ` Chris Mason
     [not found]     ` <2cfc403205051207467755cdf@mail.gmail.com>
2005-05-12 14:47       ` Jon Seymour
2005-05-12 15:18         ` Nicolas Pitre
2005-05-12 17:16           ` Junio C Hamano
2005-05-13 11:44             ` Chris Mason
2005-05-17 18:22 ` Thomas Glanzmann
2005-05-17 19:02   ` Thomas Glanzmann
2005-05-17 19:10   ` Thomas Glanzmann
2005-05-17 21:43 ` Dan Holmsand
2005-05-18  4:32   ` Nicolas Pitre
2005-05-18  8:54     ` Dan Holmsand
2005-05-18 18:41       ` Nicolas Pitre
2005-05-18 19:32         ` Dan Holmsand
2005-05-18 15:12   ` Linus Torvalds
2005-05-18 17:15     ` Dan Holmsand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).