* [PATCH] improved delta support for git
@ 2005-05-12 3:51 Nicolas Pitre
2005-05-12 4:36 ` Junio C Hamano
` (2 more replies)
0 siblings, 3 replies; 17+ messages in thread
From: Nicolas Pitre @ 2005-05-12 3:51 UTC (permalink / raw)
To: git
OK, here's some improved support for delta objects in a git repository.
This patch adds the ability to create and restore delta objects with the
git-mkdelta command. A list of objects is provided and the
corresponding delta chain is created. The maximum depth of a delta
chain can be specified with the -d argument. If a max depth of 0 is
provided then all given objects are undeltafied and replaced by their
original version. With the -v argument a lot of lovely details are
printed out.
Also included is a script to deltafy an entire repository. Simply
execute git-deltafy-script to create deltas of objects corresponding to
successive previous versions of every files. Running
'git-deltafy-script -d 0' will revert everything to non deltafied form.
I've yet to add suport to fsck-cache to understand delta objects. It is
advised to undeltafy your repository before running it otherwise you'll
see lots of reported errors. Once undeltafied you should have good
output from fsck-cache again.
Please backup your repository before playing with this for now... just
in case.
If you happen to have the whole kernel history in your repository I'd be
interested to know what the space figure is and how it performs. So far
I tested a tar of the .git/objects directory from git's git repository.
This is to estimate the real data size without the filesystem block
round up. The undeltafied repository created a 1708kb tar file while the
deltafied repository created a 1173kb tar file. The chunking storage
code should be considered for real life usage of course.
There are probably things to experiment in order to save space further,
such as deltafying tree objects, and in the context of Linux, deltafying
files with lots of similitudes between content in diferent include/asm-*
subdirectories.
Signed-off-by: Nicolas Pitre <nico@cam.org>
Index: git/diff-delta.c
===================================================================
--- /dev/null
+++ git/diff-delta.c
@@ -0,0 +1,330 @@
+/*
+ * diff-delta.c: generate a delta between two buffers
+ *
+ * Many parts of this file have been lifted from LibXDiff version 0.10.
+ * http://www.xmailserver.org/xdiff-lib.html
+ *
+ * LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2003 Davide Libenzi
+ *
+ * Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
+ *
+ * This file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ */
+
+#include <stdlib.h>
+#include "delta.h"
+
+
+/* block size: min = 16, max = 64k, power of 2 */
+#define BLK_SIZE 16
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define GR_PRIME 0x9e370001
+#define HASH(v, b) (((unsigned int)(v) * GR_PRIME) >> (32 - (b)))
+
+/* largest prime smaller than 65536 */
+#define BASE 65521
+
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+#define NMAX 5552
+
+#define DO1(buf, i) { s1 += buf[i]; s2 += s1; }
+#define DO2(buf, i) DO1(buf, i); DO1(buf, i + 1);
+#define DO4(buf, i) DO2(buf, i); DO2(buf, i + 2);
+#define DO8(buf, i) DO4(buf, i); DO4(buf, i + 4);
+#define DO16(buf) DO8(buf, 0); DO8(buf, 8);
+
+static unsigned int adler32(unsigned int adler, const unsigned char *buf, int len)
+{
+ int k;
+ unsigned int s1 = adler & 0xffff;
+ unsigned int s2 = adler >> 16;
+
+ while (len > 0) {
+ k = MIN(len, NMAX);
+ len -= k;
+ while (k >= 16) {
+ DO16(buf);
+ buf += 16;
+ k -= 16;
+ }
+ if (k != 0)
+ do {
+ s1 += *buf++;
+ s2 += s1;
+ } while (--k);
+ s1 %= BASE;
+ s2 %= BASE;
+ }
+
+ return (s2 << 16) | s1;
+}
+
+static unsigned int hashbits(unsigned int size)
+{
+ unsigned int val = 1, bits = 0;
+ while (val < size && bits < 32) {
+ val <<= 1;
+ bits++;
+ }
+ return bits ? bits: 1;
+}
+
+typedef struct s_chanode {
+ struct s_chanode *next;
+ int icurr;
+} chanode_t;
+
+typedef struct s_chastore {
+ chanode_t *head, *tail;
+ int isize, nsize;
+ chanode_t *ancur;
+ chanode_t *sncur;
+ int scurr;
+} chastore_t;
+
+static void cha_init(chastore_t *cha, int isize, int icount)
+{
+ cha->head = cha->tail = NULL;
+ cha->isize = isize;
+ cha->nsize = icount * isize;
+ cha->ancur = cha->sncur = NULL;
+ cha->scurr = 0;
+}
+
+static void *cha_alloc(chastore_t *cha)
+{
+ chanode_t *ancur;
+ void *data;
+
+ ancur = cha->ancur;
+ if (!ancur || ancur->icurr == cha->nsize) {
+ ancur = malloc(sizeof(chanode_t) + cha->nsize);
+ if (!ancur)
+ return NULL;
+ ancur->icurr = 0;
+ ancur->next = NULL;
+ if (cha->tail)
+ cha->tail->next = ancur;
+ if (!cha->head)
+ cha->head = ancur;
+ cha->tail = ancur;
+ cha->ancur = ancur;
+ }
+
+ data = (void *)ancur + sizeof(chanode_t) + ancur->icurr;
+ ancur->icurr += cha->isize;
+ return data;
+}
+
+static void cha_free(chastore_t *cha)
+{
+ chanode_t *cur = cha->head;
+ while (cur) {
+ chanode_t *tmp = cur;
+ cur = cur->next;
+ free(tmp);
+ }
+}
+
+typedef struct s_bdrecord {
+ struct s_bdrecord *next;
+ unsigned int fp;
+ const unsigned char *ptr;
+} bdrecord_t;
+
+typedef struct s_bdfile {
+ const unsigned char *data, *top;
+ chastore_t cha;
+ unsigned int fphbits;
+ bdrecord_t **fphash;
+} bdfile_t;
+
+static int delta_prepare(const unsigned char *buf, int bufsize, bdfile_t *bdf)
+{
+ unsigned int fphbits;
+ int i, hsize;
+ const unsigned char *base, *data, *top;
+ bdrecord_t *brec;
+ bdrecord_t **fphash;
+
+ fphbits = hashbits(bufsize / BLK_SIZE + 1);
+ hsize = 1 << fphbits;
+ fphash = malloc(hsize * sizeof(bdrecord_t *));
+ if (!fphash)
+ return -1;
+ for (i = 0; i < hsize; i++)
+ fphash[i] = NULL;
+ cha_init(&bdf->cha, sizeof(bdrecord_t), hsize / 4 + 1);
+
+ bdf->data = data = base = buf;
+ bdf->top = top = buf + bufsize;
+ data += (bufsize / BLK_SIZE) * BLK_SIZE;
+ if (data == top)
+ data -= BLK_SIZE;
+
+ for ( ; data >= base; data -= BLK_SIZE) {
+ brec = cha_alloc(&bdf->cha);
+ if (!brec) {
+ cha_free(&bdf->cha);
+ free(fphash);
+ return -1;
+ }
+ brec->fp = adler32(0, data, MIN(BLK_SIZE, top - data));
+ brec->ptr = data;
+ i = HASH(brec->fp, fphbits);
+ brec->next = fphash[i];
+ fphash[i] = brec;
+ }
+
+ bdf->fphbits = fphbits;
+ bdf->fphash = fphash;
+
+ return 0;
+}
+
+static void delta_cleanup(bdfile_t *bdf)
+{
+ free(bdf->fphash);
+ cha_free(&bdf->cha);
+}
+
+#define COPYOP_SIZE(o, s) \
+ (!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \
+ !!(s & 0xff) + !!(s & 0xff00) + 1)
+
+void *diff_delta(void *from_buf, unsigned long from_size,
+ void *to_buf, unsigned long to_size,
+ unsigned long *delta_size)
+{
+ int i, outpos, outsize, inscnt, csize, msize, moff;
+ unsigned int fp;
+ const unsigned char *data, *top, *ptr1, *ptr2;
+ unsigned char *out, *orig;
+ bdrecord_t *brec;
+ bdfile_t bdf;
+
+ if (!from_size || !to_size || delta_prepare(from_buf, from_size, &bdf))
+ return NULL;
+
+ outpos = 0;
+ outsize = 8192;
+ out = malloc(outsize);
+ if (!out) {
+ delta_cleanup(&bdf);
+ return NULL;
+ }
+
+ data = to_buf;
+ top = to_buf + to_size;
+
+ /* store reference buffer size */
+ orig = out + outpos++;
+ *orig = i = 0;
+ do {
+ if (from_size & 0xff) {
+ *orig |= (1 << i);
+ out[outpos++] = from_size;
+ }
+ i++;
+ from_size >>= 8;
+ } while (from_size);
+
+ /* store target buffer size */
+ orig = out + outpos++;
+ *orig = i = 0;
+ do {
+ if (to_size & 0xff) {
+ *orig |= (1 << i);
+ out[outpos++] = to_size;
+ }
+ i++;
+ to_size >>= 8;
+ } while (to_size);
+
+ inscnt = 0;
+ moff = 0;
+ while (data < top) {
+ msize = 0;
+ fp = adler32(0, data, MIN(top - data, BLK_SIZE));
+ i = HASH(fp, bdf.fphbits);
+ for (brec = bdf.fphash[i]; brec; brec = brec->next) {
+ if (brec->fp == fp) {
+ csize = bdf.top - brec->ptr;
+ if (csize > top - data)
+ csize = top - data;
+ for (ptr1 = brec->ptr, ptr2 = data;
+ csize && *ptr1 == *ptr2;
+ csize--, ptr1++, ptr2++);
+
+ csize = ptr1 - brec->ptr;
+ if (csize > msize) {
+ moff = brec->ptr - bdf.data;
+ msize = csize;
+ if (msize >= 0x10000) {
+ msize = 0x10000;
+ break;
+ }
+ }
+ }
+ }
+
+ if (!msize || msize < COPYOP_SIZE(moff, msize)) {
+ if (!inscnt)
+ outpos++;
+ out[outpos++] = *data++;
+ inscnt++;
+ if (inscnt == 0x7f) {
+ out[outpos - inscnt - 1] = inscnt;
+ inscnt = 0;
+ }
+ } else {
+ if (inscnt) {
+ out[outpos - inscnt - 1] = inscnt;
+ inscnt = 0;
+ }
+
+ data += msize;
+ orig = out + outpos++;
+ i = 0x80;
+
+ if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
+ moff >>= 8;
+ if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
+ moff >>= 8;
+ if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
+ moff >>= 8;
+ if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
+
+ if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
+ msize >>= 8;
+ if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
+
+ *orig = i;
+ }
+
+ /* next time around the largest possible output is 1 + 4 + 3 */
+ if (outpos > outsize - 8) {
+ void *tmp = out;
+ outsize = outsize * 3 / 2;
+ out = realloc(out, outsize);
+ if (!out) {
+ free(tmp);
+ delta_cleanup(&bdf);
+ return NULL;
+ }
+ }
+ }
+
+ if (inscnt)
+ out[outpos - inscnt - 1] = inscnt;
+
+ delta_cleanup(&bdf);
+ *delta_size = outpos;
+ return out;
+}
Index: git/delta.h
===================================================================
--- /dev/null
+++ git/delta.h
@@ -0,0 +1,6 @@
+extern void *diff_delta(void *from_buf, unsigned long from_size,
+ void *to_buf, unsigned long to_size,
+ unsigned long *delta_size);
+extern void *patch_delta(void *src_buf, unsigned long src_size,
+ void *delta_buf, unsigned long delta_size,
+ unsigned long *dst_size);
Index: git/Makefile
===================================================================
--- git.orig/Makefile
+++ git/Makefile
@@ -29,7 +29,7 @@
install $(PROG) $(SCRIPTS) $(HOME)/bin/
LIB_OBJS=read-cache.o sha1_file.o usage.o object.o commit.o tree.o blob.o \
- tag.o date.o
+ tag.o date.o diff-delta.o patch-delta.o
LIB_FILE=libgit.a
LIB_H=cache.h object.h blob.h tree.h commit.h tag.h
@@ -63,6 +63,9 @@
test-date: test-date.c date.o
$(CC) $(CFLAGS) -o $@ test-date.c date.o
+test-delta: test-delta.c diff-delta.o patch-delta.o
+ $(CC) $(CFLAGS) -o $@ $^
+
git-%: %.c $(LIB_FILE)
$(CC) $(CFLAGS) -o $@ $(filter %.c,$^) $(LIBS)
Index: git/patch-delta.c
===================================================================
--- /dev/null
+++ git/patch-delta.c
@@ -0,0 +1,88 @@
+/*
+ * patch-delta.c:
+ * recreate a buffer from a source and the delta produced by diff-delta.c
+ *
+ * (C) 2005 Nicolas Pitre <nico@cam.org>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "delta.h"
+
+void *patch_delta(void *src_buf, unsigned long src_size,
+ void *delta_buf, unsigned long delta_size,
+ unsigned long *dst_size)
+{
+ const unsigned char *data, *top;
+ unsigned char *dst_buf, *out, cmd;
+ unsigned long size;
+ int i;
+
+ /* the smallest delta size possible is 6 bytes */
+ if (delta_size < 6)
+ return NULL;
+
+ data = delta_buf;
+ top = delta_buf + delta_size;
+
+ /* make sure the orig file size matches what we expect */
+ size = i = 0;
+ cmd = *data++;
+ while (cmd) {
+ if (cmd & 1)
+ size |= *data++ << i;
+ i += 8;
+ cmd >>= 1;
+ }
+ if (size != src_size)
+ return NULL;
+
+ /* now the result size */
+ size = i = 0;
+ cmd = *data++;
+ while (cmd) {
+ if (cmd & 1)
+ size |= *data++ << i;
+ i += 8;
+ cmd >>= 1;
+ }
+ dst_buf = malloc(size);
+ if (!dst_buf)
+ return NULL;
+
+ out = dst_buf;
+ while (data < top) {
+ cmd = *data++;
+ if (cmd & 0x80) {
+ unsigned long cp_off = 0, cp_size = 0;
+ const unsigned char *buf;
+ if (cmd & 0x01) cp_off = *data++;
+ if (cmd & 0x02) cp_off |= (*data++ << 8);
+ if (cmd & 0x04) cp_off |= (*data++ << 16);
+ if (cmd & 0x08) cp_off |= (*data++ << 24);
+ if (cmd & 0x10) cp_size = *data++;
+ if (cmd & 0x20) cp_size |= (*data++ << 8);
+ if (cp_size == 0) cp_size = 0x10000;
+ buf = (cmd & 0x40) ? dst_buf : src_buf;
+ memcpy(out, buf + cp_off, cp_size);
+ out += cp_size;
+ } else {
+ memcpy(out, data, cmd);
+ out += cmd;
+ data += cmd;
+ }
+ }
+
+ /* sanity check */
+ if (data != top || out - dst_buf != size) {
+ free(dst_buf);
+ return NULL;
+ }
+
+ *dst_size = size;
+ return dst_buf;
+}
Index: git/test-delta.c
===================================================================
--- /dev/null
+++ git/test-delta.c
@@ -0,0 +1,79 @@
+/*
+ * test-delta.c: test code to exercise diff-delta.c and patch-delta.c
+ *
+ * (C) 2005 Nicolas Pitre <nico@cam.org>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include "delta.h"
+
+static const char *usage =
+ "test-delta (-d|-p) <from_file> <data_file> <out_file>";
+
+int main(int argc, char *argv[])
+{
+ int fd;
+ struct stat st;
+ void *from_buf, *data_buf, *out_buf;
+ unsigned long from_size, data_size, out_size;
+
+ if (argc != 5 || (strcmp(argv[1], "-d") && strcmp(argv[1], "-p"))) {
+ fprintf(stderr, "Usage: %s\n", usage);
+ return 1;
+ }
+
+ fd = open(argv[2], O_RDONLY);
+ if (fd < 0 || fstat(fd, &st)) {
+ perror(argv[2]);
+ return 1;
+ }
+ from_size = st.st_size;
+ from_buf = mmap(NULL, from_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (from_buf == MAP_FAILED) {
+ perror(argv[2]);
+ return 1;
+ }
+ close(fd);
+
+ fd = open(argv[3], O_RDONLY);
+ if (fd < 0 || fstat(fd, &st)) {
+ perror(argv[3]);
+ return 1;
+ }
+ data_size = st.st_size;
+ data_buf = mmap(NULL, data_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (data_buf == MAP_FAILED) {
+ perror(argv[3]);
+ return 1;
+ }
+ close(fd);
+
+ if (argv[1][1] == 'd')
+ out_buf = diff_delta(from_buf, from_size,
+ data_buf, data_size, &out_size);
+ else
+ out_buf = patch_delta(from_buf, from_size,
+ data_buf, data_size, &out_size);
+ if (!out_buf) {
+ fprintf(stderr, "delta operation failed (returned NULL)\n");
+ return 1;
+ }
+
+ fd = open (argv[4], O_WRONLY|O_CREAT|O_TRUNC, 0666);
+ if (fd < 0 || write(fd, out_buf, out_size) != out_size) {
+ perror(argv[4]);
+ return 1;
+ }
+
+ return 0;
+}
Index: git/sha1_file.c
===================================================================
--- git.orig/sha1_file.c
+++ git/sha1_file.c
@@ -8,6 +8,7 @@
*/
#include <stdarg.h>
#include "cache.h"
+#include "delta.h"
#ifndef O_NOATIME
#if defined(__linux__) && (defined(__i386__) || defined(__PPC__))
@@ -224,6 +225,19 @@
if (map) {
buf = unpack_sha1_file(map, mapsize, type, size);
munmap(map, mapsize);
+ if (buf && !strcmp(type, "delta")) {
+ void *ref = NULL, *delta = buf;
+ unsigned long ref_size, delta_size = *size;
+ buf = NULL;
+ if (delta_size > 20)
+ ref = read_sha1_file(delta, type, &ref_size);
+ if (ref)
+ buf = patch_delta(ref, ref_size,
+ delta+20, delta_size-20,
+ size);
+ free(delta);
+ free(ref);
+ }
return buf;
}
return NULL;
Index: git/Makefile
===================================================================
--- git.orig/Makefile
+++ git/Makefile
@@ -13,7 +13,7 @@
AR=ar
SCRIPTS=git-apply-patch-script git-merge-one-file-script git-prune-script \
- git-pull-script git-tag-script git-resolve-script
+ git-pull-script git-tag-script git-resolve-script git-deltafy-script
PROG= git-update-cache git-diff-files git-init-db git-write-tree \
git-read-tree git-commit-tree git-cat-file git-fsck-cache \
@@ -21,7 +21,8 @@
git-check-files git-ls-tree git-merge-base git-merge-cache \
git-unpack-file git-export git-diff-cache git-convert-cache \
git-http-pull git-rpush git-rpull git-rev-list git-mktag \
- git-diff-tree-helper git-tar-tree git-local-pull git-write-blob
+ git-diff-tree-helper git-tar-tree git-local-pull git-write-blob \
+ git-mkdelta
all: $(PROG)
@@ -95,6 +96,7 @@
git-rpull: rsh.c pull.c
git-rev-list: rev-list.c
git-mktag: mktag.c
+git-mkdelta: mkdelta.c
git-diff-tree-helper: diff-tree-helper.c
git-tar-tree: tar-tree.c
git-write-blob: write-blob.c
Index: git/mkdelta.c
===================================================================
--- /dev/null
+++ git/mkdelta.c
@@ -0,0 +1,283 @@
+/*
+ * Deltafication of a GIT database.
+ *
+ * (C) 2005 Nicolas Pitre <nico@cam.org>
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "cache.h"
+#include "delta.h"
+
+static int replace_object(char *buf, unsigned long len, unsigned char *sha1,
+ char *hdr, int hdrlen)
+{
+ char tmpfile[PATH_MAX];
+ int size;
+ char *compressed;
+ z_stream stream;
+ int fd;
+
+ snprintf(tmpfile, sizeof(tmpfile), "%s/obj_XXXXXX", get_object_directory());
+ fd = mkstemp(tmpfile);
+ if (fd < 0)
+ return error("%s: %s\n", tmpfile, strerror(errno));
+
+ /* Set it up */
+ memset(&stream, 0, sizeof(stream));
+ deflateInit(&stream, Z_BEST_COMPRESSION);
+ size = deflateBound(&stream, len+hdrlen);
+ compressed = xmalloc(size);
+
+ /* Compress it */
+ stream.next_out = compressed;
+ stream.avail_out = size;
+
+ /* First header.. */
+ stream.next_in = hdr;
+ stream.avail_in = hdrlen;
+ while (deflate(&stream, 0) == Z_OK)
+ /* nothing */;
+
+ /* Then the data itself.. */
+ stream.next_in = buf;
+ stream.avail_in = len;
+ while (deflate(&stream, Z_FINISH) == Z_OK)
+ /* nothing */;
+ deflateEnd(&stream);
+ size = stream.total_out;
+
+ if (write(fd, compressed, size) != size) {
+ perror("unable to write file");
+ close(fd);
+ unlink(tmpfile);
+ return -1;
+ }
+ fchmod(fd, 0444);
+ close(fd);
+
+ if (rename(tmpfile, sha1_file_name(sha1))) {
+ perror("unable to replace original object");
+ unlink(tmpfile);
+ return -1;
+ }
+ return 0;
+}
+
+static int write_delta_file(char *buf, unsigned long len,
+ unsigned char *sha1_ref, unsigned char *sha1_trg)
+{
+ char hdr[50];
+ int hdrlen;
+
+ /* Generate the header + sha1 of reference for delta */
+ hdrlen = sprintf(hdr, "delta %lu", len+20)+1;
+ memcpy(hdr + hdrlen, sha1_ref, 20);
+ hdrlen += 20;
+
+ return replace_object(buf, len, sha1_trg, hdr, hdrlen);
+}
+
+static int replace_sha1_file(char *buf, unsigned long len,
+ char *type, unsigned char *sha1)
+{
+ char hdr[50];
+ int hdrlen;
+
+ hdrlen = sprintf(hdr, "%s %lu", type, len)+1;
+ return replace_object(buf, len, sha1, hdr, hdrlen);
+}
+
+static void *get_buffer(unsigned char *sha1, char *type, unsigned long *size)
+{
+ unsigned long mapsize;
+ void *map = map_sha1_file(sha1, &mapsize);
+ if (map) {
+ void *buffer = unpack_sha1_file(map, mapsize, type, size);
+ munmap(map, mapsize);
+ if (buffer)
+ return buffer;
+ }
+ error("unable to get object %s", sha1_to_hex(sha1));
+ return NULL;
+}
+
+static void *expand_delta(void *delta, unsigned long delta_size, char *type,
+ unsigned long *size, unsigned int *depth, char *head)
+{
+ void *buf = NULL;
+ *depth++;
+ if (delta_size < 20) {
+ error("delta object is bad");
+ free(delta);
+ } else {
+ unsigned long ref_size;
+ void *ref = get_buffer(delta, type, &ref_size);
+ if (ref && !strcmp(type, "delta"))
+ ref = expand_delta(ref, ref_size, type, &ref_size,
+ depth, head);
+ else
+ memcpy(head, delta, 20);
+ if (ref)
+ buf = patch_delta(ref, ref_size, delta+20,
+ delta_size-20, size);
+ free(ref);
+ free(delta);
+ }
+ return buf;
+}
+
+static char *mkdelta_usage =
+"mkdelta [ --max-depth=N ] <reference_sha1> <target_sha1> [ <next_sha1> ... ]";
+
+int main(int argc, char **argv)
+{
+ unsigned char sha1_ref[20], sha1_trg[20], head_ref[20], head_trg[20];
+ char type_ref[20], type_trg[20];
+ void *buf_ref, *buf_trg, *buf_delta;
+ unsigned long size_ref, size_trg, size_orig, size_delta;
+ unsigned int depth_ref, depth_trg, depth_max = -1;
+ int i, verbose = 0;
+
+ for (i = 1; i < argc; i++) {
+ if (!strcmp(argv[i], "-v")) {
+ verbose = 1;
+ } else if (!strcmp(argv[i], "-d") && i+1 < argc) {
+ depth_max = atoi(argv[++i]);
+ } else if (!strncmp(argv[i], "--max-depth=", 12)) {
+ depth_max = atoi(argv[i]+12);
+ } else
+ break;
+ }
+
+ if (i + (depth_max != 0) >= argc)
+ usage(mkdelta_usage);
+
+ if (get_sha1(argv[i], sha1_ref))
+ die("bad sha1 %s", argv[i]);
+ depth_ref = 0;
+ buf_ref = get_buffer(sha1_ref, type_ref, &size_ref);
+ if (buf_ref && !strcmp(type_ref, "delta"))
+ buf_ref = expand_delta(buf_ref, size_ref, type_ref,
+ &size_ref, &depth_ref, head_ref);
+ else
+ memcpy(head_ref, sha1_ref, 20);
+ if (!buf_ref)
+ die("unable to obtain initial object %s", argv[i]);
+
+ if (depth_ref > depth_max) {
+ if (replace_sha1_file(buf_ref, size_ref, type_ref, sha1_ref))
+ die("unable to restore %s", argv[i]);
+ if (verbose)
+ printf("undelta %s (depth was %d)\n", argv[i], depth_ref);
+ depth_ref = 0;
+ }
+
+ while (++i < argc) {
+ if (get_sha1(argv[i], sha1_trg))
+ die("bad sha1 %s", argv[i]);
+ depth_trg = 0;
+ buf_trg = get_buffer(sha1_trg, type_trg, &size_trg);
+ if (buf_trg && !size_trg) {
+ if (verbose)
+ printf("skip %s (object is empty)\n", argv[i]);
+ continue;
+ }
+ size_orig = size_trg;
+ if (buf_trg && !strcmp(type_trg, "delta")) {
+ if (!memcmp(buf_trg, sha1_ref, 20)) {
+ /* delta already in place */
+ depth_ref++;
+ memcpy(sha1_ref, sha1_trg, 20);
+ buf_ref = patch_delta(buf_ref, size_ref,
+ buf_trg+20, size_trg-20,
+ &size_ref);
+ if (!buf_ref)
+ die("unable to apply delta %s", argv[i]);
+ if (depth_ref > depth_max) {
+ if (replace_sha1_file(buf_ref, size_ref,
+ type_ref, sha1_ref))
+ die("unable to restore %s", argv[i]);
+ if (verbose)
+ printf("undelta %s (depth was %d)\n", argv[i], depth_ref);
+ depth_ref = 0;
+ continue;
+ }
+ if (verbose)
+ printf("skip %s (delta already in place)\n", argv[i]);
+ continue;
+ }
+ buf_trg = expand_delta(buf_trg, size_trg, type_trg,
+ &size_trg, &depth_trg, head_trg);
+ } else
+ memcpy(head_trg, sha1_trg, 20);
+ if (!buf_trg)
+ die("unable to read target object %s", argv[i]);
+
+ if (depth_trg > depth_max) {
+ if (replace_sha1_file(buf_trg, size_trg, type_trg, sha1_trg))
+ die("unable to restore %s", argv[i]);
+ if (verbose)
+ printf("undelta %s (depth was %d)\n", argv[i], depth_trg);
+ depth_trg = 0;
+ size_orig = size_trg;
+ }
+
+ if (depth_max == 0)
+ goto skip;
+
+ if (strcmp(type_ref, type_trg))
+ die("type mismatch for object %s", argv[i]);
+
+ if (!size_ref) {
+ if (verbose)
+ printf("skip %s (initial object is empty)\n", argv[i]);
+ goto skip;
+ }
+
+ depth_ref++;
+ if (depth_ref > depth_max) {
+ if (verbose)
+ printf("skip %s (exceeding max link depth)\n", argv[i]);
+ goto skip;
+ }
+
+ if (!memcmp(head_ref, sha1_trg, 20)) {
+ if (verbose)
+ printf("skip %s (would create a loop)\n", argv[i]);
+ goto skip;
+ }
+
+ buf_delta = diff_delta(buf_ref, size_ref, buf_trg, size_trg, &size_delta);
+ if (!buf_delta)
+ die("out of memory");
+
+ if (size_delta+20 < size_orig) {
+ if (write_delta_file(buf_delta, size_delta,
+ sha1_ref, sha1_trg))
+ die("unable to write delta for %s", argv[i]);
+ free(buf_delta);
+ if (verbose)
+ printf("delta %s (size=%ld.%02ld%%, depth=%d)\n",
+ argv[i], (size_delta+20)*100 / size_trg,
+ ((size_delta+20)*10000 / size_trg)%100,
+ depth_ref);
+ } else {
+ free(buf_delta);
+ if (verbose)
+ printf("skip %s (original is smaller)\n", argv[i]);
+ skip:
+ depth_ref = depth_trg;
+ memcpy(head_ref, head_trg, 20);
+ }
+
+ free(buf_ref);
+ buf_ref = buf_trg;
+ size_ref = size_trg;
+ memcpy(sha1_ref, sha1_trg, 20);
+ }
+
+ return 0;
+}
Index: git/mktag.c
===================================================================
--- git.orig/mktag.c
+++ git/mktag.c
@@ -25,20 +25,14 @@
static int verify_object(unsigned char *sha1, const char *expected_type)
{
int ret = -1;
- unsigned long mapsize;
- void *map = map_sha1_file(sha1, &mapsize);
+ char type[100];
+ unsigned long size;
+ void *buffer = read_sha1_file(sha1, type, &size);
- if (map) {
- char type[100];
- unsigned long size;
- void *buffer = unpack_sha1_file(map, mapsize, type, &size);
-
- if (buffer) {
- if (!strcmp(type, expected_type))
- ret = check_sha1_signature(sha1, buffer, size, type);
- free(buffer);
- }
- munmap(map, mapsize);
+ if (buffer) {
+ if (!strcmp(type, expected_type))
+ ret = check_sha1_signature(sha1, buffer, size, type);
+ free(buffer);
}
return ret;
}
Index: git/git-deltafy-script
===================================================================
--- /dev/null
+++ git/git-deltafy-script
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Script to deltafy an entire GIT repository based on the commit list.
+# The most recent version of a file is the reference and the previous version
+# is changed into a delta from that most recent version. And so on for
+# successive versions going back in time.
+#
+# The -d argument allows to provide a limit on the delta chain depth.
+# If 0 is passed then everything is undeltafied.
+
+set -e
+
+depth=
+[ "$1" == "-d" ] && depth="--max-depth=$2" && shift 2
+
+curr_file=""
+
+git-rev-list HEAD |
+git-diff-tree -r --stdin |
+sed -n '/^\*/ s/^.*->\(.\{41\}\)\(.*\)$/\2 \1/p' | sort | uniq |
+while read file sha1; do
+ if [ "$file" == "$curr_file" ]; then
+ list="$list $sha1"
+ else
+ if [ "$list" ]; then
+ echo "Processing $curr_file"
+ echo "$head $list" | xargs git-mkdelta $depth -v
+ fi
+ curr_file="$file"
+ list=""
+ head="$sha1"
+ fi
+done
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH] improved delta support for git 2005-05-12 3:51 [PATCH] improved delta support for git Nicolas Pitre @ 2005-05-12 4:36 ` Junio C Hamano 2005-05-12 14:27 ` Chris Mason 2005-05-17 18:22 ` Thomas Glanzmann 2005-05-17 21:43 ` Dan Holmsand 2 siblings, 1 reply; 17+ messages in thread From: Junio C Hamano @ 2005-05-12 4:36 UTC (permalink / raw) To: Nicolas Pitre; +Cc: git The changes to sha1_file interface seems to be contained to read_sha1_file() only; which is a very good sign. You have already expressed that you are aware that fsck-cache needs to be taught about the delta objects, so I'd trust that would be what you will be tackling next. I started wondering how the delta chains would affect pull.c, the engine that decides which files under GIT_OBJECT_DIRECTORY need to be pulled from the remote side in order to construct the set of objects needed by the given commit ID, under various combinations of cut-off criteria given with -c, -t, and -a options. It appears to me that changes to the make_sure_we_have_it() routine along the following lines (completely untested) would suffice. Instead of just returning success, we first fetch the named object from the remote side, read it to see if it is really the object we have asked, or just a delta, and if it is a delta call itself again on the underlying object that delta object depends upon. Signed-off-by: Junio C Hamano <junkio@cox.net> --- # - git-pb: Fixed a leak in read-tree # + (working tree) --- a/pull.c +++ b/pull.c @@ -32,11 +32,23 @@ static void report_missing(const char *w static int make_sure_we_have_it(const char *what, unsigned char *sha1) { int status; + unsigned long mapsize; + void *map, *buf; + if (has_sha1_file(sha1)) return 0; status = fetch(sha1); if (status && what) report_missing(what, sha1); + + map = map_sha1_file(sha1, &mapsize); + if (map) { + buf = unpack_sha1_file(map, mapsize, type, size); + munmap(map, mapsize); + if (buf && !strcmp(type, "delta")) + status = make_sure_we_have_it(what, buf); + free(buf); + } return status; } ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-12 4:36 ` Junio C Hamano @ 2005-05-12 14:27 ` Chris Mason [not found] ` <2cfc403205051207467755cdf@mail.gmail.com> 0 siblings, 1 reply; 17+ messages in thread From: Chris Mason @ 2005-05-12 14:27 UTC (permalink / raw) To: Junio C Hamano; +Cc: Nicolas Pitre, git On Thursday 12 May 2005 00:36, Junio C Hamano wrote: > It appears to me that changes to the make_sure_we_have_it() > routine along the following lines (completely untested) would > suffice. Instead of just returning success, we first fetch the > named object from the remote side, read it to see if it is > really the object we have asked, or just a delta, and if it is a > delta call itself again on the underlying object that delta > object depends upon. If we fetch the named object and it is a delta, the delta will either depend on an object we already have or an object that we don't have. If we don't have it, the pull should find it while pulling other commits we don't have. -chris ^ permalink raw reply [flat|nested] 17+ messages in thread
[parent not found: <2cfc403205051207467755cdf@mail.gmail.com>]
* Re: [PATCH] improved delta support for git [not found] ` <2cfc403205051207467755cdf@mail.gmail.com> @ 2005-05-12 14:47 ` Jon Seymour 2005-05-12 15:18 ` Nicolas Pitre 0 siblings, 1 reply; 17+ messages in thread From: Jon Seymour @ 2005-05-12 14:47 UTC (permalink / raw) To: Git Mailing List On 5/13/05, Chris Mason <mason@suse.com> wrote: > On Thursday 12 May 2005 00:36, Junio C Hamano wrote: > > It appears to me that changes to the make_sure_we_have_it() ... > > If we fetch the named object and it is a delta, the delta will either depend > on an object we already have or an object that we don't have. If we don't > have it, the pull should find it while pulling other commits we don't have. > Chris, Doesn't that assume that the object referenced by the delta is reachable from the commit being pulled. While that may be true in practice, I don't think it is a logical certainty. jon. ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-12 14:47 ` Jon Seymour @ 2005-05-12 15:18 ` Nicolas Pitre 2005-05-12 17:16 ` Junio C Hamano 0 siblings, 1 reply; 17+ messages in thread From: Nicolas Pitre @ 2005-05-12 15:18 UTC (permalink / raw) To: jon; +Cc: Git Mailing List On Fri, 13 May 2005, Jon Seymour wrote: > On 5/13/05, Chris Mason <mason@suse.com> wrote: > > On Thursday 12 May 2005 00:36, Junio C Hamano wrote: > > > It appears to me that changes to the make_sure_we_have_it() ... > > > > If we fetch the named object and it is a delta, the delta will either depend > > on an object we already have or an object that we don't have. If we don't > > have it, the pull should find it while pulling other commits we don't have. > > > > Chris, > > Doesn't that assume that the object referenced by the delta is > reachable from the commit being pulled. While that may be true in > practice, I don't think it is a logical certainty. 1) If you happen to already have the referenced object in your local repository then you're done. 2) If not you pull the referenced object from the remote repository, repeat with #1 if it happens to be another delta object. 3) If the remote repository doesn't contain the object referenced by any pulled delta object then that repository is inconsistent just like if a blob object referenced by a tree object was missing. This therefore should not happen. git-fsck-cache will flag broken delta links soon. Nicolas ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-12 15:18 ` Nicolas Pitre @ 2005-05-12 17:16 ` Junio C Hamano 2005-05-13 11:44 ` Chris Mason 0 siblings, 1 reply; 17+ messages in thread From: Junio C Hamano @ 2005-05-12 17:16 UTC (permalink / raw) To: Nicolas Pitre; +Cc: jon, Git Mailing List >>>>> "NP" == Nicolas Pitre <nico@cam.org> writes: >> On 5/13/05, Chris Mason <mason@suse.com> wrote: >> > On Thursday 12 May 2005 00:36, Junio C Hamano wrote: >> > > It appears to me that changes to the make_sure_we_have_it() ... >> > >> > If we fetch the named object and it is a delta, the delta will either depend >> > on an object we already have or an object that we don't have. If we don't >> > have it, the pull should find it while pulling other commits we don't have. NP> 1) If you happen to already have the referenced object in your local NP> repository then you're done. Yes. NP> 2) If not you pull the referenced object from the remote repository, NP> repeat with #1 if it happens to be another delta object. Yes, that is the outline of what my (untested) patch does. Unless I am grossly mistaken, what Chris says is true only when we are pulling with -a flag to the git-*-pull family. If we are pulling "partially near the tip", we do not necessarily pull "other commits we don't have", hence detecting delta's requirement at per-object level and pulling the dependent becomes necessary, which is essentially what you wrote in (2) above. ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-12 17:16 ` Junio C Hamano @ 2005-05-13 11:44 ` Chris Mason 0 siblings, 0 replies; 17+ messages in thread From: Chris Mason @ 2005-05-13 11:44 UTC (permalink / raw) To: Junio C Hamano; +Cc: Nicolas Pitre, jon, Git Mailing List On Thursday 12 May 2005 13:16, Junio C Hamano wrote: > >>>>> "NP" == Nicolas Pitre <nico@cam.org> writes: > >> > >> On 5/13/05, Chris Mason <mason@suse.com> wrote: > >> > On Thursday 12 May 2005 00:36, Junio C Hamano wrote: > >> > > It appears to me that changes to the make_sure_we_have_it() ... > >> > > >> > If we fetch the named object and it is a delta, the delta will either > >> > depend on an object we already have or an object that we don't have. > >> > If we don't have it, the pull should find it while pulling other > >> > commits we don't have. > > NP> 1) If you happen to already have the referenced object in your local > NP> repository then you're done. > > Yes. > > NP> 2) If not you pull the referenced object from the remote repository, > NP> repeat with #1 if it happens to be another delta object. > > Yes, that is the outline of what my (untested) patch does. > > Unless I am grossly mistaken, what Chris says is true only when > we are pulling with -a flag to the git-*-pull family. If we are > pulling "partially near the tip", we do not necessarily pull > "other commits we don't have", hence detecting delta's > requirement at per-object level and pulling the dependent > becomes necessary, which is essentially what you wrote in (2) > above. > Yes, my post does assume that you're pulling everything and the repo you're pulling from has a sane state. This should be the common case though, so I would suggest optimizing things to build a list of the delta objects and check them at the end to see if we didn't pull any. We want the list of delta objects regardless, this way we can warn the user that they have pulled in deltas and give them the chance to convert them into full files. -chris ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-12 3:51 [PATCH] improved delta support for git Nicolas Pitre 2005-05-12 4:36 ` Junio C Hamano @ 2005-05-17 18:22 ` Thomas Glanzmann 2005-05-17 19:02 ` Thomas Glanzmann 2005-05-17 19:10 ` Thomas Glanzmann 2005-05-17 21:43 ` Dan Holmsand 2 siblings, 2 replies; 17+ messages in thread From: Thomas Glanzmann @ 2005-05-17 18:22 UTC (permalink / raw) To: git [-- Attachment #1: Type: text/plain, Size: 828 bytes --] Hello Nicolas, I just tried it against Linus git-HEAD. It worked like a charm so far. I imported mutt-1.5 CVS branch using cvsps (1103 patches). (medion) [/scratch/mutt/mutt-cvs] git-rev-tree HEAD | wc -l 1103 (medion) [/scratch/mutt/mutt-cvs] du -sh .git/objects/ 63M .git/objects/ (medion) [/scratch/mutt/mutt-cvs] git-deltafy-script -d 2000 ... (medion) [/scratch/mutt/mutt-cvs] du -sh .git/objects/ 35M .git/objects/ Maybe you should add git-deltafy-script and git-mkdelta to the installation targets (patch attached). And I wonder why the mutt CVS Repository is still smaller than the zdelta compressed mutt git repository. And with mutt CVS Repository I mean every commit since mutt-0.9x not only mutt-1.5 branch? (faui03) [~] du -sh work/mutt/cvsrepository 34M work/mutt/cvsrepository Greetings, Thomas [-- Attachment #2: diff --] [-- Type: text/plain, Size: 802 bytes --] [PATCH] Install git-mkdelta and git-deltafy-script Signed-off-by: Thomas Glanzmann <sithglan@stud.uni-erlangen.de> --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ INSTALL=install SCRIPTS=git-apply-patch-script git-merge-one-file-script git-prune-script \ - git-pull-script git-tag-script git-resolve-script + git-pull-script git-tag-script git-resolve-script git-deltafy-script PROG= git-update-cache git-diff-files git-init-db git-write-tree \ git-read-tree git-commit-tree git-cat-file git-fsck-cache \ @@ -28,7 +28,7 @@ git-unpack-file git-export git-diff-cache git-convert-cache \ git-http-pull git-rpush git-rpull git-rev-list git-mktag \ git-diff-helper git-tar-tree git-local-pull git-write-blob \ - git-get-tar-commit-id + git-get-tar-commit-id git-mkdelta all: $(PROG) ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-17 18:22 ` Thomas Glanzmann @ 2005-05-17 19:02 ` Thomas Glanzmann 2005-05-17 19:10 ` Thomas Glanzmann 1 sibling, 0 replies; 17+ messages in thread From: Thomas Glanzmann @ 2005-05-17 19:02 UTC (permalink / raw) To: git Hello, btw. 6 Megabyte are spend on commit and tree objects: (medion) [/scratch/mutt/mutt-cvs] find .git/objects/ -type f | sed 's^.*\(..\)/^\1^' | while read FILE; do echo -n "$FILE " ;git-cat-file -t $FILE; done | egrep 'tree|commit' | awk '{print $1}' | sed 's,^\(..\),.git/objects/\1/,' | xargs ls -l | awk '{sum += $5} END {print sum}' 6179105 I know my shell sux. Thomas ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-17 18:22 ` Thomas Glanzmann 2005-05-17 19:02 ` Thomas Glanzmann @ 2005-05-17 19:10 ` Thomas Glanzmann 1 sibling, 0 replies; 17+ messages in thread From: Thomas Glanzmann @ 2005-05-17 19:10 UTC (permalink / raw) To: git Hi, okay I got it. Fragmentation: before: (medion) [/scratch/mutt/mutt-cvs] du -sh --apparent-size .git/objects/ 49M .git/objects/ after: (medion) [/scratch/mutt/mutt-cvs] du -sh --apparent-size .git/objects/ 19M .git/objects/ cvs repository: (faui00u) [~/work/git/yagf] du -sh --apparent-size ../../mutt/cvsrepository 33M ../../mutt/cvsrepository Sincerely, Thomas ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-12 3:51 [PATCH] improved delta support for git Nicolas Pitre 2005-05-12 4:36 ` Junio C Hamano 2005-05-17 18:22 ` Thomas Glanzmann @ 2005-05-17 21:43 ` Dan Holmsand 2005-05-18 4:32 ` Nicolas Pitre 2005-05-18 15:12 ` Linus Torvalds 2 siblings, 2 replies; 17+ messages in thread From: Dan Holmsand @ 2005-05-17 21:43 UTC (permalink / raw) To: git [-- Attachment #1: Type: text/plain, Size: 3856 bytes --] Nicolas (and others), I've been trying out your delta stuff as well. It was a bit disappointing at first, but some tweaking payed off in the end... First, I tried the entire bkcvs history for 2.6, but storing only the "fs" directory tree in git (hoping that would be representative enough, since the entire tree gets *big*). I got 4678 commits. In its original form, it looks like this (first size is "network size", the last one disk size on ext3. Average size per object in bytes): trees: 16M (15684 files) avg: 1119, disk: 61M blobs: 121M (17200 files) avg: 7414, disk: 157M Total: 139M (37562 files) avg: 3883, disk: 237M Using your code, with unlimited delta depth: trees: 16M (15684 files) avg: 1119, disk: 61M blobs: 9M (2333 files) avg: 4491, disk: 15M deltas: 30M (14867 files) avg: 2147, disk: 71M Total: 83M (37562 files) avg: 2334, disk: 188M Same thing, with a maximum delta depth of 2: trees: 16M (15684 files) avg: 1119, disk: 61M blobs: 45M (6940 files) avg: 6906, disk: 60M deltas: 20M (10260 files) avg: 2086, disk: 48M Total: 83M (37562 files) avg: 2334, disk: 188M So, total size from a network perspective went from 139M to 83M, which seemed a little disappointing to me. I think there are too reasons, as shown by these statistics: 1) Too many deltas get too big and/or compress badly. 2) Trees take up a big chunk of total space. Therefore, I tried some other approaches. This one seemed to work best: 1) I limit the maximum size of any delta to 10% of the size of the new version. That guarantees a big saving, as long as any delta is produced. 2) If the "previous" version of a blob is a delta, I produce the new delta form the old deltas base version. This works surprisingly well. I'm guessing the reason for this is that most changes are really small, and they tend to be in the same area as a previous change (as in "Commit new feature. Commit bugfix for new feature. Commit fix for bugfix of new feature. Delete new feature as it doesn't work..."). 3) I use the same method for all tree objects. This method of "opportunistic delta compression" has some other advantages: No risk of long delta chains (as the maximum delta depth is one). It should be disk cache friendly, as many deltas are produced against the same base version. And this method could easily be used incrementally, or "on the fly", as forward deltas are used. Using these tweaks helped a lot, size wise: trees: 3M (9746 files) avg: 380, disk: 38M blobs: 13M (3301 files) avg: 4208, disk: 20M deltas: 11M (19837 files) avg: 586, disk: 78M Total: 28M (37562 files) avg: 799, disk: 155M As this method turned 139M worth of git repository into 28M, I decided to try the same method on the entire bkcvs history (28203 commits). Plain vanilla git looks like this: trees: 246M (156812 files) avg: 1647, disk: 699M blobs: 1171M (185458 files) avg: 6623, disk: 1573M Total: 1422M (370473 files) avg: 4025, disk: 2382M The delta compressed approach outlined above yields: trees: 47M (73519 files) avg: 672, disk: 289M blobs: 156M (49857 files) avg: 3285, disk: 281M deltas: 107M (218894 files) avg: 515, disk: 863M Total: 315M (370473 files) avg: 892, disk: 1544M So, 1.4G became 315M. Not too bad, IMHO. Disk size is still big, of course, but disks are apparently cheap these days. It could probably be even better, if git didn't produce quite as many tree objects. Some sort of chunking together of tree objects would help delta compression a lot (and improve disk size quite a bit in the process). Attached is a patch (against current cogito). It is basically the same as yours, Nicolas, except for some hackery to make the above possible. I'm sure I've made lots of stupid mistakes in it (and the 10% limit is hardcoded right now; I'm lazy). /dan [-- Attachment #2: delta.patch --] [-- Type: text/x-patch, Size: 26038 bytes --] Index: Makefile =================================================================== --- 4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2/Makefile (mode:100644) +++ uncommitted/Makefile (mode:100644) @@ -35,7 +35,7 @@ INSTALL?=install SCRIPTS=git-apply-patch-script git-merge-one-file-script git-prune-script \ - git-pull-script git-tag-script git-resolve-script + git-pull-script git-tag-script git-resolve-script git-deltafy-script PROG= git-update-cache git-diff-files git-init-db git-write-tree \ git-read-tree git-commit-tree git-cat-file git-fsck-cache \ @@ -44,7 +44,7 @@ git-unpack-file git-export git-diff-cache git-convert-cache \ git-http-pull git-rpush git-rpull git-rev-list git-mktag \ git-diff-helper git-tar-tree git-local-pull git-write-blob \ - git-get-tar-commit-id + git-get-tar-commit-id git-mkdelta SCRIPT= commit-id tree-id parent-id cg-add cg-admin-lsobj cg-admin-uncommit \ cg-branch-add cg-branch-ls cg-cancel cg-clone cg-commit cg-diff \ @@ -60,7 +60,7 @@ COMMON= read-cache.o LIB_OBJS=read-cache.o sha1_file.o usage.o object.o commit.o tree.o blob.o \ - tag.o date.o + tag.o date.o diff-delta.o patch-delta.o LIB_FILE=libgit.a LIB_H=cache.h object.h blob.h tree.h commit.h tag.h @@ -94,6 +94,9 @@ all: $(PROG) $(GEN_SCRIPT) +test-delta: test-delta.c diff-delta.o patch-delta.o + $(CC) $(CFLAGS) -o $@ $^ + git-%: %.c $(LIB_FILE) $(CC) $(CFLAGS) -o $@ $(filter %.c,$^) $(LIBS) Index: delta.h =================================================================== --- /dev/null (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2) +++ uncommitted/delta.h (mode:100644) @@ -0,0 +1,6 @@ +extern void *diff_delta(void *from_buf, unsigned long from_size, + void *to_buf, unsigned long to_size, + unsigned long *delta_size); +extern void *patch_delta(void *src_buf, unsigned long src_size, + void *delta_buf, unsigned long delta_size, + unsigned long *dst_size); Index: diff-delta.c =================================================================== --- /dev/null (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2) +++ uncommitted/diff-delta.c (mode:100644) @@ -0,0 +1,330 @@ +/* + * diff-delta.c: generate a delta between two buffers + * + * Many parts of this file have been lifted from LibXDiff version 0.10. + * http://www.xmailserver.org/xdiff-lib.html + * + * LibXDiff was written by Davide Libenzi <davidel@xmailserver.org> + * Copyright (C) 2003 Davide Libenzi + * + * Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005. + * + * This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + */ + +#include <stdlib.h> +#include "delta.h" + + +/* block size: min = 16, max = 64k, power of 2 */ +#define BLK_SIZE 16 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define GR_PRIME 0x9e370001 +#define HASH(v, b) (((unsigned int)(v) * GR_PRIME) >> (32 - (b))) + +/* largest prime smaller than 65536 */ +#define BASE 65521 + +/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ +#define NMAX 5552 + +#define DO1(buf, i) { s1 += buf[i]; s2 += s1; } +#define DO2(buf, i) DO1(buf, i); DO1(buf, i + 1); +#define DO4(buf, i) DO2(buf, i); DO2(buf, i + 2); +#define DO8(buf, i) DO4(buf, i); DO4(buf, i + 4); +#define DO16(buf) DO8(buf, 0); DO8(buf, 8); + +static unsigned int adler32(unsigned int adler, const unsigned char *buf, int len) +{ + int k; + unsigned int s1 = adler & 0xffff; + unsigned int s2 = adler >> 16; + + while (len > 0) { + k = MIN(len, NMAX); + len -= k; + while (k >= 16) { + DO16(buf); + buf += 16; + k -= 16; + } + if (k != 0) + do { + s1 += *buf++; + s2 += s1; + } while (--k); + s1 %= BASE; + s2 %= BASE; + } + + return (s2 << 16) | s1; +} + +static unsigned int hashbits(unsigned int size) +{ + unsigned int val = 1, bits = 0; + while (val < size && bits < 32) { + val <<= 1; + bits++; + } + return bits ? bits: 1; +} + +typedef struct s_chanode { + struct s_chanode *next; + int icurr; +} chanode_t; + +typedef struct s_chastore { + chanode_t *head, *tail; + int isize, nsize; + chanode_t *ancur; + chanode_t *sncur; + int scurr; +} chastore_t; + +static void cha_init(chastore_t *cha, int isize, int icount) +{ + cha->head = cha->tail = NULL; + cha->isize = isize; + cha->nsize = icount * isize; + cha->ancur = cha->sncur = NULL; + cha->scurr = 0; +} + +static void *cha_alloc(chastore_t *cha) +{ + chanode_t *ancur; + void *data; + + ancur = cha->ancur; + if (!ancur || ancur->icurr == cha->nsize) { + ancur = malloc(sizeof(chanode_t) + cha->nsize); + if (!ancur) + return NULL; + ancur->icurr = 0; + ancur->next = NULL; + if (cha->tail) + cha->tail->next = ancur; + if (!cha->head) + cha->head = ancur; + cha->tail = ancur; + cha->ancur = ancur; + } + + data = (void *)ancur + sizeof(chanode_t) + ancur->icurr; + ancur->icurr += cha->isize; + return data; +} + +static void cha_free(chastore_t *cha) +{ + chanode_t *cur = cha->head; + while (cur) { + chanode_t *tmp = cur; + cur = cur->next; + free(tmp); + } +} + +typedef struct s_bdrecord { + struct s_bdrecord *next; + unsigned int fp; + const unsigned char *ptr; +} bdrecord_t; + +typedef struct s_bdfile { + const unsigned char *data, *top; + chastore_t cha; + unsigned int fphbits; + bdrecord_t **fphash; +} bdfile_t; + +static int delta_prepare(const unsigned char *buf, int bufsize, bdfile_t *bdf) +{ + unsigned int fphbits; + int i, hsize; + const unsigned char *base, *data, *top; + bdrecord_t *brec; + bdrecord_t **fphash; + + fphbits = hashbits(bufsize / BLK_SIZE + 1); + hsize = 1 << fphbits; + fphash = malloc(hsize * sizeof(bdrecord_t *)); + if (!fphash) + return -1; + for (i = 0; i < hsize; i++) + fphash[i] = NULL; + cha_init(&bdf->cha, sizeof(bdrecord_t), hsize / 4 + 1); + + bdf->data = data = base = buf; + bdf->top = top = buf + bufsize; + data += (bufsize / BLK_SIZE) * BLK_SIZE; + if (data == top) + data -= BLK_SIZE; + + for ( ; data >= base; data -= BLK_SIZE) { + brec = cha_alloc(&bdf->cha); + if (!brec) { + cha_free(&bdf->cha); + free(fphash); + return -1; + } + brec->fp = adler32(0, data, MIN(BLK_SIZE, top - data)); + brec->ptr = data; + i = HASH(brec->fp, fphbits); + brec->next = fphash[i]; + fphash[i] = brec; + } + + bdf->fphbits = fphbits; + bdf->fphash = fphash; + + return 0; +} + +static void delta_cleanup(bdfile_t *bdf) +{ + free(bdf->fphash); + cha_free(&bdf->cha); +} + +#define COPYOP_SIZE(o, s) \ + (!!(o & 0xff) + !!(o & 0xff00) + !!(o & 0xff0000) + !!(o & 0xff000000) + \ + !!(s & 0xff) + !!(s & 0xff00) + 1) + +void *diff_delta(void *from_buf, unsigned long from_size, + void *to_buf, unsigned long to_size, + unsigned long *delta_size) +{ + int i, outpos, outsize, inscnt, csize, msize, moff; + unsigned int fp; + const unsigned char *data, *top, *ptr1, *ptr2; + unsigned char *out, *orig; + bdrecord_t *brec; + bdfile_t bdf; + + if (!from_size || !to_size || delta_prepare(from_buf, from_size, &bdf)) + return NULL; + + outpos = 0; + outsize = 8192; + out = malloc(outsize); + if (!out) { + delta_cleanup(&bdf); + return NULL; + } + + data = to_buf; + top = to_buf + to_size; + + /* store reference buffer size */ + orig = out + outpos++; + *orig = i = 0; + do { + if (from_size & 0xff) { + *orig |= (1 << i); + out[outpos++] = from_size; + } + i++; + from_size >>= 8; + } while (from_size); + + /* store target buffer size */ + orig = out + outpos++; + *orig = i = 0; + do { + if (to_size & 0xff) { + *orig |= (1 << i); + out[outpos++] = to_size; + } + i++; + to_size >>= 8; + } while (to_size); + + inscnt = 0; + moff = 0; + while (data < top) { + msize = 0; + fp = adler32(0, data, MIN(top - data, BLK_SIZE)); + i = HASH(fp, bdf.fphbits); + for (brec = bdf.fphash[i]; brec; brec = brec->next) { + if (brec->fp == fp) { + csize = bdf.top - brec->ptr; + if (csize > top - data) + csize = top - data; + for (ptr1 = brec->ptr, ptr2 = data; + csize && *ptr1 == *ptr2; + csize--, ptr1++, ptr2++); + + csize = ptr1 - brec->ptr; + if (csize > msize) { + moff = brec->ptr - bdf.data; + msize = csize; + if (msize >= 0x10000) { + msize = 0x10000; + break; + } + } + } + } + + if (!msize || msize < COPYOP_SIZE(moff, msize)) { + if (!inscnt) + outpos++; + out[outpos++] = *data++; + inscnt++; + if (inscnt == 0x7f) { + out[outpos - inscnt - 1] = inscnt; + inscnt = 0; + } + } else { + if (inscnt) { + out[outpos - inscnt - 1] = inscnt; + inscnt = 0; + } + + data += msize; + orig = out + outpos++; + i = 0x80; + + if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; } + moff >>= 8; + if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; } + moff >>= 8; + if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; } + moff >>= 8; + if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; } + + if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; } + msize >>= 8; + if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; } + + *orig = i; + } + + /* next time around the largest possible output is 1 + 4 + 3 */ + if (outpos > outsize - 8) { + void *tmp = out; + outsize = outsize * 3 / 2; + out = realloc(out, outsize); + if (!out) { + free(tmp); + delta_cleanup(&bdf); + return NULL; + } + } + } + + if (inscnt) + out[outpos - inscnt - 1] = inscnt; + + delta_cleanup(&bdf); + *delta_size = outpos; + return out; +} Index: git-deltafy-script =================================================================== --- /dev/null (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2) +++ uncommitted/git-deltafy-script (mode:100755) @@ -0,0 +1,64 @@ +#!/bin/bash + +# Script to deltafy an entire GIT repository based on the commit list. + +# git-deltafy-script --pack deltafies a repository, from current HEAD +# down. +# git-deltafy-script --unpack undeltafies all objects. + +export LANG=C + +depth= +[ "$1" == "-d" ] && depth="--max-depth=$2" && shift 2 + +prevcommit= +prevtree= + +treeid() { + git-cat-file commit "$1" | sed -e 's/tree //;q' +} + +mkdelta() { + git-mkdelta --max-depth=1 -o -v "$@" || exit 1 +} + +if [ "$1" = --pack ] ; then +git-rev-list HEAD | tac | +while read commit; do + if [ "$prevcommit" ]; then + git-diff-tree -r -z $prevcommit $commit | + while IFS=$'\t' read -d $'\0' a1 a2 sha file; do + to=${sha#*->} + from=${sha%->*} + [ "$from" != "$to" ] && mkdelta $from $to + done + + prevdir= + prevsha= + tree=$(treeid "$commit") + [ "$prevtree" ] || prevtree=$(treeid $prevcommit) + [ $prevtree != $tree ] && mkdelta $prevtree $tree + + ( git-ls-tree -r $prevtree; git-ls-tree -r $tree ) | + grep $'^[0-9]*\ttree' | sort -k4 -s | uniq -u -s4 | + while IFS=$'\t' read a1 a2 sha dir; do + if [ "$prevdir" = "$dir" -a "$prevsha" != "$sha" ]; then + echo "deltafying tree $dir" + mkdelta $prevsha $sha + fi + prevdir=$dir + prevsha=$sha + done + + fi + prevcommit=$commit + prevtree=$tree +done +elif [ "$1" = --unpack ]; then + ( cd .git/objects && find -type f ) | sed 's,[./],,g' | + xargs git-mkdelta --max-depth=0 -v +else + echo "usage: $(basename "$0") [--pack|--unpack]" >&2 + exit 1 +fi +exit 0 Index: mkdelta.c =================================================================== --- /dev/null (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2) +++ uncommitted/mkdelta.c (mode:100644) @@ -0,0 +1,306 @@ +/* + * Deltafication of a GIT database. + * + * (C) 2005 Nicolas Pitre <nico@cam.org> + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "cache.h" +#include "delta.h" + +static int replace_object(char *buf, unsigned long len, unsigned char *sha1, + char *hdr, int hdrlen) +{ + char tmpfile[PATH_MAX]; + int size; + char *compressed; + z_stream stream; + int fd; + + snprintf(tmpfile, sizeof(tmpfile), "%s/obj_XXXXXX", get_object_directory()); + fd = mkstemp(tmpfile); + if (fd < 0) + return error("%s: %s\n", tmpfile, strerror(errno)); + + /* Set it up */ + memset(&stream, 0, sizeof(stream)); + deflateInit(&stream, Z_BEST_COMPRESSION); + size = deflateBound(&stream, len+hdrlen); + compressed = xmalloc(size); + + /* Compress it */ + stream.next_out = compressed; + stream.avail_out = size; + + /* First header.. */ + stream.next_in = hdr; + stream.avail_in = hdrlen; + while (deflate(&stream, 0) == Z_OK) + /* nothing */; + + /* Then the data itself.. */ + stream.next_in = buf; + stream.avail_in = len; + while (deflate(&stream, Z_FINISH) == Z_OK) + /* nothing */; + deflateEnd(&stream); + size = stream.total_out; + + if (write(fd, compressed, size) != size) { + perror("unable to write file"); + close(fd); + unlink(tmpfile); + return -1; + } + fchmod(fd, 0444); + close(fd); + + if (rename(tmpfile, sha1_file_name(sha1))) { + perror("unable to replace original object"); + unlink(tmpfile); + return -1; + } + return 0; +} + +static int write_delta_file(char *buf, unsigned long len, + unsigned char *sha1_ref, unsigned char *sha1_trg) +{ + char hdr[50]; + int hdrlen; + + /* Generate the header + sha1 of reference for delta */ + hdrlen = sprintf(hdr, "delta %lu", len+20)+1; + memcpy(hdr + hdrlen, sha1_ref, 20); + hdrlen += 20; + + return replace_object(buf, len, sha1_trg, hdr, hdrlen); +} + +static int replace_sha1_file(char *buf, unsigned long len, + char *type, unsigned char *sha1) +{ + char hdr[50]; + int hdrlen; + + hdrlen = sprintf(hdr, "%s %lu", type, len)+1; + return replace_object(buf, len, sha1, hdr, hdrlen); +} + +static void *get_buffer(unsigned char *sha1, char *type, unsigned long *size) +{ + unsigned long mapsize; + void *map = map_sha1_file(sha1, &mapsize); + if (map) { + void *buffer = unpack_sha1_file(map, mapsize, type, size); + munmap(map, mapsize); + if (buffer) + return buffer; + } + error("unable to get object %s", sha1_to_hex(sha1)); + return NULL; +} + +static void *expand_delta(void *delta, unsigned long delta_size, char *type, + unsigned long *size, unsigned int *depth, char *head) +{ + void *buf = NULL; + *depth++; + if (delta_size < 20) { + error("delta object is bad"); + free(delta); + } else { + unsigned long ref_size; + void *ref = get_buffer(delta, type, &ref_size); + if (ref && !strcmp(type, "delta")) + ref = expand_delta(ref, ref_size, type, &ref_size, + depth, head); + else + memcpy(head, delta, 20); + if (ref) + buf = patch_delta(ref, ref_size, delta+20, + delta_size-20, size); + free(ref); + free(delta); + } + return buf; +} + +static char *mkdelta_usage = +"mkdelta [ --max-depth=N ] [ -o ] <reference_sha1> <target_sha1> [ <next_sha1> ... ]"; + +int main(int argc, char **argv) +{ + unsigned char sha1_ref[20], sha1_trg[20], head_ref[20], head_trg[20]; + char type_ref[20], type_trg[20]; + void *buf_ref, *buf_trg, *buf_delta; + unsigned long size_ref, size_trg, size_orig, size_delta; + unsigned int depth_ref, depth_trg, depth_max = -1; + int i, verbose = 0, oneparent = 0; + + for (i = 1; i < argc; i++) { + if (!strcmp(argv[i], "-v")) { + verbose = 1; + } else if (!strcmp(argv[i], "-o")) { + oneparent = 1; + } else if (!strcmp(argv[i], "-d") && i+1 < argc) { + depth_max = atoi(argv[++i]); + } else if (!strncmp(argv[i], "--max-depth=", 12)) { + depth_max = atoi(argv[i]+12); + } else + break; + } + + if (i + (depth_max != 0) >= argc) + usage(mkdelta_usage); + + if (get_sha1(argv[i], sha1_ref)) + die("bad sha1 %s", argv[i]); + depth_ref = 0; + buf_ref = get_buffer(sha1_ref, type_ref, &size_ref); + if (oneparent && depth_max > 0) { + while (buf_ref && !strcmp(type_ref, "delta")) { + if (size_ref < 20) + die("bad delta object"); + //printf ("getting parent for %s %i\n", + //sha1_to_hex(sha1_ref), depth_max); + memcpy(sha1_ref, buf_ref, 20); + free(buf_ref); + //printf("loading parent %s\n", + //sha1_to_hex(sha1_ref)); + buf_ref = get_buffer(sha1_ref, + type_ref, &size_ref); + if (!buf_ref) die("broken get_buffer!"); + } + } + if (buf_ref && !strcmp(type_ref, "delta")) + buf_ref = expand_delta(buf_ref, size_ref, type_ref, + &size_ref, &depth_ref, head_ref); + else + memcpy(head_ref, sha1_ref, 20); + if (!buf_ref) + die("unable to obtain initial object %s", argv[i]); + + if (depth_ref > depth_max) { + if (replace_sha1_file(buf_ref, size_ref, type_ref, sha1_ref)) + die("unable to restore %s", argv[i]); + if (verbose) + printf("undelta %s (depth was %d)\n", argv[i], depth_ref); + depth_ref = 0; + } + + while (++i < argc) { + if (get_sha1(argv[i], sha1_trg)) + die("bad sha1 %s", argv[i]); + depth_trg = 0; + buf_trg = get_buffer(sha1_trg, type_trg, &size_trg); + if (buf_trg && !size_trg) { + if (verbose) + printf("skip %s (object is empty)\n", argv[i]); + continue; + } + size_orig = size_trg; + if (buf_trg && !strcmp(type_trg, "delta")) { + if (!memcmp(buf_trg, sha1_ref, 20)) { + /* delta already in place */ + depth_ref++; + memcpy(sha1_ref, sha1_trg, 20); + buf_ref = patch_delta(buf_ref, size_ref, + buf_trg+20, size_trg-20, + &size_ref); + if (!buf_ref) + die("unable to apply delta %s", argv[i]); + if (depth_ref > depth_max) { + if (replace_sha1_file(buf_ref, size_ref, + type_ref, sha1_ref)) + die("unable to restore %s", argv[i]); + if (verbose) + printf("undelta %s (depth was %d)\n", argv[i], depth_ref); + depth_ref = 0; + continue; + } + if (verbose) + printf("skip %s (delta already in place)\n", argv[i]); + continue; + } + buf_trg = expand_delta(buf_trg, size_trg, type_trg, + &size_trg, &depth_trg, head_trg); + } else + memcpy(head_trg, sha1_trg, 20); + if (!buf_trg) + die("unable to read target object %s", argv[i]); + + if (depth_trg > depth_max || depth_max == 0) { + if (replace_sha1_file(buf_trg, size_trg, type_trg, sha1_trg)) + die("unable to restore %s", argv[i]); + if (verbose) + printf("undelta %s (depth was %d)\n", argv[i], depth_trg); + depth_trg = 0; + size_orig = size_trg; + } + + if (depth_max == 0) + goto skip; + + if (strcmp(type_ref, type_trg)) + die("type mismatch for object %s", argv[i]); + + if (!size_ref) { + if (verbose) + printf("skip %s (initial object is empty)\n", argv[i]); + goto skip; + } + + depth_ref++; + if (depth_ref > depth_max) { + if (verbose) + printf("skip %s (exceeding max link depth)\n", argv[i]); + goto skip; + } + + if (!memcmp(head_ref, sha1_trg, 20)) { + if (verbose) + printf("skip %s (would create a loop)\n", argv[i]); + goto skip; + } + + buf_delta = diff_delta(buf_ref, size_ref, buf_trg, size_trg, &size_delta); + if (!buf_delta) + die("out of memory"); + + //if (size_delta+20 < size_orig) { + if ((size_delta+20)*100/size_trg < 10) { + if (write_delta_file(buf_delta, size_delta, + sha1_ref, sha1_trg)) + die("unable to write delta for %s", argv[i]); + free(buf_delta); + if (verbose) + printf("delta %s (size=%ld.%02ld%%, depth=%d)\n", + argv[i], (size_delta+20)*100 / size_trg, + ((size_delta+20)*10000 / size_trg)%100, + depth_ref); + } else { + free(buf_delta); + if (verbose) { + printf("skip %s (original is smaller)", argv[i]); + printf(" (size=%ld.%02ld%%, depth=%d)\n", + (size_delta+20)*100 / size_trg, + ((size_delta+20)*10000 / size_trg)%100, + depth_ref); + } + skip: + depth_ref = depth_trg; + memcpy(head_ref, head_trg, 20); + } + + free(buf_ref); + buf_ref = buf_trg; + size_ref = size_trg; + memcpy(sha1_ref, sha1_trg, 20); + } + + return 0; +} Index: mktag.c =================================================================== --- 4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2/mktag.c (mode:100644) +++ uncommitted/mktag.c (mode:100644) @@ -25,20 +25,14 @@ static int verify_object(unsigned char *sha1, const char *expected_type) { int ret = -1; - unsigned long mapsize; - void *map = map_sha1_file(sha1, &mapsize); + char type[100]; + unsigned long size; + void *buffer = read_sha1_file(sha1, type, &size); - if (map) { - char type[100]; - unsigned long size; - void *buffer = unpack_sha1_file(map, mapsize, type, &size); - - if (buffer) { - if (!strcmp(type, expected_type)) - ret = check_sha1_signature(sha1, buffer, size, type); - free(buffer); - } - munmap(map, mapsize); + if (buffer) { + if (!strcmp(type, expected_type)) + ret = check_sha1_signature(sha1, buffer, size, type); + free(buffer); } return ret; } Index: patch-delta.c =================================================================== --- /dev/null (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2) +++ uncommitted/patch-delta.c (mode:100644) @@ -0,0 +1,88 @@ +/* + * patch-delta.c: + * recreate a buffer from a source and the delta produced by diff-delta.c + * + * (C) 2005 Nicolas Pitre <nico@cam.org> + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <stdlib.h> +#include <string.h> +#include "delta.h" + +void *patch_delta(void *src_buf, unsigned long src_size, + void *delta_buf, unsigned long delta_size, + unsigned long *dst_size) +{ + const unsigned char *data, *top; + unsigned char *dst_buf, *out, cmd; + unsigned long size; + int i; + + /* the smallest delta size possible is 6 bytes */ + if (delta_size < 6) + return NULL; + + data = delta_buf; + top = delta_buf + delta_size; + + /* make sure the orig file size matches what we expect */ + size = i = 0; + cmd = *data++; + while (cmd) { + if (cmd & 1) + size |= *data++ << i; + i += 8; + cmd >>= 1; + } + if (size != src_size) + return NULL; + + /* now the result size */ + size = i = 0; + cmd = *data++; + while (cmd) { + if (cmd & 1) + size |= *data++ << i; + i += 8; + cmd >>= 1; + } + dst_buf = malloc(size); + if (!dst_buf) + return NULL; + + out = dst_buf; + while (data < top) { + cmd = *data++; + if (cmd & 0x80) { + unsigned long cp_off = 0, cp_size = 0; + const unsigned char *buf; + if (cmd & 0x01) cp_off = *data++; + if (cmd & 0x02) cp_off |= (*data++ << 8); + if (cmd & 0x04) cp_off |= (*data++ << 16); + if (cmd & 0x08) cp_off |= (*data++ << 24); + if (cmd & 0x10) cp_size = *data++; + if (cmd & 0x20) cp_size |= (*data++ << 8); + if (cp_size == 0) cp_size = 0x10000; + buf = (cmd & 0x40) ? dst_buf : src_buf; + memcpy(out, buf + cp_off, cp_size); + out += cp_size; + } else { + memcpy(out, data, cmd); + out += cmd; + data += cmd; + } + } + + /* sanity check */ + if (data != top || out - dst_buf != size) { + free(dst_buf); + return NULL; + } + + *dst_size = size; + return dst_buf; +} Index: sha1_file.c =================================================================== --- 4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2/sha1_file.c (mode:100644) +++ uncommitted/sha1_file.c (mode:100644) @@ -9,6 +9,7 @@ #include <stdarg.h> #include <limits.h> #include "cache.h" +#include "delta.h" #ifndef O_NOATIME #if defined(__linux__) && (defined(__i386__) || defined(__PPC__)) @@ -353,6 +354,19 @@ if (map) { buf = unpack_sha1_file(map, mapsize, type, size); munmap(map, mapsize); + if (buf && !strcmp(type, "delta")) { + void *ref = NULL, *delta = buf; + unsigned long ref_size, delta_size = *size; + buf = NULL; + if (delta_size > 20) + ref = read_sha1_file(delta, type, &ref_size); + if (ref) + buf = patch_delta(ref, ref_size, + delta+20, delta_size-20, + size); + free(delta); + free(ref); + } return buf; } return NULL; Index: test-delta.c =================================================================== --- /dev/null (tree:4ef3de6ae44888d83e8c00326ddcc9f40cbd12e2) +++ uncommitted/test-delta.c (mode:100644) @@ -0,0 +1,79 @@ +/* + * test-delta.c: test code to exercise diff-delta.c and patch-delta.c + * + * (C) 2005 Nicolas Pitre <nico@cam.org> + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include "delta.h" + +static const char *usage = + "test-delta (-d|-p) <from_file> <data_file> <out_file>"; + +int main(int argc, char *argv[]) +{ + int fd; + struct stat st; + void *from_buf, *data_buf, *out_buf; + unsigned long from_size, data_size, out_size; + + if (argc != 5 || (strcmp(argv[1], "-d") && strcmp(argv[1], "-p"))) { + fprintf(stderr, "Usage: %s\n", usage); + return 1; + } + + fd = open(argv[2], O_RDONLY); + if (fd < 0 || fstat(fd, &st)) { + perror(argv[2]); + return 1; + } + from_size = st.st_size; + from_buf = mmap(NULL, from_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (from_buf == MAP_FAILED) { + perror(argv[2]); + return 1; + } + close(fd); + + fd = open(argv[3], O_RDONLY); + if (fd < 0 || fstat(fd, &st)) { + perror(argv[3]); + return 1; + } + data_size = st.st_size; + data_buf = mmap(NULL, data_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (data_buf == MAP_FAILED) { + perror(argv[3]); + return 1; + } + close(fd); + + if (argv[1][1] == 'd') + out_buf = diff_delta(from_buf, from_size, + data_buf, data_size, &out_size); + else + out_buf = patch_delta(from_buf, from_size, + data_buf, data_size, &out_size); + if (!out_buf) { + fprintf(stderr, "delta operation failed (returned NULL)\n"); + return 1; + } + + fd = open (argv[4], O_WRONLY|O_CREAT|O_TRUNC, 0666); + if (fd < 0 || write(fd, out_buf, out_size) != out_size) { + perror(argv[4]); + return 1; + } + + return 0; +} ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-17 21:43 ` Dan Holmsand @ 2005-05-18 4:32 ` Nicolas Pitre 2005-05-18 8:54 ` Dan Holmsand 2005-05-18 15:12 ` Linus Torvalds 1 sibling, 1 reply; 17+ messages in thread From: Nicolas Pitre @ 2005-05-18 4:32 UTC (permalink / raw) To: Dan Holmsand; +Cc: git On Tue, 17 May 2005, Dan Holmsand wrote: > I've been trying out your delta stuff as well. It was a bit > disappointing at first, but some tweaking payed off in the end... Cool! My goal is to provide the mechanism that can be used by a higher level implementing the deltafication policy. I only provided one script as an example, but it turns out that you found a way to achieve better space saving. And I bet you that there is probably ways to do even better with more exhaustive delta targets. For example you could try all possible combinations on an object list for each file (and let it run overnight). > 1) Too many deltas get too big and/or compress badly. One thing I've been wondering about is whether gzipping small deltas is actually a gain. For very small files it seems that gzip is adding more overhead making the compressed file actually larger. Might be worth storing some deltas uncompressed if the compressed version turns out to be larger. > 2) Trees take up a big chunk of total space. Tree objects can be deltafied as well, but I didn't had time to script it. A large space saving can be expected there as well, especially for changesets that modify only a few files deep down the tree hierarchy. > Therefore, I tried some other approaches. This one seemed to work > best: > > 1) I limit the maximum size of any delta to 10% of the size of the new > version. That guarantees a big saving, as long as any delta is > produced. Well, any delta object smaller than its original object saves space, even if it's 75% of the original size. But... > 2) If the "previous" version of a blob is a delta, I produce the new > delta form the old deltas base version. This works surprisingly well. > I'm guessing the reason for this is that most changes are really > small, and they tend to be in the same area as a previous change (as > in "Commit new feature. Commit bugfix for new feature. Commit fix for > bugfix of new feature. Delete new feature as it doesn't work..."). ... but then the ultimate solution is to try out all possible references within a given list. My git-deltafy-script already finds out the list of objects belonging to the same file. Maybe git-mkdelta should try all combinations between them. This way a deeper delta chain could be allowed for maximum space saving. > 3) I use the same method for all tree objects. Yup. > Attached is a patch (against current cogito). It is basically the same > as yours, Nicolas, except for some hackery to make the above possible. > I'm sure I've made lots of stupid mistakes in it (and the 10% limit is > hardcoded right now; I'm lazy). I will look at it and merge the good stuff. Thanks for testing! Nicolas ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-18 4:32 ` Nicolas Pitre @ 2005-05-18 8:54 ` Dan Holmsand 2005-05-18 18:41 ` Nicolas Pitre 0 siblings, 1 reply; 17+ messages in thread From: Dan Holmsand @ 2005-05-18 8:54 UTC (permalink / raw) To: git Nicolas Pitre wrote: > My goal is to provide the mechanism that can be used by a higher level > implementing the deltafication policy. I only provided one script as an > example, but it turns out that you found a way to achieve better space > saving. And I bet you that there is probably ways to do even better > with more exhaustive delta targets. For example you could try all > possible combinations on an object list for each file (and let it run > overnight). Well, any kind of deltafication of, say, the complete kernel history pretty much has to run overnight anyway :-) > One thing I've been wondering about is whether gzipping small deltas is > actually a gain. For very small files it seems that gzip is adding more > overhead making the compressed file actually larger. Might be worth > storing some deltas uncompressed if the compressed version turns out to > be larger. It's probably better to skip deltafication of very small files altogether. Big pain for small gain, and all that. >>1) I limit the maximum size of any delta to 10% of the size of the new >>version. That guarantees a big saving, as long as any delta is >>produced. > > > Well, any delta object smaller than its original object saves space, > even if it's 75% of the original size. But... That's not true if you want to keep the delta chain length down (and thus performance up). Then, the most efficient approach is to generate many deltas against the same base file (otherwise, you only get 50% delta files with a maximum delta depth of 1). But in this case, the trick is to know when to stop deltafying against one base file, and start over with another. If you switch to a new keyframe too often, you obviously lose some potential savings. But if you don't switch often enough, you end up repeating the same data in too many delta files. A maximum delta size of 10% turned out to be ideal for at least the "fs" tree. 8% was significantly worse, as was 15%. (The ideal size depends on how big the average change is: the smaller the average change, the smaller the max delta size should be). > ... but then the ultimate solution is to try out all possible references > within a given list. My git-deltafy-script already finds out the list > of objects belonging to the same file. Maybe git-mkdelta should try > all combinations between them. This way a deeper delta chain could be > allowed for maximum space saving. Yeah. But then you lose the ability to do incremental deltafication, or deltafication on-the-fly. And it would be really, really nice to have git do deltas at commit time - that way you could keep the very cool "immutable objects" property of git, while still saving a lot of space. > I will look at it and merge the good stuff. Cool! Thanks! /dan ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-18 8:54 ` Dan Holmsand @ 2005-05-18 18:41 ` Nicolas Pitre 2005-05-18 19:32 ` Dan Holmsand 0 siblings, 1 reply; 17+ messages in thread From: Nicolas Pitre @ 2005-05-18 18:41 UTC (permalink / raw) To: Dan Holmsand; +Cc: git On Wed, 18 May 2005, Dan Holmsand wrote: > Nicolas Pitre wrote: > > One thing I've been wondering about is whether gzipping small deltas is > > actually a gain. For very small files it seems that gzip is adding more > > overhead making the compressed file actually larger. Might be worth storing > > some deltas uncompressed if the compressed version turns out to be larger. > > It's probably better to skip deltafication of very small files altogether. Big > pain for small gain, and all that. No, that's not what I mean. Suppose a large source file that may change only one line between two versions. The delta may therefore end up being only a few bytes long. Compressing a few bytes with zlib creates a _larger_ file than the original few bytes. > > Well, any delta object smaller than its original object saves space, even if > > it's 75% of the original size. But... > > That's not true if you want to keep the delta chain length down (and thus > performance up). Sure. That's why I added the -d switch to mkdelta. But if you can fit a delta which is 75% the size of its original object size then you still save 25% of the space, regardless of the delta chain length. > But in this case, the trick is to know when to stop deltafying against one > base file, and start over with another. If you switch to a new keyframe too > often, you obviously lose some potential savings. But if you don't switch > often enough, you end up repeating the same data in too many delta files. That's why multiple combinations should be tried. And to keep things under control then a new argument specifying the delta "distance" might limit the number of trials. > A maximum delta size of 10% turned out to be ideal for at least the "fs" > tree. 8% was significantly worse, as was 15%. (The ideal size depends on how > big the average change is: the smaller the average change, the smaller the max > delta size should be). In fact it seems that deltas might be significantly harder to compress. Therefore a test on the resulting file should probably be done as well to make sure we don't end up with a delta larger than the original object. > > ... but then the ultimate solution is to try out all possible references > > within a given list. My git-deltafy-script already finds out the list of > > objects belonging to the same file. Maybe git-mkdelta should try all > > combinations between them. This way a deeper delta chain could be allowed > > for maximum space saving. > > Yeah. But then you lose the ability to do incremental deltafication, or > deltafication on-the-fly. Not at all. Nothing prevents you from making the latest revision of a file be the reference object and the previous revision turned into a delta against that latest revision, even if it was itself a reference object before. The only thing that must be avoided is a delta loop and current mkdelta code takes care of that already. Nicolas ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-18 18:41 ` Nicolas Pitre @ 2005-05-18 19:32 ` Dan Holmsand 0 siblings, 0 replies; 17+ messages in thread From: Dan Holmsand @ 2005-05-18 19:32 UTC (permalink / raw) To: git Nicolas Pitre wrote: > On Wed, 18 May 2005, Dan Holmsand wrote: >>Nicolas Pitre wrote: >>It's probably better to skip deltafication of very small files altogether. Big >>pain for small gain, and all that. > > No, that's not what I mean. > > Suppose a large source file that may change only one line between two > versions. The delta may therefore end up being only a few bytes long. > Compressing a few bytes with zlib creates a _larger_ file than the > original few bytes. Oh, I see. You're right, of course. I doubt, however, that there are really large gains to be made, measured in bytes; since small files tend to be small :-) It might nevertheless be worthwhile to skip processing of really small files, though, as there's basically no hope of gaining anything by deltafication. Anyway, I've also noticed that deltas compress a lot worse than regular blobs. In particular, "complex deltas" (i.e. small changes on lines 1,3,5,9,12, etc.), compress poorly. That might explain why my simplistic "depth-one-deltas-against-a-common-keyframe" method works comparatively well, as that should tend to cleaner deltas from time to time (i.e. chunk on lines 1-12 got replaced by new stuff), that ought to be easier to compress. >>>Well, any delta object smaller than its original object saves space, even if >>>it's 75% of the original size. But... >> >>That's not true if you want to keep the delta chain length down (and thus >>performance up). > > Sure. That's why I added the -d switch to mkdelta. But if you can fit > a delta which is 75% the size of its original object size then you still > save 25% of the space, regardless of the delta chain length. Ok, I guess we're talking about slightly different things here. I was talking about my simple-and-fast method of processing one new version of an object at a time, deltafying each new version against the first previous non-deltafied one. If you, in that scenario, allow use of up to 75% sized deltas, on average delta size will probably be something like 37%. > In fact it seems that deltas might be significantly harder to compress. > Therefore a test on the resulting file should probably be done as well > to make sure we don't end up with a delta larger than the original > object. Yeah (see above). That's just one of the things I cheated my way out of, by limiting delta size to 10% (I trusting that compression doesn't increase size by 90%). >>>... but then the ultimate solution is to try out all possible references >>>within a given list. My git-deltafy-script already finds out the list of >>>objects belonging to the same file. Maybe git-mkdelta should try all >>>combinations between them. This way a deeper delta chain could be allowed >>>for maximum space saving. >> >>Yeah. But then you lose the ability to do incremental deltafication, or >>deltafication on-the-fly. > > > Not at all. Nothing prevents you from making the latest revision of a > file be the reference object and the previous revision turned into a > delta against that latest revision, even if it was itself a reference > object before. The only thing that must be avoided is a delta loop and > current mkdelta code takes care of that already. Sure. But there's some downside to modifying already existing objects. In particular, downloads using the existing methods (both http and rsync) won't get your new, smaller objects. If someone is pulling from a deltafied repository often enough, and the objects of the top-most commit are always non-deltafied, they will never see any deltas. Object immutability is a really good thing. And there might be some performance issues involved, if you'd like to do deltafying at commit-time (not that I've actually tried this, or anything)... Thanks for your comments! /dan ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-17 21:43 ` Dan Holmsand 2005-05-18 4:32 ` Nicolas Pitre @ 2005-05-18 15:12 ` Linus Torvalds 2005-05-18 17:15 ` Dan Holmsand 1 sibling, 1 reply; 17+ messages in thread From: Linus Torvalds @ 2005-05-18 15:12 UTC (permalink / raw) To: Dan Holmsand; +Cc: git On Tue, 17 May 2005, Dan Holmsand wrote: > > Therefore, I tried some other approaches. This one seemed to work > best: > > 1) I limit the maximum size of any delta to 10% of the size of the new > version. That guarantees a big saving, as long as any delta is > produced. > > 2) If the "previous" version of a blob is a delta, I produce the new > delta form the old deltas base version. This works surprisingly well. > I'm guessing the reason for this is that most changes are really > small, and they tend to be in the same area as a previous change (as > in "Commit new feature. Commit bugfix for new feature. Commit fix for > bugfix of new feature. Delete new feature as it doesn't work..."). > > 3) I use the same method for all tree objects. Has anybody tried: 4) don't limit yourself to previous-history-objects One of the things I liked best about the delta patches was that it is history-neutral, and can happily delta an object against any other random object, in the same tree, in a future tree, or in a past tree. Even without any history at all, there should be a noticeable amount of delta opportunities, as different architectures often end up sharing files that are quite similar, but not exactly the same. Now, that's a very expensive thing to do, since it changes the question of "which object should I delta against" from O(1) to O(n) (where "n" is tyhe total number of objects), and thus the whole deltafication from O(n) to O(n**2), but especially together with your "max 10%" rule, you should be able to limit your choices very effectively: if you know that your delta should be within 10% of the total size, you can limit your "let's try that object" search to other objects that are also within 10% of your object. That doesn't change the basic expense factor much in theory (if sizes were truly evenly distributed in <n> it might change it, but there's probably only a few different "classes" of file sizes, much fewer than <n>, so it's still probably O(n**2)), but it should cut down the work by some noticeable constant factor, making it a hopefully realistic experiment. So your first rule makes a global deltafication cheaper, and in fact, together with your second rule, you might even decide to make the size differential depend on the size of the _compressed_ object, since you don't care about objects that have already been deltafied, and if they are within 10% of each other, then they should also likely compress similarly, and it should thus be pretty equivalent to just compare compressed sizes. Again, that second optimization wouldn't change the O(n**2) nature of the expense, but should give another nice factor of speedup, maybe making the exercise possible in the first place. As to the long-term "O(n**2) deltafication is not practical for big projects with lots of history" issue, doing things incrementally should hopefully solve that, and turn it into a series of O(n) operations at the cost of saying "we'll never re-delta an object against the future once we've found a delta in the past or used it as a base for a delta". The fsck "scan all objects" code could be a good starting point. Doing this experiment at least once should be interesting. It may turn out that the incremental space savings aren't all that noticeable, and that the pure history-based one already finds 90% of all savings, making the expensive version not worth it. It would be nice to _know_, though. Linus ^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] improved delta support for git 2005-05-18 15:12 ` Linus Torvalds @ 2005-05-18 17:15 ` Dan Holmsand 0 siblings, 0 replies; 17+ messages in thread From: Dan Holmsand @ 2005-05-18 17:15 UTC (permalink / raw) To: git Linus Torvalds wrote: > Has anybody tried: > > 4) don't limit yourself to previous-history-objects > > One of the things I liked best about the delta patches was that it is > history-neutral, and can happily delta an object against any other random > object, in the same tree, in a future tree, or in a past tree. > > Even without any history at all, there should be a noticeable amount of > delta opportunities, as different architectures often end up sharing files > that are quite similar, but not exactly the same. > > Now, that's a very expensive thing to do, since it changes the question of > "which object should I delta against" from O(1) to O(n) (where "n" is tyhe > total number of objects), and thus the whole deltafication from O(n) to > O(n**2), but especially together with your "max 10%" rule, you should be > able to limit your choices very effectively: if you know that your delta > should be within 10% of the total size, you can limit your "let's try that > object" search to other objects that are also within 10% of your object. Yeah, that sounds very interresting *and* very expensive... Ideally, I'd like to find the set of objects that should *not* be deltafied (i.e. the ideal "keyframe" objects), but that would generate the maximum number of small, depth-one deltas with the least total size. But I can't really see how that could be done in a number of deltafications significantly less than the number of atoms in the universe. Let me think about that some more, though. I'd like to try a couple of other approaches anyway: a) Sort all objects by size. Start by biggest (or smallest), and try to get as many max-10%-deltas out of that as possible, stopping the search when objects get too small (big) according to some size difference limit. Cross already deltafied objects off the list, and continue. Might work, and might be fast enough with a sufficiently small size limit. b) Use the same history-based approach as before, and in addition try to deltafy any "new" objects against other new objects and previous ones (say one or two commits back) in a given size range. That should catch renames, copys of the same template, etc. That shouldn't really affect performance, as new files are added comparatively seldom. > Doing this experiment at least once should be interesting. It may turn out > that the incremental space savings aren't all that noticeable, and that > the pure history-based one already finds 90% of all savings, making the > expensive version not worth it. It would be nice to _know_, though. I definitely agree. And I also agree that the history-based deltafication seems less than pure, from a "git, the object store that doesn't really care" point of view. On the other hand, the history-based thing has its advantages. It takes advantage of people's hard work to make patches as small as possible. It's fast. And (perhaps more importantly), it's deterministic. The "ideal" approach could possibly require every single blob to be redeltafied when a new object is added, if we want to stay ideal. And it could be done at commit-time, thus keeping git's nice promise of immutable files, while still keeping size requirements down. And as my current method gives roughly an 80% size reduction over "plain git", that might (by boring, excessively practical people) be considered enough :-) /dan ^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2005-05-18 19:35 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-05-12 3:51 [PATCH] improved delta support for git Nicolas Pitre
2005-05-12 4:36 ` Junio C Hamano
2005-05-12 14:27 ` Chris Mason
[not found] ` <2cfc403205051207467755cdf@mail.gmail.com>
2005-05-12 14:47 ` Jon Seymour
2005-05-12 15:18 ` Nicolas Pitre
2005-05-12 17:16 ` Junio C Hamano
2005-05-13 11:44 ` Chris Mason
2005-05-17 18:22 ` Thomas Glanzmann
2005-05-17 19:02 ` Thomas Glanzmann
2005-05-17 19:10 ` Thomas Glanzmann
2005-05-17 21:43 ` Dan Holmsand
2005-05-18 4:32 ` Nicolas Pitre
2005-05-18 8:54 ` Dan Holmsand
2005-05-18 18:41 ` Nicolas Pitre
2005-05-18 19:32 ` Dan Holmsand
2005-05-18 15:12 ` Linus Torvalds
2005-05-18 17:15 ` Dan Holmsand
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).