* [PATCH 6/6] make git-pack-objects able to create deltas with offset to base
@ 2006-09-21 4:09 Nicolas Pitre
2006-09-23 1:25 ` [PATCH 7/6] make pack data reuse compatible with both delta types Nicolas Pitre
0 siblings, 1 reply; 2+ messages in thread
From: Nicolas Pitre @ 2006-09-21 4:09 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git
This is enabled with --delta-base-offset only, and doesn't work with
pack data reuse yet.
The idea is to allow for the fetch protocol to use an extension flag
to notify the remote end that --delta-base-offset can be used with
git-pack-objects. Eventually git-repack will always provide this flag.
With this, all delta base objects are now pushed before deltas that depend
on them. This is a requirements for OBJ_OFS_DELTA. This is not a
requirement for OBJ_REF_DELTA but always doing so makes the code simpler.
Signed-off-by: Nicolas Pitre <nico@cam.org>
---
Delta data reuse is possible, of course, but that will come through a
separate patch. I need to think about it some more in order to implement
it as efficiently as possible. The issue is to get back to the
corresponding object entry given an object offset in an existing pack.
I guess I could use the pack offset to find the corresponding sha1 in the pack
index, and then use locate_object_entry_hash(), but that would require a pack
index sorted by offset to be efficient... which means yet more memory usage.
Well, I need to look at that revindex more closely I guess...
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index c62734a..ec57536 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -60,6 +60,8 @@ static int non_empty;
static int no_reuse_delta;
static int local;
static int incremental;
+static int allow_ofs_delta;
+
static struct object_entry **sorted_by_sha, **sorted_by_type;
static struct object_entry *objects;
static int nr_objects, nr_alloc, nr_result;
@@ -334,9 +336,6 @@ static unsigned long write_object(struct
enum object_type obj_type;
int to_reuse = 0;
- if (entry->preferred_base)
- return 0;
-
obj_type = entry->type;
if (! entry->in_pack)
to_reuse = 0; /* can't reuse what we don't have */
@@ -380,18 +379,35 @@ static unsigned long write_object(struct
if (entry->delta) {
buf = delta_against(buf, size, entry);
size = entry->delta_size;
- obj_type = OBJ_REF_DELTA;
+ obj_type = (allow_ofs_delta && entry->delta->offset) ?
+ OBJ_OFS_DELTA : OBJ_REF_DELTA;
}
/*
* The object header is a byte of 'type' followed by zero or
- * more bytes of length. For deltas, the 20 bytes of delta
- * sha1 follows that.
+ * more bytes of length.
*/
hdrlen = encode_header(obj_type, size, header);
sha1write(f, header, hdrlen);
- if (entry->delta) {
- sha1write(f, entry->delta, 20);
+ if (obj_type == OBJ_OFS_DELTA) {
+ /*
+ * Deltas with relative base contain an additional
+ * encoding of the relative offset for the delta
+ * base from this object's position in the pack.
+ */
+ unsigned long ofs = entry->offset - entry->delta->offset;
+ unsigned pos = sizeof(header) - 1;
+ header[pos] = ofs & 127;
+ while (ofs >>= 7)
+ header[--pos] = 128 | (--ofs & 127);
+ sha1write(f, header + pos, sizeof(header) - pos);
+ hdrlen += sizeof(header) - pos;
+ } else if (obj_type == OBJ_REF_DELTA) {
+ /*
+ * Deltas with a base reference contain
+ * an additional 20 bytes for the base sha1.
+ */
+ sha1write(f, entry->delta->sha1, 20);
hdrlen += 20;
}
datalen = sha1write_compressed(f, buf, size);
@@ -413,7 +429,7 @@ static unsigned long write_object(struct
reused_delta++;
reused++;
}
- if (obj_type == OBJ_REF_DELTA)
+ if (entry->delta)
written_delta++;
written++;
return hdrlen + datalen;
@@ -423,17 +439,16 @@ static unsigned long write_one(struct sh
struct object_entry *e,
unsigned long offset)
{
- if (e->offset)
+ if (e->offset || e->preferred_base)
/* offset starts from header size and cannot be zero
* if it is written already.
*/
return offset;
- e->offset = offset;
- offset += write_object(f, e);
- /* if we are deltified, write out its base object. */
+ /* if we are deltified, write out its base object first. */
if (e->delta)
offset = write_one(f, e->delta, offset);
- return offset;
+ e->offset = offset;
+ return offset + write_object(f, e);
}
static void write_pack_file(void)
@@ -1484,6 +1499,10 @@ int cmd_pack_objects(int argc, const cha
no_reuse_delta = 1;
continue;
}
+ if (!strcmp("--delta-base-offset", arg)) {
+ allow_ofs_delta = no_reuse_delta = 1;
+ continue;
+ }
if (!strcmp("--stdout", arg)) {
pack_to_stdout = 1;
continue;
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH 7/6] make pack data reuse compatible with both delta types
2006-09-21 4:09 [PATCH 6/6] make git-pack-objects able to create deltas with offset to base Nicolas Pitre
@ 2006-09-23 1:25 ` Nicolas Pitre
0 siblings, 0 replies; 2+ messages in thread
From: Nicolas Pitre @ 2006-09-23 1:25 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git
This is the missing part to git-pack-objects allowing it to reuse delta
data to/from any of the two delta types. It can reuse delta from any
type, and it outputs base offsets when --allow-delta-base-offset is
provided and the base is also included in the pack. Otherwise it
outputs base sha1 references just like it always did.
Signed-off-by: Nicolas Pitre <nico@cam.org>
---
On Thu, 21 Sep 2006, Nicolas Pitre wrote:
> Delta data reuse is possible, of course, but that will come through a
> separate patch. I need to think about it some more in order to implement
> it as efficiently as possible. The issue is to get back to the
> corresponding object entry given an object offset in an existing pack.
This was done with the addition of a sequence number in the revindex
entry which is 4 additional bytes per object that has to live in memory.
If ever this is considered too much memory usage then there is always
the possibility of using an array of pointers to pack index entries
instead, but then the runtime cost might be higher since the sorting and
searching will have to dereference those pointers to get to the offset
and endian swap it every time. I think that memory usage should not be
an issue in most cases, and if it is then this 4 bytes per object is
probably not going to save much anyway.
So I think support for delta base offset is now complete.
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 2212649..6db97b6 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -29,6 +29,7 @@ struct object_entry {
enum object_type type;
enum object_type in_pack_type; /* could be delta */
unsigned long delta_size; /* delta data size (uncompressed) */
+#define in_pack_header_size delta_size /* only when reusing pack data */
struct object_entry *delta; /* delta base object */
struct packed_git *in_pack; /* already in pack */
unsigned int in_pack_offset;
@@ -86,17 +87,25 @@ static int object_ix_hashsz;
* Pack index for existing packs give us easy access to the offsets into
* corresponding pack file where each object's data starts, but the entries
* do not store the size of the compressed representation (uncompressed
- * size is easily available by examining the pack entry header). We build
- * a hashtable of existing packs (pack_revindex), and keep reverse index
- * here -- pack index file is sorted by object name mapping to offset; this
- * pack_revindex[].revindex array is an ordered list of offsets, so if you
- * know the offset of an object, next offset is where its packed
- * representation ends.
+ * size is easily available by examining the pack entry header). It is
+ * also rather expensive to find the sha1 for an object given its offset.
+ *
+ * We build a hashtable of existing packs (pack_revindex), and keep reverse
+ * index here -- pack index file is sorted by object name mapping to offset;
+ * this pack_revindex[].revindex array is a list of offset/index_nr pairs
+ * ordered by offset, so if you know the offset of an object, next offset
+ * is where its packed representation ends and the index_nr can be used to
+ * get the object sha1 from the main index.
*/
+struct revindex_entry {
+ unsigned int offset;
+ unsigned int nr;
+};
struct pack_revindex {
struct packed_git *p;
- unsigned long *revindex;
-} *pack_revindex = NULL;
+ struct revindex_entry *revindex;
+};
+static struct pack_revindex *pack_revindex;
static int pack_revindex_hashsz;
/*
@@ -143,14 +152,9 @@ static void prepare_pack_ix(void)
static int cmp_offset(const void *a_, const void *b_)
{
- unsigned long a = *(unsigned long *) a_;
- unsigned long b = *(unsigned long *) b_;
- if (a < b)
- return -1;
- else if (a == b)
- return 0;
- else
- return 1;
+ const struct revindex_entry *a = a_;
+ const struct revindex_entry *b = b_;
+ return (a->offset < b->offset) ? -1 : (a->offset > b->offset) ? 1 : 0;
}
/*
@@ -163,25 +167,27 @@ static void prepare_pack_revindex(struct
int i;
void *index = p->index_base + 256;
- rix->revindex = xmalloc(sizeof(unsigned long) * (num_ent + 1));
+ rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1));
for (i = 0; i < num_ent; i++) {
unsigned int hl = *((unsigned int *)((char *) index + 24*i));
- rix->revindex[i] = ntohl(hl);
+ rix->revindex[i].offset = ntohl(hl);
+ rix->revindex[i].nr = i;
}
/* This knows the pack format -- the 20-byte trailer
* follows immediately after the last object data.
*/
- rix->revindex[num_ent] = p->pack_size - 20;
- qsort(rix->revindex, num_ent, sizeof(unsigned long), cmp_offset);
+ rix->revindex[num_ent].offset = p->pack_size - 20;
+ rix->revindex[num_ent].nr = -1;
+ qsort(rix->revindex, num_ent, sizeof(*rix->revindex), cmp_offset);
}
-static unsigned long find_packed_object_size(struct packed_git *p,
- unsigned long ofs)
+static struct revindex_entry * find_packed_object(struct packed_git *p,
+ unsigned int ofs)
{
int num;
int lo, hi;
struct pack_revindex *rix;
- unsigned long *revindex;
+ struct revindex_entry *revindex;
num = pack_revindex_ix(p);
if (num < 0)
die("internal error: pack revindex uninitialized");
@@ -193,10 +199,10 @@ static unsigned long find_packed_object_
hi = num_packed_objects(p) + 1;
do {
int mi = (lo + hi) / 2;
- if (revindex[mi] == ofs) {
- return revindex[mi+1] - ofs;
+ if (revindex[mi].offset == ofs) {
+ return revindex + mi;
}
- else if (ofs < revindex[mi])
+ else if (ofs < revindex[mi].offset)
hi = mi;
else
lo = mi + 1;
@@ -204,6 +210,20 @@ static unsigned long find_packed_object_
die("internal error: pack revindex corrupt");
}
+static unsigned long find_packed_object_size(struct packed_git *p,
+ unsigned long ofs)
+{
+ struct revindex_entry *entry = find_packed_object(p, ofs);
+ return entry[1].offset - ofs;
+}
+
+static unsigned char *find_packed_object_name(struct packed_git *p,
+ unsigned long ofs)
+{
+ struct revindex_entry *entry = find_packed_object(p, ofs);
+ return (unsigned char *)(p->index_base + 256) + 24 * entry->nr + 4;
+}
+
static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
{
unsigned long othersize, delta_size;
@@ -249,6 +269,10 @@ static int encode_header(enum object_typ
return n;
}
+/*
+ * we are going to reuse the existing object data as is. make
+ * sure it is not corrupt.
+ */
static int check_inflate(unsigned char *data, unsigned long len, unsigned long expect)
{
z_stream stream;
@@ -280,32 +304,6 @@ static int check_inflate(unsigned char *
return st;
}
-/*
- * we are going to reuse the existing pack entry data. make
- * sure it is not corrupt.
- */
-static int revalidate_pack_entry(struct object_entry *entry, unsigned char *data, unsigned long len)
-{
- enum object_type type;
- unsigned long size, used;
-
- if (pack_to_stdout)
- return 0;
-
- /* the caller has already called use_packed_git() for us,
- * so it is safe to access the pack data from mmapped location.
- * make sure the entry inflates correctly.
- */
- used = unpack_object_header_gently(data, len, &type, &size);
- if (!used)
- return -1;
- if (type == OBJ_REF_DELTA)
- used += 20; /* skip base object name */
- data += used;
- len -= used;
- return check_inflate(data, len, entry->size);
-}
-
static int revalidate_loose_object(struct object_entry *entry,
unsigned char *map,
unsigned long mapsize)
@@ -339,7 +337,7 @@ static unsigned long write_object(struct
obj_type = entry->type;
if (! entry->in_pack)
to_reuse = 0; /* can't reuse what we don't have */
- else if (obj_type == OBJ_REF_DELTA)
+ else if (obj_type == OBJ_REF_DELTA || obj_type == OBJ_OFS_DELTA)
to_reuse = 1; /* check_object() decided it for us */
else if (obj_type != entry->in_pack_type)
to_reuse = 0; /* pack has delta which is unusable */
@@ -415,18 +413,38 @@ static unsigned long write_object(struct
}
else {
struct packed_git *p = entry->in_pack;
- use_packed_git(p);
- datalen = find_packed_object_size(p, entry->in_pack_offset);
- buf = (char *) p->pack_base + entry->in_pack_offset;
+ if (entry->delta) {
+ obj_type = (allow_ofs_delta && entry->delta->offset) ?
+ OBJ_OFS_DELTA : OBJ_REF_DELTA;
+ reused_delta++;
+ }
+ hdrlen = encode_header(obj_type, entry->size, header);
+ sha1write(f, header, hdrlen);
+ if (obj_type == OBJ_OFS_DELTA) {
+ unsigned long ofs = entry->offset - entry->delta->offset;
+ unsigned pos = sizeof(header) - 1;
+ header[pos] = ofs & 127;
+ while (ofs >>= 7)
+ header[--pos] = 128 | (--ofs & 127);
+ sha1write(f, header + pos, sizeof(header) - pos);
+ hdrlen += sizeof(header) - pos;
+ } else if (obj_type == OBJ_REF_DELTA) {
+ sha1write(f, entry->delta->sha1, 20);
+ hdrlen += 20;
+ }
- if (revalidate_pack_entry(entry, buf, datalen))
+ use_packed_git(p);
+ buf = (char *) p->pack_base
+ + entry->in_pack_offset
+ + entry->in_pack_header_size;
+ datalen = find_packed_object_size(p, entry->in_pack_offset)
+ - entry->in_pack_header_size;
+//fprintf(stderr, "reusing %d at %d header %d size %d\n", obj_type, entry->in_pack_offset, entry->in_pack_header_size, datalen);
+ if (!pack_to_stdout && check_inflate(buf, datalen, entry->size))
die("corrupt delta in pack %s", sha1_to_hex(entry->sha1));
sha1write(f, buf, datalen);
unuse_packed_git(p);
- hdrlen = 0; /* not really */
- if (obj_type == OBJ_REF_DELTA)
- reused_delta++;
reused++;
}
if (entry->delta)
@@ -914,26 +932,64 @@ static void check_object(struct object_e
char type[20];
if (entry->in_pack && !entry->preferred_base) {
- unsigned char base[20];
- unsigned long size;
- struct object_entry *base_entry;
+ struct packed_git *p = entry->in_pack;
+ unsigned long left = p->pack_size - entry->in_pack_offset;
+ unsigned long size, used;
+ unsigned char *buf;
+ struct object_entry *base_entry = NULL;
+
+ use_packed_git(p);
+ buf = p->pack_base;
+ buf += entry->in_pack_offset;
/* We want in_pack_type even if we do not reuse delta.
* There is no point not reusing non-delta representations.
*/
- check_reuse_pack_delta(entry->in_pack,
- entry->in_pack_offset,
- base, &size,
- &entry->in_pack_type);
+ used = unpack_object_header_gently(buf, left,
+ &entry->in_pack_type, &size);
+ if (!used || left - used <= 20)
+ die("corrupt pack for %s", sha1_to_hex(entry->sha1));
/* Check if it is delta, and the base is also an object
* we are going to pack. If so we will reuse the existing
* delta.
*/
- if (!no_reuse_delta &&
- entry->in_pack_type == OBJ_REF_DELTA &&
- (base_entry = locate_object_entry(base)) &&
- (!base_entry->preferred_base)) {
+ if (!no_reuse_delta) {
+ unsigned char c, *base_name;
+ unsigned long ofs;
+ /* there is at least 20 bytes left in the pack */
+ switch (entry->in_pack_type) {
+ case OBJ_REF_DELTA:
+ base_name = buf + used;
+ used += 20;
+ break;
+ case OBJ_OFS_DELTA:
+ c = buf[used++];
+ ofs = c & 127;
+ while (c & 128) {
+ ofs += 1;
+ if (!ofs || ofs & ~(~0UL >> 7))
+ die("delta base offset overflow in pack for %s",
+ sha1_to_hex(entry->sha1));
+ c = buf[used++];
+ ofs = (ofs << 7) + (c & 127);
+ }
+ if (ofs >= entry->in_pack_offset)
+ die("delta base offset out of bound for %s",
+ sha1_to_hex(entry->sha1));
+ ofs = entry->in_pack_offset - ofs;
+ base_name = find_packed_object_name(p, ofs);
+ break;
+ default:
+ base_name = NULL;
+ }
+ if (base_name)
+ base_entry = locate_object_entry(base_name);
+ }
+ unuse_packed_git(p);
+ entry->in_pack_header_size = used;
+
+ if (base_entry && !base_entry->preferred_base) {
/* Depth value does not matter - find_deltas()
* will never consider reused delta as the
@@ -942,9 +998,9 @@ static void check_object(struct object_e
*/
/* uncompressed size of the delta data */
- entry->size = entry->delta_size = size;
+ entry->size = size;
entry->delta = base_entry;
- entry->type = OBJ_REF_DELTA;
+ entry->type = entry->in_pack_type;
entry->delta_sibling = base_entry->delta_child;
base_entry->delta_child = entry;
@@ -1500,7 +1556,7 @@ int cmd_pack_objects(int argc, const cha
continue;
}
if (!strcmp("--delta-base-offset", arg)) {
- allow_ofs_delta = no_reuse_delta = 1;
+ allow_ofs_delta = 1;
continue;
}
if (!strcmp("--stdout", arg)) {
diff --git a/pack.h b/pack.h
index 05557da..4c9bddd 100644
--- a/pack.h
+++ b/pack.h
@@ -16,7 +16,4 @@ struct pack_header {
};
extern int verify_pack(struct packed_git *, int);
-extern int check_reuse_pack_delta(struct packed_git *, unsigned long,
- unsigned char *, unsigned long *,
- enum object_type *);
#endif
diff --git a/sha1_file.c b/sha1_file.c
index 66ebdde..e4bc4ae 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1004,25 +1004,6 @@ static unsigned long unpack_object_heade
return offset + used;
}
-int check_reuse_pack_delta(struct packed_git *p, unsigned long offset,
- unsigned char *base, unsigned long *sizep,
- enum object_type *kindp)
-{
- unsigned long ptr;
- int status = -1;
-
- use_packed_git(p);
- ptr = offset;
- ptr = unpack_object_header(p, ptr, kindp, sizep);
- if (*kindp != OBJ_REF_DELTA)
- goto done;
- hashcpy(base, (unsigned char *) p->pack_base + ptr);
- status = 0;
- done:
- unuse_packed_git(p);
- return status;
-}
-
void packed_object_info_detail(struct packed_git *p,
unsigned long offset,
char *type,
^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2006-09-23 1:25 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-09-21 4:09 [PATCH 6/6] make git-pack-objects able to create deltas with offset to base Nicolas Pitre
2006-09-23 1:25 ` [PATCH 7/6] make pack data reuse compatible with both delta types Nicolas Pitre
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).