git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] Using index-pack in place of verify-pack
@ 2011-02-28  9:49 Junio C Hamano
  2011-02-28  9:49 ` [PATCH 1/5] index-pack: group the delta-base array entries also by type Junio C Hamano
                   ` (5 more replies)
  0 siblings, 6 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-02-28  9:49 UTC (permalink / raw)
  To: git; +Cc: Shawn O. Pearce

The verify-pack command walks objects in a pack for contents verification
in a very naive way, namely, in the object name order.  It is inefficient
compared to the way index-pack does, which verifies contents of the
objects in the order delta-chain is applied.  As the result, the former
command would end up reconstituting the same intermediate object in a
delta chain number of times while the latter would recreate an object only
once.

This series, which is still a WIP, is to teach index-pack "--verify"
option, in order to eventually replace verify-pack with it.

The basic idea, which comes from Shawn Pearce, is to run index-pack on a
packfile being verified, and compare the resulting .idx file with the
existing one, and make sure they match.

Junio C Hamano (5):
  index-pack: group the delta-base array entries also by type
  write_idx_file: introduce a struct to hold idx customization options
  index-pack: --verify
  write_idx_file: need_large_offset() helper function
  index-pack --verify: read anomalous offsets from v2 idx file

 builtin/index-pack.c   |  178 ++++++++++++++++++++++++++++++++++++++----------
 builtin/pack-objects.c |   20 +++---
 csum-file.c            |   46 ++++++++++++-
 csum-file.h            |    2 +
 fast-import.c          |   10 ++-
 pack-write.c           |   82 ++++++++++++++++------
 pack.h                 |   23 +++++-
 t/t5302-pack-index.sh  |   18 +++++
 8 files changed, 301 insertions(+), 78 deletions(-)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/5] index-pack: group the delta-base array entries also by type
  2011-02-28  9:49 [PATCH 0/5] Using index-pack in place of verify-pack Junio C Hamano
@ 2011-02-28  9:49 ` Junio C Hamano
  2011-02-28  9:49 ` [PATCH 2/5] write_idx_file: introduce a struct to hold idx customization options Junio C Hamano
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-02-28  9:49 UTC (permalink / raw)
  To: git; +Cc: Shawn O. Pearce

Entries in the delta_base array are only grouped by the bytepattern in
the delta_base union, some of which have 20-byte object name of the base
object (i.e. base for REF_DELTA objects), while others have sizeof(off_t)
bytes followed by enough NULs to fill 20-byte.  The loops to iterate
through a range inside this array still needs to inspect the type of the
delta, and skip over false hits.

Group the entries also by type to eliminate the potential of false hits.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 * This is a resend of an earlier patch we already discussed; it is not a
   wrong thing to do per-se, but is somewhat difficult to trigger.  You
   would need to come up with an object name that consists of almost all
   NULs and other bytes that happen to be the same as an offset of some
   other object.

 builtin/index-pack.c |   61 ++++++++++++++++++++++++++++++++-----------------
 1 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 8dc5c0b..1b5d83a 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -391,7 +391,18 @@ static void *get_data_from_pack(struct object_entry *obj)
 	return data;
 }
 
-static int find_delta(const union delta_base *base)
+static int compare_delta_bases(const union delta_base *base1,
+			       const union delta_base *base2,
+			       enum object_type type1,
+			       enum object_type type2)
+{
+	int cmp = type1 - type2;
+	if (cmp)
+		return cmp;
+	return memcmp(base1, base2, UNION_BASE_SZ);
+}
+
+static int find_delta(const union delta_base *base, enum object_type type)
 {
 	int first = 0, last = nr_deltas;
 
@@ -400,7 +411,8 @@ static int find_delta(const union delta_base *base)
                 struct delta_entry *delta = &deltas[next];
                 int cmp;
 
-                cmp = memcmp(base, &delta->base, UNION_BASE_SZ);
+		cmp = compare_delta_bases(base, &delta->base,
+					  type, objects[delta->obj_no].type);
                 if (!cmp)
                         return next;
                 if (cmp < 0) {
@@ -413,9 +425,10 @@ static int find_delta(const union delta_base *base)
 }
 
 static void find_delta_children(const union delta_base *base,
-				int *first_index, int *last_index)
+				int *first_index, int *last_index,
+				enum object_type type)
 {
-	int first = find_delta(base);
+	int first = find_delta(base, type);
 	int last = first;
 	int end = nr_deltas - 1;
 
@@ -543,11 +556,13 @@ static void find_unresolved_deltas(struct base_data *base,
 		union delta_base base_spec;
 
 		hashcpy(base_spec.sha1, base->obj->idx.sha1);
-		find_delta_children(&base_spec, &ref_first, &ref_last);
+		find_delta_children(&base_spec,
+				    &ref_first, &ref_last, OBJ_REF_DELTA);
 
 		memset(&base_spec, 0, sizeof(base_spec));
 		base_spec.offset = base->obj->idx.offset;
-		find_delta_children(&base_spec, &ofs_first, &ofs_last);
+		find_delta_children(&base_spec,
+				    &ofs_first, &ofs_last, OBJ_OFS_DELTA);
 	}
 
 	if (ref_last == -1 && ofs_last == -1) {
@@ -559,24 +574,24 @@ static void find_unresolved_deltas(struct base_data *base,
 
 	for (i = ref_first; i <= ref_last; i++) {
 		struct object_entry *child = objects + deltas[i].obj_no;
-		if (child->real_type == OBJ_REF_DELTA) {
-			struct base_data result;
-			resolve_delta(child, base, &result);
-			if (i == ref_last && ofs_last == -1)
-				free_base_data(base);
-			find_unresolved_deltas(&result, base);
-		}
+		struct base_data result;
+
+		assert(child->real_type == OBJ_REF_DELTA);
+		resolve_delta(child, base, &result);
+		if (i == ref_last && ofs_last == -1)
+			free_base_data(base);
+		find_unresolved_deltas(&result, base);
 	}
 
 	for (i = ofs_first; i <= ofs_last; i++) {
 		struct object_entry *child = objects + deltas[i].obj_no;
-		if (child->real_type == OBJ_OFS_DELTA) {
-			struct base_data result;
-			resolve_delta(child, base, &result);
-			if (i == ofs_last)
-				free_base_data(base);
-			find_unresolved_deltas(&result, base);
-		}
+		struct base_data result;
+
+		assert(child->real_type == OBJ_OFS_DELTA);
+		resolve_delta(child, base, &result);
+		if (i == ofs_last)
+			free_base_data(base);
+		find_unresolved_deltas(&result, base);
 	}
 
 	unlink_base_data(base);
@@ -586,7 +601,11 @@ static int compare_delta_entry(const void *a, const void *b)
 {
 	const struct delta_entry *delta_a = a;
 	const struct delta_entry *delta_b = b;
-	return memcmp(&delta_a->base, &delta_b->base, UNION_BASE_SZ);
+
+	/* group by type (ref vs ofs) and then by value (sha-1 or offset) */
+	return compare_delta_bases(&delta_a->base, &delta_b->base,
+				   objects[delta_a->obj_no].type,
+				   objects[delta_b->obj_no].type);
 }
 
 /* Parse all objects and return the pack content SHA1 hash */
-- 
1.7.4.1.249.g4aa72

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/5] write_idx_file: introduce a struct to hold idx customization options
  2011-02-28  9:49 [PATCH 0/5] Using index-pack in place of verify-pack Junio C Hamano
  2011-02-28  9:49 ` [PATCH 1/5] index-pack: group the delta-base array entries also by type Junio C Hamano
@ 2011-02-28  9:49 ` Junio C Hamano
  2011-02-28  9:49 ` [PATCH 3/5] index-pack: --verify Junio C Hamano
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-02-28  9:49 UTC (permalink / raw)
  To: git; +Cc: Shawn O. Pearce

Remove two globals, pack_idx_default version and pack_idx_off32_limit,
and place them in a pack_idx_option structure.  Allow callers to pass
it to write_idx_file() as a parameter.

Adjust all callers to the API change.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/index-pack.c   |   23 +++++++++++++----------
 builtin/pack-objects.c |   20 +++++++++++---------
 fast-import.c          |   10 ++++++----
 pack-write.c           |   17 +++++++++++------
 pack.h                 |   11 +++++++----
 5 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 1b5d83a..4df6818 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -880,11 +880,12 @@ static void final(const char *final_pack_name, const char *curr_pack_name,
 
 static int git_index_pack_config(const char *k, const char *v, void *cb)
 {
+	struct pack_idx_option *opts = cb;
+
 	if (!strcmp(k, "pack.indexversion")) {
-		pack_idx_default_version = git_config_int(k, v);
-		if (pack_idx_default_version > 2)
-			die("bad pack.indexversion=%"PRIu32,
-				pack_idx_default_version);
+		opts->version = git_config_int(k, v);
+		if (opts->version > 2)
+			die("bad pack.indexversion=%"PRIu32, opts->version);
 		return 0;
 	}
 	return git_default_config(k, v, cb);
@@ -898,6 +899,7 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	const char *keep_name = NULL, *keep_msg = NULL;
 	char *index_name_buf = NULL, *keep_name_buf = NULL;
 	struct pack_idx_entry **idx_objects;
+	struct pack_idx_option opts;
 	unsigned char pack_sha1[20];
 
 	if (argc == 2 && !strcmp(argv[1], "-h"))
@@ -905,7 +907,8 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 
 	read_replace_refs = 0;
 
-	git_config(git_index_pack_config, NULL);
+	reset_pack_idx_option(&opts);
+	git_config(git_index_pack_config, &opts);
 	if (prefix && chdir(prefix))
 		die("Cannot come back to cwd");
 
@@ -944,12 +947,12 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 				index_name = argv[++i];
 			} else if (!prefixcmp(arg, "--index-version=")) {
 				char *c;
-				pack_idx_default_version = strtoul(arg + 16, &c, 10);
-				if (pack_idx_default_version > 2)
+				opts.version = strtoul(arg + 16, &c, 10);
+				if (opts.version > 2)
 					die("bad %s", arg);
 				if (*c == ',')
-					pack_idx_off32_limit = strtoul(c+1, &c, 0);
-				if (*c || pack_idx_off32_limit & 0x80000000)
+					opts.off32_limit = strtoul(c+1, &c, 0);
+				if (*c || opts.off32_limit & 0x80000000)
 					die("bad %s", arg);
 			} else
 				usage(index_pack_usage);
@@ -1032,7 +1035,7 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	idx_objects = xmalloc((nr_objects) * sizeof(struct pack_idx_entry *));
 	for (i = 0; i < nr_objects; i++)
 		idx_objects[i] = &objects[i].idx;
-	curr_index = write_idx_file(index_name, idx_objects, nr_objects, pack_sha1);
+	curr_index = write_idx_file(index_name, idx_objects, nr_objects, &opts, pack_sha1);
 	free(idx_objects);
 
 	final(pack_name, curr_pack,
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b0503b2..dc471b7 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -70,6 +70,7 @@ static int local;
 static int incremental;
 static int ignore_packed_keep;
 static int allow_ofs_delta;
+static struct pack_idx_option pack_idx_opts;
 static const char *base_name;
 static int progress = 1;
 static int window = 10;
@@ -493,8 +494,8 @@ static void write_pack_file(void)
 			const char *idx_tmp_name;
 			char tmpname[PATH_MAX];
 
-			idx_tmp_name = write_idx_file(NULL, written_list,
-						      nr_written, sha1);
+			idx_tmp_name = write_idx_file(NULL, written_list, nr_written,
+						      &pack_idx_opts, sha1);
 
 			snprintf(tmpname, sizeof(tmpname), "%s-%s.pack",
 				 base_name, sha1_to_hex(sha1));
@@ -1880,10 +1881,10 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 		return 0;
 	}
 	if (!strcmp(k, "pack.indexversion")) {
-		pack_idx_default_version = git_config_int(k, v);
-		if (pack_idx_default_version > 2)
+		pack_idx_opts.version = git_config_int(k, v);
+		if (pack_idx_opts.version > 2)
 			die("bad pack.indexversion=%"PRIu32,
-				pack_idx_default_version);
+			    pack_idx_opts.version);
 		return 0;
 	}
 	if (!strcmp(k, "pack.packsizelimit")) {
@@ -2130,6 +2131,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	rp_av[1] = "--objects"; /* --thin will make it --objects-edge */
 	rp_ac = 2;
 
+	reset_pack_idx_option(&pack_idx_opts);
 	git_config(git_pack_config, NULL);
 	if (!pack_compression_seen && core_compression_seen)
 		pack_compression_level = core_compression_level;
@@ -2274,12 +2276,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		}
 		if (!prefixcmp(arg, "--index-version=")) {
 			char *c;
-			pack_idx_default_version = strtoul(arg + 16, &c, 10);
-			if (pack_idx_default_version > 2)
+			pack_idx_opts.version = strtoul(arg + 16, &c, 10);
+			if (pack_idx_opts.version > 2)
 				die("bad %s", arg);
 			if (*c == ',')
-				pack_idx_off32_limit = strtoul(c+1, &c, 0);
-			if (*c || pack_idx_off32_limit & 0x80000000)
+				pack_idx_opts.off32_limit = strtoul(c+1, &c, 0);
+			if (*c || pack_idx_opts.off32_limit & 0x80000000)
 				die("bad %s", arg);
 			continue;
 		}
diff --git a/fast-import.c b/fast-import.c
index 3886a1b..91e936d 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -315,6 +315,7 @@ static unsigned int atom_cnt;
 static struct atom_str **atom_table;
 
 /* The .pack file being generated */
+static struct pack_idx_option pack_idx_opts;
 static unsigned int pack_id;
 static struct sha1file *pack_file;
 static struct packed_git *pack_data;
@@ -905,7 +906,7 @@ static const char *create_index(void)
 	if (c != last)
 		die("internal consistency error creating the index");
 
-	tmpfile = write_idx_file(NULL, idx, object_count, pack_data->sha1);
+	tmpfile = write_idx_file(NULL, idx, object_count, &pack_idx_opts, pack_data->sha1);
 	free(idx);
 	return tmpfile;
 }
@@ -3055,10 +3056,10 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 		return 0;
 	}
 	if (!strcmp(k, "pack.indexversion")) {
-		pack_idx_default_version = git_config_int(k, v);
-		if (pack_idx_default_version > 2)
+		pack_idx_opts.version = git_config_int(k, v);
+		if (pack_idx_opts.version > 2)
 			die("bad pack.indexversion=%"PRIu32,
-			    pack_idx_default_version);
+			    pack_idx_opts.version);
 		return 0;
 	}
 	if (!strcmp(k, "pack.packsizelimit")) {
@@ -3116,6 +3117,7 @@ int main(int argc, const char **argv)
 		usage(fast_import_usage);
 
 	setup_git_directory();
+	reset_pack_idx_option(&pack_idx_opts);
 	git_config(git_pack_config, NULL);
 	if (!pack_compression_seen && core_compression_seen)
 		pack_compression_level = core_compression_level;
diff --git a/pack-write.c b/pack-write.c
index a905ca4..f739a0f 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -2,8 +2,12 @@
 #include "pack.h"
 #include "csum-file.h"
 
-uint32_t pack_idx_default_version = 2;
-uint32_t pack_idx_off32_limit = 0x7fffffff;
+void reset_pack_idx_option(struct pack_idx_option *opts)
+{
+	memset(opts, 0, sizeof(*opts));
+	opts->version = 2;
+	opts->off32_limit = 0x7fffffff;
+}
 
 static int sha1_compare(const void *_a, const void *_b)
 {
@@ -18,7 +22,8 @@ static int sha1_compare(const void *_a, const void *_b)
  * will be sorted by SHA1 on exit.
  */
 const char *write_idx_file(const char *index_name, struct pack_idx_entry **objects,
-			   int nr_objects, unsigned char *sha1)
+			   int nr_objects, const struct pack_idx_option *opts,
+			   unsigned char *sha1)
 {
 	struct sha1file *f;
 	struct pack_idx_entry **sorted_by_sha, **list, **last;
@@ -55,7 +60,7 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 	f = sha1fd(fd, index_name);
 
 	/* if last object's offset is >= 2^31 we should use index V2 */
-	index_version = (last_obj_offset >> 31) ? 2 : pack_idx_default_version;
+	index_version = (last_obj_offset >> 31) ? 2 : opts->version;
 
 	/* index versions 2 and above need a header */
 	if (index_version >= 2) {
@@ -115,7 +120,7 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 		list = sorted_by_sha;
 		for (i = 0; i < nr_objects; i++) {
 			struct pack_idx_entry *obj = *list++;
-			uint32_t offset = (obj->offset <= pack_idx_off32_limit) ?
+			uint32_t offset = (obj->offset <= opts->off32_limit) ?
 				obj->offset : (0x80000000 | nr_large_offset++);
 			offset = htonl(offset);
 			sha1write(f, &offset, 4);
@@ -126,7 +131,7 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 		while (nr_large_offset) {
 			struct pack_idx_entry *obj = *list++;
 			uint64_t offset = obj->offset;
-			if (offset > pack_idx_off32_limit) {
+			if (offset > opts->off32_limit) {
 				uint32_t split[2];
 				split[0] = htonl(offset >> 32);
 				split[1] = htonl(offset & 0xffffffff);
diff --git a/pack.h b/pack.h
index bb27576..953f57e 100644
--- a/pack.h
+++ b/pack.h
@@ -34,9 +34,12 @@ struct pack_header {
  */
 #define PACK_IDX_SIGNATURE 0xff744f63	/* "\377tOc" */
 
-/* These may be overridden by command-line parameters */
-extern uint32_t pack_idx_default_version;
-extern uint32_t pack_idx_off32_limit;
+struct pack_idx_option {
+	uint32_t version;
+	uint32_t off32_limit;
+};
+
+extern void reset_pack_idx_option(struct pack_idx_option *);
 
 /*
  * Packed object index header
@@ -55,7 +58,7 @@ struct pack_idx_entry {
 	off_t offset;
 };
 
-extern const char *write_idx_file(const char *index_name, struct pack_idx_entry **objects, int nr_objects, unsigned char *sha1);
+extern const char *write_idx_file(const char *index_name, struct pack_idx_entry **objects, int nr_objects, const struct pack_idx_option *, unsigned char *sha1);
 extern int check_pack_crc(struct packed_git *p, struct pack_window **w_curs, off_t offset, off_t len, unsigned int nr);
 extern int verify_pack_index(struct packed_git *);
 extern int verify_pack(struct packed_git *);
-- 
1.7.4.1.249.g4aa72

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/5] index-pack: --verify
  2011-02-28  9:49 [PATCH 0/5] Using index-pack in place of verify-pack Junio C Hamano
  2011-02-28  9:49 ` [PATCH 1/5] index-pack: group the delta-base array entries also by type Junio C Hamano
  2011-02-28  9:49 ` [PATCH 2/5] write_idx_file: introduce a struct to hold idx customization options Junio C Hamano
@ 2011-02-28  9:49 ` Junio C Hamano
  2011-02-28 17:48   ` Shawn O. Pearce
  2011-02-28  9:49 ` [PATCH 4/5] write_idx_file: need_large_offset() helper function Junio C Hamano
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: Junio C Hamano @ 2011-02-28  9:49 UTC (permalink / raw)
  To: git; +Cc: Shawn O. Pearce

Given an existing .pack file and the .idx file that describes it,
this new mode of operation reads and re-index the packfile and makes
sure the existing .idx file matches the result byte-for-byte.

All the objects in the .pack file are validated during this operation as
well.  Unlike verify-pack, which visits each object described in the .idx
file in the SHA-1 order, index-pack efficiently exploits the delta-chain
to avoid rebuilding the objects that are used as the base of deltified
objects over and over again while validating the objects.  This should
result in much quicker verification of the .pack file and its .idx file.

This version however cannot verify a .pack/.idx pair with a handcrafted v2
index that uses 64-bit offset representation for offsets that would fit
within 31-bit. You can create such an .idx file by giving a custom offset
to --index-version option to the command.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---

 * This does not make much of a difference in a small project like git.git
   itself, especially when the repository is packed with not very big depth
   value.

 builtin/index-pack.c  |   46 ++++++++++++++++++++++++++++++++++++++++------
 csum-file.c           |   46 +++++++++++++++++++++++++++++++++++++++++++++-
 csum-file.h           |    2 ++
 pack-write.c          |   26 ++++++++++++++++----------
 pack.h                |    4 ++++
 t/t5302-pack-index.sh |   18 ++++++++++++++++++
 6 files changed, 125 insertions(+), 17 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 4df6818..24a9a16 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -11,7 +11,7 @@
 #include "exec_cmd.h"
 
 static const char index_pack_usage[] =
-"git index-pack [-v] [-o <index-file>] [ --keep | --keep=<msg> ] [--strict] (<pack-file> | --stdin [--fix-thin] [<pack-file>])";
+"git index-pack [-v] [-o <index-file>] [--keep | --keep=<msg>] [--verify] [--strict] (<pack-file> | --stdin [--fix-thin] [<pack-file>])";
 
 struct object_entry
 {
@@ -891,9 +891,32 @@ static int git_index_pack_config(const char *k, const char *v, void *cb)
 	return git_default_config(k, v, cb);
 }
 
+static void read_idx_option(struct pack_idx_option *opts, const char *pack_name)
+{
+	struct packed_git *p = add_packed_git(pack_name, strlen(pack_name), 1);
+
+	if (!p)
+		die("Cannot open existing pack file '%s'", pack_name);
+	if (open_pack_index(p))
+		die("Cannot open existing pack idx file for '%s'", pack_name);
+
+	/* Read the attributes from the existing idx file */
+	opts->version = p->index_version;
+
+	/*
+	 * Get rid of the idx file as we do not need it anymore.
+	 * NEEDSWORK: extract this bit from free_pack_by_name() in
+	 * sha1_file.c, perhaps?  It shouldn't matter very much as we
+	 * know we haven't installed this pack (hence we never have
+	 * read anything from it).
+	 */
+	close_pack_index(p);
+	free(p);
+}
+
 int cmd_index_pack(int argc, const char **argv, const char *prefix)
 {
-	int i, fix_thin_pack = 0;
+	int i, fix_thin_pack = 0, verify = 0;
 	const char *curr_pack, *curr_index;
 	const char *index_name = NULL, *pack_name = NULL;
 	const char *keep_name = NULL, *keep_msg = NULL;
@@ -922,6 +945,8 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 				fix_thin_pack = 1;
 			} else if (!strcmp(arg, "--strict")) {
 				strict = 1;
+			} else if (!strcmp(arg, "--verify")) {
+				verify = 1;
 			} else if (!strcmp(arg, "--keep")) {
 				keep_msg = "";
 			} else if (!prefixcmp(arg, "--keep=")) {
@@ -988,6 +1013,12 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 		strcpy(keep_name_buf + len - 5, ".keep");
 		keep_name = keep_name_buf;
 	}
+	if (verify) {
+		if (!index_name)
+			die("--verify with no packfile name given");
+		read_idx_option(&opts, index_name);
+		opts.flags |= WRITE_IDX_VERIFY;
+	}
 
 	curr_pack = open_pack_file(pack_name);
 	parse_pack_header();
@@ -1038,10 +1069,13 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 	curr_index = write_idx_file(index_name, idx_objects, nr_objects, &opts, pack_sha1);
 	free(idx_objects);
 
-	final(pack_name, curr_pack,
-		index_name, curr_index,
-		keep_name, keep_msg,
-		pack_sha1);
+	if (!verify)
+		final(pack_name, curr_pack,
+		      index_name, curr_index,
+		      keep_name, keep_msg,
+		      pack_sha1);
+	else
+		close(input_fd);
 	free(objects);
 	free(index_name_buf);
 	free(keep_name_buf);
diff --git a/csum-file.c b/csum-file.c
index 4d50cc5..f70e3dd 100644
--- a/csum-file.c
+++ b/csum-file.c
@@ -11,8 +11,20 @@
 #include "progress.h"
 #include "csum-file.h"
 
-static void flush(struct sha1file *f, void * buf, unsigned int count)
+static void flush(struct sha1file *f, void *buf, unsigned int count)
 {
+	if (0 <= f->check_fd && count)  {
+		unsigned char check_buffer[8192];
+		ssize_t ret = read_in_full(f->check_fd, check_buffer, count);
+
+		if (ret < 0)
+			die_errno("%s: sha1 file read error", f->name);
+		if (ret < count)
+			die("%s: sha1 file truncated", f->name);
+		if (memcmp(buf, check_buffer, count))
+			die("sha1 file '%s' validation error", f->name);
+	}
+
 	for (;;) {
 		int ret = xwrite(f->fd, buf, count);
 		if (ret > 0) {
@@ -59,6 +71,17 @@ int sha1close(struct sha1file *f, unsigned char *result, unsigned int flags)
 		fd = 0;
 	} else
 		fd = f->fd;
+	if (0 <= f->check_fd) {
+		char discard;
+		int cnt = read_in_full(f->check_fd, &discard, 1);
+		if (cnt < 0)
+			die_errno("%s: error when reading the tail of sha1 file",
+				  f->name);
+		if (cnt)
+			die("%s: sha1 file has trailing garbage", f->name);
+		if (close(f->check_fd))
+			die_errno("%s: sha1 file error on close", f->name);
+	}
 	free(f);
 	return fd;
 }
@@ -101,10 +124,31 @@ struct sha1file *sha1fd(int fd, const char *name)
 	return sha1fd_throughput(fd, name, NULL);
 }
 
+struct sha1file *sha1fd_check(const char *name)
+{
+	int sink, check;
+	struct sha1file *f;
+
+	sink = open("/dev/null", O_WRONLY);
+	if (sink < 0)
+		return NULL;
+	check = open(name, O_RDONLY);
+	if (check < 0) {
+		int saved_errno = errno;
+		close(sink);
+		errno = saved_errno;
+		return NULL;
+	}
+	f = sha1fd(sink, name);
+	f->check_fd = check;
+	return f;
+}
+
 struct sha1file *sha1fd_throughput(int fd, const char *name, struct progress *tp)
 {
 	struct sha1file *f = xmalloc(sizeof(*f));
 	f->fd = fd;
+	f->check_fd = -1;
 	f->offset = 0;
 	f->total = 0;
 	f->tp = tp;
diff --git a/csum-file.h b/csum-file.h
index 294add2..6a7967c 100644
--- a/csum-file.h
+++ b/csum-file.h
@@ -6,6 +6,7 @@ struct progress;
 /* A SHA1-protected file */
 struct sha1file {
 	int fd;
+	int check_fd;
 	unsigned int offset;
 	git_SHA_CTX ctx;
 	off_t total;
@@ -21,6 +22,7 @@ struct sha1file {
 #define CSUM_FSYNC	2
 
 extern struct sha1file *sha1fd(int fd, const char *name);
+extern struct sha1file *sha1fd_check(const char *name);
 extern struct sha1file *sha1fd_throughput(int fd, const char *name, struct progress *tp);
 extern int sha1close(struct sha1file *, unsigned char *, unsigned int);
 extern int sha1write(struct sha1file *, void *, unsigned int);
diff --git a/pack-write.c b/pack-write.c
index f739a0f..16529c3 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -47,17 +47,22 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 	else
 		sorted_by_sha = list = last = NULL;
 
-	if (!index_name) {
-		static char tmpfile[PATH_MAX];
-		fd = odb_mkstemp(tmpfile, sizeof(tmpfile), "pack/tmp_idx_XXXXXX");
-		index_name = xstrdup(tmpfile);
+	if (opts->flags & WRITE_IDX_VERIFY) {
+		assert(index_name);
+		f = sha1fd_check(index_name);
 	} else {
-		unlink(index_name);
-		fd = open(index_name, O_CREAT|O_EXCL|O_WRONLY, 0600);
+		if (!index_name) {
+			static char tmpfile[PATH_MAX];
+			fd = odb_mkstemp(tmpfile, sizeof(tmpfile), "pack/tmp_idx_XXXXXX");
+			index_name = xstrdup(tmpfile);
+		} else {
+			unlink(index_name);
+			fd = open(index_name, O_CREAT|O_EXCL|O_WRONLY, 0600);
+		}
+		if (fd < 0)
+			die_errno("unable to create '%s'", index_name);
+		f = sha1fd(fd, index_name);
 	}
-	if (fd < 0)
-		die_errno("unable to create '%s'", index_name);
-	f = sha1fd(fd, index_name);
 
 	/* if last object's offset is >= 2^31 we should use index V2 */
 	index_version = (last_obj_offset >> 31) ? 2 : opts->version;
@@ -142,7 +147,8 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 	}
 
 	sha1write(f, sha1, 20);
-	sha1close(f, NULL, CSUM_FSYNC);
+	sha1close(f, NULL, ((opts->flags & WRITE_IDX_VERIFY)
+			    ? CSUM_CLOSE : CSUM_FSYNC));
 	git_SHA1_Final(sha1, &ctx);
 	return index_name;
 }
diff --git a/pack.h b/pack.h
index 953f57e..dddafdd 100644
--- a/pack.h
+++ b/pack.h
@@ -35,6 +35,10 @@ struct pack_header {
 #define PACK_IDX_SIGNATURE 0xff744f63	/* "\377tOc" */
 
 struct pack_idx_option {
+	unsigned flags;
+	/* flag bits */
+#define WRITE_IDX_VERIFY 01
+
 	uint32_t version;
 	uint32_t off32_limit;
 };
diff --git a/t/t5302-pack-index.sh b/t/t5302-pack-index.sh
index b34ea93..7c5fa03 100755
--- a/t/t5302-pack-index.sh
+++ b/t/t5302-pack-index.sh
@@ -65,6 +65,14 @@ test_expect_success \
     'cmp "test-1-${pack1}.idx" "1.idx" &&
      cmp "test-2-${pack2}.idx" "2.idx"'
 
+test_expect_success 'index-pack --verify on index version 1' '
+	git index-pack --verify "test-1-${pack1}.pack"
+'
+
+test_expect_success 'index-pack --verify on index version 2' '
+	git index-pack --verify "test-2-${pack2}.pack"
+'
+
 test_expect_success \
     'index v2: force some 64-bit offsets with pack-objects' \
     'pack3=$(git pack-objects --index-version=2,0x40000 test-3 <obj-list)'
@@ -93,6 +101,16 @@ test_expect_success OFF64_T \
     '64-bit offsets: index-pack result should match pack-objects one' \
     'cmp "test-3-${pack3}.idx" "3.idx"'
 
+test_expect_success OFF64_T 'index-pack --verify on 64-bit offset v2 (cheat)' '
+	# This cheats by knowing which lower offset should still be encoded
+	# in 64-bit representation.
+	git index-pack --verify --index-version=2,0x40000 "test-3-${pack3}.pack"
+'
+
+test_expect_failure OFF64_T 'index-pack --verify on 64-bit offset v2' '
+	git index-pack --verify "test-3-${pack3}.pack"
+'
+
 # returns the object number for given object in given pack index
 index_obj_nr()
 {
-- 
1.7.4.1.249.g4aa72

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 4/5] write_idx_file: need_large_offset() helper function
  2011-02-28  9:49 [PATCH 0/5] Using index-pack in place of verify-pack Junio C Hamano
                   ` (2 preceding siblings ...)
  2011-02-28  9:49 ` [PATCH 3/5] index-pack: --verify Junio C Hamano
@ 2011-02-28  9:49 ` Junio C Hamano
  2011-02-28  9:49 ` [PATCH 5/5] index-pack --verify: read anomalous offsets from v2 idx file Junio C Hamano
  2011-02-28 13:07 ` [PATCH 0/5] Using index-pack in place of verify-pack Sverre Rabbelier
  5 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-02-28  9:49 UTC (permalink / raw)
  To: git; +Cc: Shawn O. Pearce

Extract the logic to determine if a given "offset" value needs to be
represented as a large offset in the .idx file into a separate helper
function.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---

 * This incidentally "fixes" the initial determination of index_version.
   Earlier it didn't take the off32-limit given from the command line into
   account when inspecting the offset of the last object in the pack, but
   this version does.  As off32-limit is primarily a debugging feature,
   this does not matter much, but as long as we are touching the
   surrounding code, it would be better to fix it.

 pack-write.c |   29 +++++++++++++++++++----------
 1 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/pack-write.c b/pack-write.c
index 16529c3..92e7eef 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -16,6 +16,11 @@ static int sha1_compare(const void *_a, const void *_b)
 	return hashcmp(a->sha1, b->sha1);
 }
 
+static int need_large_offset(off_t offset, const struct pack_idx_option *opts)
+{
+	return (offset >> 31) || (opts->off32_limit < offset);
+}
+
 /*
  * On entry *sha1 contains the pack content SHA1 hash, on exit it is
  * the SHA1 hash of sorted object names. The objects array passed in
@@ -65,7 +70,7 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 	}
 
 	/* if last object's offset is >= 2^31 we should use index V2 */
-	index_version = (last_obj_offset >> 31) ? 2 : opts->version;
+	index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version;
 
 	/* index versions 2 and above need a header */
 	if (index_version >= 2) {
@@ -125,8 +130,11 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 		list = sorted_by_sha;
 		for (i = 0; i < nr_objects; i++) {
 			struct pack_idx_entry *obj = *list++;
-			uint32_t offset = (obj->offset <= opts->off32_limit) ?
-				obj->offset : (0x80000000 | nr_large_offset++);
+			uint32_t offset;
+
+			offset = (need_large_offset(obj->offset, opts)
+				  ? (0x80000000 | nr_large_offset++)
+				  : obj->offset);
 			offset = htonl(offset);
 			sha1write(f, &offset, 4);
 		}
@@ -136,13 +144,14 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 		while (nr_large_offset) {
 			struct pack_idx_entry *obj = *list++;
 			uint64_t offset = obj->offset;
-			if (offset > opts->off32_limit) {
-				uint32_t split[2];
-				split[0] = htonl(offset >> 32);
-				split[1] = htonl(offset & 0xffffffff);
-				sha1write(f, split, 8);
-				nr_large_offset--;
-			}
+			uint32_t split[2];
+
+			if (!need_large_offset(offset, opts))
+				continue;
+			split[0] = htonl(offset >> 32);
+			split[1] = htonl(offset & 0xffffffff);
+			sha1write(f, split, 8);
+			nr_large_offset--;
 		}
 	}
 
-- 
1.7.4.1.249.g4aa72

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 5/5] index-pack --verify: read anomalous offsets from v2 idx file
  2011-02-28  9:49 [PATCH 0/5] Using index-pack in place of verify-pack Junio C Hamano
                   ` (3 preceding siblings ...)
  2011-02-28  9:49 ` [PATCH 4/5] write_idx_file: need_large_offset() helper function Junio C Hamano
@ 2011-02-28  9:49 ` Junio C Hamano
  2011-02-28 13:07 ` [PATCH 0/5] Using index-pack in place of verify-pack Sverre Rabbelier
  5 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-02-28  9:49 UTC (permalink / raw)
  To: git; +Cc: Shawn O. Pearce

A pack v2 .idx file usually records offset using 64-bit representation
only when the offset does not fit within 31-bit, but you can handcraft
your .idx file to record smaller offset using 64-bit, storing all zero
in the upper 4-byte.  By inspecting the original idx file when running
index-pack --verify, encode such low offsets that do not need to be in
64-bit but are encoded using 64-bit just like the original idx file so
that we can still validate the pack/idx pair by comparing the idx file
recomputed with the original.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---

 * This "fixes" the shortcoming of 3/5 that breaks on a handcrafted .idx
   in a way that does not even depend on the off32_limit logic.

 builtin/index-pack.c  |   48 ++++++++++++++++++++++++++++++++++++++++++++++++
 pack-write.c          |   18 +++++++++++++++++-
 pack.h                |    8 ++++++++
 t/t5302-pack-index.sh |    2 +-
 4 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 24a9a16..513fbbc 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -891,6 +891,51 @@ static int git_index_pack_config(const char *k, const char *v, void *cb)
 	return git_default_config(k, v, cb);
 }
 
+static int cmp_uint32(const void *a_, const void *b_)
+{
+	uint32_t a = *((uint32_t *)a_);
+	uint32_t b = *((uint32_t *)b_);
+
+	return (a < b) ? -1 : (a != b);
+}
+
+static void read_v2_anomalous_offsets(struct packed_git *p,
+				      struct pack_idx_option *opts)
+{
+	const uint32_t *idx1, *idx2;
+	uint32_t i;
+
+	/* The address of the 4-byte offset table */
+	idx1 = (((const uint32_t *)p->index_data)
+		+ 2 /* 8-byte header */
+		+ 256 /* fan out */
+		+ 5 * p->num_objects /* 20-byte SHA-1 table */
+		+ p->num_objects /* CRC32 table */
+		);
+
+	/* The address of the 8-byte offset table */
+	idx2 = idx1 + p->num_objects;
+
+	for (i = 0; i < p->num_objects; i++) {
+		uint32_t off = ntohl(idx1[i]);
+		if (!(off & 0x80000000))
+			continue;
+		off = off & 0x7fffffff;
+		if (idx2[off * 2])
+			continue;
+		/*
+		 * The real offset is ntohl(idx2[off * 2]) in high 4
+		 * octets, and ntohl(idx2[off * 2 + 1]) in low 4
+		 * octets.  But idx2[off * 2] is Zero!!!
+		 */
+		ALLOC_GROW(opts->anomaly, opts->anomaly_nr + 1, opts->anomaly_alloc);
+		opts->anomaly[opts->anomaly_nr++] = ntohl(idx2[off * 2 + 1]);
+	}
+
+	if (1 < opts->anomaly_nr)
+		qsort(opts->anomaly, opts->anomaly_nr, sizeof(uint32_t), cmp_uint32);
+}
+
 static void read_idx_option(struct pack_idx_option *opts, const char *pack_name)
 {
 	struct packed_git *p = add_packed_git(pack_name, strlen(pack_name), 1);
@@ -903,6 +948,9 @@ static void read_idx_option(struct pack_idx_option *opts, const char *pack_name)
 	/* Read the attributes from the existing idx file */
 	opts->version = p->index_version;
 
+	if (opts->version == 2)
+		read_v2_anomalous_offsets(p, opts);
+
 	/*
 	 * Get rid of the idx file as we do not need it anymore.
 	 * NEEDSWORK: extract this bit from free_pack_by_name() in
diff --git a/pack-write.c b/pack-write.c
index 92e7eef..9cd3bfb 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -16,9 +16,25 @@ static int sha1_compare(const void *_a, const void *_b)
 	return hashcmp(a->sha1, b->sha1);
 }
 
+static int cmp_uint32(const void *a_, const void *b_)
+{
+	uint32_t a = *((uint32_t *)a_);
+	uint32_t b = *((uint32_t *)b_);
+
+	return (a < b) ? -1 : (a != b);
+}
+
 static int need_large_offset(off_t offset, const struct pack_idx_option *opts)
 {
-	return (offset >> 31) || (opts->off32_limit < offset);
+	uint32_t ofsval;
+
+	if ((offset >> 31) || (opts->off32_limit < offset))
+		return 1;
+	if (!opts->anomaly_nr)
+		return 0;
+	ofsval = offset;
+	return !!bsearch(&ofsval, opts->anomaly, opts->anomaly_nr,
+			 sizeof(ofsval), cmp_uint32);
 }
 
 /*
diff --git a/pack.h b/pack.h
index dddafdd..722a54e 100644
--- a/pack.h
+++ b/pack.h
@@ -41,6 +41,14 @@ struct pack_idx_option {
 
 	uint32_t version;
 	uint32_t off32_limit;
+
+	/*
+	 * List of offsets that would fit within off32_limit but
+	 * need to be written out as 64-bit entity for byte-for-byte
+	 * verification.
+	 */
+	int anomaly_alloc, anomaly_nr;
+	uint32_t *anomaly;
 };
 
 extern void reset_pack_idx_option(struct pack_idx_option *);
diff --git a/t/t5302-pack-index.sh b/t/t5302-pack-index.sh
index 7c5fa03..76bcaca 100755
--- a/t/t5302-pack-index.sh
+++ b/t/t5302-pack-index.sh
@@ -107,7 +107,7 @@ test_expect_success OFF64_T 'index-pack --verify on 64-bit offset v2 (cheat)' '
 	git index-pack --verify --index-version=2,0x40000 "test-3-${pack3}.pack"
 '
 
-test_expect_failure OFF64_T 'index-pack --verify on 64-bit offset v2' '
+test_expect_success OFF64_T 'index-pack --verify on 64-bit offset v2' '
 	git index-pack --verify "test-3-${pack3}.pack"
 '
 
-- 
1.7.4.1.249.g4aa72

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] Using index-pack in place of verify-pack
  2011-02-28  9:49 [PATCH 0/5] Using index-pack in place of verify-pack Junio C Hamano
                   ` (4 preceding siblings ...)
  2011-02-28  9:49 ` [PATCH 5/5] index-pack --verify: read anomalous offsets from v2 idx file Junio C Hamano
@ 2011-02-28 13:07 ` Sverre Rabbelier
  2011-02-28 16:16   ` Junio C Hamano
  5 siblings, 1 reply; 15+ messages in thread
From: Sverre Rabbelier @ 2011-02-28 13:07 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Shawn O. Pearce

Heya,

On Mon, Feb 28, 2011 at 10:49, Junio C Hamano <gitster@pobox.com> wrote:
> The basic idea, which comes from Shawn Pearce, is to run index-pack on a
> packfile being verified, and compare the resulting .idx file with the
> existing one, and make sure they match.

Do you have any timings, on say, the kernel repo?

-- 
Cheers,

Sverre Rabbelier

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] Using index-pack in place of verify-pack
  2011-02-28 13:07 ` [PATCH 0/5] Using index-pack in place of verify-pack Sverre Rabbelier
@ 2011-02-28 16:16   ` Junio C Hamano
  2011-02-28 16:46     ` Sverre Rabbelier
  0 siblings, 1 reply; 15+ messages in thread
From: Junio C Hamano @ 2011-02-28 16:16 UTC (permalink / raw)
  To: Sverre Rabbelier; +Cc: git, Shawn O. Pearce

Sverre Rabbelier <srabbelier@gmail.com> writes:

> On Mon, Feb 28, 2011 at 10:49, Junio C Hamano <gitster@pobox.com> wrote:
>> The basic idea, which comes from Shawn Pearce, is to run index-pack on a
>> packfile being verified, and compare the resulting .idx file with the
>> existing one, and make sure they match.
>
> Do you have any timings, on say, the kernel repo?

Not yet; the code has just become in good enough shape to be built and
measured by anybody interested --- didn't I say WIP somewhere?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] Using index-pack in place of verify-pack
  2011-02-28 16:16   ` Junio C Hamano
@ 2011-02-28 16:46     ` Sverre Rabbelier
  2011-02-28 16:58       ` Jeff King
  0 siblings, 1 reply; 15+ messages in thread
From: Sverre Rabbelier @ 2011-02-28 16:46 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Shawn O. Pearce

Heya,

On Mon, Feb 28, 2011 at 17:16, Junio C Hamano <gitster@pobox.com> wrote:
> Not yet; the code has just become in good enough shape to be built and
> measured by anybody interested --- didn't I say WIP somewhere?

Yes, but I figured that we'd want to see if it's worth pursuing at
all, e.g., whether we should try and get this in decent shape or not?

-- 
Cheers,

Sverre Rabbelier

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] Using index-pack in place of verify-pack
  2011-02-28 16:46     ` Sverre Rabbelier
@ 2011-02-28 16:58       ` Jeff King
  2011-02-28 17:01         ` Sverre Rabbelier
                           ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Jeff King @ 2011-02-28 16:58 UTC (permalink / raw)
  To: Sverre Rabbelier; +Cc: Junio C Hamano, git, Shawn O. Pearce

On Mon, Feb 28, 2011 at 05:46:19PM +0100, Sverre Rabbelier wrote:

> On Mon, Feb 28, 2011 at 17:16, Junio C Hamano <gitster@pobox.com> wrote:
> > Not yet; the code has just become in good enough shape to be built and
> > measured by anybody interested --- didn't I say WIP somewhere?
> 
> Yes, but I figured that we'd want to see if it's worth pursuing at
> all, e.g., whether we should try and get this in decent shape or not?

I'm not at all interested in this topic, but in the time it took the two
of you to write your emails, I did this:

  $ cd linux-2.6

  $ time git verify-pack \
      .git/objects/pack/pack-36c50f6082df404c26a699f2702946e0cce6208f.idx
  real    2m37.338s
  user    2m35.874s
  sys     0m1.348s

  $ time git index-pack --verify \
      .git/objects/pack/pack-36c50f6082df404c26a699f2702946e0cce6208f.pack
  real    1m37.208s
  user    1m36.106s
  sys     0m1.048s


OK, with the CPU time it probably took longer than your emails. But I
ate some ice cream while it computed. :)

-Peff

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] Using index-pack in place of verify-pack
  2011-02-28 16:58       ` Jeff King
@ 2011-02-28 17:01         ` Sverre Rabbelier
  2011-02-28 17:25         ` Shawn O. Pearce
  2011-02-28 19:50         ` Junio C Hamano
  2 siblings, 0 replies; 15+ messages in thread
From: Sverre Rabbelier @ 2011-02-28 17:01 UTC (permalink / raw)
  To: Jeff King; +Cc: Junio C Hamano, git, Shawn O. Pearce

Heya,

On Mon, Feb 28, 2011 at 17:58, Jeff King <peff@peff.net> wrote:
>  $ time git verify-pack \
>      .git/objects/pack/pack-36c50f6082df404c26a699f2702946e0cce6208f.idx
>  real    2m37.338s
>  user    2m35.874s
>  sys     0m1.348s
>
>  $ time git index-pack --verify \
>      .git/objects/pack/pack-36c50f6082df404c26a699f2702946e0cce6208f.pack
>  real    1m37.208s
>  user    1m36.106s
>  sys     0m1.048s

Wow, impressive results, a full minute shaved off.

> OK, with the CPU time it probably took longer than your emails. But I
> ate some ice cream while it computed. :)

You lucky man ;)

-- 
Cheers,

Sverre Rabbelier

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] Using index-pack in place of verify-pack
  2011-02-28 16:58       ` Jeff King
  2011-02-28 17:01         ` Sverre Rabbelier
@ 2011-02-28 17:25         ` Shawn O. Pearce
  2011-02-28 19:50         ` Junio C Hamano
  2 siblings, 0 replies; 15+ messages in thread
From: Shawn O. Pearce @ 2011-02-28 17:25 UTC (permalink / raw)
  To: Jeff King; +Cc: Sverre Rabbelier, Junio C Hamano, git

Jeff King <peff@peff.net> wrote:
>   $ cd linux-2.6
> 
>   $ time git verify-pack \
>       .git/objects/pack/pack-36c50f6082df404c26a699f2702946e0cce6208f.idx
>   real    2m37.338s
>   user    2m35.874s
>   sys     0m1.348s
> 
>   $ time git index-pack --verify \
>       .git/objects/pack/pack-36c50f6082df404c26a699f2702946e0cce6208f.pack
>   real    1m37.208s
>   user    1m36.106s
>   sys     0m1.048s

Not the improvement I had hoped. Your verify-pack ran much more
quickly than the fsck I was seeing that led me to suggest this to
Junio a few weeks ago. But saving a minute is still pretty good.

That index-pack run is basically what we get during clone, so
I don't really expect to improve on that. (However I have been
contemplating that the locality within the pack isn't optimal and
we can do better placement.)

-- 
Shawn.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] index-pack: --verify
  2011-02-28  9:49 ` [PATCH 3/5] index-pack: --verify Junio C Hamano
@ 2011-02-28 17:48   ` Shawn O. Pearce
  2011-02-28 18:54     ` Junio C Hamano
  0 siblings, 1 reply; 15+ messages in thread
From: Shawn O. Pearce @ 2011-02-28 17:48 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano <gitster@pobox.com> wrote:
> Given an existing .pack file and the .idx file that describes it,
> this new mode of operation reads and re-index the packfile and makes
> sure the existing .idx file matches the result byte-for-byte.
> 
> All the objects in the .pack file are validated during this operation as
> well.  Unlike verify-pack, which visits each object described in the .idx
> file in the SHA-1 order, index-pack efficiently exploits the delta-chain
> to avoid rebuilding the objects that are used as the base of deltified
> objects over and over again while validating the objects.  This should
> result in much quicker verification of the .pack file and its .idx file.
> 
> This version however cannot verify a .pack/.idx pair with a handcrafted v2
> index that uses 64-bit offset representation for offsets that would fit
> within 31-bit. You can create such an .idx file by giving a custom offset
> to --index-version option to the command.
> 
> Signed-off-by: Junio C Hamano <gitster@pobox.com>
> ---
> 
>  * This does not make much of a difference in a small project like git.git
>    itself, especially when the repository is packed with not very big depth
>    value.

However it is useful on larger repositories, like linux-2.6. I'd like
to see this series cleaned up and submitted, because as Peff's tests
shows, it shaves 1 minute off the linux-2.6 fsck.

Skim reading the code it mostly looked OK, though the NEEDSWORK
stuff has to be cleaned up. And I wonder if the series cannot be
flipped around a bit to put the 4/5 earlier and try to avoid a
stage where `index-pack --verify` doesn't do the right thing on
the hand-rolled lower 32 bit index limit.

In this patch I'm not happy about csum-file having this check_fd
part of its contents. But its probably the shortest way to inject
validation into index-pack without butchering a large part of its
index generation function.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] index-pack: --verify
  2011-02-28 17:48   ` Shawn O. Pearce
@ 2011-02-28 18:54     ` Junio C Hamano
  0 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-02-28 18:54 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: git

"Shawn O. Pearce" <spearce@spearce.org> writes:

> Skim reading the code it mostly looked OK, though the NEEDSWORK
> stuff has to be cleaned up.

Yeah, the part that needs working is about thinking if such a clean-up
(i.e. extracting small piece of code from free-pack-by-name to make an
official API function to free storage for a packed_git while closing the
pack_index and its associated resource) is necessary.  There are other
remaining tasks such as integrating this into verify_pack() function,
which in turn requires to teach index-pack to produce the delta-chain
histograms (i.e. "verify-pack --verbose").  Also the codepath needs to be
taught about the plan for narrow clones---there are places that assume
everything referred to from objects need to exist.  These have higher
priority than this NEEDSWORK comment.

> ... And I wonder if the series cannot be
> flipped around a bit to put the 4/5 earlier and try to avoid a
> stage where `index-pack --verify` doesn't do the right thing on
> the hand-rolled lower 32 bit index limit.

Oh, I fully agree that 4/5 should be squashed into this in the final
round.  I preserved the progression of thought in this WIP as it
illustrated pitfalls to avoid rather nicely (iow, hoping to show others a
BCP of building a series by example).  The separation would also hopefully
have made the series easier to review.

> In this patch I'm not happy about csum-file having this check_fd
> part of its contents. But its probably the shortest way to inject
> validation into index-pack without butchering a large part of its
> index generation function.

Yeah, I am only 70% happy about the approach myself.  Patches and
improvements are welcome, of course.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] Using index-pack in place of verify-pack
  2011-02-28 16:58       ` Jeff King
  2011-02-28 17:01         ` Sverre Rabbelier
  2011-02-28 17:25         ` Shawn O. Pearce
@ 2011-02-28 19:50         ` Junio C Hamano
  2 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-02-28 19:50 UTC (permalink / raw)
  To: Jeff King; +Cc: Sverre Rabbelier, git, Shawn O. Pearce

Jeff King <peff@peff.net> writes:

> I'm not at all interested in this topic, but in the time it took the two
> of you to write your emails, I did this:
>
>   $ cd linux-2.6
>
>   $ time git verify-pack \
>       .git/objects/pack/pack-36c50f6082df404c26a699f2702946e0cce6208f.idx
>   real    2m37.338s
>   user    2m35.874s
>   sys     0m1.348s
>
>   $ time git index-pack --verify \
>       .git/objects/pack/pack-36c50f6082df404c26a699f2702946e0cce6208f.pack
>   real    1m37.208s
>   user    1m36.106s
>   sys     0m1.048s
>
>
> OK, with the CPU time it probably took longer than your emails. But I
> ate some ice cream while it computed. :)

I guess my box is showing its age.  It is 2m34s vs 3m25s for the kernel
repository packed with depth=50 (the default).

A pack created with insanely deep depth of course shows larger difference
(2m31s vs 4m38s for the same repository repacked with depth=8192).

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2011-02-28 19:50 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-02-28  9:49 [PATCH 0/5] Using index-pack in place of verify-pack Junio C Hamano
2011-02-28  9:49 ` [PATCH 1/5] index-pack: group the delta-base array entries also by type Junio C Hamano
2011-02-28  9:49 ` [PATCH 2/5] write_idx_file: introduce a struct to hold idx customization options Junio C Hamano
2011-02-28  9:49 ` [PATCH 3/5] index-pack: --verify Junio C Hamano
2011-02-28 17:48   ` Shawn O. Pearce
2011-02-28 18:54     ` Junio C Hamano
2011-02-28  9:49 ` [PATCH 4/5] write_idx_file: need_large_offset() helper function Junio C Hamano
2011-02-28  9:49 ` [PATCH 5/5] index-pack --verify: read anomalous offsets from v2 idx file Junio C Hamano
2011-02-28 13:07 ` [PATCH 0/5] Using index-pack in place of verify-pack Sverre Rabbelier
2011-02-28 16:16   ` Junio C Hamano
2011-02-28 16:46     ` Sverre Rabbelier
2011-02-28 16:58       ` Jeff King
2011-02-28 17:01         ` Sverre Rabbelier
2011-02-28 17:25         ` Shawn O. Pearce
2011-02-28 19:50         ` Junio C Hamano

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).