git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
Date: Wed, 14 Mar 2012 21:40:55 +0700	[thread overview]
Message-ID: <1331736055-21019-1-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <7vfwdq8914.fsf@alter.siamese.dyndns.org>

When we fetch or push, usually "git rev-list --verify-objects --not
--all --stdin" is used to make sure that all objects between existing
refs and new refs are good. This means no gaps in between, all objects
are well-formed, object content agrees with its sha-1 signature.

For the last one, --verify-objects calls check_sha1_signature() via
parse_object(). check_sha1_signature() is an expensive operation,
especially when new refs are far away from existing ones because all
objects in between are re-hashed. Because objects coming from the new
pack are already hashed by index-pack, we can trust their
integrity. The only objects left to check are existing ones in repo
but has no connection to any current refs.

Pass the new pack id down to--verify-objects and skip
check_sha1_signature() on objects from that pack.

As an (extreme) example, a repository is created with only one commit:
e83c516 (Initial revision of "git", the information manager from hell
- 2005-04-07). The rest of git.git is fetched on top. Without the
patch:

$ time git fetch file:///home/pclouds/w/git/.git
remote: Counting objects: 125656, done.
remote: Compressing objects: 100% (33280/33280), done.
remote: Total 125656 (delta 92585), reused 123464 (delta 90682)
Receiving objects: 100% (125656/125656), 34.60 MiB | 8.47 MiB/s, done.
Resolving deltas: 100% (92585/92585), done.
From file:///home/pclouds/t/test/
 * branch            HEAD       -> FETCH_HEAD

real    1m30.437s
user    1m31.338s
sys     0m1.687s

With the patch:

$ time git fetch file:///home/pclouds/w/git/.git
remote: Counting objects: 125656, done.
remote: Compressing objects: 100% (33280/33280), done.
remote: Total 125656 (delta 92585), reused 123464 (delta 90682)
Receiving objects: 100% (125656/125656), 34.60 MiB | 7.86 MiB/s, done.
Resolving deltas: 100% (92585/92585), done.
From file:///home/pclouds/t/test/
 * branch            HEAD       -> FETCH_HEAD

real    0m52.182s
user    0m53.151s
sys     0m1.465s

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Another try. Exclude all objects reachable from refs and the new pack
 from --verify-objects tests.
 
 I keep CHECK_CONNECT_QUIET change because it seems a good change
 anyway. transport.[ch] changes are dropped because we can detect pack
 case based on pack_lockfile.

 builtin/fetch.c        |   12 +++++++-----
 builtin/receive-pack.c |    7 +++----
 builtin/rev-list.c     |   20 ++++++++++++++++++--
 connected.c            |   31 +++++++++++++++++++++++--------
 connected.h            |    5 ++++-
 revision.c             |    5 +++++
 revision.h             |    3 +++
 7 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 65f5f9b..cc84d04 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -368,7 +368,7 @@ static int iterate_ref_map(void *cb_data, unsigned char sha1[20])
 }
 
 static int store_updated_refs(const char *raw_url, const char *remote_name,
-		struct ref *ref_map)
+			      struct ref *ref_map, const char *pack_lockfile)
 {
 	FILE *fp;
 	struct commit *commit;
@@ -389,7 +389,7 @@ static int store_updated_refs(const char *raw_url, const char *remote_name,
 		url = xstrdup("foreign");
 
 	rm = ref_map;
-	if (check_everything_connected(iterate_ref_map, 0, &rm)) {
+	if (check_everything_connected(iterate_ref_map, 0, pack_lockfile, &rm)) {
 		rc = error(_("%s did not send all necessary objects\n"), url);
 		goto abort;
 	}
@@ -516,7 +516,8 @@ static int quickfetch(struct ref *ref_map)
 	 */
 	if (depth)
 		return -1;
-	return check_everything_connected(iterate_ref_map, 1, &rm);
+	return check_everything_connected(iterate_ref_map,
+					  CHECK_CONNECT_QUIET, NULL, &rm);
 }
 
 static int fetch_refs(struct transport *transport, struct ref *ref_map)
@@ -526,8 +527,9 @@ static int fetch_refs(struct transport *transport, struct ref *ref_map)
 		ret = transport_fetch_refs(transport, ref_map);
 	if (!ret)
 		ret |= store_updated_refs(transport->url,
-				transport->remote->name,
-				ref_map);
+					  transport->remote->name,
+					  ref_map,
+					  transport->pack_lockfile);
 	transport_unlock_pack(transport);
 	return ret;
 }
diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c
index 0afb8b2..01d37f6 100644
--- a/builtin/receive-pack.c
+++ b/builtin/receive-pack.c
@@ -40,6 +40,7 @@ static int auto_gc = 1;
 static const char *head_name;
 static void *head_name_to_free;
 static int sent_capabilities;
+static const char *pack_lockfile;
 
 static enum deny_action parse_deny_action(const char *var, const char *value)
 {
@@ -669,7 +670,7 @@ static void set_connectivity_errors(struct command *commands)
 	for (cmd = commands; cmd; cmd = cmd->next) {
 		struct command *singleton = cmd;
 		if (!check_everything_connected(command_singleton_iterator,
-						0, &singleton))
+						0, pack_lockfile, &singleton))
 			continue;
 		cmd->error_string = "missing necessary objects";
 	}
@@ -705,7 +706,7 @@ static void execute_commands(struct command *commands, const char *unpacker_erro
 
 	cmd = commands;
 	if (check_everything_connected(iterate_receive_command_list,
-				       0, &cmd))
+				       0, pack_lockfile, &cmd))
 		set_connectivity_errors(commands);
 
 	if (run_receive_hook(commands, pre_receive_hook, 0)) {
@@ -797,8 +798,6 @@ static const char *parse_pack_header(struct pack_header *hdr)
 	}
 }
 
-static const char *pack_lockfile;
-
 static const char *unpack(void)
 {
 	struct pack_header hdr;
diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index 4c4d404..21d714b 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -180,8 +180,24 @@ static void finish_object(struct object *obj,
 	struct rev_list_info *info = cb_data;
 	if (obj->type == OBJ_BLOB && !has_sha1_file(obj->sha1))
 		die("missing blob object '%s'", sha1_to_hex(obj->sha1));
-	if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT)
-		parse_object(obj->sha1);
+	if (info->revs->verify_objects &&
+	    !obj->parsed && obj->type != OBJ_COMMIT) {
+		const char *safe_pack = info->revs->safe_pack;
+		struct object_info oi;
+		int safe = 0;
+		memset(&oi, 0, sizeof(oi));
+		if (*safe_pack &&
+		    sha1_object_info_extended(obj->sha1, &oi) >= 0 &&
+		    oi.whence == OI_PACKED) {
+			const char *pack = oi.u.packed.pack->pack_name;
+			int len = strlen(pack);
+			assert(strncmp(pack + len - 51, "/pack-", 6) == 0);
+			assert(strcmp(pack + len - 5, ".pack") == 0);
+			safe = !memcmp(safe_pack, pack + len - 45, 40);
+		}
+		if (!safe)
+			parse_object(obj->sha1);
+	}
 }
 
 static void show_object(struct object *obj,
diff --git a/connected.c b/connected.c
index d762423..af81049 100644
--- a/connected.c
+++ b/connected.c
@@ -14,28 +14,43 @@
  *
  * Returns 0 if everything is connected, non-zero otherwise.
  */
-int check_everything_connected(sha1_iterate_fn fn, int quiet, void *cb_data)
+int check_everything_connected(sha1_iterate_fn fn, unsigned int flags,
+			       const char *pack_lockfile, void *cb_data)
 {
 	struct child_process rev_list;
-	const char *argv[] = {"rev-list", "--verify-objects",
-			      "--stdin", "--not", "--all", NULL, NULL};
+	const char *argv[] = {"rev-list", "--verify-objects", "--stdin",
+			      "--not", "--all", NULL, NULL, NULL, NULL };
 	char commit[41];
 	unsigned char sha1[20];
-	int err = 0;
+	int err = 0, ac = 5;
+	struct strbuf packfile = STRBUF_INIT;
 
 	if (fn(cb_data, sha1))
 		return err;
 
-	if (quiet)
-		argv[5] = "--quiet";
+	if (flags & CHECK_CONNECT_QUIET)
+		argv[ac++] = "--quiet";
+	if (pack_lockfile) {
+		strbuf_addstr(&packfile, pack_lockfile);
+		/* xxx/pack-%40s.keep */
+		assert(strcmp(packfile.buf + packfile.len - 5, ".keep") == 0);
+		assert(strncmp(packfile.buf + packfile.len - 51, "/pack-", 6) == 0);
+		strbuf_setlen(&packfile, packfile.len - 5);
+		strbuf_remove(&packfile, 0, packfile.len - 40);
+		argv[ac++] = "--safe-pack";
+		argv[ac++] = packfile.buf;
+	}
+	assert(ac < ARRAY_SIZE(argv) && argv[ac] == NULL);
 
 	memset(&rev_list, 0, sizeof(rev_list));
 	rev_list.argv = argv;
 	rev_list.git_cmd = 1;
 	rev_list.in = -1;
 	rev_list.no_stdout = 1;
-	rev_list.no_stderr = quiet;
-	if (start_command(&rev_list))
+	rev_list.no_stderr = flags & CHECK_CONNECT_QUIET;
+	err = start_command(&rev_list);
+	strbuf_release(&packfile);
+	if (err)
 		return error(_("Could not run 'git rev-list'"));
 
 	sigchain_push(SIGPIPE, SIG_IGN);
diff --git a/connected.h b/connected.h
index 7e4585a..ed0b559 100644
--- a/connected.h
+++ b/connected.h
@@ -1,6 +1,8 @@
 #ifndef CONNECTED_H
 #define CONNECTED_H
 
+#define CHECK_CONNECT_QUIET  1
+
 /*
  * Take callback data, and return next object name in the buffer.
  * When called after returning the name for the last object, return -1
@@ -15,6 +17,7 @@ typedef int (*sha1_iterate_fn)(void *, unsigned char [20]);
  *
  * Return 0 if Ok, non zero otherwise (i.e. some missing objects)
  */
-extern int check_everything_connected(sha1_iterate_fn, int quiet, void *cb_data);
+extern int check_everything_connected(sha1_iterate_fn, unsigned int flags,
+				      const char *pack_lockfile, void *cb_data);
 
 #endif /* CONNECTED_H */
diff --git a/revision.c b/revision.c
index 819ff01..1c2d017 100644
--- a/revision.c
+++ b/revision.c
@@ -1451,6 +1451,11 @@ static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg
 		revs->tree_objects = 1;
 		revs->blob_objects = 1;
 		revs->verify_objects = 1;
+	} else if (!strcmp(arg, "--safe-pack")) {
+		if (strlen(argv[1]) != 40)
+			die("--safe-pack requires an SHA-1 as pack id, not %s", argv[1]);
+		strcpy(revs->safe_pack, argv[1]);
+		return 2;
 	} else if (!strcmp(arg, "--unpacked")) {
 		revs->unpacked = 1;
 	} else if (!prefixcmp(arg, "--unpacked=")) {
diff --git a/revision.h b/revision.h
index b8e9223..2790910 100644
--- a/revision.h
+++ b/revision.h
@@ -168,6 +168,9 @@ struct rev_info {
 	int count_left;
 	int count_right;
 	int count_same;
+
+	/* --verify-objects */
+	char safe_pack[41];
 };
 
 #define REV_TREE_SAME		0
-- 
1.7.8.36.g69ee2

  parent reply	other threads:[~2012-03-14 14:43 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-02-27 15:39 [PATCH] Perform cheaper connectivity check when pack is used as medium Nguyễn Thái Ngọc Duy
2012-02-27 18:14 ` Junio C Hamano
2012-02-28 13:18   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
2012-02-28 15:40     ` Johannes Sixt
2012-02-28 15:47       ` Andreas Ericsson
2012-02-29  1:41       ` Nguyen Thai Ngoc Duy
2012-03-02  6:10     ` Junio C Hamano
2012-03-02 14:05       ` Nguyen Thai Ngoc Duy
2012-03-02 17:31         ` Junio C Hamano
2012-03-03  3:05           ` Nguyen Thai Ngoc Duy
2012-03-03  6:59             ` Junio C Hamano
2012-03-03 10:09               ` Nguyen Thai Ngoc Duy
2012-03-06 11:20               ` Nguyen Thai Ngoc Duy
2012-03-06 15:56                 ` Junio C Hamano
2012-03-14 14:40               ` Nguyễn Thái Ngọc Duy [this message]
2012-03-14 19:05                 ` [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack Junio C Hamano
2012-03-15 21:09                 ` Junio C Hamano
2012-03-15 21:57                   ` Junio C Hamano
2012-03-16  2:28                     ` Nguyen Thai Ngoc Duy
2012-03-16  3:42                       ` Junio C Hamano
2012-03-16  3:42                         ` Nguyen Thai Ngoc Duy
2012-03-16  2:24                   ` Nguyen Thai Ngoc Duy
2012-03-16  3:46                     ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1331736055-21019-1-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).