All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack
Date: Wed, 14 Mar 2012 21:40:55 +0700	[thread overview]
Message-ID: <1331736055-21019-1-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <7vfwdq8914.fsf@alter.siamese.dyndns.org>

When we fetch or push, usually "git rev-list --verify-objects --not
--all --stdin" is used to make sure that all objects between existing
refs and new refs are good. This means no gaps in between, all objects
are well-formed, object content agrees with its sha-1 signature.

For the last one, --verify-objects calls check_sha1_signature() via
parse_object(). check_sha1_signature() is an expensive operation,
especially when new refs are far away from existing ones because all
objects in between are re-hashed. Because objects coming from the new
pack are already hashed by index-pack, we can trust their
integrity. The only objects left to check are existing ones in repo
but has no connection to any current refs.

Pass the new pack id down to--verify-objects and skip
check_sha1_signature() on objects from that pack.

As an (extreme) example, a repository is created with only one commit:
e83c516 (Initial revision of "git", the information manager from hell
- 2005-04-07). The rest of git.git is fetched on top. Without the
patch:

$ time git fetch file:///home/pclouds/w/git/.git
remote: Counting objects: 125656, done.
remote: Compressing objects: 100% (33280/33280), done.
remote: Total 125656 (delta 92585), reused 123464 (delta 90682)
Receiving objects: 100% (125656/125656), 34.60 MiB | 8.47 MiB/s, done.
Resolving deltas: 100% (92585/92585), done.
From file:///home/pclouds/t/test/
 * branch            HEAD       -> FETCH_HEAD

real    1m30.437s
user    1m31.338s
sys     0m1.687s

With the patch:

$ time git fetch file:///home/pclouds/w/git/.git
remote: Counting objects: 125656, done.
remote: Compressing objects: 100% (33280/33280), done.
remote: Total 125656 (delta 92585), reused 123464 (delta 90682)
Receiving objects: 100% (125656/125656), 34.60 MiB | 7.86 MiB/s, done.
Resolving deltas: 100% (92585/92585), done.
From file:///home/pclouds/t/test/
 * branch            HEAD       -> FETCH_HEAD

real    0m52.182s
user    0m53.151s
sys     0m1.465s

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 Another try. Exclude all objects reachable from refs and the new pack
 from --verify-objects tests.
 
 I keep CHECK_CONNECT_QUIET change because it seems a good change
 anyway. transport.[ch] changes are dropped because we can detect pack
 case based on pack_lockfile.

 builtin/fetch.c        |   12 +++++++-----
 builtin/receive-pack.c |    7 +++----
 builtin/rev-list.c     |   20 ++++++++++++++++++--
 connected.c            |   31 +++++++++++++++++++++++--------
 connected.h            |    5 ++++-
 revision.c             |    5 +++++
 revision.h             |    3 +++
 7 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/builtin/fetch.c b/builtin/fetch.c
index 65f5f9b..cc84d04 100644
--- a/builtin/fetch.c
+++ b/builtin/fetch.c
@@ -368,7 +368,7 @@ static int iterate_ref_map(void *cb_data, unsigned char sha1[20])
 }
 
 static int store_updated_refs(const char *raw_url, const char *remote_name,
-		struct ref *ref_map)
+			      struct ref *ref_map, const char *pack_lockfile)
 {
 	FILE *fp;
 	struct commit *commit;
@@ -389,7 +389,7 @@ static int store_updated_refs(const char *raw_url, const char *remote_name,
 		url = xstrdup("foreign");
 
 	rm = ref_map;
-	if (check_everything_connected(iterate_ref_map, 0, &rm)) {
+	if (check_everything_connected(iterate_ref_map, 0, pack_lockfile, &rm)) {
 		rc = error(_("%s did not send all necessary objects\n"), url);
 		goto abort;
 	}
@@ -516,7 +516,8 @@ static int quickfetch(struct ref *ref_map)
 	 */
 	if (depth)
 		return -1;
-	return check_everything_connected(iterate_ref_map, 1, &rm);
+	return check_everything_connected(iterate_ref_map,
+					  CHECK_CONNECT_QUIET, NULL, &rm);
 }
 
 static int fetch_refs(struct transport *transport, struct ref *ref_map)
@@ -526,8 +527,9 @@ static int fetch_refs(struct transport *transport, struct ref *ref_map)
 		ret = transport_fetch_refs(transport, ref_map);
 	if (!ret)
 		ret |= store_updated_refs(transport->url,
-				transport->remote->name,
-				ref_map);
+					  transport->remote->name,
+					  ref_map,
+					  transport->pack_lockfile);
 	transport_unlock_pack(transport);
 	return ret;
 }
diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c
index 0afb8b2..01d37f6 100644
--- a/builtin/receive-pack.c
+++ b/builtin/receive-pack.c
@@ -40,6 +40,7 @@ static int auto_gc = 1;
 static const char *head_name;
 static void *head_name_to_free;
 static int sent_capabilities;
+static const char *pack_lockfile;
 
 static enum deny_action parse_deny_action(const char *var, const char *value)
 {
@@ -669,7 +670,7 @@ static void set_connectivity_errors(struct command *commands)
 	for (cmd = commands; cmd; cmd = cmd->next) {
 		struct command *singleton = cmd;
 		if (!check_everything_connected(command_singleton_iterator,
-						0, &singleton))
+						0, pack_lockfile, &singleton))
 			continue;
 		cmd->error_string = "missing necessary objects";
 	}
@@ -705,7 +706,7 @@ static void execute_commands(struct command *commands, const char *unpacker_erro
 
 	cmd = commands;
 	if (check_everything_connected(iterate_receive_command_list,
-				       0, &cmd))
+				       0, pack_lockfile, &cmd))
 		set_connectivity_errors(commands);
 
 	if (run_receive_hook(commands, pre_receive_hook, 0)) {
@@ -797,8 +798,6 @@ static const char *parse_pack_header(struct pack_header *hdr)
 	}
 }
 
-static const char *pack_lockfile;
-
 static const char *unpack(void)
 {
 	struct pack_header hdr;
diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index 4c4d404..21d714b 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -180,8 +180,24 @@ static void finish_object(struct object *obj,
 	struct rev_list_info *info = cb_data;
 	if (obj->type == OBJ_BLOB && !has_sha1_file(obj->sha1))
 		die("missing blob object '%s'", sha1_to_hex(obj->sha1));
-	if (info->revs->verify_objects && !obj->parsed && obj->type != OBJ_COMMIT)
-		parse_object(obj->sha1);
+	if (info->revs->verify_objects &&
+	    !obj->parsed && obj->type != OBJ_COMMIT) {
+		const char *safe_pack = info->revs->safe_pack;
+		struct object_info oi;
+		int safe = 0;
+		memset(&oi, 0, sizeof(oi));
+		if (*safe_pack &&
+		    sha1_object_info_extended(obj->sha1, &oi) >= 0 &&
+		    oi.whence == OI_PACKED) {
+			const char *pack = oi.u.packed.pack->pack_name;
+			int len = strlen(pack);
+			assert(strncmp(pack + len - 51, "/pack-", 6) == 0);
+			assert(strcmp(pack + len - 5, ".pack") == 0);
+			safe = !memcmp(safe_pack, pack + len - 45, 40);
+		}
+		if (!safe)
+			parse_object(obj->sha1);
+	}
 }
 
 static void show_object(struct object *obj,
diff --git a/connected.c b/connected.c
index d762423..af81049 100644
--- a/connected.c
+++ b/connected.c
@@ -14,28 +14,43 @@
  *
  * Returns 0 if everything is connected, non-zero otherwise.
  */
-int check_everything_connected(sha1_iterate_fn fn, int quiet, void *cb_data)
+int check_everything_connected(sha1_iterate_fn fn, unsigned int flags,
+			       const char *pack_lockfile, void *cb_data)
 {
 	struct child_process rev_list;
-	const char *argv[] = {"rev-list", "--verify-objects",
-			      "--stdin", "--not", "--all", NULL, NULL};
+	const char *argv[] = {"rev-list", "--verify-objects", "--stdin",
+			      "--not", "--all", NULL, NULL, NULL, NULL };
 	char commit[41];
 	unsigned char sha1[20];
-	int err = 0;
+	int err = 0, ac = 5;
+	struct strbuf packfile = STRBUF_INIT;
 
 	if (fn(cb_data, sha1))
 		return err;
 
-	if (quiet)
-		argv[5] = "--quiet";
+	if (flags & CHECK_CONNECT_QUIET)
+		argv[ac++] = "--quiet";
+	if (pack_lockfile) {
+		strbuf_addstr(&packfile, pack_lockfile);
+		/* xxx/pack-%40s.keep */
+		assert(strcmp(packfile.buf + packfile.len - 5, ".keep") == 0);
+		assert(strncmp(packfile.buf + packfile.len - 51, "/pack-", 6) == 0);
+		strbuf_setlen(&packfile, packfile.len - 5);
+		strbuf_remove(&packfile, 0, packfile.len - 40);
+		argv[ac++] = "--safe-pack";
+		argv[ac++] = packfile.buf;
+	}
+	assert(ac < ARRAY_SIZE(argv) && argv[ac] == NULL);
 
 	memset(&rev_list, 0, sizeof(rev_list));
 	rev_list.argv = argv;
 	rev_list.git_cmd = 1;
 	rev_list.in = -1;
 	rev_list.no_stdout = 1;
-	rev_list.no_stderr = quiet;
-	if (start_command(&rev_list))
+	rev_list.no_stderr = flags & CHECK_CONNECT_QUIET;
+	err = start_command(&rev_list);
+	strbuf_release(&packfile);
+	if (err)
 		return error(_("Could not run 'git rev-list'"));
 
 	sigchain_push(SIGPIPE, SIG_IGN);
diff --git a/connected.h b/connected.h
index 7e4585a..ed0b559 100644
--- a/connected.h
+++ b/connected.h
@@ -1,6 +1,8 @@
 #ifndef CONNECTED_H
 #define CONNECTED_H
 
+#define CHECK_CONNECT_QUIET  1
+
 /*
  * Take callback data, and return next object name in the buffer.
  * When called after returning the name for the last object, return -1
@@ -15,6 +17,7 @@ typedef int (*sha1_iterate_fn)(void *, unsigned char [20]);
  *
  * Return 0 if Ok, non zero otherwise (i.e. some missing objects)
  */
-extern int check_everything_connected(sha1_iterate_fn, int quiet, void *cb_data);
+extern int check_everything_connected(sha1_iterate_fn, unsigned int flags,
+				      const char *pack_lockfile, void *cb_data);
 
 #endif /* CONNECTED_H */
diff --git a/revision.c b/revision.c
index 819ff01..1c2d017 100644
--- a/revision.c
+++ b/revision.c
@@ -1451,6 +1451,11 @@ static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg
 		revs->tree_objects = 1;
 		revs->blob_objects = 1;
 		revs->verify_objects = 1;
+	} else if (!strcmp(arg, "--safe-pack")) {
+		if (strlen(argv[1]) != 40)
+			die("--safe-pack requires an SHA-1 as pack id, not %s", argv[1]);
+		strcpy(revs->safe_pack, argv[1]);
+		return 2;
 	} else if (!strcmp(arg, "--unpacked")) {
 		revs->unpacked = 1;
 	} else if (!prefixcmp(arg, "--unpacked=")) {
diff --git a/revision.h b/revision.h
index b8e9223..2790910 100644
--- a/revision.h
+++ b/revision.h
@@ -168,6 +168,9 @@ struct rev_info {
 	int count_left;
 	int count_right;
 	int count_same;
+
+	/* --verify-objects */
+	char safe_pack[41];
 };
 
 #define REV_TREE_SAME		0
-- 
1.7.8.36.g69ee2

  parent reply	other threads:[~2012-03-14 14:43 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-02-27 15:39 [PATCH] Perform cheaper connectivity check when pack is used as medium Nguyễn Thái Ngọc Duy
2012-02-27 18:14 ` Junio C Hamano
2012-02-28 13:18   ` [PATCH v2] " Nguyễn Thái Ngọc Duy
2012-02-28 15:40     ` Johannes Sixt
2012-02-28 15:47       ` Andreas Ericsson
2012-02-29  1:41       ` Nguyen Thai Ngoc Duy
2012-03-02  6:10     ` Junio C Hamano
2012-03-02 14:05       ` Nguyen Thai Ngoc Duy
2012-03-02 17:31         ` Junio C Hamano
2012-03-03  3:05           ` Nguyen Thai Ngoc Duy
2012-03-03  6:59             ` Junio C Hamano
2012-03-03 10:09               ` Nguyen Thai Ngoc Duy
2012-03-06 11:20               ` Nguyen Thai Ngoc Duy
2012-03-06 15:56                 ` Junio C Hamano
2012-03-14 14:40               ` Nguyễn Thái Ngọc Duy [this message]
2012-03-14 19:05                 ` [PATCH] {fetch,receive}-pack: skip sha-1 integrity test on objects from new pack Junio C Hamano
2012-03-15 21:09                 ` Junio C Hamano
2012-03-15 21:57                   ` Junio C Hamano
2012-03-16  2:28                     ` Nguyen Thai Ngoc Duy
2012-03-16  3:42                       ` Junio C Hamano
2012-03-16  3:42                         ` Nguyen Thai Ngoc Duy
2012-03-16  2:24                   ` Nguyen Thai Ngoc Duy
2012-03-16  3:46                     ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1331736055-21019-1-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.