All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH v2.1 4/6] index-pack: use streaming interface for collision test on large blobs
Date: Thu, 24 May 2012 20:55:44 +0700	[thread overview]
Message-ID: <1337867744-24704-1-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <1337782191-10091-4-git-send-email-pclouds@gmail.com>

When putting whole objects in core is unavoidable, try match object
type and size first before actually inflating.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 The bypassing has_sha1_object() and doing sha1_object_info() directly mainly to
 help avoid unnecessary collision tesst when user run this command on
 an in-repo pack. But I realized there's no sure way to check if the
 given pack (especiall with --stdin) is in repo. So I'll drop 5/6 and
 6/6. People just have to set GIT_DIR=/nowhere.

 builtin/index-pack.c   | 82 +++++++++++++++++++++++++++++++++++++++++++++++---
 t/t5300-pack-object.sh |  5 +++
 2 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 9129299..8b5c1eb 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -9,6 +9,7 @@
 #include "progress.h"
 #include "fsck.h"
 #include "exec_cmd.h"
+#include "streaming.h"
 #include "thread-utils.h"
 
 static const char index_pack_usage[] =
@@ -621,31 +622,102 @@ static void find_delta_children(const union delta_base *base,
 	*last_index = last;
 }
 
+struct compare_data {
+	struct object_entry *entry;
+	struct git_istream *st;
+	unsigned char *buf;
+	unsigned long buf_size;
+};
+
+static int compare_objects(const unsigned char *buf, unsigned long size,
+			   void *cb_data)
+{
+	struct compare_data *data = cb_data;
+
+	if (data->buf_size < size) {
+		free(data->buf);
+		data->buf = xmalloc(size);
+		data->buf_size = size;
+	}
+
+	while (size) {
+		ssize_t len = read_istream(data->st, data->buf, size);
+		if (len == 0)
+			die(_("SHA1 COLLISION FOUND WITH %s !"),
+			    sha1_to_hex(data->entry->idx.sha1));
+		if (len < 0)
+			die(_("unable to read %s"),
+			    sha1_to_hex(data->entry->idx.sha1));
+		if (memcmp(buf, data->buf, len))
+			die(_("SHA1 COLLISION FOUND WITH %s !"),
+			    sha1_to_hex(data->entry->idx.sha1));
+		size -= len;
+		buf += len;
+	}
+	return 0;
+}
+
+static int check_collison(struct object_entry *entry)
+{
+	struct compare_data data;
+	enum object_type type;
+	unsigned long size;
+
+	if (entry->size <= big_file_threshold || entry->type != OBJ_BLOB)
+		return -1;
+
+	memset(&data, 0, sizeof(data));
+	data.entry = entry;
+	data.st = open_istream(entry->idx.sha1, &type, &size, NULL);
+	if (!data.st)
+		return -1;
+	if (size != entry->size || type != entry->type)
+		die(_("SHA1 COLLISION FOUND WITH %s !"),
+		    sha1_to_hex(entry->idx.sha1));
+	unpack_data(entry, compare_objects, &data);
+	close_istream(data.st);
+	free(data.buf);
+	return 0;
+}
+
 static void sha1_object(const void *data, struct object_entry *obj_entry,
 			unsigned long size, enum object_type type,
 			const unsigned char *sha1)
 {
 	void *new_data = NULL;
+	int collision_test_needed;
 
 	assert(data || obj_entry);
 
 	read_lock();
-	if (has_sha1_file(sha1)) {
+	collision_test_needed = has_sha1_file(sha1);
+	read_unlock();
+
+	if (collision_test_needed && !data) {
+		read_lock();
+		if (!check_collison(obj_entry))
+			collision_test_needed = 0;
+		read_unlock();
+	}
+	if (collision_test_needed) {
 		void *has_data;
 		enum object_type has_type;
 		unsigned long has_size;
-		if (!data)
-			data = new_data = get_data_from_pack(obj_entry);
+		read_lock();
+		has_type = sha1_object_info(sha1, &has_size);
+		if (has_type != type || has_size != size)
+			die(_("SHA1 COLLISION FOUND WITH %s !"), sha1_to_hex(sha1));
 		has_data = read_sha1_file(sha1, &has_type, &has_size);
 		read_unlock();
+		if (!data)
+			data = new_data = get_data_from_pack(obj_entry);
 		if (!has_data)
 			die(_("cannot read existing object %s"), sha1_to_hex(sha1));
 		if (size != has_size || type != has_type ||
 		    memcmp(data, has_data, size) != 0)
 			die(_("SHA1 COLLISION FOUND WITH %s !"), sha1_to_hex(sha1));
 		free(has_data);
-	} else
-		read_unlock();
+	}
 
 	if (strict) {
 		read_lock();
diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh
index d9d856b..300ed91 100755
--- a/t/t5300-pack-object.sh
+++ b/t/t5300-pack-object.sh
@@ -418,4 +418,9 @@ test_expect_success \
     'test_must_fail git index-pack -o bad.idx test-3.pack 2>msg &&
      grep "SHA1 COLLISION FOUND" msg'
 
+test_expect_success \
+    'make sure index-pack detects the SHA1 collision (large blobs)' \
+    'test_must_fail git -c core.bigfilethreshold=1 index-pack -o bad.idx test-3.pack 2>msg &&
+     grep "SHA1 COLLISION FOUND" msg'
+
 test_done
-- 
1.7.10.2.549.g9354186

  parent reply	other threads:[~2012-05-24 13:59 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-05-16 12:50 [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Nguyễn Thái Ngọc Duy
2012-05-16 12:50 ` [PATCH 2/2] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
2012-05-18 22:20   ` Junio C Hamano
2012-05-19  5:31     ` Nguyen Thai Ngoc Duy
2012-05-18 22:15 ` [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Junio C Hamano
2012-05-23 14:09 ` [PATCH v2 1/6] " Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 2/6] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 3/6] index-pack: factor out unpack core from get_data_from_pack Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 4/6] index-pack: use streaming interface for collision test on large blobs Nguyễn Thái Ngọc Duy
2012-05-23 16:03     ` Junio C Hamano
2012-05-24 13:55     ` Nguyễn Thái Ngọc Duy [this message]
2012-05-23 14:09   ` [PATCH v2 5/6] index-pack: avoid collision test when verifying in-repo pack Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 6/6] sha1_loose_object_info: do not complain out loud on non-existent objects Nguyễn Thái Ngọc Duy
2012-05-23 14:24     ` Nguyen Thai Ngoc Duy
2012-05-23 16:01       ` Junio C Hamano
2012-05-24 13:12         ` Nguyen Thai Ngoc Duy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1337867744-24704-1-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.