git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH v2 2/6] index-pack: use streaming interface on large blobs (most of the time)
Date: Wed, 23 May 2012 21:09:47 +0700	[thread overview]
Message-ID: <1337782191-10091-2-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <1337782191-10091-1-git-send-email-pclouds@gmail.com>

unpack_raw_entry() will not allocate and return decompressed blobs if
they are larger than core.bigFileThreshold. sha1_object() may not be
called on those objects because there's no actual content.

sha1_object() is called later on those objects, where we can safely
use get_data_from_pack() to retrieve blob content for checking.
However we always do that when we definitely need the blob
content. And we often don't.

There are two cases when we may need object content. The first case is
when we find an in-repo blob with the same SHA-1. We need to do
collision test, byte-on-byte. If this test is on, the blob must be
loaded on memory (i.e. no streaming). Normally (e.g. in
fetch/pull/clone) this does not happen because git avoid to send
objects that client already has.

The other case is when --strict is specified and the object in
question is not a blob, which can't happen in reality becase we deal
with large _blobs_ here.

Note: --verify (or git-verify-pack) a pack from current repository
will trigger collision test on every object in the pack, which
effectively disables this patch. This could be easily worked around by
setting GIT_DIR to an imaginary place with no packs.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c | 52 ++++++++++++++++++++++++++++++++++++++++++++--------
 t/t1050-large.sh     |  5 +++++
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index a744856..c7c2b88 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -392,9 +392,10 @@ static int is_delta_type(enum object_type type)
 static void *unpack_entry_data(unsigned long offset, unsigned long size,
 			       enum object_type type, unsigned char *sha1)
 {
+	static char fixed_buf[8192];
 	int status;
 	git_zstream stream;
-	void *buf = xmalloc(size);
+	void *buf;
 	git_SHA_CTX c;
 	char hdr[32];
 	int hdrlen;
@@ -405,11 +406,15 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 		git_SHA1_Update(&c, hdr, hdrlen);
 	} else
 		sha1 = NULL;
+	if (type == OBJ_BLOB && size > big_file_threshold)
+		buf = fixed_buf;
+	else
+		buf = xmalloc(size);
 
 	memset(&stream, 0, sizeof(stream));
 	git_inflate_init(&stream);
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = buf == fixed_buf ? sizeof(fixed_buf) : size;
 
 	do {
 		unsigned char *last_out = stream.next_out;
@@ -419,13 +424,17 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 		use(input_len - stream.avail_in);
 		if (sha1)
 			git_SHA1_Update(&c, last_out, stream.next_out - last_out);
+		if (buf == fixed_buf) {
+			stream.next_out = buf;
+			stream.avail_out = sizeof(fixed_buf);
+		}
 	} while (status == Z_OK);
 	if (stream.total_out != size || status != Z_STREAM_END)
 		bad_object(offset, _("inflate returned %d"), status);
 	git_inflate_end(&stream);
 	if (sha1)
 		git_SHA1_Final(sha1, &c);
-	return buf;
+	return buf == fixed_buf ? NULL : buf;
 }
 
 static void *unpack_raw_entry(struct object_entry *obj,
@@ -591,14 +600,21 @@ static void find_delta_children(const union delta_base *base,
 	*last_index = last;
 }
 
-static void sha1_object(const void *data, unsigned long size,
-			enum object_type type, const unsigned char *sha1)
+static void sha1_object(const void *data, struct object_entry *obj_entry,
+			unsigned long size, enum object_type type,
+			const unsigned char *sha1)
 {
+	void *new_data = NULL;
+
+	assert(data || obj_entry);
+
 	read_lock();
 	if (has_sha1_file(sha1)) {
 		void *has_data;
 		enum object_type has_type;
 		unsigned long has_size;
+		if (!data)
+			data = new_data = get_data_from_pack(obj_entry);
 		has_data = read_sha1_file(sha1, &has_type, &has_size);
 		read_unlock();
 		if (!has_data)
@@ -623,6 +639,9 @@ static void sha1_object(const void *data, unsigned long size,
 			int eaten;
 			void *buf = (void *) data;
 
+			if (!buf)
+				buf = new_data = get_data_from_pack(obj_entry);
+
 			/*
 			 * we do not need to free the memory here, as the
 			 * buf is deleted by the caller.
@@ -647,6 +666,8 @@ static void sha1_object(const void *data, unsigned long size,
 		}
 		read_unlock();
 	}
+
+	free(new_data);
 }
 
 /*
@@ -730,7 +751,7 @@ static void resolve_delta(struct object_entry *delta_obj,
 		bad_object(delta_obj->idx.offset, _("failed to apply delta"));
 	hash_sha1_file(result->data, result->size,
 		       typename(delta_obj->real_type), delta_obj->idx.sha1);
-	sha1_object(result->data, result->size, delta_obj->real_type,
+	sha1_object(result->data, NULL, result->size, delta_obj->real_type,
 		    delta_obj->idx.sha1);
 	counter_lock();
 	nr_resolved_deltas++;
@@ -860,7 +881,7 @@ static void *threaded_second_pass(void *data)
  */
 static void parse_pack_objects(unsigned char *sha1)
 {
-	int i;
+	int i, nr_delays = 0;
 	struct delta_entry *delta = deltas;
 	struct stat st;
 
@@ -876,8 +897,12 @@ static void parse_pack_objects(unsigned char *sha1)
 			nr_deltas++;
 			delta->obj_no = i;
 			delta++;
+		} else if (!data) {
+			/* large blobs, check later */
+			obj->real_type = OBJ_BAD;
+			nr_delays++;
 		} else
-			sha1_object(data, obj->size, obj->type, obj->idx.sha1);
+			sha1_object(data, NULL, obj->size, obj->type, obj->idx.sha1);
 		free(data);
 		display_progress(progress, i+1);
 	}
@@ -897,6 +922,17 @@ static void parse_pack_objects(unsigned char *sha1)
 	if (S_ISREG(st.st_mode) &&
 			lseek(input_fd, 0, SEEK_CUR) - input_len != st.st_size)
 		die(_("pack has junk at the end"));
+
+	for (i = 0; i < nr_objects; i++) {
+		struct object_entry *obj = &objects[i];
+		if (obj->real_type != OBJ_BAD)
+			continue;
+		obj->real_type = obj->type;
+		sha1_object(NULL, obj, obj->size, obj->type, obj->idx.sha1);
+		nr_delays--;
+	}
+	if (nr_delays)
+		die(_("confusion beyond insanity in parse_pack_objects()"));
 }
 
 /*
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..3f80688 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -130,6 +130,11 @@ test_expect_success 'git-show a large file' '
 
 '
 
+test_expect_success 'index-pack' '
+	git clone file://"`pwd`"/.git foo &&
+	GIT_DIR=non-existent git index-pack --strict --verify foo/.git/objects/pack/*.pack
+'
+
 test_expect_success 'repack' '
 	git repack -ad
 '
-- 
1.7.10.2.549.g9354186

  reply	other threads:[~2012-05-23 14:14 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-05-16 12:50 [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Nguyễn Thái Ngọc Duy
2012-05-16 12:50 ` [PATCH 2/2] index-pack: use streaming interface on large blobs (most of the time) Nguyễn Thái Ngọc Duy
2012-05-18 22:20   ` Junio C Hamano
2012-05-19  5:31     ` Nguyen Thai Ngoc Duy
2012-05-18 22:15 ` [PATCH 1/2] index-pack: hash non-delta objects while reading from stream Junio C Hamano
2012-05-23 14:09 ` [PATCH v2 1/6] " Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` Nguyễn Thái Ngọc Duy [this message]
2012-05-23 14:09   ` [PATCH v2 3/6] index-pack: factor out unpack core from get_data_from_pack Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 4/6] index-pack: use streaming interface for collision test on large blobs Nguyễn Thái Ngọc Duy
2012-05-23 16:03     ` Junio C Hamano
2012-05-24 13:55     ` [PATCH v2.1 " Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 5/6] index-pack: avoid collision test when verifying in-repo pack Nguyễn Thái Ngọc Duy
2012-05-23 14:09   ` [PATCH v2 6/6] sha1_loose_object_info: do not complain out loud on non-existent objects Nguyễn Thái Ngọc Duy
2012-05-23 14:24     ` Nguyen Thai Ngoc Duy
2012-05-23 16:01       ` Junio C Hamano
2012-05-24 13:12         ` Nguyen Thai Ngoc Duy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1337782191-10091-2-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).