git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
	"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH v3 07/11] index-pack: reduce memory usage when the pack has large blobs
Date: Mon,  5 Mar 2012 10:43:44 +0700	[thread overview]
Message-ID: <1330919028-6611-8-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <1330865996-2069-1-git-send-email-pclouds@gmail.com>

This command unpacks every non-delta objects in order to:

1. calculate sha-1
2. do byte-to-byte sha-1 collision test if we happen to have objects
   with the same sha-1
3. validate object content in strict mode

All this requires the entire object to stay in memory, a bad news for
giant blobs. This patch lowers memory consumption by not saving the
object in memory whenever possible, calculating SHA-1 while unpacking
the object.

This patch assumes that the collision test is rarely needed. The
collision test will be done later in second pass if necessary, which
puts the entire object back to memory again (We could even do the
collision test without putting the entire object back in memory, by
comparing as we unpack it).

In strict mode, it always keeps non-blob objects in memory for
validation (blobs do not need data validation).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c |   64 +++++++++++++++++++++++++++++++++++++++----------
 t/t1050-large.sh     |    4 +-
 2 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 918684f..db27133 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -276,30 +276,60 @@ static void unlink_base_data(struct base_data *c)
 	free_base_data(c);
 }
 
-static void *unpack_entry_data(unsigned long offset, unsigned long size)
+static void *unpack_entry_data(unsigned long offset, unsigned long size,
+			       enum object_type type, unsigned char *sha1)
 {
+	static char fixed_buf[8192];
 	int status;
 	git_zstream stream;
-	void *buf = xmalloc(size);
+	void *buf;
+	git_SHA_CTX c;
+
+	if (sha1) {		/* do hash_sha1_file internally */
+		char hdr[32];
+		int hdrlen = sprintf(hdr, "%s %lu", typename(type), size)+1;
+		git_SHA1_Init(&c);
+		git_SHA1_Update(&c, hdr, hdrlen);
+
+		buf = fixed_buf;
+	} else {
+		buf = xmalloc(size);
+	}
 
 	memset(&stream, 0, sizeof(stream));
 	git_inflate_init(&stream);
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = buf == fixed_buf ? sizeof(fixed_buf) : size;
 
 	do {
 		stream.next_in = fill(1);
 		stream.avail_in = input_len;
 		status = git_inflate(&stream, 0);
 		use(input_len - stream.avail_in);
+		if (sha1) {
+			git_SHA1_Update(&c, buf, stream.next_out - (unsigned char *)buf);
+			stream.next_out = buf;
+			stream.avail_out = sizeof(fixed_buf);
+		}
 	} while (status == Z_OK);
 	if (stream.total_out != size || status != Z_STREAM_END)
 		bad_object(offset, "inflate returned %d", status);
 	git_inflate_end(&stream);
+	if (sha1) {
+		git_SHA1_Final(sha1, &c);
+		buf = NULL;
+	}
 	return buf;
 }
 
-static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_base)
+static int is_delta_type(enum object_type type)
+{
+	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
+}
+
+static void *unpack_raw_entry(struct object_entry *obj,
+			      union delta_base *delta_base,
+			      unsigned char *sha1)
 {
 	unsigned char *p;
 	unsigned long size, c;
@@ -359,7 +389,9 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
 	}
 	obj->hdr_size = consumed_bytes - obj->idx.offset;
 
-	data = unpack_entry_data(obj->idx.offset, obj->size);
+	if (is_delta_type(obj->type) || strict)
+		sha1 = NULL;	/* save unpacked object */
+	data = unpack_entry_data(obj->idx.offset, obj->size, obj->type, sha1);
 	obj->idx.crc32 = input_crc32;
 	return data;
 }
@@ -460,8 +492,9 @@ static void find_delta_children(const union delta_base *base,
 static void sha1_object(const void *data, unsigned long size,
 			enum object_type type, unsigned char *sha1)
 {
-	hash_sha1_file(data, size, typename(type), sha1);
-	if (has_sha1_file(sha1)) {
+	if (data)
+		hash_sha1_file(data, size, typename(type), sha1);
+	if (data && has_sha1_file(sha1)) {
 		void *has_data;
 		enum object_type has_type;
 		unsigned long has_size;
@@ -510,11 +543,6 @@ static void sha1_object(const void *data, unsigned long size,
 	}
 }
 
-static int is_delta_type(enum object_type type)
-{
-	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
-}
-
 /*
  * This function is part of find_unresolved_deltas(). There are two
  * walkers going in the opposite ways.
@@ -689,10 +717,20 @@ static int compare_delta_entry(const void *a, const void *b)
  * - if used as a base, uncompress the object and apply all deltas,
  *   recursively checking if the resulting object is used as a base
  *   for some more deltas.
+ * - if the same object exists in repository and we're not in strict
+ *   mode, we skipped the sha-1 collision test in the first pass.
+ *   Do it now.
  */
 static void second_pass(struct object_entry *obj)
 {
 	struct base_data *base_obj = alloc_base_data();
+
+	if (!strict && has_sha1_file(obj->idx.sha1)) {
+		void *data = get_data_from_pack(obj);
+		sha1_object(data, obj->size, obj->type, obj->idx.sha1);
+		free(data);
+	}
+
 	base_obj->obj = obj;
 	base_obj->data = NULL;
 	find_unresolved_deltas(base_obj);
@@ -718,7 +756,7 @@ static void parse_pack_objects(unsigned char *sha1)
 				nr_objects);
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
-		void *data = unpack_raw_entry(obj, &delta->base);
+		void *data = unpack_raw_entry(obj, &delta->base, obj->idx.sha1);
 		obj->real_type = obj->type;
 		if (is_delta_type(obj->type)) {
 			nr_deltas++;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 4e08e02..e4b77a2 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -130,11 +130,11 @@ test_expect_success 'git-show a large file' '
 
 '
 
-test_expect_failure 'clone' '
+test_expect_success 'clone' '
 	git clone file://"$PWD"/.git new
 '
 
-test_expect_failure 'fetch updates' '
+test_expect_success 'fetch updates' '
 	echo modified >> large1 &&
 	git commit -q -a -m updated &&
 	(
-- 
1.7.3.1.256.g2539c.dirty

  parent reply	other threads:[~2012-03-05  3:45 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-02-27  7:55 [PATCH 00/11] Large blob fixes Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 01/11] Add more large blob test cases Nguyễn Thái Ngọc Duy
2012-02-27 20:18   ` Peter Baumann
2012-02-27  7:55 ` [PATCH 02/11] Factor out and export large blob writing code to arbitrary file handle Nguyễn Thái Ngọc Duy
2012-02-27 17:29   ` Junio C Hamano
2012-02-27 21:50     ` Junio C Hamano
2012-02-27  7:55 ` [PATCH 03/11] cat-file: use streaming interface to print blobs Nguyễn Thái Ngọc Duy
2012-02-27 17:44   ` Junio C Hamano
2012-02-28  1:08     ` Nguyen Thai Ngoc Duy
2012-02-27  7:55 ` [PATCH 04/11] parse_object: special code path for blobs to avoid putting whole object in memory Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 05/11] show: use streaming interface for showing blobs Nguyễn Thái Ngọc Duy
2012-02-27 18:00   ` Junio C Hamano
2012-02-27  7:55 ` [PATCH 06/11] index-pack --verify: skip sha-1 collision test Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 07/11] index-pack: split second pass obj handling into own function Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 08/11] index-pack: reduce memory usage when the pack has large blobs Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 09/11] pack-check: do not unpack blobs Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 10/11] archive: support streaming large files to a tar archive Nguyễn Thái Ngọc Duy
2012-02-27  7:55 ` [PATCH 11/11] fsck: use streaming interface for writing lost-found blobs Nguyễn Thái Ngọc Duy
2012-02-27 18:43 ` [PATCH 00/11] Large blob fixes Junio C Hamano
2012-02-28  1:23   ` Nguyen Thai Ngoc Duy
2012-03-04 12:59 ` [PATCH v2 00/10] " Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 01/10] Add more large blob test cases Nguyễn Thái Ngọc Duy
2012-03-06  0:59     ` Junio C Hamano
2012-03-04 12:59   ` [PATCH v2 02/10] streaming: make streaming-write-entry to be more reusable Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 03/10] cat-file: use streaming interface to print blobs Nguyễn Thái Ngọc Duy
2012-03-04 23:12     ` Junio C Hamano
2012-03-05  2:42       ` Nguyen Thai Ngoc Duy
2012-03-04 12:59   ` [PATCH v2 04/10] parse_object: special code path for blobs to avoid putting whole object in memory Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 05/10] show: use streaming interface for showing blobs Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 06/10] index-pack: split second pass obj handling into own function Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 07/10] index-pack: reduce memory usage when the pack has large blobs Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 08/10] pack-check: do not unpack blobs Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 09/10] archive: support streaming large files to a tar archive Nguyễn Thái Ngọc Duy
2012-03-04 12:59   ` [PATCH v2 10/10] fsck: use streaming interface for writing lost-found blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 00/11] Large blob fixes Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 01/11] Add more large blob test cases Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 02/11] streaming: make streaming-write-entry to be more reusable Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 03/11] cat-file: use streaming interface to print blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 04/11] parse_object: special code path for blobs to avoid putting whole object in memory Nguyễn Thái Ngọc Duy
2012-03-06  0:57     ` Junio C Hamano
2012-03-05  3:43   ` [PATCH v3 05/11] show: use streaming interface for showing blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 06/11] index-pack: split second pass obj handling into own function Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` Nguyễn Thái Ngọc Duy [this message]
2012-03-05  3:43   ` [PATCH v3 08/11] pack-check: do not unpack blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 09/11] archive: support streaming large files to a tar archive Nguyễn Thái Ngọc Duy
2012-03-06  0:57     ` Junio C Hamano
2012-03-05  3:43   ` [PATCH v3 10/11] fsck: use streaming interface for writing lost-found blobs Nguyễn Thái Ngọc Duy
2012-03-05  3:43   ` [PATCH v3 11/11] update-server-info: respect core.bigfilethreshold Nguyễn Thái Ngọc Duy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1330919028-6611-8-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).