git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH 2/4] index-pack: reduce memory usage when the pack has large blobs
Date: Sat, 25 Feb 2012 17:56:14 +0700	[thread overview]
Message-ID: <1330167376-24859-3-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <1330167376-24859-1-git-send-email-pclouds@gmail.com>

This command unpacks every non-delta objects in order to:

1. calculate sha-1
2. do byte-to-byte sha-1 collision test if we happen to have objects
   with the same sha-1
3. validate object content in strict mode

All this requires the entire object to stay in memory, a bad news for
giant blobs. This patch lowers memory consumption by not saving the
object in memory whenever possible, calculating SHA-1 while unpacking
the object.

This patch assumes that the collision test is rarely needed. The
collision test will be done later in second pass if necessary, which
puts the entire object back to memory again (We could even do the
collision test without putting the entire object back in memory, by
comparing as we unpack it).

In strict mode, it always keeps non-blob objects in memory for
validation (blobs do not need data validation). "--strict --verify"
also keeps blobs in memory.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 builtin/index-pack.c |   74 +++++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index cee83b9..ab24dd8 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -277,30 +277,60 @@ static void unlink_base_data(struct base_data *c)
 	free_base_data(c);
 }
 
-static void *unpack_entry_data(unsigned long offset, unsigned long size)
+static void *unpack_entry_data(unsigned long offset, unsigned long size,
+			       enum object_type type, unsigned char *sha1)
 {
+	static char fixed_buf[8192];
 	int status;
 	git_zstream stream;
-	void *buf = xmalloc(size);
+	void *buf;
+	git_SHA_CTX c;
+
+	if (sha1) {		/* do hash_sha1_file internally */
+		char hdr[32];
+		int hdrlen = sprintf(hdr, "%s %lu", typename(type), size)+1;
+		git_SHA1_Init(&c);
+		git_SHA1_Update(&c, hdr, hdrlen);
+
+		buf = fixed_buf;
+	} else {
+		buf = xmalloc(size);
+	}
 
 	memset(&stream, 0, sizeof(stream));
 	git_inflate_init(&stream);
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = buf == fixed_buf ? sizeof(fixed_buf) : size;
 
 	do {
 		stream.next_in = fill(1);
 		stream.avail_in = input_len;
 		status = git_inflate(&stream, 0);
 		use(input_len - stream.avail_in);
+		if (sha1) {
+			git_SHA1_Update(&c, buf, stream.next_out - (unsigned char *)buf);
+			stream.next_out = buf;
+			stream.avail_out = sizeof(fixed_buf);
+		}
 	} while (status == Z_OK);
 	if (stream.total_out != size || status != Z_STREAM_END)
 		bad_object(offset, "inflate returned %d", status);
 	git_inflate_end(&stream);
+	if (sha1) {
+		git_SHA1_Final(sha1, &c);
+		buf = NULL;
+	}
 	return buf;
 }
 
-static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_base)
+static int is_delta_type(enum object_type type)
+{
+	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
+}
+
+static void *unpack_raw_entry(struct object_entry *obj,
+			      union delta_base *delta_base,
+			      unsigned char *sha1)
 {
 	unsigned char *p;
 	unsigned long size, c;
@@ -360,7 +390,17 @@ static void *unpack_raw_entry(struct object_entry *obj, union delta_base *delta_
 	}
 	obj->hdr_size = consumed_bytes - obj->idx.offset;
 
-	data = unpack_entry_data(obj->idx.offset, obj->size);
+	/*
+	 * --verify --strict: sha1_object() does all collision test
+	 *          --strict: sha1_object() does all except blobs,
+	 *                    blobs tested in second pass
+	 * --verify         : no collision test
+	 *                  : all in second pass
+	 */
+	if (is_delta_type(obj->type) ||
+	    (strict && (verify || obj->type != OBJ_BLOB)))
+		sha1 = NULL;	/* save unpacked object */
+	data = unpack_entry_data(obj->idx.offset, obj->size, obj->type, sha1);
 	obj->idx.crc32 = input_crc32;
 	return data;
 }
@@ -461,8 +501,9 @@ static void find_delta_children(const union delta_base *base,
 static void sha1_object(const void *data, unsigned long size,
 			enum object_type type, unsigned char *sha1)
 {
-	hash_sha1_file(data, size, typename(type), sha1);
-	if ((strict || !verify) && has_sha1_file(sha1)) {
+	if (data)
+		hash_sha1_file(data, size, typename(type), sha1);
+	if (data && (strict || !verify) && has_sha1_file(sha1)) {
 		void *has_data;
 		enum object_type has_type;
 		unsigned long has_size;
@@ -511,11 +552,6 @@ static void sha1_object(const void *data, unsigned long size,
 	}
 }
 
-static int is_delta_type(enum object_type type)
-{
-	return (type == OBJ_REF_DELTA || type == OBJ_OFS_DELTA);
-}
-
 /*
  * This function is part of find_unresolved_deltas(). There are two
  * walkers going in the opposite ways.
@@ -702,7 +738,7 @@ static void parse_pack_objects(unsigned char *sha1)
 				nr_objects);
 	for (i = 0; i < nr_objects; i++) {
 		struct object_entry *obj = &objects[i];
-		void *data = unpack_raw_entry(obj, &delta->base);
+		void *data = unpack_raw_entry(obj, &delta->base, obj->idx.sha1);
 		obj->real_type = obj->type;
 		if (is_delta_type(obj->type)) {
 			nr_deltas++;
@@ -744,6 +780,9 @@ static void parse_pack_objects(unsigned char *sha1)
 	 * - if used as a base, uncompress the object and apply all deltas,
 	 *   recursively checking if the resulting object is used as a base
 	 *   for some more deltas.
+	 * - if the same object exists in repository and we're not in strict
+	 *   mode, we skipped the sha-1 collision test in the first pass.
+	 *   Do it now.
 	 */
 	if (verbose)
 		progress = start_progress("Resolving deltas", nr_deltas);
@@ -753,6 +792,15 @@ static void parse_pack_objects(unsigned char *sha1)
 
 		if (is_delta_type(obj->type))
 			continue;
+
+		if (((!strict && !verify) ||
+		     (strict && !verify && obj->type == OBJ_BLOB)) &&
+		    has_sha1_file(obj->idx.sha1)) {
+			void *data = get_data_from_pack(obj);
+			sha1_object(data, obj->size, obj->type, obj->idx.sha1);
+			free(data);
+		}
+
 		base_obj->obj = obj;
 		base_obj->data = NULL;
 		find_unresolved_deltas(base_obj);
-- 
1.7.8.36.g69ee2

  parent reply	other threads:[~2012-02-25 10:54 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-02-25 10:56 [PATCH 0/4] index-pack improvements Nguyễn Thái Ngọc Duy
2012-02-25 10:56 ` [PATCH 1/4] index-pack --verify: skip sha-1 collision test Nguyễn Thái Ngọc Duy
2012-02-25 10:56 ` Nguyễn Thái Ngọc Duy [this message]
2012-02-25 10:56 ` [PATCH 3/4] index-pack: move second pass code into separate function Nguyễn Thái Ngọc Duy
2012-02-25 10:56 ` [PATCH 4/4] index-pack: support multithreaded delta resolving Nguyễn Thái Ngọc Duy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1330167376-24859-3-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).