All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Shawn O. Pearce" <spearce@spearce.org>
To: Junio C Hamano <gitster@pobox.com>
Cc: git <git@vger.kernel.org>, Nicolas Pitre <nico@fluxnic.net>
Subject: [PATCH v2] fast-import: Stream very large blobs directly to pack
Date: Fri, 29 Jan 2010 08:38:38 -0800	[thread overview]
Message-ID: <20100129163838.GD21821@spearce.org> (raw)
In-Reply-To: <20100129152254.GC21821@spearce.org>

If a blob is larger than the configured big-file-threshold, instead
of reading it into a single buffer obtained from malloc, stream it
onto the end of the current pack file.  Streaming the larger objects
into the pack avoids the 4+ GiB memory footprint that occurs when
fast-import is processing 2+ GiB blobs.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---

 Fixed a missing skip_optional_lf() at the end of the large blob
 command case.

 Documentation/git-fast-import.txt |    6 ++
 fast-import.c                     |  172 +++++++++++++++++++++++++++++++++----
 t/t5705-clone-2gb.sh              |    2 +-
 t/t9300-fast-import.sh            |   46 ++++++++++
 4 files changed, 208 insertions(+), 18 deletions(-)

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index e6d364f..16d3596 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -50,6 +50,12 @@ OPTIONS
 	importers may wish to lower this, such as to ensure the
 	resulting packfiles fit on CDs.
 
+--big-file-threshold=<n>::
+	Maximum size of a blob that fast-import will attempt to
+	create a delta for, expressed in MiB.  The default is 512.
+	Some importers may wish to lower this on systems with
+	constrained memory.
+
 --depth=<n>::
 	Maximum delta depth, for blob and tree deltification.
 	Default is 10.
diff --git a/fast-import.c b/fast-import.c
index 60d0aa2..8055f73 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -280,6 +280,7 @@ struct recent_command
 /* Configured limits on output */
 static unsigned long max_depth = 10;
 static off_t max_packsize = (1LL << 32) - 1;
+static uintmax_t big_file_threshold = 512 * 1024 * 1024;
 static int force_update;
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
 static int pack_compression_seen;
@@ -1003,7 +1004,7 @@ static void cycle_packfile(void)
 
 static size_t encode_header(
 	enum object_type type,
-	size_t size,
+	uintmax_t size,
 	unsigned char *hdr)
 {
 	int n = 1;
@@ -1159,6 +1160,118 @@ static int store_object(
 	return 0;
 }
 
+static void truncate_pack(off_t to)
+{
+	if (ftruncate(pack_data->pack_fd, to)
+	 || lseek(pack_data->pack_fd, to, SEEK_SET) != to)
+		die_errno("cannot truncate pack to skip duplicate");
+	pack_size = to;
+}
+
+static void stream_blob(uintmax_t len, unsigned char *sha1out, uintmax_t mark)
+{
+	size_t in_sz = 64 * 1024, out_sz = 64 * 1024;
+	unsigned char *in_buf = xmalloc(in_sz);
+	unsigned char *out_buf = xmalloc(out_sz);
+	struct object_entry *e;
+	unsigned char sha1[20];
+	unsigned long hdrlen;
+	off_t offset;
+	git_SHA_CTX c;
+	z_stream s;
+	int status = Z_OK;
+
+	/* Determine if we should auto-checkpoint. */
+	if ((pack_size + 60 + len) > max_packsize
+		|| (pack_size + 60 + len) < pack_size)
+		cycle_packfile();
+
+	offset = pack_size;
+
+	hdrlen = snprintf((char *)out_buf, out_sz, "blob %" PRIuMAX, len) + 1;
+	if (out_sz <= hdrlen)
+		die("impossibly large object header");
+
+	git_SHA1_Init(&c);
+	git_SHA1_Update(&c, out_buf, hdrlen);
+
+	memset(&s, 0, sizeof(s));
+	deflateInit(&s, pack_compression_level);
+
+	hdrlen = encode_header(OBJ_BLOB, len, out_buf);
+	if (out_sz <= hdrlen)
+		die("impossibly large object header");
+
+	s.next_out = out_buf + hdrlen;
+	s.avail_out = out_sz - hdrlen;
+
+	while (status != Z_STREAM_END) {
+		if (0 < len && !s.avail_in) {
+			size_t cnt = in_sz < len ? in_sz : (size_t)len;
+			size_t n = fread(in_buf, 1, cnt, stdin);
+			if (!n && feof(stdin))
+				die("EOF in data (%" PRIuMAX " bytes remaining)", len);
+
+			git_SHA1_Update(&c, in_buf, n);
+			s.next_in = in_buf;
+			s.avail_in = n;
+			len -= n;
+		}
+
+		status = deflate(&s, len ? 0 : Z_FINISH);
+
+		if (!s.avail_out || status == Z_STREAM_END) {
+			size_t n = s.next_out - out_buf;
+			write_or_die(pack_data->pack_fd, out_buf, n);
+			pack_size += n;
+			s.next_out = out_buf;
+			s.avail_out = out_sz;
+		}
+
+		switch (status) {
+		case Z_OK:
+		case Z_BUF_ERROR:
+		case Z_STREAM_END:
+			continue;
+		default:
+			die("unexpected deflate failure: %d", status);
+		}
+	}
+	deflateEnd(&s);
+	git_SHA1_Final(sha1, &c);
+
+	if (sha1out)
+		hashcpy(sha1out, sha1);
+
+	e = insert_object(sha1);
+
+	if (mark)
+		insert_mark(mark, e);
+
+	if (e->offset) {
+		duplicate_count_by_type[OBJ_BLOB]++;
+		truncate_pack(offset);
+
+	} else if (find_sha1_pack(sha1, packed_git)) {
+		e->type = OBJ_BLOB;
+		e->pack_id = MAX_PACK_ID;
+		e->offset = 1; /* just not zero! */
+		duplicate_count_by_type[OBJ_BLOB]++;
+		truncate_pack(offset);
+
+	} else {
+		e->depth = 0;
+		e->type = OBJ_BLOB;
+		e->pack_id = pack_id;
+		e->offset = offset;
+		object_count++;
+		object_count_by_type[OBJ_BLOB]++;
+	}
+
+	free(in_buf);
+	free(out_buf);
+}
+
 /* All calls must be guarded by find_object() or find_mark() to
  * ensure the 'struct object_entry' passed was written by this
  * process instance.  We unpack the entry by the offset, avoiding
@@ -1704,7 +1817,7 @@ static void parse_mark(void)
 		next_mark = 0;
 }
 
-static void parse_data(struct strbuf *sb)
+static int parse_data(struct strbuf *sb, uintmax_t limit, uintmax_t *len_res)
 {
 	strbuf_reset(sb);
 
@@ -1728,9 +1841,15 @@ static void parse_data(struct strbuf *sb)
 		free(term);
 	}
 	else {
-		size_t n = 0, length;
+		uintmax_t len = strtoumax(command_buf.buf + 5, NULL, 10);
+		size_t n = 0, length = (size_t)len;
 
-		length = strtoul(command_buf.buf + 5, NULL, 10);
+		if (limit && limit < len) {
+			*len_res = len;
+			return 0;
+		}
+		if (length < len)
+			die("data is too large to use in this context");
 
 		while (n < length) {
 			size_t s = strbuf_fread(sb, length - n, stdin);
@@ -1742,6 +1861,7 @@ static void parse_data(struct strbuf *sb)
 	}
 
 	skip_optional_lf();
+	return 1;
 }
 
 static int validate_raw_date(const char *src, char *result, int maxlen)
@@ -1806,14 +1926,32 @@ static char *parse_ident(const char *buf)
 	return ident;
 }
 
-static void parse_new_blob(void)
+static void parse_and_store_blob(
+	struct last_object *last,
+	unsigned char *sha1out,
+	uintmax_t mark)
 {
 	static struct strbuf buf = STRBUF_INIT;
+	uintmax_t len;
 
+	if (parse_data(&buf, big_file_threshold, &len))
+		store_object(OBJ_BLOB, &buf, last, sha1out, mark);
+	else {
+		if (last) {
+			strbuf_release(&last->data);
+			last->offset = 0;
+			last->depth = 0;
+		}
+		stream_blob(len, sha1out, mark);
+		skip_optional_lf();
+	}
+}
+
+static void parse_new_blob(void)
+{
 	read_next_command();
 	parse_mark();
-	parse_data(&buf);
-	store_object(OBJ_BLOB, &buf, &last_blob, NULL, next_mark);
+	parse_and_store_blob(&last_blob, NULL, next_mark);
 }
 
 static void unload_one_branch(void)
@@ -1924,15 +2062,12 @@ static void file_change_m(struct branch *b)
 		 * another repository.
 		 */
 	} else if (inline_data) {
-		static struct strbuf buf = STRBUF_INIT;
-
 		if (p != uq.buf) {
 			strbuf_addstr(&uq, p);
 			p = uq.buf;
 		}
 		read_next_command();
-		parse_data(&buf);
-		store_object(OBJ_BLOB, &buf, &last_blob, sha1, 0);
+		parse_and_store_blob(&last_blob, sha1, 0);
 	} else if (oe) {
 		if (oe->type != OBJ_BLOB)
 			die("Not a blob (actually a %s): %s",
@@ -2058,15 +2193,12 @@ static void note_change_n(struct branch *b)
 		die("Invalid ref name or SHA1 expression: %s", p);
 
 	if (inline_data) {
-		static struct strbuf buf = STRBUF_INIT;
-
 		if (p != uq.buf) {
 			strbuf_addstr(&uq, p);
 			p = uq.buf;
 		}
 		read_next_command();
-		parse_data(&buf);
-		store_object(OBJ_BLOB, &buf, &last_blob, sha1, 0);
+		parse_and_store_blob(&last_blob, sha1, 0);
 	} else if (oe) {
 		if (oe->type != OBJ_BLOB)
 			die("Not a blob (actually a %s): %s",
@@ -2232,7 +2364,7 @@ static void parse_new_commit(void)
 	}
 	if (!committer)
 		die("Expected committer but didn't get one");
-	parse_data(&msg);
+	parse_data(&msg, 0, NULL);
 	read_next_command();
 	parse_from(b);
 	merge_list = parse_merge(&merge_count);
@@ -2353,7 +2485,7 @@ static void parse_new_tag(void)
 		tagger = NULL;
 
 	/* tag payload/message */
-	parse_data(&msg);
+	parse_data(&msg, 0, NULL);
 
 	/* build the tag object */
 	strbuf_reset(&new_data);
@@ -2473,6 +2605,10 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 		pack_compression_seen = 1;
 		return 0;
 	}
+	if (!strcmp(k, "core.bigfilethreshold")) {
+		long n = git_config_int(k, v);
+		big_file_threshold = 0 < n ? n : 0;
+	}
 	return git_default_config(k, v, cb);
 }
 
@@ -2518,6 +2654,8 @@ int main(int argc, const char **argv)
 		}
 		else if (!prefixcmp(a, "--max-pack-size="))
 			max_packsize = strtoumax(a + 16, NULL, 0) * 1024 * 1024;
+		else if (!prefixcmp(a, "--big-file-threshold="))
+			big_file_threshold = strtoumax(a + 21, NULL, 0) * 1024 * 1024;
 		else if (!prefixcmp(a, "--depth=")) {
 			max_depth = strtoul(a + 8, NULL, 0);
 			if (max_depth > MAX_DEPTH)
diff --git a/t/t5705-clone-2gb.sh b/t/t5705-clone-2gb.sh
index 9f52154..adfaae8 100755
--- a/t/t5705-clone-2gb.sh
+++ b/t/t5705-clone-2gb.sh
@@ -31,7 +31,7 @@ test_expect_success 'setup' '
 	 echo "data 5" &&
 	 echo ">2gb" &&
 	 cat commit) |
-	git fast-import &&
+	git fast-import --big-file-threshold=2 &&
 	test ! -f exit-status
 
 '
diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
index b49815d..513db86 100755
--- a/t/t9300-fast-import.sh
+++ b/t/t9300-fast-import.sh
@@ -1254,4 +1254,50 @@ test_expect_success \
 	'Q: verify note for third commit' \
 	'git cat-file blob refs/notes/foobar:$commit3 >actual && test_cmp expect actual'
 
+##
+## R: very large blobs
+##
+blobsize=$((2*1024*1024 + 53))
+test-genrandom bar $blobsize >expect
+cat >input <<INPUT_END
+commit refs/heads/big-file
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+data <<COMMIT
+R - big file
+COMMIT
+
+M 644 inline big1
+data $blobsize
+INPUT_END
+cat expect >>input
+cat >>input <<INPUT_END
+M 644 inline big2
+data $blobsize
+INPUT_END
+cat expect >>input
+echo >>input
+
+test_expect_success \
+	'R: blob bigger than threshold' \
+	'test_create_repo R &&
+	 git --git-dir=R/.git fast-import --big-file-threshold=1 <input'
+test_expect_success \
+	'R: verify created pack' \
+	': >verify &&
+	 for p in R/.git/objects/pack/*.pack;
+	 do
+	   git verify-pack -v $p >>verify || exit;
+	 done'
+test_expect_success \
+	'R: verify written objects' \
+	'git --git-dir=R/.git cat-file blob big-file:big1 >actual &&
+	 test_cmp expect actual &&
+	 a=$(git --git-dir=R/.git rev-parse big-file:big1) &&
+	 b=$(git --git-dir=R/.git rev-parse big-file:big2) &&
+	 test $a = $b'
+test_expect_success \
+	'R: blob appears only once' \
+	'n=$(grep $a verify | wc -l) &&
+	 test 1 = $n'
+
 test_done
-- 
1.7.0.rc0.170.g7207c

-- 
Shawn.

  reply	other threads:[~2010-01-29 16:39 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-01-29  1:23 [PATCH] fast-import: Stream very large blobs directly to pack Shawn O. Pearce
2010-01-29  2:33 ` Nicolas Pitre
2010-01-29  2:37   ` Shawn O. Pearce
2010-01-29  5:29 ` Junio C Hamano
2010-01-29 15:22   ` Shawn O. Pearce
2010-01-29 16:38     ` Shawn O. Pearce [this message]
2010-01-29 18:29       ` [PATCH v2] " Jakub Narebski
2010-01-29 18:30         ` Shawn O. Pearce
2010-01-29 23:02           ` A Large Angry SCM
2010-01-30  7:17             ` Junio C Hamano
2010-01-29 18:35 ` [PATCH] " Sverre Rabbelier
2010-01-29 18:37   ` Shawn O. Pearce
2010-01-29 18:41     ` Sverre Rabbelier
2010-01-29 18:44       ` Shawn O. Pearce
2010-01-30  3:41     ` Junio C Hamano
2010-01-30  6:19       ` Junio C Hamano
2010-01-30  7:33         ` Junio C Hamano
2010-02-01 15:28           ` Shawn O. Pearce
2010-02-01 20:14             ` Junio C Hamano
2010-02-04  2:01             ` Junio C Hamano
2010-02-04  2:07               ` Shawn O. Pearce
2010-02-04  2:25                 ` Junio C Hamano
2010-02-04  2:27                   ` Junio C Hamano
2010-02-04  2:30                     ` Shawn O. Pearce
2010-02-04  2:28               ` Nicolas Pitre
2010-02-01 15:41           ` [PATCH] fast-import: Document the core.bigFileThreshold configuration setting Shawn O. Pearce
2010-02-01 15:23         ` [PATCH] fast-import: Stream very large blobs directly to pack Shawn O. Pearce

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100129163838.GD21821@spearce.org \
    --to=spearce@spearce.org \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=nico@fluxnic.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.