[PATCH] fast-import: Stream very large blobs directly to pack

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] fast-import: Stream very large blobs directly to pack
@ 2010-01-29  1:23 Shawn O. Pearce
  2010-01-29  2:33 ` Nicolas Pitre
                   ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Shawn O. Pearce @ 2010-01-29  1:23 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

If a blob is larger than the configured big-file-threshold, instead
of reading it into a single buffer obtained from malloc, stream it
onto the end of the current pack file.  Streaming the larger objects
into the pack avoids the 4+ GiB memory footprint that occurs when
fast-import is processing 2+ GiB blobs.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 Documentation/git-fast-import.txt |    6 ++
 fast-import.c                     |  174 +++++++++++++++++++++++++++++++++----
 t/t5705-clone-2gb.sh              |    2 +-
 t/t9300-fast-import.sh            |   46 ++++++++++
 4 files changed, 210 insertions(+), 18 deletions(-)

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index e6d364f..16d3596 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -50,6 +50,12 @@ OPTIONS
 	importers may wish to lower this, such as to ensure the
 	resulting packfiles fit on CDs.
 
+--big-file-threshold=<n>::
+	Maximum size of a blob that fast-import will attempt to
+	create a delta for, expressed in MiB.  The default is 512.
+	Some importers may wish to lower this on systems with
+	constrained memory.
+
 --depth=<n>::
 	Maximum delta depth, for blob and tree deltification.
 	Default is 10.
diff --git a/fast-import.c b/fast-import.c
index 60d0aa2..8114f93 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -280,6 +280,7 @@ struct recent_command
 /* Configured limits on output */
 static unsigned long max_depth = 10;
 static off_t max_packsize = (1LL << 32) - 1;
+static uintmax_t big_file_threshold = 512 * 1024 * 1024;
 static int force_update;
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
 static int pack_compression_seen;
@@ -1003,7 +1004,7 @@ static void cycle_packfile(void)
 
 static size_t encode_header(
 	enum object_type type,
-	size_t size,
+	uintmax_t size,
 	unsigned char *hdr)
 {
 	int n = 1;
@@ -1159,6 +1160,121 @@ static int store_object(
 	return 0;
 }
 
+static void truncate_pack(off_t to)
+{
+	if (ftruncate(pack_data->pack_fd, to)
+	 || lseek(pack_data->pack_fd, to, SEEK_SET) != to)
+		die_errno("cannot truncate pack to skip duplicate");
+	pack_size = to;
+}
+
+static void stream_blob(
+	uintmax_t len,
+	unsigned char *sha1out,
+	uintmax_t mark)
+{
+	size_t in_sz = 64 * 1024, out_sz = 64 * 1024;
+	unsigned char *in_buf = xmalloc(in_sz);
+	unsigned char *out_buf = xmalloc(out_sz);
+	struct object_entry *e;
+	unsigned char sha1[20];
+	unsigned long hdrlen;
+	off_t offset;
+	git_SHA_CTX c;
+	z_stream s;
+	int status = Z_OK;
+
+	/* Determine if we should auto-checkpoint. */
+	if ((pack_size + 60 + len) > max_packsize
+		|| (pack_size + 60 + len) < pack_size)
+		cycle_packfile();
+
+	offset = pack_size;
+
+	hdrlen = snprintf((char *)out_buf, out_sz, "blob %" PRIuMAX, len) + 1;
+	if (out_sz <= hdrlen)
+		die("impossibly large object header");
+
+	git_SHA1_Init(&c);
+	git_SHA1_Update(&c, out_buf, hdrlen);
+
+	memset(&s, 0, sizeof(s));
+	deflateInit(&s, pack_compression_level);
+
+	hdrlen = encode_header(OBJ_BLOB, len, out_buf);
+	if (out_sz <= hdrlen)
+		die("impossibly large object header");
+
+	s.next_out = out_buf + hdrlen;
+	s.avail_out = out_sz - hdrlen;
+
+	while (status != Z_STREAM_END) {
+		if (0 < len && !s.avail_in) {
+			size_t cnt = in_sz < len ? in_sz : (size_t)len;
+			size_t n = fread(in_buf, 1, cnt, stdin);
+			if (!n && feof(stdin))
+				die("EOF in data (%" PRIuMAX " bytes remaining)", len);
+
+			git_SHA1_Update(&c, in_buf, n);
+			s.next_in = in_buf;
+			s.avail_in = n;
+			len -= n;
+		}
+
+		status = deflate(&s, len ? 0 : Z_FINISH);
+
+		if (!s.avail_out || status == Z_STREAM_END) {
+			size_t n = s.next_out - out_buf;
+			write_or_die(pack_data->pack_fd, out_buf, n);
+			pack_size += n;
+			s.next_out = out_buf;
+			s.avail_out = out_sz;
+		}
+
+		switch (status) {
+		case Z_OK:
+		case Z_BUF_ERROR:
+		case Z_STREAM_END:
+			continue;
+		default:
+			die("unexpected deflate failure: %d", status);
+		}
+	}
+	deflateEnd(&s);
+	git_SHA1_Final(sha1, &c);
+
+	if (sha1out)
+		hashcpy(sha1out, sha1);
+
+	e = insert_object(sha1);
+
+	if (mark)
+		insert_mark(mark, e);
+
+	if (e->offset) {
+		duplicate_count_by_type[OBJ_BLOB]++;
+		truncate_pack(offset);
+
+	} else if (find_sha1_pack(sha1, packed_git)) {
+		e->type = OBJ_BLOB;
+		e->pack_id = MAX_PACK_ID;
+		e->offset = 1; /* just not zero! */
+		duplicate_count_by_type[OBJ_BLOB]++;
+		truncate_pack(offset);
+
+	} else {
+		e->depth = 0;
+		e->type = OBJ_BLOB;
+		e->pack_id = pack_id;
+		e->offset = offset;
+		object_count++;
+		object_count_by_type[OBJ_BLOB]++;
+	}
+
+	free(in_buf);
+	free(out_buf);
+}
+
 /* All calls must be guarded by find_object() or find_mark() to
  * ensure the 'struct object_entry' passed was written by this
  * process instance.  We unpack the entry by the offset, avoiding
@@ -1704,7 +1820,7 @@ static void parse_mark(void)
 		next_mark = 0;
 }
 
-static void parse_data(struct strbuf *sb)
+static int parse_data(struct strbuf *sb, uintmax_t limit, uintmax_t *len_res)
 {
 	strbuf_reset(sb);
 
@@ -1728,9 +1844,15 @@ static void parse_data(struct strbuf *sb)
 		free(term);
 	}
 	else {
-		size_t n = 0, length;
+		uintmax_t len = strtoumax(command_buf.buf + 5, NULL, 10);
+		size_t n = 0, length = (size_t)len;
 
-		length = strtoul(command_buf.buf + 5, NULL, 10);
+		if (limit && limit < len) {
+			*len_res = len;
+			return 0;
+		}
+		if (length < len)
+			die("data is too large to use in this context");
 
 		while (n < length) {
 			size_t s = strbuf_fread(sb, length - n, stdin);
@@ -1742,6 +1864,7 @@ static void parse_data(struct strbuf *sb)
 	}
 
 	skip_optional_lf();
+	return 1;
 }
 
 static int validate_raw_date(const char *src, char *result, int maxlen)
@@ -1806,14 +1929,31 @@ static char *parse_ident(const char *buf)
 	return ident;
 }
 
-static void parse_new_blob(void)
+static void parse_and_store_blob(
+	struct last_object *last,
+	unsigned char *sha1out,
+	uintmax_t mark)
 {
 	static struct strbuf buf = STRBUF_INIT;
+	uintmax_t len;
+
+	if (parse_data(&buf, big_file_threshold, &len))
+		store_object(OBJ_BLOB, &buf, last, sha1out, mark);
+	else {
+		if (last) {
+			strbuf_release(&last->data);
+			last->offset = 0;
+			last->depth = 0;
+		}
+		stream_blob(len, sha1out, mark);
+	}
+}
 
+static void parse_new_blob(void)
+{
 	read_next_command();
 	parse_mark();
-	parse_data(&buf);
-	store_object(OBJ_BLOB, &buf, &last_blob, NULL, next_mark);
+	parse_and_store_blob(&last_blob, NULL, next_mark);
 }
 
 static void unload_one_branch(void)
@@ -1924,15 +2064,12 @@ static void file_change_m(struct branch *b)
 		 * another repository.
 		 */
 	} else if (inline_data) {
-		static struct strbuf buf = STRBUF_INIT;
-
 		if (p != uq.buf) {
 			strbuf_addstr(&uq, p);
 			p = uq.buf;
 		}
 		read_next_command();
-		parse_data(&buf);
-		store_object(OBJ_BLOB, &buf, &last_blob, sha1, 0);
+		parse_and_store_blob(&last_blob, sha1, 0);
 	} else if (oe) {
 		if (oe->type != OBJ_BLOB)
 			die("Not a blob (actually a %s): %s",
@@ -2058,15 +2195,12 @@ static void note_change_n(struct branch *b)
 		die("Invalid ref name or SHA1 expression: %s", p);
 
 	if (inline_data) {
-		static struct strbuf buf = STRBUF_INIT;
-
 		if (p != uq.buf) {
 			strbuf_addstr(&uq, p);
 			p = uq.buf;
 		}
 		read_next_command();
-		parse_data(&buf);
-		store_object(OBJ_BLOB, &buf, &last_blob, sha1, 0);
+		parse_and_store_blob(&last_blob, sha1, 0);
 	} else if (oe) {
 		if (oe->type != OBJ_BLOB)
 			die("Not a blob (actually a %s): %s",
@@ -2232,7 +2366,7 @@ static void parse_new_commit(void)
 	}
 	if (!committer)
 		die("Expected committer but didn't get one");
-	parse_data(&msg);
+	parse_data(&msg, 0, NULL);
 	read_next_command();
 	parse_from(b);
 	merge_list = parse_merge(&merge_count);
@@ -2353,7 +2487,7 @@ static void parse_new_tag(void)
 		tagger = NULL;
 
 	/* tag payload/message */
-	parse_data(&msg);
+	parse_data(&msg, 0, NULL);
 
 	/* build the tag object */
 	strbuf_reset(&new_data);
@@ -2473,6 +2607,10 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 		pack_compression_seen = 1;
 		return 0;
 	}
+	if (!strcmp(k, "core.bigfilethreshold")) {
+		long n = git_config_int(k, v);
+		big_file_threshold = 0 < n ? n : 0;
+	}
 	return git_default_config(k, v, cb);
 }
 
@@ -2518,6 +2656,8 @@ int main(int argc, const char **argv)
 		}
 		else if (!prefixcmp(a, "--max-pack-size="))
 			max_packsize = strtoumax(a + 16, NULL, 0) * 1024 * 1024;
+		else if (!prefixcmp(a, "--big-file-threshold="))
+			big_file_threshold = strtoumax(a + 21, NULL, 0) * 1024 * 1024;
 		else if (!prefixcmp(a, "--depth=")) {
 			max_depth = strtoul(a + 8, NULL, 0);
 			if (max_depth > MAX_DEPTH)
diff --git a/t/t5705-clone-2gb.sh b/t/t5705-clone-2gb.sh
index 9f52154..adfaae8 100755
--- a/t/t5705-clone-2gb.sh
+++ b/t/t5705-clone-2gb.sh
@@ -31,7 +31,7 @@ test_expect_success 'setup' '
 	 echo "data 5" &&
 	 echo ">2gb" &&
 	 cat commit) |
-	git fast-import &&
+	git fast-import --big-file-threshold=2 &&
 	test ! -f exit-status
 
 '
diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
index b49815d..513db86 100755
--- a/t/t9300-fast-import.sh
+++ b/t/t9300-fast-import.sh
@@ -1254,4 +1254,50 @@ test_expect_success \
 	'Q: verify note for third commit' \
 	'git cat-file blob refs/notes/foobar:$commit3 >actual && test_cmp expect actual'
 
+##
+## R: very large blobs
+##
+blobsize=$((2*1024*1024 + 53))
+test-genrandom bar $blobsize >expect
+cat >input <<INPUT_END
+commit refs/heads/big-file
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+data <<COMMIT
+R - big file
+COMMIT
+
+M 644 inline big1
+data $blobsize
+INPUT_END
+cat expect >>input
+cat >>input <<INPUT_END
+M 644 inline big2
+data $blobsize
+INPUT_END
+cat expect >>input
+echo >>input
+
+test_expect_success \
+	'R: blob bigger than threshold' \
+	'test_create_repo R &&
+	 git --git-dir=R/.git fast-import --big-file-threshold=1 <input'
+test_expect_success \
+	'R: verify created pack' \
+	': >verify &&
+	 for p in R/.git/objects/pack/*.pack;
+	 do
+	   git verify-pack -v $p >>verify || exit;
+	 done'
+test_expect_success \
+	'R: verify written objects' \
+	'git --git-dir=R/.git cat-file blob big-file:big1 >actual &&
+	 test_cmp expect actual &&
+	 a=$(git --git-dir=R/.git rev-parse big-file:big1) &&
+	 b=$(git --git-dir=R/.git rev-parse big-file:big2) &&
+	 test $a = $b'
+test_expect_success \
+	'R: blob appears only once' \
+	'n=$(grep $a verify | wc -l) &&
+	 test 1 = $n'
+
 test_done
-- 
1.7.0.rc0.170.g7207c

-- 
Shawn.

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-29  1:23 [PATCH] fast-import: Stream very large blobs directly to pack Shawn O. Pearce
@ 2010-01-29  2:33 ` Nicolas Pitre
  2010-01-29  2:37   ` Shawn O. Pearce
  2010-01-29  5:29 ` Junio C Hamano
  2010-01-29 18:35 ` [PATCH] " Sverre Rabbelier
  2 siblings, 1 reply; 27+ messages in thread
From: Nicolas Pitre @ 2010-01-29  2:33 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, git

On Thu, 28 Jan 2010, Shawn O. Pearce wrote:

> If a blob is larger than the configured big-file-threshold, instead
> of reading it into a single buffer obtained from malloc, stream it
> onto the end of the current pack file.  Streaming the larger objects
> into the pack avoids the 4+ GiB memory footprint that occurs when
> fast-import is processing 2+ GiB blobs.

Yeah.  I've had that item on my todo list for ages now.  This 
big-file-threshold principle has to be applied to 'git add' too so a big 
blob is stored in pack file form right away, and used to bypass delta 
searching in 'git pack-objects', used to skip the diff machinery, and so 
on.


Nicolas

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-29  2:33 ` Nicolas Pitre
@ 2010-01-29  2:37   ` Shawn O. Pearce
  0 siblings, 0 replies; 27+ messages in thread
From: Shawn O. Pearce @ 2010-01-29  2:37 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

Nicolas Pitre <nico@fluxnic.net> wrote:
> On Thu, 28 Jan 2010, Shawn O. Pearce wrote:
> 
> > If a blob is larger than the configured big-file-threshold, instead
> > of reading it into a single buffer obtained from malloc, stream it
> > onto the end of the current pack file.  Streaming the larger objects
> > into the pack avoids the 4+ GiB memory footprint that occurs when
> > fast-import is processing 2+ GiB blobs.
> 
> Yeah.  I've had that item on my todo list for ages now.  This 
> big-file-threshold principle has to be applied to 'git add' too so a big 
> blob is stored in pack file form right away, and used to bypass delta 
> searching in 'git pack-objects', used to skip the diff machinery, and so 
> on.

Yea, there are a lot of places we should improve for bigger files.
gfi just happened to be the first one I got a bug report on from
a user...

-- 
Shawn.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-29  1:23 [PATCH] fast-import: Stream very large blobs directly to pack Shawn O. Pearce
  2010-01-29  2:33 ` Nicolas Pitre
@ 2010-01-29  5:29 ` Junio C Hamano
  2010-01-29 15:22   ` Shawn O. Pearce
  2010-01-29 18:35 ` [PATCH] " Sverre Rabbelier
  2 siblings, 1 reply; 27+ messages in thread
From: Junio C Hamano @ 2010-01-29  5:29 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: git, Nicolas Pitre

"Shawn O. Pearce" <spearce@spearce.org> writes:

> +static void stream_blob(
> +	uintmax_t len,
> +	unsigned char *sha1out,
> +	uintmax_t mark)

A funny way to indent and line wrap...

> +{
> + ...
> +	/* Determine if we should auto-checkpoint. */
> +	if ((pack_size + 60 + len) > max_packsize
> +		|| (pack_size + 60 + len) < pack_size)
> +		cycle_packfile();

What's "60" in this math?

If the data is not compressible, we could even grow and the end result
might be more than (pack_size + len), busting max_packsize.  As we are
streaming out, we cannot say "oops, let me try again after truncating and
closing the current file and then opening a new file", and instead may
have to copy the data from the current one to a new one, and truncate the
current one.  Is this something worth worrying about?

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-29  5:29 ` Junio C Hamano
@ 2010-01-29 15:22   ` Shawn O. Pearce
  2010-01-29 16:38     ` [PATCH v2] " Shawn O. Pearce
  0 siblings, 1 reply; 27+ messages in thread
From: Shawn O. Pearce @ 2010-01-29 15:22 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

Junio C Hamano <gitster@pobox.com> wrote:
> "Shawn O. Pearce" <spearce@spearce.org> writes:
> 
> > +static void stream_blob(
> > +	uintmax_t len,
> > +	unsigned char *sha1out,
> > +	uintmax_t mark)
> 
> A funny way to indent and line wrap...

Consistent with something else in the same file that has a similar
task... store_object().  I literally just copied the lines from
there and pasted here.

> > +	/* Determine if we should auto-checkpoint. */
> > +	if ((pack_size + 60 + len) > max_packsize
> > +		|| (pack_size + 60 + len) < pack_size)
> > +		cycle_packfile();
> 
> What's "60" in this math?

IIRC 60 is 20 bytes for the SHA-1 footer, and another 40 padding
for the object header, and the deflate() wrapping.

Again, the 60 was stolen from the existing store_object(), which
already has the same assumption.  Only there I think we have len as
the fully compressed version, so the deflate() wrapping is already
being accounted for.

> If the data is not compressible, we could even grow and the end result
> might be more than (pack_size + len), busting max_packsize.

Yea, that's a good point.  I probably should run the length through
deflateBound() and use that here in the test.  But I didn't think it
would make a huge difference.  If the file isn't compressible what
is the real increment over the uncompressed size?  There's a header
and footer, and instructions in the stream to indicate literal data,
but its not like its doubling in size.

> As we are
> streaming out, we cannot say "oops, let me try again after truncating and
> closing the current file and then opening a new file", and instead may
> have to copy the data from the current one to a new one, and truncate the
> current one.  Is this something worth worrying about?

Hmm.  I'm not that worried about it, but then there's the case of
a blob that is larger than the max pack size.  We can't store that,
period.  Should we try to exceed max pack size for that one object?
Or should we refuse?

The major reason for this test is to ensure an object offset
starts before the 4 GiB boundary so idx v1 can hold the offset.
A much less important reason is to try to support older 32 bit
filesystems which can't do more than 2 GiB per file.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] fast-import: Stream very large blobs directly to pack
  2010-01-29 15:22   ` Shawn O. Pearce
@ 2010-01-29 16:38     ` Shawn O. Pearce
  2010-01-29 18:29       ` Jakub Narebski
  0 siblings, 1 reply; 27+ messages in thread
From: Shawn O. Pearce @ 2010-01-29 16:38 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Nicolas Pitre

If a blob is larger than the configured big-file-threshold, instead
of reading it into a single buffer obtained from malloc, stream it
onto the end of the current pack file.  Streaming the larger objects
into the pack avoids the 4+ GiB memory footprint that occurs when
fast-import is processing 2+ GiB blobs.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---

 Fixed a missing skip_optional_lf() at the end of the large blob
 command case.

 Documentation/git-fast-import.txt |    6 ++
 fast-import.c                     |  172 +++++++++++++++++++++++++++++++++----
 t/t5705-clone-2gb.sh              |    2 +-
 t/t9300-fast-import.sh            |   46 ++++++++++
 4 files changed, 208 insertions(+), 18 deletions(-)

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index e6d364f..16d3596 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -50,6 +50,12 @@ OPTIONS
 	importers may wish to lower this, such as to ensure the
 	resulting packfiles fit on CDs.
 
+--big-file-threshold=<n>::
+	Maximum size of a blob that fast-import will attempt to
+	create a delta for, expressed in MiB.  The default is 512.
+	Some importers may wish to lower this on systems with
+	constrained memory.
+
 --depth=<n>::
 	Maximum delta depth, for blob and tree deltification.
 	Default is 10.
diff --git a/fast-import.c b/fast-import.c
index 60d0aa2..8055f73 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -280,6 +280,7 @@ struct recent_command
 /* Configured limits on output */
 static unsigned long max_depth = 10;
 static off_t max_packsize = (1LL << 32) - 1;
+static uintmax_t big_file_threshold = 512 * 1024 * 1024;
 static int force_update;
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
 static int pack_compression_seen;
@@ -1003,7 +1004,7 @@ static void cycle_packfile(void)
 
 static size_t encode_header(
 	enum object_type type,
-	size_t size,
+	uintmax_t size,
 	unsigned char *hdr)
 {
 	int n = 1;
@@ -1159,6 +1160,118 @@ static int store_object(
 	return 0;
 }
 
+static void truncate_pack(off_t to)
+{
+	if (ftruncate(pack_data->pack_fd, to)
+	 || lseek(pack_data->pack_fd, to, SEEK_SET) != to)
+		die_errno("cannot truncate pack to skip duplicate");
+	pack_size = to;
+}
+
+static void stream_blob(uintmax_t len, unsigned char *sha1out, uintmax_t mark)
+{
+	size_t in_sz = 64 * 1024, out_sz = 64 * 1024;
+	unsigned char *in_buf = xmalloc(in_sz);
+	unsigned char *out_buf = xmalloc(out_sz);
+	struct object_entry *e;
+	unsigned char sha1[20];
+	unsigned long hdrlen;
+	off_t offset;
+	git_SHA_CTX c;
+	z_stream s;
+	int status = Z_OK;
+
+	/* Determine if we should auto-checkpoint. */
+	if ((pack_size + 60 + len) > max_packsize
+		|| (pack_size + 60 + len) < pack_size)
+		cycle_packfile();
+
+	offset = pack_size;
+
+	hdrlen = snprintf((char *)out_buf, out_sz, "blob %" PRIuMAX, len) + 1;
+	if (out_sz <= hdrlen)
+		die("impossibly large object header");
+
+	git_SHA1_Init(&c);
+	git_SHA1_Update(&c, out_buf, hdrlen);
+
+	memset(&s, 0, sizeof(s));
+	deflateInit(&s, pack_compression_level);
+
+	hdrlen = encode_header(OBJ_BLOB, len, out_buf);
+	if (out_sz <= hdrlen)
+		die("impossibly large object header");
+
+	s.next_out = out_buf + hdrlen;
+	s.avail_out = out_sz - hdrlen;
+
+	while (status != Z_STREAM_END) {
+		if (0 < len && !s.avail_in) {
+			size_t cnt = in_sz < len ? in_sz : (size_t)len;
+			size_t n = fread(in_buf, 1, cnt, stdin);
+			if (!n && feof(stdin))
+				die("EOF in data (%" PRIuMAX " bytes remaining)", len);
+
+			git_SHA1_Update(&c, in_buf, n);
+			s.next_in = in_buf;
+			s.avail_in = n;
+			len -= n;
+		}
+
+		status = deflate(&s, len ? 0 : Z_FINISH);
+
+		if (!s.avail_out || status == Z_STREAM_END) {
+			size_t n = s.next_out - out_buf;
+			write_or_die(pack_data->pack_fd, out_buf, n);
+			pack_size += n;
+			s.next_out = out_buf;
+			s.avail_out = out_sz;
+		}
+
+		switch (status) {
+		case Z_OK:
+		case Z_BUF_ERROR:
+		case Z_STREAM_END:
+			continue;
+		default:
+			die("unexpected deflate failure: %d", status);
+		}
+	}
+	deflateEnd(&s);
+	git_SHA1_Final(sha1, &c);
+
+	if (sha1out)
+		hashcpy(sha1out, sha1);
+
+	e = insert_object(sha1);
+
+	if (mark)
+		insert_mark(mark, e);
+
+	if (e->offset) {
+		duplicate_count_by_type[OBJ_BLOB]++;
+		truncate_pack(offset);
+
+	} else if (find_sha1_pack(sha1, packed_git)) {
+		e->type = OBJ_BLOB;
+		e->pack_id = MAX_PACK_ID;
+		e->offset = 1; /* just not zero! */
+		duplicate_count_by_type[OBJ_BLOB]++;
+		truncate_pack(offset);
+
+	} else {
+		e->depth = 0;
+		e->type = OBJ_BLOB;
+		e->pack_id = pack_id;
+		e->offset = offset;
+		object_count++;
+		object_count_by_type[OBJ_BLOB]++;
+	}
+
+	free(in_buf);
+	free(out_buf);
+}
+
 /* All calls must be guarded by find_object() or find_mark() to
  * ensure the 'struct object_entry' passed was written by this
  * process instance.  We unpack the entry by the offset, avoiding
@@ -1704,7 +1817,7 @@ static void parse_mark(void)
 		next_mark = 0;
 }
 
-static void parse_data(struct strbuf *sb)
+static int parse_data(struct strbuf *sb, uintmax_t limit, uintmax_t *len_res)
 {
 	strbuf_reset(sb);
 
@@ -1728,9 +1841,15 @@ static void parse_data(struct strbuf *sb)
 		free(term);
 	}
 	else {
-		size_t n = 0, length;
+		uintmax_t len = strtoumax(command_buf.buf + 5, NULL, 10);
+		size_t n = 0, length = (size_t)len;
 
-		length = strtoul(command_buf.buf + 5, NULL, 10);
+		if (limit && limit < len) {
+			*len_res = len;
+			return 0;
+		}
+		if (length < len)
+			die("data is too large to use in this context");
 
 		while (n < length) {
 			size_t s = strbuf_fread(sb, length - n, stdin);
@@ -1742,6 +1861,7 @@ static void parse_data(struct strbuf *sb)
 	}
 
 	skip_optional_lf();
+	return 1;
 }
 
 static int validate_raw_date(const char *src, char *result, int maxlen)
@@ -1806,14 +1926,32 @@ static char *parse_ident(const char *buf)
 	return ident;
 }
 
-static void parse_new_blob(void)
+static void parse_and_store_blob(
+	struct last_object *last,
+	unsigned char *sha1out,
+	uintmax_t mark)
 {
 	static struct strbuf buf = STRBUF_INIT;
+	uintmax_t len;
 
+	if (parse_data(&buf, big_file_threshold, &len))
+		store_object(OBJ_BLOB, &buf, last, sha1out, mark);
+	else {
+		if (last) {
+			strbuf_release(&last->data);
+			last->offset = 0;
+			last->depth = 0;
+		}
+		stream_blob(len, sha1out, mark);
+		skip_optional_lf();
+	}
+}
+
+static void parse_new_blob(void)
+{
 	read_next_command();
 	parse_mark();
-	parse_data(&buf);
-	store_object(OBJ_BLOB, &buf, &last_blob, NULL, next_mark);
+	parse_and_store_blob(&last_blob, NULL, next_mark);
 }
 
 static void unload_one_branch(void)
@@ -1924,15 +2062,12 @@ static void file_change_m(struct branch *b)
 		 * another repository.
 		 */
 	} else if (inline_data) {
-		static struct strbuf buf = STRBUF_INIT;
-
 		if (p != uq.buf) {
 			strbuf_addstr(&uq, p);
 			p = uq.buf;
 		}
 		read_next_command();
-		parse_data(&buf);
-		store_object(OBJ_BLOB, &buf, &last_blob, sha1, 0);
+		parse_and_store_blob(&last_blob, sha1, 0);
 	} else if (oe) {
 		if (oe->type != OBJ_BLOB)
 			die("Not a blob (actually a %s): %s",
@@ -2058,15 +2193,12 @@ static void note_change_n(struct branch *b)
 		die("Invalid ref name or SHA1 expression: %s", p);
 
 	if (inline_data) {
-		static struct strbuf buf = STRBUF_INIT;
-
 		if (p != uq.buf) {
 			strbuf_addstr(&uq, p);
 			p = uq.buf;
 		}
 		read_next_command();
-		parse_data(&buf);
-		store_object(OBJ_BLOB, &buf, &last_blob, sha1, 0);
+		parse_and_store_blob(&last_blob, sha1, 0);
 	} else if (oe) {
 		if (oe->type != OBJ_BLOB)
 			die("Not a blob (actually a %s): %s",
@@ -2232,7 +2364,7 @@ static void parse_new_commit(void)
 	}
 	if (!committer)
 		die("Expected committer but didn't get one");
-	parse_data(&msg);
+	parse_data(&msg, 0, NULL);
 	read_next_command();
 	parse_from(b);
 	merge_list = parse_merge(&merge_count);
@@ -2353,7 +2485,7 @@ static void parse_new_tag(void)
 		tagger = NULL;
 
 	/* tag payload/message */
-	parse_data(&msg);
+	parse_data(&msg, 0, NULL);
 
 	/* build the tag object */
 	strbuf_reset(&new_data);
@@ -2473,6 +2605,10 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 		pack_compression_seen = 1;
 		return 0;
 	}
+	if (!strcmp(k, "core.bigfilethreshold")) {
+		long n = git_config_int(k, v);
+		big_file_threshold = 0 < n ? n : 0;
+	}
 	return git_default_config(k, v, cb);
 }
 
@@ -2518,6 +2654,8 @@ int main(int argc, const char **argv)
 		}
 		else if (!prefixcmp(a, "--max-pack-size="))
 			max_packsize = strtoumax(a + 16, NULL, 0) * 1024 * 1024;
+		else if (!prefixcmp(a, "--big-file-threshold="))
+			big_file_threshold = strtoumax(a + 21, NULL, 0) * 1024 * 1024;
 		else if (!prefixcmp(a, "--depth=")) {
 			max_depth = strtoul(a + 8, NULL, 0);
 			if (max_depth > MAX_DEPTH)
diff --git a/t/t5705-clone-2gb.sh b/t/t5705-clone-2gb.sh
index 9f52154..adfaae8 100755
--- a/t/t5705-clone-2gb.sh
+++ b/t/t5705-clone-2gb.sh
@@ -31,7 +31,7 @@ test_expect_success 'setup' '
 	 echo "data 5" &&
 	 echo ">2gb" &&
 	 cat commit) |
-	git fast-import &&
+	git fast-import --big-file-threshold=2 &&
 	test ! -f exit-status
 
 '
diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
index b49815d..513db86 100755
--- a/t/t9300-fast-import.sh
+++ b/t/t9300-fast-import.sh
@@ -1254,4 +1254,50 @@ test_expect_success \
 	'Q: verify note for third commit' \
 	'git cat-file blob refs/notes/foobar:$commit3 >actual && test_cmp expect actual'
 
+##
+## R: very large blobs
+##
+blobsize=$((2*1024*1024 + 53))
+test-genrandom bar $blobsize >expect
+cat >input <<INPUT_END
+commit refs/heads/big-file
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+data <<COMMIT
+R - big file
+COMMIT
+
+M 644 inline big1
+data $blobsize
+INPUT_END
+cat expect >>input
+cat >>input <<INPUT_END
+M 644 inline big2
+data $blobsize
+INPUT_END
+cat expect >>input
+echo >>input
+
+test_expect_success \
+	'R: blob bigger than threshold' \
+	'test_create_repo R &&
+	 git --git-dir=R/.git fast-import --big-file-threshold=1 <input'
+test_expect_success \
+	'R: verify created pack' \
+	': >verify &&
+	 for p in R/.git/objects/pack/*.pack;
+	 do
+	   git verify-pack -v $p >>verify || exit;
+	 done'
+test_expect_success \
+	'R: verify written objects' \
+	'git --git-dir=R/.git cat-file blob big-file:big1 >actual &&
+	 test_cmp expect actual &&
+	 a=$(git --git-dir=R/.git rev-parse big-file:big1) &&
+	 b=$(git --git-dir=R/.git rev-parse big-file:big2) &&
+	 test $a = $b'
+test_expect_success \
+	'R: blob appears only once' \
+	'n=$(grep $a verify | wc -l) &&
+	 test 1 = $n'
+
 test_done
-- 
1.7.0.rc0.170.g7207c

-- 
Shawn.

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] fast-import: Stream very large blobs directly to pack
  2010-01-29 16:38     ` [PATCH v2] " Shawn O. Pearce
@ 2010-01-29 18:29       ` Jakub Narebski
  2010-01-29 18:30         ` Shawn O. Pearce
  0 siblings, 1 reply; 27+ messages in thread
From: Jakub Narebski @ 2010-01-29 18:29 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, git, Nicolas Pitre

"Shawn O. Pearce" <spearce@spearce.org> writes:

>  Documentation/git-fast-import.txt |    6 ++
>  fast-import.c                     |  172 +++++++++++++++++++++++++++++++++----
>  t/t5705-clone-2gb.sh              |    2 +-
>  t/t9300-fast-import.sh            |   46 ++++++++++
>  4 files changed, 208 insertions(+), 18 deletions(-)

> +--big-file-threshold=<n>::
> +	Maximum size of a blob that fast-import will attempt to
> +	create a delta for, expressed in MiB.  The default is 512.
> +	Some importers may wish to lower this on systems with
> +	constrained memory.
> +

Shouldn't there be config variable corresponding to this command line
option, so that it can be set once for [constrained] system?

-- 
Jakub Narebski
Poland
ShadeHawk on #git

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] fast-import: Stream very large blobs directly to pack
  2010-01-29 18:29       ` Jakub Narebski
@ 2010-01-29 18:30         ` Shawn O. Pearce
  2010-01-29 23:02           ` A Large Angry SCM
  0 siblings, 1 reply; 27+ messages in thread
From: Shawn O. Pearce @ 2010-01-29 18:30 UTC (permalink / raw)
  To: Jakub Narebski; +Cc: Junio C Hamano, git, Nicolas Pitre

Jakub Narebski <jnareb@gmail.com> wrote:
> "Shawn O. Pearce" <spearce@spearce.org> writes:
> 
> >  Documentation/git-fast-import.txt |    6 ++
> >  fast-import.c                     |  172 +++++++++++++++++++++++++++++++++----
> >  t/t5705-clone-2gb.sh              |    2 +-
> >  t/t9300-fast-import.sh            |   46 ++++++++++
> >  4 files changed, 208 insertions(+), 18 deletions(-)
> 
> > +--big-file-threshold=<n>::
> > +	Maximum size of a blob that fast-import will attempt to
> > +	create a delta for, expressed in MiB.  The default is 512.
> > +	Some importers may wish to lower this on systems with
> > +	constrained memory.
> > +
> 
> Shouldn't there be config variable corresponding to this command line
> option, so that it can be set once for [constrained] system?

Implemented as core.bigFileThreshold in this patch... but I didn't
document it...

-- 
Shawn.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-29  1:23 [PATCH] fast-import: Stream very large blobs directly to pack Shawn O. Pearce
  2010-01-29  2:33 ` Nicolas Pitre
  2010-01-29  5:29 ` Junio C Hamano
@ 2010-01-29 18:35 ` Sverre Rabbelier
  2010-01-29 18:37   ` Shawn O. Pearce
  2 siblings, 1 reply; 27+ messages in thread
From: Sverre Rabbelier @ 2010-01-29 18:35 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, git, Nicolas Pitre

Heya,

On Fri, Jan 29, 2010 at 02:23, Shawn O. Pearce <spearce@spearce.org> wrote:
> index 60d0aa2..8114f93 100644

Looks like you based it off an old version of fast-import, at least
not on what is in master atm, since that has my fast-import series.
Rebasing shouldn't be too hard, you'll just have to move the option
parsing to the appropriate function.

-- 
Cheers,

Sverre Rabbelier

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-29 18:35 ` [PATCH] " Sverre Rabbelier
@ 2010-01-29 18:37   ` Shawn O. Pearce
  2010-01-29 18:41     ` Sverre Rabbelier
  2010-01-30  3:41     ` Junio C Hamano
  0 siblings, 2 replies; 27+ messages in thread
From: Shawn O. Pearce @ 2010-01-29 18:37 UTC (permalink / raw)
  To: Sverre Rabbelier; +Cc: Junio C Hamano, git, Nicolas Pitre

Sverre Rabbelier <srabbelier@gmail.com> wrote:
> On Fri, Jan 29, 2010 at 02:23, Shawn O. Pearce <spearce@spearce.org> wrote:
> > index 60d0aa2..8114f93 100644
> 
> Looks like you based it off an old version of fast-import, at least
> not on what is in master atm, since that has my fast-import series.
> Rebasing shouldn't be too hard, you'll just have to move the option
> parsing to the appropriate function.

I was intentionally slating this for maint, to fix a bug a user
reported when handling large streams.

But yea... I guess I also owe Junio a rebased form for master so
he has less merge conflicts to resolve.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-29 18:37   ` Shawn O. Pearce
@ 2010-01-29 18:41     ` Sverre Rabbelier
  2010-01-29 18:44       ` Shawn O. Pearce
  2010-01-30  3:41     ` Junio C Hamano
  1 sibling, 1 reply; 27+ messages in thread
From: Sverre Rabbelier @ 2010-01-29 18:41 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, git, Nicolas Pitre

Heya,

On Fri, Jan 29, 2010 at 19:37, Shawn O. Pearce <spearce@spearce.org> wrote:
> I was intentionally slating this for maint, to fix a bug a user
> reported when handling large streams.

Ah, sorry, did not notice that.

Speaking of which, how does one figure out what commit a patch
is/could be based on? Couldn't we include that in the part behind the
triple dash?

-- 
Cheers,

Sverre Rabbelier

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-29 18:41     ` Sverre Rabbelier
@ 2010-01-29 18:44       ` Shawn O. Pearce
  0 siblings, 0 replies; 27+ messages in thread
From: Shawn O. Pearce @ 2010-01-29 18:44 UTC (permalink / raw)
  To: Sverre Rabbelier; +Cc: Junio C Hamano, git, Nicolas Pitre

Sverre Rabbelier <srabbelier@gmail.com> wrote:
> On Fri, Jan 29, 2010 at 19:37, Shawn O. Pearce <spearce@spearce.org> wrote:
> > I was intentionally slating this for maint, to fix a bug a user
> > reported when handling large streams.
> 
> Ah, sorry, did not notice that.
> 
> Speaking of which, how does one figure out what commit a patch
> is/could be based on? Couldn't we include that in the part behind the
> triple dash?

You have to tell the other guy in your cover letter section.  You can
try to guess if the index information is accurate by grepping
around a log output looking for that blob sha-1 and then finding
what branch that's in.  Its not pretty.

IIRC other DVCS systems include the ancestor identity in their
patch output.  Git doesn't bother, because in a multi-patch series
the 2nd (and subsequent) patches would have useless id information.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] fast-import: Stream very large blobs directly to pack
  2010-01-29 18:30         ` Shawn O. Pearce
@ 2010-01-29 23:02           ` A Large Angry SCM
  2010-01-30  7:17             ` Junio C Hamano
  0 siblings, 1 reply; 27+ messages in thread
From: A Large Angry SCM @ 2010-01-29 23:02 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: git

Shawn O. Pearce wrote:
...
> Implemented as core.bigFileThreshold in this patch... but I didn't
> document it...

Bad dog! No biscuit!

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-29 18:37   ` Shawn O. Pearce
  2010-01-29 18:41     ` Sverre Rabbelier
@ 2010-01-30  3:41     ` Junio C Hamano
  2010-01-30  6:19       ` Junio C Hamano
  1 sibling, 1 reply; 27+ messages in thread
From: Junio C Hamano @ 2010-01-30  3:41 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Sverre Rabbelier, Junio C Hamano, git, Nicolas Pitre

"Shawn O. Pearce" <spearce@spearce.org> writes:

> Sverre Rabbelier <srabbelier@gmail.com> wrote:
>> On Fri, Jan 29, 2010 at 02:23, Shawn O. Pearce <spearce@spearce.org> wrote:
>> > index 60d0aa2..8114f93 100644
>> 
>> Looks like you based it off an old version of fast-import, at least
>> not on what is in master atm, since that has my fast-import series.
>> Rebasing shouldn't be too hard, you'll just have to move the option
>> parsing to the appropriate function.
>
> I was intentionally slating this for maint, to fix a bug a user
> reported when handling large streams.

I personally see that as adding a new feature (especially with new option
and config).

> But yea... I guess I also owe Junio a rebased form for master so
> he has less merge conflicts to resolve.

Thanks.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-30  3:41     ` Junio C Hamano
@ 2010-01-30  6:19       ` Junio C Hamano
  2010-01-30  7:33         ` Junio C Hamano
  2010-02-01 15:23         ` [PATCH] fast-import: Stream very large blobs directly to pack Shawn O. Pearce
  0 siblings, 2 replies; 27+ messages in thread
From: Junio C Hamano @ 2010-01-30  6:19 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Sverre Rabbelier, git, Nicolas Pitre

Junio C Hamano <gitster@pobox.com> writes:

> "Shawn O. Pearce" <spearce@spearce.org> writes:
>
>> I was intentionally slating this for maint, to fix a bug a user
>> reported when handling large streams.
>
> I personally see that as adding a new feature (especially with new option
> and config).

Sorry, but I take it back.  The new codepath triggers even without any
explicit request and _fixes_ the situation where old code simply failed,
so it is worth queuing for the maintenance track.

Do you want to do the deflatebound thing, or are we Ok without?

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] fast-import: Stream very large blobs directly to pack
  2010-01-29 23:02           ` A Large Angry SCM
@ 2010-01-30  7:17             ` Junio C Hamano
  0 siblings, 0 replies; 27+ messages in thread
From: Junio C Hamano @ 2010-01-30  7:17 UTC (permalink / raw)
  To: gitzilla; +Cc: Shawn O. Pearce, git

A Large Angry SCM <gitzilla@gmail.com> writes:

> Shawn O. Pearce wrote:
> ...
>> Implemented as core.bigFileThreshold in this patch... but I didn't
>> document it...
>
> Bad dog! No biscuit!

:-)

Patches welcome.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-30  6:19       ` Junio C Hamano
@ 2010-01-30  7:33         ` Junio C Hamano
  2010-02-01 15:28           ` Shawn O. Pearce
  2010-02-01 15:41           ` [PATCH] fast-import: Document the core.bigFileThreshold configuration setting Shawn O. Pearce
  2010-02-01 15:23         ` [PATCH] fast-import: Stream very large blobs directly to pack Shawn O. Pearce
  1 sibling, 2 replies; 27+ messages in thread
From: Junio C Hamano @ 2010-01-30  7:33 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Sverre Rabbelier, git, Nicolas Pitre

I'll queued this with a result of my conflict resolution to 'pu' for now
but please double check after I push it out.

You may want to add the new option to the output from "cmd -h" and
probably description of the configuration in the doc before any of this
gets official.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-30  6:19       ` Junio C Hamano
  2010-01-30  7:33         ` Junio C Hamano
@ 2010-02-01 15:23         ` Shawn O. Pearce
  1 sibling, 0 replies; 27+ messages in thread
From: Shawn O. Pearce @ 2010-02-01 15:23 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Sverre Rabbelier, git, Nicolas Pitre

Junio C Hamano <gitster@pobox.com> wrote:
> Junio C Hamano <gitster@pobox.com> writes:
> > "Shawn O. Pearce" <spearce@spearce.org> writes:
> >
> >> I was intentionally slating this for maint, to fix a bug a user
> >> reported when handling large streams.
> >
> > I personally see that as adding a new feature (especially with new option
> > and config).
> 
> Sorry, but I take it back.  The new codepath triggers even without any
> explicit request and _fixes_ the situation where old code simply failed,
> so it is worth queuing for the maintenance track.

Thanks.
 
> Do you want to do the deflatebound thing, or are we Ok without?

No, I think we're OK without it.  The pack size limit stuff is only
a rough guess anyway.  I never meant for us to be sticking to it to
within a byte.  It was only meant to keep us from shoving a 100 MiB
blob onto the end of a pack once we got close to the size requested
by the user.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-01-30  7:33         ` Junio C Hamano
@ 2010-02-01 15:28           ` Shawn O. Pearce
  2010-02-01 20:14             ` Junio C Hamano
  2010-02-04  2:01             ` Junio C Hamano
  2010-02-01 15:41           ` [PATCH] fast-import: Document the core.bigFileThreshold configuration setting Shawn O. Pearce
  1 sibling, 2 replies; 27+ messages in thread
From: Shawn O. Pearce @ 2010-02-01 15:28 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Sverre Rabbelier, git, Nicolas Pitre

Junio C Hamano <gitster@pobox.com> wrote:
> I'll queued this with a result of my conflict resolution to 'pu' for now
> but please double check after I push it out.

The strtoumax call got messed up.  Squash this into your merge:

diff --git a/fast-import.c b/fast-import.c
index e6ebcf6..9c65a24 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -2800,7 +2800,7 @@ static int parse_one_option(const char *option)
 	if (!prefixcmp(option, "max-pack-size=")) {
 		option_max_pack_size(option + 14);
 	} else if (!prefixcmp(option, "big-file-threshold=")) {
-		big_file_threshold = strtoumax(option + 21, NULL, 0) * 1024 * 1024;
+		big_file_threshold = strtoumax(option + 19, NULL, 0) * 1024 * 1024;
 	} else if (!prefixcmp(option, "depth=")) {
 		option_depth(option + 6);
 	} else if (!prefixcmp(option, "active-branches=")) {
 
> You may want to add the new option to the output from "cmd -h" and
> probably description of the configuration in the doc before any of this
> gets official.

I'll send an additional patch in a minute with these documentation
related updates.

-- 
Shawn.

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH] fast-import: Document the core.bigFileThreshold configuration setting
  2010-01-30  7:33         ` Junio C Hamano
  2010-02-01 15:28           ` Shawn O. Pearce
@ 2010-02-01 15:41           ` Shawn O. Pearce
  1 sibling, 0 replies; 27+ messages in thread
From: Shawn O. Pearce @ 2010-02-01 15:41 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Sverre Rabbelier, git, Nicolas Pitre

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 Junio C Hamano <gitster@pobox.com> wrote:
 > You may want to add the new option to the output from "cmd -h" and
 > probably description of the configuration in the doc before any of this
 > gets official.

 Doc patch.  :-)

 Feel free to squash, or to apply as its own change, I don't care
 either way.

 Documentation/config.txt |   14 ++++++++++++++
 fast-import.c            |    2 +-
 2 files changed, 15 insertions(+), 1 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index f7728ec..b16a20b 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -409,6 +409,20 @@ You probably do not need to adjust this value.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileThreshold::
+	Files larger than this size are stored deflated, without
+	attempting delta compression.  Storing large files without
+	delta compression avoids excessive memory usage, at the
+	slight expense of increased disk usage.
++
+Default is 512 MiB on all platforms.  This should be reasonable
+for most projects as source code and other text files can still
+be delta compressed, but larger binary media files won't be.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
++
+Currently only linkgit:git-fast-import[1] honors this setting.
+
 core.excludesfile::
 	In addition to '.gitignore' (per-directory) and
 	'.git/info/exclude', git looks into this file for patterns
diff --git a/fast-import.c b/fast-import.c
index 8055f73..1bcf3bb 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -2613,7 +2613,7 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 }
 
 static const char fast_import_usage[] =
-"git fast-import [--date-format=f] [--max-pack-size=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file]";
+"git fast-import [--date-format=f] [--max-pack-size=n] [--big-file-threshold=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file]";
 
 int main(int argc, const char **argv)
 {
-- 
1.7.0.rc0.170.g7207c

-- 
Shawn.

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-02-01 15:28           ` Shawn O. Pearce
@ 2010-02-01 20:14             ` Junio C Hamano
  2010-02-04  2:01             ` Junio C Hamano
  1 sibling, 0 replies; 27+ messages in thread
From: Junio C Hamano @ 2010-02-01 20:14 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, Sverre Rabbelier, git, Nicolas Pitre

"Shawn O. Pearce" <spearce@spearce.org> writes:

> Junio C Hamano <gitster@pobox.com> wrote:
>> I'll queued this with a result of my conflict resolution to 'pu' for now
>> but please double check after I push it out.
>
> The strtoumax call got messed up.  Squash this into your merge:
>
> diff --git a/fast-import.c b/fast-import.c
> index e6ebcf6..9c65a24 100644
> --- a/fast-import.c
> +++ b/fast-import.c
> @@ -2800,7 +2800,7 @@ static int parse_one_option(const char *option)
>  	if (!prefixcmp(option, "max-pack-size=")) {
>  		option_max_pack_size(option + 14);
>  	} else if (!prefixcmp(option, "big-file-threshold=")) {
> -		big_file_threshold = strtoumax(option + 21, NULL, 0) * 1024 * 1024;
> +		big_file_threshold = strtoumax(option + 19, NULL, 0) * 1024 * 1024;
>  	} else if (!prefixcmp(option, "depth=")) {
>  		option_depth(option + 6);
>  	} else if (!prefixcmp(option, "active-branches=")) {

Ah, of course.  Sorry for not being careful and thanks for pointing it
out.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-02-01 15:28           ` Shawn O. Pearce
  2010-02-01 20:14             ` Junio C Hamano
@ 2010-02-04  2:01             ` Junio C Hamano
  2010-02-04  2:07               ` Shawn O. Pearce
  2010-02-04  2:28               ` Nicolas Pitre
  1 sibling, 2 replies; 27+ messages in thread
From: Junio C Hamano @ 2010-02-04  2:01 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, Sverre Rabbelier, git, Nicolas Pitre

"Shawn O. Pearce" <spearce@spearce.org> writes:

> The strtoumax call got messed up.  Squash this into your merge:
>
> diff --git a/fast-import.c b/fast-import.c
> index e6ebcf6..9c65a24 100644
> --- a/fast-import.c
> +++ b/fast-import.c
> @@ -2800,7 +2800,7 @@ static int parse_one_option(const char *option)
>  	if (!prefixcmp(option, "max-pack-size=")) {
>  		option_max_pack_size(option + 14);
>  	} else if (!prefixcmp(option, "big-file-threshold=")) {
> -		big_file_threshold = strtoumax(option + 21, NULL, 0) * 1024 * 1024;
> +		big_file_threshold = strtoumax(option + 19, NULL, 0) * 1024 * 1024;
>  	} else if (!prefixcmp(option, "depth=")) {
>  		option_depth(option + 6);
>  	} else if (!prefixcmp(option, "active-branches=")) {
>  
>> You may want to add the new option to the output from "cmd -h" and
>> probably description of the configuration in the doc before any of this
>> gets official.
>
> I'll send an additional patch in a minute with these documentation
> related updates.

Well, well, well....

The documentation says this is counted in bytes, but somehow neither of us
found the above " * 1024 * 1024" suspicious.

Shouldn't it be at least like this?  It would probably be a good idea to
use git_parse_ulong() or somesuch while we are at it.

 fast-import.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index ca21082..ea1ac0f 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -2800,7 +2800,7 @@ static int parse_one_option(const char *option)
 	if (!prefixcmp(option, "max-pack-size=")) {
 		option_max_pack_size(option + 14);
 	} else if (!prefixcmp(option, "big-file-threshold=")) {
-		big_file_threshold = strtoumax(option + 19, NULL, 0) * 1024 * 1024;
+		big_file_threshold = strtoumax(option + 19, NULL, 0);
 	} else if (!prefixcmp(option, "depth=")) {
 		option_depth(option + 6);
 	} else if (!prefixcmp(option, "active-branches=")) {

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-02-04  2:01             ` Junio C Hamano
@ 2010-02-04  2:07               ` Shawn O. Pearce
  2010-02-04  2:25                 ` Junio C Hamano
  2010-02-04  2:28               ` Nicolas Pitre
  1 sibling, 1 reply; 27+ messages in thread
From: Shawn O. Pearce @ 2010-02-04  2:07 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Sverre Rabbelier, git, Nicolas Pitre

Junio C Hamano <gitster@pobox.com> wrote:
> 
> Well, well, well....
> 
> The documentation says this is counted in bytes, but somehow neither of us
> found the above " * 1024 * 1024" suspicious.
> 
> Shouldn't it be at least like this?  It would probably be a good idea to
> use git_parse_ulong() or somesuch while we are at it.
> 
>  fast-import.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/fast-import.c b/fast-import.c
> index ca21082..ea1ac0f 100644
> --- a/fast-import.c
> +++ b/fast-import.c
> @@ -2800,7 +2800,7 @@ static int parse_one_option(const char *option)
>  	if (!prefixcmp(option, "max-pack-size=")) {
>  		option_max_pack_size(option + 14);
>  	} else if (!prefixcmp(option, "big-file-threshold=")) {
> -		big_file_threshold = strtoumax(option + 19, NULL, 0) * 1024 * 1024;
> +		big_file_threshold = strtoumax(option + 19, NULL, 0);

In my v3 patch I thought I replaced this code with:

+               else if (!prefixcmp(a, "--big-file-threshold=")) {
+                       unsigned long v;
+                       if (!git_parse_ulong(a + 21, &v))
+                               usage(fast_import_usage);
+                       big_file_threshold = v;

So we relied on git_parse_ulong to handle unit suffixes as well.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-02-04  2:07               ` Shawn O. Pearce
@ 2010-02-04  2:25                 ` Junio C Hamano
  2010-02-04  2:27                   ` Junio C Hamano
  0 siblings, 1 reply; 27+ messages in thread
From: Junio C Hamano @ 2010-02-04  2:25 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, Sverre Rabbelier, git, Nicolas Pitre

"Shawn O. Pearce" <spearce@spearce.org> writes:

> In my v3 patch I thought I replaced this code with:
>
> +               else if (!prefixcmp(a, "--big-file-threshold=")) {
> +                       unsigned long v;
> +                       if (!git_parse_ulong(a + 21, &v))
> +                               usage(fast_import_usage);
> +                       big_file_threshold = v;
>
> So we relied on git_parse_ulong to handle unit suffixes as well.

Yeah, you did; but it didn't carried through the merge process across the
code restructure to add "option_blah" stuff.  Sorry about that.

Looking at the output from

    $ git grep -n -e ' \* 1024 \* 1024' -- '*.c'

I noticed another issue.  Don't we need the same thing for max_packsize?
Or is that _too much_ of a backward incompatible change that we should
wait until 1.7.1?

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-02-04  2:25                 ` Junio C Hamano
@ 2010-02-04  2:27                   ` Junio C Hamano
  2010-02-04  2:30                     ` Shawn O. Pearce
  0 siblings, 1 reply; 27+ messages in thread
From: Junio C Hamano @ 2010-02-04  2:27 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Shawn O. Pearce, Sverre Rabbelier, git, Nicolas Pitre

Junio C Hamano <gitster@pobox.com> writes:

> "Shawn O. Pearce" <spearce@spearce.org> writes:
>
>> In my v3 patch I thought I replaced this code with:
>>
>> +               else if (!prefixcmp(a, "--big-file-threshold=")) {
>> +                       unsigned long v;
>> +                       if (!git_parse_ulong(a + 21, &v))
>> +                               usage(fast_import_usage);
>> +                       big_file_threshold = v;
>>
>> So we relied on git_parse_ulong to handle unit suffixes as well.
>
> Yeah, you did; but it didn't carried through the merge process across the
> code restructure to add "option_blah" stuff.  Sorry about that.

And this is a fix-up for the mismerge.  I didn't touch max-pack-size stuff.

 fast-import.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index ca21082..a6730d0 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -2800,7 +2800,10 @@ static int parse_one_option(const char *option)
 	if (!prefixcmp(option, "max-pack-size=")) {
 		option_max_pack_size(option + 14);
 	} else if (!prefixcmp(option, "big-file-threshold=")) {
-		big_file_threshold = strtoumax(option + 19, NULL, 0) * 1024 * 1024;
+		unsigned long v;
+		if (!git_parse_ulong(option + 19, &v))
+			return 0;
+		big_file_threshold = v;
 	} else if (!prefixcmp(option, "depth=")) {
 		option_depth(option + 6);
 	} else if (!prefixcmp(option, "active-branches=")) {

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-02-04  2:01             ` Junio C Hamano
  2010-02-04  2:07               ` Shawn O. Pearce
@ 2010-02-04  2:28               ` Nicolas Pitre
  1 sibling, 0 replies; 27+ messages in thread
From: Nicolas Pitre @ 2010-02-04  2:28 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Shawn O. Pearce, Sverre Rabbelier, git

On Wed, 3 Feb 2010, Junio C Hamano wrote:

> "Shawn O. Pearce" <spearce@spearce.org> writes:
> 
> > The strtoumax call got messed up.  Squash this into your merge:
> >
> > diff --git a/fast-import.c b/fast-import.c
> > index e6ebcf6..9c65a24 100644
> > --- a/fast-import.c
> > +++ b/fast-import.c
> > @@ -2800,7 +2800,7 @@ static int parse_one_option(const char *option)
> >  	if (!prefixcmp(option, "max-pack-size=")) {
> >  		option_max_pack_size(option + 14);
> >  	} else if (!prefixcmp(option, "big-file-threshold=")) {
> > -		big_file_threshold = strtoumax(option + 21, NULL, 0) * 1024 * 1024;
> > +		big_file_threshold = strtoumax(option + 19, NULL, 0) * 1024 * 1024;
> >  	} else if (!prefixcmp(option, "depth=")) {
> >  		option_depth(option + 6);
> >  	} else if (!prefixcmp(option, "active-branches=")) {
> >  
> >> You may want to add the new option to the output from "cmd -h" and
> >> probably description of the configuration in the doc before any of this
> >> gets official.
> >
> > I'll send an additional patch in a minute with these documentation
> > related updates.
> 
> Well, well, well....
> 
> The documentation says this is counted in bytes, but somehow neither of us
> found the above " * 1024 * 1024" suspicious.
> 
> Shouldn't it be at least like this?  It would probably be a good idea to
> use git_parse_ulong() or somesuch while we are at it.

Yes, definitely.  I'm about to post a patch moving --max-pack-size in 
that direction too.  I just had to fix a couple other unsuspected issues 
to get there though.  Patches will follow shortly.


Nicolas

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] fast-import: Stream very large blobs directly to pack
  2010-02-04  2:27                   ` Junio C Hamano
@ 2010-02-04  2:30                     ` Shawn O. Pearce
  0 siblings, 0 replies; 27+ messages in thread
From: Shawn O. Pearce @ 2010-02-04  2:30 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Sverre Rabbelier, git, Nicolas Pitre

Junio C Hamano <gitster@pobox.com> wrote:
> And this is a fix-up for the mismerge.  I didn't touch max-pack-size stuff.
> 
>  fast-import.c |    5 ++++-
>  1 files changed, 4 insertions(+), 1 deletions(-)
> 
> diff --git a/fast-import.c b/fast-import.c
> index ca21082..a6730d0 100644
> --- a/fast-import.c
> +++ b/fast-import.c
> @@ -2800,7 +2800,10 @@ static int parse_one_option(const char *option)
>  	if (!prefixcmp(option, "max-pack-size=")) {
>  		option_max_pack_size(option + 14);
>  	} else if (!prefixcmp(option, "big-file-threshold=")) {
> -		big_file_threshold = strtoumax(option + 19, NULL, 0) * 1024 * 1024;
> +		unsigned long v;
> +		if (!git_parse_ulong(option + 19, &v))
> +			return 0;
> +		big_file_threshold = v;

Yup, looks good to me.

-- 
Shawn.

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2010-02-04  2:31 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-29  1:23 [PATCH] fast-import: Stream very large blobs directly to pack Shawn O. Pearce
2010-01-29  2:33 ` Nicolas Pitre
2010-01-29  2:37   ` Shawn O. Pearce
2010-01-29  5:29 ` Junio C Hamano
2010-01-29 15:22   ` Shawn O. Pearce
2010-01-29 16:38     ` [PATCH v2] " Shawn O. Pearce
2010-01-29 18:29       ` Jakub Narebski
2010-01-29 18:30         ` Shawn O. Pearce
2010-01-29 23:02           ` A Large Angry SCM
2010-01-30  7:17             ` Junio C Hamano
2010-01-29 18:35 ` [PATCH] " Sverre Rabbelier
2010-01-29 18:37   ` Shawn O. Pearce
2010-01-29 18:41     ` Sverre Rabbelier
2010-01-29 18:44       ` Shawn O. Pearce
2010-01-30  3:41     ` Junio C Hamano
2010-01-30  6:19       ` Junio C Hamano
2010-01-30  7:33         ` Junio C Hamano
2010-02-01 15:28           ` Shawn O. Pearce
2010-02-01 20:14             ` Junio C Hamano
2010-02-04  2:01             ` Junio C Hamano
2010-02-04  2:07               ` Shawn O. Pearce
2010-02-04  2:25                 ` Junio C Hamano
2010-02-04  2:27                   ` Junio C Hamano
2010-02-04  2:30                     ` Shawn O. Pearce
2010-02-04  2:28               ` Nicolas Pitre
2010-02-01 15:41           ` [PATCH] fast-import: Document the core.bigFileThreshold configuration setting Shawn O. Pearce
2010-02-01 15:23         ` [PATCH] fast-import: Stream very large blobs directly to pack Shawn O. Pearce

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).