[PATCH v2 0/5] Bulk Check-in

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 0/5] Bulk Check-in
@ 2011-12-01  0:27 Junio C Hamano
  2011-12-01  0:27 ` [PATCH v2 1/5] write_pack_header(): a helper function Junio C Hamano
                   ` (4 more replies)
  0 siblings, 5 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-01  0:27 UTC (permalink / raw)
  To: git

This is a re-roll of the earlier bulk-check-in series. The only change is
that the last one is a bit re-structured to pay attention to the packsize
limit from the get-go and also not write objects that already exist in the
repository, instead of "oops, I forgot to do that, and here is a fix".

Junio C Hamano (5):
  write_pack_header(): a helper function
  create_tmp_packfile(): a helper function
  finish_tmp_packfile(): a helper function
  csum-file: introduce sha1file_checkpoint
  bulk-checkin: replace fast-import based implementation

 Makefile               |    2 +
 builtin/add.c          |    5 +
 builtin/pack-objects.c |   62 +++--------
 bulk-checkin.c         |  275 ++++++++++++++++++++++++++++++++++++++++++++++++
 bulk-checkin.h         |   16 +++
 cache.h                |    2 +
 config.c               |    4 +
 csum-file.c            |   20 ++++
 csum-file.h            |    9 ++
 environment.c          |    1 +
 fast-import.c          |   25 ++---
 pack-write.c           |   53 +++++++++
 pack.h                 |    6 +
 sha1_file.c            |   67 +-----------
 t/t1050-large.sh       |   94 +++++++++++++++--
 zlib.c                 |    9 ++-
 16 files changed, 516 insertions(+), 134 deletions(-)
 create mode 100644 bulk-checkin.c
 create mode 100644 bulk-checkin.h

-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v2 1/5] write_pack_header(): a helper function
  2011-12-01  0:27 [PATCH v2 0/5] Bulk Check-in Junio C Hamano
@ 2011-12-01  0:27 ` Junio C Hamano
  2011-12-01  0:27 ` [PATCH v2 2/5] create_tmp_packfile(): " Junio C Hamano
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-01  0:27 UTC (permalink / raw)
  To: git

Factor out a small logic out of the private write_pack_file() function
in builtin/pack-objects.c

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/pack-objects.c |    9 +++------
 pack-write.c           |   12 ++++++++++++
 pack.h                 |    2 ++
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index ba3705d..6643c16 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -571,7 +571,6 @@ static void write_pack_file(void)
 	uint32_t i = 0, j;
 	struct sha1file *f;
 	off_t offset;
-	struct pack_header hdr;
 	uint32_t nr_remaining = nr_result;
 	time_t last_mtime = 0;
 	struct object_entry **write_order;
@@ -596,11 +595,9 @@ static void write_pack_file(void)
 			f = sha1fd(fd, pack_tmp_name);
 		}
 
-		hdr.hdr_signature = htonl(PACK_SIGNATURE);
-		hdr.hdr_version = htonl(PACK_VERSION);
-		hdr.hdr_entries = htonl(nr_remaining);
-		sha1write(f, &hdr, sizeof(hdr));
-		offset = sizeof(hdr);
+		offset = write_pack_header(f, nr_remaining);
+		if (!offset)
+			die_errno("unable to write pack header");
 		nr_written = 0;
 		for (; i < nr_objects; i++) {
 			struct object_entry *e = write_order[i];
diff --git a/pack-write.c b/pack-write.c
index 9cd3bfb..46f3f84 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -178,6 +178,18 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec
 	return index_name;
 }
 
+off_t write_pack_header(struct sha1file *f, uint32_t nr_entries)
+{
+	struct pack_header hdr;
+
+	hdr.hdr_signature = htonl(PACK_SIGNATURE);
+	hdr.hdr_version = htonl(PACK_VERSION);
+	hdr.hdr_entries = htonl(nr_entries);
+	if (sha1write(f, &hdr, sizeof(hdr)))
+		return 0;
+	return sizeof(hdr);
+}
+
 /*
  * Update pack header with object_count and compute new SHA1 for pack data
  * associated to pack_fd, and write that SHA1 at the end.  That new SHA1
diff --git a/pack.h b/pack.h
index 722a54e..d429d8a 100644
--- a/pack.h
+++ b/pack.h
@@ -2,6 +2,7 @@
 #define PACK_H
 
 #include "object.h"
+#include "csum-file.h"
 
 /*
  * Packed object header
@@ -74,6 +75,7 @@ extern const char *write_idx_file(const char *index_name, struct pack_idx_entry
 extern int check_pack_crc(struct packed_git *p, struct pack_window **w_curs, off_t offset, off_t len, unsigned int nr);
 extern int verify_pack_index(struct packed_git *);
 extern int verify_pack(struct packed_git *);
+extern off_t write_pack_header(struct sha1file *f, uint32_t);
 extern void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t);
 extern char *index_pack_lockfile(int fd);
 extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned char *);
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v2 2/5] create_tmp_packfile(): a helper function
  2011-12-01  0:27 [PATCH v2 0/5] Bulk Check-in Junio C Hamano
  2011-12-01  0:27 ` [PATCH v2 1/5] write_pack_header(): a helper function Junio C Hamano
@ 2011-12-01  0:27 ` Junio C Hamano
  2011-12-01  0:27 ` [PATCH v2 3/5] finish_tmp_packfile(): " Junio C Hamano
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-01  0:27 UTC (permalink / raw)
  To: git

Factor out a small logic out of the private write_pack_file() function
in builtin/pack-objects.c

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/pack-objects.c |   12 +++---------
 pack-write.c           |   10 ++++++++++
 pack.h                 |    3 +++
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 6643c16..3258fa9 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -584,16 +584,10 @@ static void write_pack_file(void)
 		unsigned char sha1[20];
 		char *pack_tmp_name = NULL;
 
-		if (pack_to_stdout) {
+		if (pack_to_stdout)
 			f = sha1fd_throughput(1, "<stdout>", progress_state);
-		} else {
-			char tmpname[PATH_MAX];
-			int fd;
-			fd = odb_mkstemp(tmpname, sizeof(tmpname),
-					 "pack/tmp_pack_XXXXXX");
-			pack_tmp_name = xstrdup(tmpname);
-			f = sha1fd(fd, pack_tmp_name);
-		}
+		else
+			f = create_tmp_packfile(&pack_tmp_name);
 
 		offset = write_pack_header(f, nr_remaining);
 		if (!offset)
diff --git a/pack-write.c b/pack-write.c
index 46f3f84..863cce8 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -328,3 +328,13 @@ int encode_in_pack_object_header(enum object_type type, uintmax_t size, unsigned
 	*hdr = c;
 	return n;
 }
+
+struct sha1file *create_tmp_packfile(char **pack_tmp_name)
+{
+	char tmpname[PATH_MAX];
+	int fd;
+
+	fd = odb_mkstemp(tmpname, sizeof(tmpname), "pack/tmp_pack_XXXXXX");
+	*pack_tmp_name = xstrdup(tmpname);
+	return sha1fd(fd, *pack_tmp_name);
+}
diff --git a/pack.h b/pack.h
index d429d8a..0027ac6 100644
--- a/pack.h
+++ b/pack.h
@@ -84,4 +84,7 @@ extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned ch
 #define PH_ERROR_PACK_SIGNATURE	(-2)
 #define PH_ERROR_PROTOCOL	(-3)
 extern int read_pack_header(int fd, struct pack_header *);
+
+extern struct sha1file *create_tmp_packfile(char **pack_tmp_name);
+
 #endif
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v2 3/5] finish_tmp_packfile(): a helper function
  2011-12-01  0:27 [PATCH v2 0/5] Bulk Check-in Junio C Hamano
  2011-12-01  0:27 ` [PATCH v2 1/5] write_pack_header(): a helper function Junio C Hamano
  2011-12-01  0:27 ` [PATCH v2 2/5] create_tmp_packfile(): " Junio C Hamano
@ 2011-12-01  0:27 ` Junio C Hamano
  2011-12-01  0:27 ` [PATCH v2 4/5] csum-file: introduce sha1file_checkpoint Junio C Hamano
  2011-12-01  0:27 ` [PATCH v2 5/5] bulk-checkin: replace fast-import based implementation Junio C Hamano
  4 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-01  0:27 UTC (permalink / raw)
  To: git

Factor out a small logic out of the private write_pack_file() function
in builtin/pack-objects.c.

This changes the order of finishing multi-pack generation slightly. The
code used to

 - adjust shared perm of temporary packfile
 - rename temporary packfile to the final name
 - update mtime of the packfile under the final name
 - adjust shared perm of temporary idxfile
 - rename temporary idxfile to the final name

but because the helper does not want to do the mtime thing, the updated
code does that step first and then all the rest.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/pack-objects.c |   33 ++++++++++-----------------------
 pack-write.c           |   31 +++++++++++++++++++++++++++++++
 pack.h                 |    1 +
 3 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 3258fa9..b458b6d 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -617,20 +617,8 @@ static void write_pack_file(void)
 
 		if (!pack_to_stdout) {
 			struct stat st;
-			const char *idx_tmp_name;
 			char tmpname[PATH_MAX];
 
-			idx_tmp_name = write_idx_file(NULL, written_list, nr_written,
-						      &pack_idx_opts, sha1);
-
-			snprintf(tmpname, sizeof(tmpname), "%s-%s.pack",
-				 base_name, sha1_to_hex(sha1));
-			free_pack_by_name(tmpname);
-			if (adjust_shared_perm(pack_tmp_name))
-				die_errno("unable to make temporary pack file readable");
-			if (rename(pack_tmp_name, tmpname))
-				die_errno("unable to rename temporary pack file");
-
 			/*
 			 * Packs are runtime accessed in their mtime
 			 * order since newer packs are more likely to contain
@@ -638,28 +626,27 @@ static void write_pack_file(void)
 			 * packs then we should modify the mtime of later ones
 			 * to preserve this property.
 			 */
-			if (stat(tmpname, &st) < 0) {
+			if (stat(pack_tmp_name, &st) < 0) {
 				warning("failed to stat %s: %s",
-					tmpname, strerror(errno));
+					pack_tmp_name, strerror(errno));
 			} else if (!last_mtime) {
 				last_mtime = st.st_mtime;
 			} else {
 				struct utimbuf utb;
 				utb.actime = st.st_atime;
 				utb.modtime = --last_mtime;
-				if (utime(tmpname, &utb) < 0)
+				if (utime(pack_tmp_name, &utb) < 0)
 					warning("failed utime() on %s: %s",
 						tmpname, strerror(errno));
 			}
 
-			snprintf(tmpname, sizeof(tmpname), "%s-%s.idx",
-				 base_name, sha1_to_hex(sha1));
-			if (adjust_shared_perm(idx_tmp_name))
-				die_errno("unable to make temporary index file readable");
-			if (rename(idx_tmp_name, tmpname))
-				die_errno("unable to rename temporary index file");
-
-			free((void *) idx_tmp_name);
+			/* Enough space for "-<sha-1>.pack"? */
+			if (sizeof(tmpname) <= strlen(base_name) + 50)
+				die("pack base name '%s' too long", base_name);
+			snprintf(tmpname, sizeof(tmpname), "%s-", base_name);
+			finish_tmp_packfile(tmpname, pack_tmp_name,
+					    written_list, nr_written,
+					    &pack_idx_opts, sha1);
 			free(pack_tmp_name);
 			puts(sha1_to_hex(sha1));
 		}
diff --git a/pack-write.c b/pack-write.c
index 863cce8..cadc3e1 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -338,3 +338,34 @@ struct sha1file *create_tmp_packfile(char **pack_tmp_name)
 	*pack_tmp_name = xstrdup(tmpname);
 	return sha1fd(fd, *pack_tmp_name);
 }
+
+void finish_tmp_packfile(char *name_buffer,
+			 const char *pack_tmp_name,
+			 struct pack_idx_entry **written_list,
+			 uint32_t nr_written,
+			 struct pack_idx_option *pack_idx_opts,
+			 unsigned char sha1[])
+{
+	const char *idx_tmp_name;
+	char *end_of_name_prefix = strrchr(name_buffer, 0);
+
+	if (adjust_shared_perm(pack_tmp_name))
+		die_errno("unable to make temporary pack file readable");
+
+	idx_tmp_name = write_idx_file(NULL, written_list, nr_written,
+				      pack_idx_opts, sha1);
+	if (adjust_shared_perm(idx_tmp_name))
+		die_errno("unable to make temporary index file readable");
+
+	sprintf(end_of_name_prefix, "%s.pack", sha1_to_hex(sha1));
+	free_pack_by_name(name_buffer);
+
+	if (rename(pack_tmp_name, name_buffer))
+		die_errno("unable to rename temporary pack file");
+
+	sprintf(end_of_name_prefix, "%s.idx", sha1_to_hex(sha1));
+	if (rename(idx_tmp_name, name_buffer))
+		die_errno("unable to rename temporary index file");
+
+	free((void *)idx_tmp_name);
+}
diff --git a/pack.h b/pack.h
index 0027ac6..cfb0f69 100644
--- a/pack.h
+++ b/pack.h
@@ -86,5 +86,6 @@ extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned ch
 extern int read_pack_header(int fd, struct pack_header *);
 
 extern struct sha1file *create_tmp_packfile(char **pack_tmp_name);
+extern void finish_tmp_packfile(char *name_buffer, const char *pack_tmp_name, struct pack_idx_entry **written_list, uint32_t nr_written, struct pack_idx_option *pack_idx_opts, unsigned char sha1[]);
 
 #endif
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v2 4/5] csum-file: introduce sha1file_checkpoint
  2011-12-01  0:27 [PATCH v2 0/5] Bulk Check-in Junio C Hamano
                   ` (2 preceding siblings ...)
  2011-12-01  0:27 ` [PATCH v2 3/5] finish_tmp_packfile(): " Junio C Hamano
@ 2011-12-01  0:27 ` Junio C Hamano
  2011-12-01  0:27 ` [PATCH v2 5/5] bulk-checkin: replace fast-import based implementation Junio C Hamano
  4 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-01  0:27 UTC (permalink / raw)
  To: git

It is useful to be able to rewind a check-summed file to a certain
previous state after writing data into it using sha1write() API. The
fast-import command does this after streaming a blob data to the packfile
being generated and then noticing that the same blob has already been
written, and it does this with a private code truncate_pack() that is
commented as "Yes, this is a layering violation".

Introduce two API functions, sha1file_checkpoint(), that allows the caller
to save a state of a sha1file, and then later revert it to the saved state.
Use it to reimplement truncate_pack().

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 csum-file.c   |   20 ++++++++++++++++++++
 csum-file.h   |    9 +++++++++
 fast-import.c |   25 ++++++++-----------------
 3 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/csum-file.c b/csum-file.c
index fc97d6e..53f5375 100644
--- a/csum-file.c
+++ b/csum-file.c
@@ -158,6 +158,26 @@ struct sha1file *sha1fd_throughput(int fd, const char *name, struct progress *tp
 	return f;
 }
 
+void sha1file_checkpoint(struct sha1file *f, struct sha1file_checkpoint *checkpoint)
+{
+	sha1flush(f);
+	checkpoint->offset = f->total;
+	checkpoint->ctx = f->ctx;
+}
+
+int sha1file_truncate(struct sha1file *f, struct sha1file_checkpoint *checkpoint)
+{
+	off_t offset = checkpoint->offset;
+
+	if (ftruncate(f->fd, offset) ||
+	    lseek(f->fd, offset, SEEK_SET) != offset)
+		return -1;
+	f->total = offset;
+	f->ctx = checkpoint->ctx;
+	f->offset = 0; /* sha1flush() was called in checkpoint */
+	return 0;
+}
+
 void crc32_begin(struct sha1file *f)
 {
 	f->crc32 = crc32(0, NULL, 0);
diff --git a/csum-file.h b/csum-file.h
index 6a7967c..3b540bd 100644
--- a/csum-file.h
+++ b/csum-file.h
@@ -17,6 +17,15 @@ struct sha1file {
 	unsigned char buffer[8192];
 };
 
+/* Checkpoint */
+struct sha1file_checkpoint {
+	off_t offset;
+	git_SHA_CTX ctx;
+};
+
+extern void sha1file_checkpoint(struct sha1file *, struct sha1file_checkpoint *);
+extern int sha1file_truncate(struct sha1file *, struct sha1file_checkpoint *);
+
 /* sha1close flags */
 #define CSUM_CLOSE	1
 #define CSUM_FSYNC	2
diff --git a/fast-import.c b/fast-import.c
index 8d8ea3c..a8db41b 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -1143,17 +1143,11 @@ static int store_object(
 	return 0;
 }
 
-static void truncate_pack(off_t to, git_SHA_CTX *ctx)
+static void truncate_pack(struct sha1file_checkpoint *checkpoint)
 {
-	if (ftruncate(pack_data->pack_fd, to)
-	 || lseek(pack_data->pack_fd, to, SEEK_SET) != to)
+	if (sha1file_truncate(pack_file, checkpoint))
 		die_errno("cannot truncate pack to skip duplicate");
-	pack_size = to;
-
-	/* yes this is a layering violation */
-	pack_file->total = to;
-	pack_file->offset = 0;
-	pack_file->ctx = *ctx;
+	pack_size = checkpoint->offset;
 }
 
 static void stream_blob(uintmax_t len, unsigned char *sha1out, uintmax_t mark)
@@ -1166,8 +1160,8 @@ static void stream_blob(uintmax_t len, unsigned char *sha1out, uintmax_t mark)
 	unsigned long hdrlen;
 	off_t offset;
 	git_SHA_CTX c;
-	git_SHA_CTX pack_file_ctx;
 	git_zstream s;
+	struct sha1file_checkpoint checkpoint;
 	int status = Z_OK;
 
 	/* Determine if we should auto-checkpoint. */
@@ -1175,11 +1169,8 @@ static void stream_blob(uintmax_t len, unsigned char *sha1out, uintmax_t mark)
 		|| (pack_size + 60 + len) < pack_size)
 		cycle_packfile();
 
-	offset = pack_size;
-
-	/* preserve the pack_file SHA1 ctx in case we have to truncate later */
-	sha1flush(pack_file);
-	pack_file_ctx = pack_file->ctx;
+	sha1file_checkpoint(pack_file, &checkpoint);
+	offset = checkpoint.offset;
 
 	hdrlen = snprintf((char *)out_buf, out_sz, "blob %" PRIuMAX, len) + 1;
 	if (out_sz <= hdrlen)
@@ -1245,14 +1236,14 @@ static void stream_blob(uintmax_t len, unsigned char *sha1out, uintmax_t mark)
 
 	if (e->idx.offset) {
 		duplicate_count_by_type[OBJ_BLOB]++;
-		truncate_pack(offset, &pack_file_ctx);
+		truncate_pack(&checkpoint);
 
 	} else if (find_sha1_pack(sha1, packed_git)) {
 		e->type = OBJ_BLOB;
 		e->pack_id = MAX_PACK_ID;
 		e->idx.offset = 1; /* just not zero! */
 		duplicate_count_by_type[OBJ_BLOB]++;
-		truncate_pack(offset, &pack_file_ctx);
+		truncate_pack(&checkpoint);
 
 	} else {
 		e->depth = 0;
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v2 5/5] bulk-checkin: replace fast-import based implementation
  2011-12-01  0:27 [PATCH v2 0/5] Bulk Check-in Junio C Hamano
                   ` (3 preceding siblings ...)
  2011-12-01  0:27 ` [PATCH v2 4/5] csum-file: introduce sha1file_checkpoint Junio C Hamano
@ 2011-12-01  0:27 ` Junio C Hamano
  2011-12-01  8:05   ` Nguyen Thai Ngoc Duy
  2011-12-02  0:40   ` [PATCH v3 0/6] Bulk check-in Junio C Hamano
  4 siblings, 2 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-01  0:27 UTC (permalink / raw)
  To: git

This extends the earlier approach to stream a large file directly from the
filesystem to its own packfile, and allows "git add" to send large files
directly into a single pack. Older code used to spawn fast-import, but the
new bulk-checkin API replaces it.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Makefile               |    2 +
 builtin/add.c          |    5 +
 builtin/pack-objects.c |    6 +-
 bulk-checkin.c         |  275 ++++++++++++++++++++++++++++++++++++++++++++++++
 bulk-checkin.h         |   16 +++
 cache.h                |    2 +
 config.c               |    4 +
 environment.c          |    1 +
 sha1_file.c            |   67 +-----------
 t/t1050-large.sh       |   94 +++++++++++++++--
 zlib.c                 |    9 ++-
 11 files changed, 403 insertions(+), 78 deletions(-)
 create mode 100644 bulk-checkin.c
 create mode 100644 bulk-checkin.h

diff --git a/Makefile b/Makefile
index 3139c19..418dd2e 100644
--- a/Makefile
+++ b/Makefile
@@ -505,6 +505,7 @@ LIB_H += argv-array.h
 LIB_H += attr.h
 LIB_H += blob.h
 LIB_H += builtin.h
+LIB_H += bulk-checkin.h
 LIB_H += cache.h
 LIB_H += cache-tree.h
 LIB_H += color.h
@@ -591,6 +592,7 @@ LIB_OBJS += base85.o
 LIB_OBJS += bisect.o
 LIB_OBJS += blob.o
 LIB_OBJS += branch.o
+LIB_OBJS += bulk-checkin.o
 LIB_OBJS += bundle.o
 LIB_OBJS += cache-tree.o
 LIB_OBJS += color.o
diff --git a/builtin/add.c b/builtin/add.c
index c59b0c9..1c42900 100644
--- a/builtin/add.c
+++ b/builtin/add.c
@@ -13,6 +13,7 @@
 #include "diff.h"
 #include "diffcore.h"
 #include "revision.h"
+#include "bulk-checkin.h"
 
 static const char * const builtin_add_usage[] = {
 	"git add [options] [--] <filepattern>...",
@@ -458,11 +459,15 @@ int cmd_add(int argc, const char **argv, const char *prefix)
 		free(seen);
 	}
 
+	plug_bulk_checkin();
+
 	exit_status |= add_files_to_cache(prefix, pathspec, flags);
 
 	if (add_new_files)
 		exit_status |= add_files(&dir, flags);
 
+	unplug_bulk_checkin();
+
  finish:
 	if (active_cache_changed) {
 		if (write_cache(newfd, active_cache, active_nr) ||
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b458b6d..dde913e 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -76,7 +76,7 @@ static struct pack_idx_option pack_idx_opts;
 static const char *base_name;
 static int progress = 1;
 static int window = 10;
-static unsigned long pack_size_limit, pack_size_limit_cfg;
+static unsigned long pack_size_limit;
 static int depth = 50;
 static int delta_search_threads;
 static int pack_to_stdout;
@@ -2009,10 +2009,6 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 			    pack_idx_opts.version);
 		return 0;
 	}
-	if (!strcmp(k, "pack.packsizelimit")) {
-		pack_size_limit_cfg = git_config_ulong(k, v);
-		return 0;
-	}
 	return git_default_config(k, v, cb);
 }
 
diff --git a/bulk-checkin.c b/bulk-checkin.c
new file mode 100644
index 0000000..807463e
--- /dev/null
+++ b/bulk-checkin.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2011, Google Inc.
+ */
+#include "bulk-checkin.h"
+#include "csum-file.h"
+#include "pack.h"
+
+static int pack_compression_level = Z_DEFAULT_COMPRESSION;
+
+static struct bulk_checkin_state {
+	unsigned plugged:1;
+
+	char *pack_tmp_name;
+	struct sha1file *f;
+	off_t offset;
+	struct pack_idx_option pack_idx_opts;
+
+	struct pack_idx_entry **written;
+	uint32_t alloc_written;
+	uint32_t nr_written;
+} state;
+
+static void finish_bulk_checkin(struct bulk_checkin_state *state)
+{
+	unsigned char sha1[20];
+	char packname[PATH_MAX];
+	int i;
+
+	if (!state->f)
+		return;
+
+	if (state->nr_written == 0) {
+		close(state->f->fd);
+		unlink(state->pack_tmp_name);
+		goto clear_exit;
+	} else if (state->nr_written == 1) {
+		sha1close(state->f, sha1, CSUM_FSYNC);
+	} else {
+		int fd = sha1close(state->f, sha1, 0);
+		fixup_pack_header_footer(fd, sha1, state->pack_tmp_name,
+					 state->nr_written, sha1,
+					 state->offset);
+		close(fd);
+	}
+
+	sprintf(packname, "%s/pack/pack-", get_object_directory());
+	finish_tmp_packfile(packname, state->pack_tmp_name,
+			    state->written, state->nr_written,
+			    &state->pack_idx_opts, sha1);
+	for (i = 0; i < state->nr_written; i++)
+		free(state->written[i]);
+
+clear_exit:
+	free(state->written);
+	memset(state, 0, sizeof(*state));
+
+	/* Make objects we just wrote available to ourselves */
+	reprepare_packed_git();
+}
+
+static int already_written(struct bulk_checkin_state *state, unsigned char sha1[])
+{
+	int i;
+
+	/* The object may already exist in the repository */
+	if (has_sha1_file(sha1))
+		return 1;
+
+	/* Might want to keep the list sorted */
+	for (i = 0; i < state->nr_written; i++)
+		if (!hashcmp(state->written[i]->sha1, sha1))
+			return 1;
+
+	/* This is a new object we need to keep */
+	return 0;
+}
+
+/*
+ * Read the contents from fd for size bytes, streaming it to the
+ * packfile in state while updating the hash in ctx. Signal a failure
+ * by returning a negative value when the resulting pack would exceed
+ * the pack size limit and this is not the first object in the pack,
+ * so that the caller can discard what we wrote from the current pack
+ * by truncating it and opening a new one. The caller will then call
+ * us again after rewinding the input fd.
+ *
+ * The already_hashed_to pointer is kept untouched by the caller to
+ * make sure we do not hash the same byte when we are called
+ * again. This way, the caller does not have to checkpoint its hash
+ * status before calling us just in case we ask it to call us again
+ * with a new pack.
+ */
+static int stream_to_pack(struct bulk_checkin_state *state,
+			  git_SHA_CTX *ctx, off_t *already_hashed_to,
+			  int fd, size_t size, enum object_type type,
+			  const char *path, unsigned flags)
+{
+	git_zstream s;
+	unsigned char obuf[16384];
+	unsigned hdrlen;
+	int status = Z_OK;
+	int write_object = (flags & HASH_WRITE_OBJECT);
+	off_t offset = 0;
+
+	memset(&s, 0, sizeof(s));
+	git_deflate_init(&s, pack_compression_level);
+
+	hdrlen = encode_in_pack_object_header(type, size, obuf);
+	s.next_out = obuf + hdrlen;
+	s.avail_out = sizeof(obuf) - hdrlen;
+
+	while (status != Z_STREAM_END) {
+		unsigned char ibuf[16384];
+
+		if (size && !s.avail_in) {
+			ssize_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
+			if (xread(fd, ibuf, rsize) != rsize)
+				die("failed to read %d bytes from '%s'",
+				    (int)rsize, path);
+			offset += rsize;
+			if (*already_hashed_to < offset) {
+				size_t hsize = offset - *already_hashed_to;
+				if (rsize < hsize)
+					hsize = rsize;
+				if (hsize)
+					git_SHA1_Update(ctx, ibuf, hsize);
+				*already_hashed_to = offset;
+			}
+			s.next_in = ibuf;
+			s.avail_in = rsize;
+			size -= rsize;
+		}
+
+		status = git_deflate(&s, size ? 0 : Z_FINISH);
+
+		if (!s.avail_out || status == Z_STREAM_END) {
+			if (write_object) {
+				size_t written = s.next_out - obuf;
+
+				/* would we bust the size limit? */
+				if (state->nr_written &&
+				    pack_size_limit_cfg &&
+				    pack_size_limit_cfg < state->offset + written) {
+					git_deflate_abort(&s);
+					return -1;
+				}
+
+				sha1write(state->f, obuf, written);
+				state->offset += written;
+			}
+			s.next_out = obuf;
+			s.avail_out = sizeof(obuf);
+		}
+
+		switch (status) {
+		case Z_OK:
+		case Z_BUF_ERROR:
+		case Z_STREAM_END:
+			continue;
+		default:
+			die("unexpected deflate failure: %d", status);
+		}
+	}
+	git_deflate_end(&s);
+	return 0;
+}
+
+/* Lazily create backing packfile for the state */
+static void prepare_to_stream(struct bulk_checkin_state *state,
+			      unsigned flags)
+{
+	if (!(flags & HASH_WRITE_OBJECT) || state->f)
+		return;
+
+	state->f = create_tmp_packfile(&state->pack_tmp_name);
+	reset_pack_idx_option(&state->pack_idx_opts);
+
+	/* Pretend we are going to write only one object */
+	state->offset = write_pack_header(state->f, 1);
+	if (!state->offset)
+		die_errno("unable to write pack header");
+}
+
+static int deflate_to_pack(struct bulk_checkin_state *state,
+			   unsigned char result_sha1[],
+			   int fd, size_t size,
+			   enum object_type type, const char *path,
+			   unsigned flags)
+{
+	off_t seekback, already_hashed_to;
+	git_SHA_CTX ctx;
+	unsigned char obuf[16384];
+	unsigned header_len;
+	struct sha1file_checkpoint checkpoint;
+	struct pack_idx_entry *idx = NULL;
+
+	seekback = lseek(fd, 0, SEEK_CUR);
+	if (seekback == (off_t) -1)
+		return error("cannot find the current offset");
+
+	header_len = sprintf((char *)obuf, "%s %" PRIuMAX,
+			     typename(type), (uintmax_t)size) + 1;
+	git_SHA1_Init(&ctx);
+	git_SHA1_Update(&ctx, obuf, header_len);
+
+	/* Note: idx is non-NULL when we are writing */
+	if ((flags & HASH_WRITE_OBJECT) != 0)
+		idx = xcalloc(1, sizeof(*idx));
+
+	already_hashed_to = seekback;
+
+	while (1) {
+		prepare_to_stream(state, flags);
+		if (idx) {
+			sha1file_checkpoint(state->f, &checkpoint);
+			idx->offset = state->offset;
+			crc32_begin(state->f);
+		}
+		if (!stream_to_pack(state, &ctx, &already_hashed_to,
+				    fd, size, type, path, flags))
+			break;
+		/*
+		 * Writing this object to the current pack will make
+		 * it too big; we need to truncate it, start a new
+		 * pack, and write into it.
+		 */
+		if (!idx)
+			die("BUG: should not happen");
+		sha1file_truncate(state->f, &checkpoint);
+		state->offset = checkpoint.offset;
+		finish_bulk_checkin(state);
+		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
+			return error("cannot seek back");
+	}
+	git_SHA1_Final(result_sha1, &ctx);
+	if (!idx)
+		return 0;
+
+	if (already_written(state, result_sha1)) {
+		sha1file_truncate(state->f, &checkpoint);
+		state->offset = checkpoint.offset;
+		free(idx);
+	} else {
+		idx->crc32 = crc32_end(state->f);
+		hashcpy(idx->sha1, result_sha1);
+		ALLOC_GROW(state->written,
+			   state->nr_written + 1,
+			   state->alloc_written);
+		state->written[state->nr_written++] = idx;
+	}
+	return 0;
+}
+
+int index_bulk_checkin(unsigned char *sha1,
+		       int fd, size_t size, enum object_type type,
+		       const char *path, unsigned flags)
+{
+	int status = deflate_to_pack(&state, sha1, fd, size, type,
+				     path, flags);
+	if (!state.plugged)
+		finish_bulk_checkin(&state);
+	return status;
+}
+
+void plug_bulk_checkin(void)
+{
+	state.plugged = 1;
+}
+
+void unplug_bulk_checkin(void)
+{
+	state.plugged = 0;
+	if (state.f)
+		finish_bulk_checkin(&state);
+}
diff --git a/bulk-checkin.h b/bulk-checkin.h
new file mode 100644
index 0000000..4f599f8
--- /dev/null
+++ b/bulk-checkin.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2011, Google Inc.
+ */
+#ifndef BULK_CHECKIN_H
+#define BULK_CHECKIN_H
+
+#include "cache.h"
+
+extern int index_bulk_checkin(unsigned char sha1[],
+			      int fd, size_t size, enum object_type type,
+			      const char *path, unsigned flags);
+
+extern void plug_bulk_checkin(void);
+extern void unplug_bulk_checkin(void);
+
+#endif
diff --git a/cache.h b/cache.h
index 2e6ad36..4f20861 100644
--- a/cache.h
+++ b/cache.h
@@ -35,6 +35,7 @@ int git_inflate(git_zstream *, int flush);
 void git_deflate_init(git_zstream *, int level);
 void git_deflate_init_gzip(git_zstream *, int level);
 void git_deflate_end(git_zstream *);
+int git_deflate_abort(git_zstream *);
 int git_deflate_end_gently(git_zstream *);
 int git_deflate(git_zstream *, int flush);
 unsigned long git_deflate_bound(git_zstream *, unsigned long);
@@ -598,6 +599,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long pack_size_limit_cfg;
 extern int read_replace_refs;
 extern int fsync_object_files;
 extern int core_preload_index;
diff --git a/config.c b/config.c
index edf9914..c736802 100644
--- a/config.c
+++ b/config.c
@@ -797,6 +797,10 @@ int git_default_config(const char *var, const char *value, void *dummy)
 		return 0;
 	}
 
+	if (!strcmp(var, "pack.packsizelimit")) {
+		pack_size_limit_cfg = git_config_ulong(var, value);
+		return 0;
+	}
 	/* Add other config variables here and to Documentation/config.txt. */
 	return 0;
 }
diff --git a/environment.c b/environment.c
index 0bee6a7..31e4284 100644
--- a/environment.c
+++ b/environment.c
@@ -60,6 +60,7 @@ char *notes_ref_name;
 int grafts_replace_parents = 1;
 int core_apply_sparse_checkout;
 struct startup_info *startup_info;
+unsigned long pack_size_limit_cfg;
 
 /* Parallel index stat data preload? */
 int core_preload_index = 0;
diff --git a/sha1_file.c b/sha1_file.c
index 27f3b9b..c96e366 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -18,6 +18,7 @@
 #include "refs.h"
 #include "pack-revindex.h"
 #include "sha1-lookup.h"
+#include "bulk-checkin.h"
 
 #ifndef O_NOATIME
 #if defined(__linux__) && (defined(__i386__) || defined(__PPC__))
@@ -2679,10 +2680,8 @@ static int index_core(unsigned char *sha1, int fd, size_t size,
 }
 
 /*
- * This creates one packfile per large blob, because the caller
- * immediately wants the result sha1, and fast-import can report the
- * object name via marks mechanism only by closing the created
- * packfile.
+ * This creates one packfile per large blob unless bulk-checkin
+ * machinery is "plugged".
  *
  * This also bypasses the usual "convert-to-git" dance, and that is on
  * purpose. We could write a streaming version of the converting
@@ -2696,65 +2695,7 @@ static int index_stream(unsigned char *sha1, int fd, size_t size,
 			enum object_type type, const char *path,
 			unsigned flags)
 {
-	struct child_process fast_import;
-	char export_marks[512];
-	const char *argv[] = { "fast-import", "--quiet", export_marks, NULL };
-	char tmpfile[512];
-	char fast_import_cmd[512];
-	char buf[512];
-	int len, tmpfd;
-
-	strcpy(tmpfile, git_path("hashstream_XXXXXX"));
-	tmpfd = git_mkstemp_mode(tmpfile, 0600);
-	if (tmpfd < 0)
-		die_errno("cannot create tempfile: %s", tmpfile);
-	if (close(tmpfd))
-		die_errno("cannot close tempfile: %s", tmpfile);
-	sprintf(export_marks, "--export-marks=%s", tmpfile);
-
-	memset(&fast_import, 0, sizeof(fast_import));
-	fast_import.in = -1;
-	fast_import.argv = argv;
-	fast_import.git_cmd = 1;
-	if (start_command(&fast_import))
-		die_errno("index-stream: git fast-import failed");
-
-	len = sprintf(fast_import_cmd, "blob\nmark :1\ndata %lu\n",
-		      (unsigned long) size);
-	write_or_whine(fast_import.in, fast_import_cmd, len,
-		       "index-stream: feeding fast-import");
-	while (size) {
-		char buf[10240];
-		size_t sz = size < sizeof(buf) ? size : sizeof(buf);
-		ssize_t actual;
-
-		actual = read_in_full(fd, buf, sz);
-		if (actual < 0)
-			die_errno("index-stream: reading input");
-		if (write_in_full(fast_import.in, buf, actual) != actual)
-			die_errno("index-stream: feeding fast-import");
-		size -= actual;
-	}
-	if (close(fast_import.in))
-		die_errno("index-stream: closing fast-import");
-	if (finish_command(&fast_import))
-		die_errno("index-stream: finishing fast-import");
-
-	tmpfd = open(tmpfile, O_RDONLY);
-	if (tmpfd < 0)
-		die_errno("index-stream: cannot open fast-import mark");
-	len = read(tmpfd, buf, sizeof(buf));
-	if (len < 0)
-		die_errno("index-stream: reading fast-import mark");
-	if (close(tmpfd) < 0)
-		die_errno("index-stream: closing fast-import mark");
-	if (unlink(tmpfile))
-		die_errno("index-stream: unlinking fast-import mark");
-	if (len != 44 ||
-	    memcmp(":1 ", buf, 3) ||
-	    get_sha1_hex(buf + 3, sha1))
-		die_errno("index-stream: unexpected fast-import mark: <%s>", buf);
-	return 0;
+	return index_bulk_checkin(sha1, fd, size, type, path, flags);
 }
 
 int index_fd(unsigned char *sha1, int fd, struct stat *st,
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index deba111..29d6024 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -7,21 +7,97 @@ test_description='adding and checking out large blobs'
 
 test_expect_success setup '
 	git config core.bigfilethreshold 200k &&
-	echo X | dd of=large bs=1k seek=2000
+	echo X | dd of=large1 bs=1k seek=2000 &&
+	echo X | dd of=large2 bs=1k seek=2000 &&
+	echo X | dd of=large3 bs=1k seek=2000 &&
+	echo Y | dd of=huge bs=1k seek=2500
 '
 
-test_expect_success 'add a large file' '
-	git add large &&
-	# make sure we got a packfile and no loose objects
-	test -f .git/objects/pack/pack-*.pack &&
-	test ! -f .git/objects/??/??????????????????????????????????????
+test_expect_success 'add a large file or two' '
+	git add large1 huge large2 &&
+	# make sure we got a single packfile and no loose objects
+	bad= count=0 idx= &&
+	for p in .git/objects/pack/pack-*.pack
+	do
+		count=$(( $count + 1 ))
+		if test -f "$p" && idx=${p%.pack}.idx && test -f "$idx"
+		then
+			continue
+		fi
+		bad=t
+	done &&
+	test -z "$bad" &&
+	test $count = 1 &&
+	cnt=$(git show-index <"$idx" | wc -l) &&
+	test $cnt = 2 &&
+	for l in .git/objects/??/??????????????????????????????????????
+	do
+		test -f "$l" || continue
+		bad=t
+	done &&
+	test -z "$bad" &&
+
+	# attempt to add another copy of the same
+	git add large3 &&
+	bad= count=0 &&
+	for p in .git/objects/pack/pack-*.pack
+	do
+		count=$(( $count + 1 ))
+		if test -f "$p" && idx=${p%.pack}.idx && test -f "$idx"
+		then
+			continue
+		fi
+		bad=t
+	done &&
+	test -z "$bad" &&
+	test $count = 1
 '
 
 test_expect_success 'checkout a large file' '
-	large=$(git rev-parse :large) &&
-	git update-index --add --cacheinfo 100644 $large another &&
+	large1=$(git rev-parse :large1) &&
+	git update-index --add --cacheinfo 100644 $large1 another &&
 	git checkout another &&
-	cmp large another ;# this must not be test_cmp
+	cmp large1 another ;# this must not be test_cmp
+'
+
+test_expect_success 'packsize limit' '
+	test_create_repo mid &&
+	(
+		cd mid &&
+		git config core.bigfilethreshold 64k &&
+		git config pack.packsizelimit 256k &&
+
+		# mid1 and mid2 will fit within 256k limit but
+		# appending mid3 will bust the limit and will
+		# result in a separate packfile.
+		test-genrandom "a" $(( 66 * 1024 )) >mid1 &&
+		test-genrandom "b" $(( 80 * 1024 )) >mid2 &&
+		test-genrandom "c" $(( 128 * 1024 )) >mid3 &&
+		git add mid1 mid2 mid3 &&
+
+		count=0
+		for pi in .git/objects/pack/pack-*.idx
+		do
+			test -f "$pi" && count=$(( $count + 1 ))
+		done &&
+		test $count = 2 &&
+
+		(
+			git hash-object --stdin <mid1
+			git hash-object --stdin <mid2
+			git hash-object --stdin <mid3
+		) |
+		sort >expect &&
+
+		for pi in .git/objects/pack/pack-*.idx
+		do
+			git show-index <"$pi"
+		done |
+		sed -e "s/^[0-9]* \([0-9a-f]*\) .*/\1/" |
+		sort >actual &&
+
+		test_cmp expect actual
+	)
 '
 
 test_done
diff --git a/zlib.c b/zlib.c
index 3c63d48..2b2c0c7 100644
--- a/zlib.c
+++ b/zlib.c
@@ -188,13 +188,20 @@ void git_deflate_init_gzip(git_zstream *strm, int level)
 	    strm->z.msg ? strm->z.msg : "no message");
 }
 
-void git_deflate_end(git_zstream *strm)
+int git_deflate_abort(git_zstream *strm)
 {
 	int status;
 
 	zlib_pre_call(strm);
 	status = deflateEnd(&strm->z);
 	zlib_post_call(strm);
+	return status;
+}
+
+void git_deflate_end(git_zstream *strm)
+{
+	int status = git_deflate_abort(strm);
+
 	if (status == Z_OK)
 		return;
 	error("deflateEnd: %s (%s)", zerr_to_string(status),
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH v2 5/5] bulk-checkin: replace fast-import based implementation
  2011-12-01  0:27 ` [PATCH v2 5/5] bulk-checkin: replace fast-import based implementation Junio C Hamano
@ 2011-12-01  8:05   ` Nguyen Thai Ngoc Duy
  2011-12-01 15:46     ` Junio C Hamano
  2011-12-02  0:40   ` [PATCH v3 0/6] Bulk check-in Junio C Hamano
  1 sibling, 1 reply; 15+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2011-12-01  8:05 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Thu, Dec 1, 2011 at 7:27 AM, Junio C Hamano <gitster@pobox.com> wrote:
> @@ -458,11 +459,15 @@ int cmd_add(int argc, const char **argv, const char *prefix)
>                free(seen);
>        }
>
> +       plug_bulk_checkin();
> +
>        exit_status |= add_files_to_cache(prefix, pathspec, flags);
>
>        if (add_new_files)
>                exit_status |= add_files(&dir, flags);
>
> +       unplug_bulk_checkin();
> +
>  finish:
>        if (active_cache_changed) {
>                if (write_cache(newfd, active_cache, active_nr) ||

Also do the same for git-commit, except as-is commit case, for
updating large files?

update-index should also learn about this, although at plumbing level,
maybe we could have an option to let users choose what to stream (i.e.
skip size check for streaming).
-- 
Duy

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v2 5/5] bulk-checkin: replace fast-import based implementation
  2011-12-01  8:05   ` Nguyen Thai Ngoc Duy
@ 2011-12-01 15:46     ` Junio C Hamano
  0 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-01 15:46 UTC (permalink / raw)
  To: Nguyen Thai Ngoc Duy; +Cc: git

Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:

> On Thu, Dec 1, 2011 at 7:27 AM, Junio C Hamano <gitster@pobox.com> wrote:
>> @@ -458,11 +459,15 @@ int cmd_add(int argc, const char **argv, const char *prefix)
>>                free(seen);
>>        }
>>
>> +       plug_bulk_checkin();
>> +
>>        exit_status |= add_files_to_cache(prefix, pathspec, flags);
>>
>>        if (add_new_files)
>>                exit_status |= add_files(&dir, flags);
>>
>> +       unplug_bulk_checkin();
>> +
>>  finish:
>>        if (active_cache_changed) {
>>                if (write_cache(newfd, active_cache, active_nr) ||
>
> Also do the same for git-commit, except as-is commit case, for
> updating large files?
>
> update-index should also learn about this,...

I didn't do any of the above in this series and that was on purpose.

Getting plug/unplug (and all the machinery that they are related to) to do
the right thing when they are called is far more important at this stage
than figuring out when it is appropriate to plug and unplug in various
random programs, it is sufficient to have one canonical and simplest
callsite pair as an example in the primary interface to add new objects
(i.e. "git add") for that purpose, and it is not too cumbersome for early
adopters to remember using "git add" when adding oddball large blobs
instead of using "commit -a".

In short, I wanted to keep the patch to the minimum to avoid distracting
capable reviewers (which is in short supply these days).

I wouldn't stop you or anybody from preparing a patch series that adds
plug/unplug pairs before the machinery is perfected, so that such an
effort can expose a corner case where the machinery and the API presented
in this series is inadequate and can use improvements. Once the machinery
gets perfected, submit such a patch series to add users that are well
tested.

Thanks.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v3 0/6] Bulk check-in
  2011-12-01  0:27 ` [PATCH v2 5/5] bulk-checkin: replace fast-import based implementation Junio C Hamano
  2011-12-01  8:05   ` Nguyen Thai Ngoc Duy
@ 2011-12-02  0:40   ` Junio C Hamano
  2011-12-02  0:40     ` [PATCH v3 1/6] bulk-checkin: replace fast-import based implementation Junio C Hamano
                       ` (5 more replies)
  1 sibling, 6 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-02  0:40 UTC (permalink / raw)
  To: git

I would declare that the earlier parts of the v2 that are about factoring
out various API pieces from existing code are basically completed, so they
are not part of this iteration.

The bulk-checkin patch from v2 has been tweaked a bit (deflate_to_pack()
initializes "already_hashed_to" pointer to 0, instead of the current file
position "seekback"), and then the rest of the series builds on top of it
to add a new in-pack encoding that I am tentatively calling "chunked".

The basic idea is to represent a large/huge blob as a concatenation of
smaller blobs. An entry in a pack in "chunked" representation records a
list of object names of the component blob objects.  The object name given
to such a blob is computed exactly the same way as before. In other words,
the name of a object does not depend on its representation; we hash "blob
<size> NUL" and the whole large blob contents to come up with its name. It
is *not* the hash of the component blob object names.

As can be seen in the log message of the "support chunked-object encoding"
patch, many pieces are still missing from this series and filling them
will be a long and tortuous journey. But we need to start somewhere.

I specifically excluded any heuristics to split large objects into chunks
in a self-synchronising way so that a small edit near the beginning of a
large blob results in a handful of new component blobs followed by the
same component blobs as used to represent the same blob before such an
edit, and I do not plan to work on that part myself. My impression from
listening Avery's plug for "bup" is that it is a solved problem; it should
be reasonably straightforward to lift the logic and plug it into the
framework presented here (once the codebase gets solid enough, that is).

After this series, the next step for me is likely to teach the streaming
interface about "chunked" objects, and then pack-objects to take notice
and reuse "chunked" representation when sending things out (which means
that sending a "chunked" encoded blob would involve sending the component
blobs it uses, among other things), but I expect that it will extend well
into next year.

Junio C Hamano (6):
  bulk-checkin: replace fast-import based implementation
  varint-in-pack: refactor varint encoding/decoding
  new representation types in the packstream
  bulk-checkin: allow the same data to be multiply hashed
  bulk-checkin: support chunked-object encoding
  chunked-object: fallback checkout codepaths

 Makefile               |    3 +
 builtin/add.c          |    5 +
 builtin/pack-objects.c |   34 ++---
 bulk-checkin.c         |  415 ++++++++++++++++++++++++++++++++++++++++++++++++
 bulk-checkin.h         |   17 ++
 cache.h                |   13 ++-
 config.c               |    9 +
 environment.c          |    2 +
 pack-write.c           |   50 +++++-
 pack.h                 |    2 +
 sha1_file.c            |  150 +++++++++---------
 split-chunk.c          |   28 ++++
 t/t1050-large.sh       |  135 +++++++++++++++-
 zlib.c                 |    9 +-
 14 files changed, 760 insertions(+), 112 deletions(-)
 create mode 100644 bulk-checkin.c
 create mode 100644 bulk-checkin.h
 create mode 100644 split-chunk.c

-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v3 1/6] bulk-checkin: replace fast-import based implementation
  2011-12-02  0:40   ` [PATCH v3 0/6] Bulk check-in Junio C Hamano
@ 2011-12-02  0:40     ` Junio C Hamano
  2011-12-02  0:40     ` [PATCH v3 2/6] varint-in-pack: refactor varint encoding/decoding Junio C Hamano
                       ` (4 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-02  0:40 UTC (permalink / raw)
  To: git

This extends the earlier approach to stream a large file directly from the
filesystem to its own packfile, and allows "git add" to send large files
directly into a single pack. Older code used to spawn fast-import, but the
new bulk-checkin API replaces it.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Makefile               |    2 +
 builtin/add.c          |    5 +
 builtin/pack-objects.c |    6 +-
 bulk-checkin.c         |  275 ++++++++++++++++++++++++++++++++++++++++++++++++
 bulk-checkin.h         |   16 +++
 cache.h                |    2 +
 config.c               |    4 +
 environment.c          |    1 +
 sha1_file.c            |   67 +-----------
 t/t1050-large.sh       |   94 +++++++++++++++--
 zlib.c                 |    9 ++-
 11 files changed, 403 insertions(+), 78 deletions(-)
 create mode 100644 bulk-checkin.c
 create mode 100644 bulk-checkin.h

diff --git a/Makefile b/Makefile
index 3139c19..418dd2e 100644
--- a/Makefile
+++ b/Makefile
@@ -505,6 +505,7 @@ LIB_H += argv-array.h
 LIB_H += attr.h
 LIB_H += blob.h
 LIB_H += builtin.h
+LIB_H += bulk-checkin.h
 LIB_H += cache.h
 LIB_H += cache-tree.h
 LIB_H += color.h
@@ -591,6 +592,7 @@ LIB_OBJS += base85.o
 LIB_OBJS += bisect.o
 LIB_OBJS += blob.o
 LIB_OBJS += branch.o
+LIB_OBJS += bulk-checkin.o
 LIB_OBJS += bundle.o
 LIB_OBJS += cache-tree.o
 LIB_OBJS += color.o
diff --git a/builtin/add.c b/builtin/add.c
index c59b0c9..1c42900 100644
--- a/builtin/add.c
+++ b/builtin/add.c
@@ -13,6 +13,7 @@
 #include "diff.h"
 #include "diffcore.h"
 #include "revision.h"
+#include "bulk-checkin.h"
 
 static const char * const builtin_add_usage[] = {
 	"git add [options] [--] <filepattern>...",
@@ -458,11 +459,15 @@ int cmd_add(int argc, const char **argv, const char *prefix)
 		free(seen);
 	}
 
+	plug_bulk_checkin();
+
 	exit_status |= add_files_to_cache(prefix, pathspec, flags);
 
 	if (add_new_files)
 		exit_status |= add_files(&dir, flags);
 
+	unplug_bulk_checkin();
+
  finish:
 	if (active_cache_changed) {
 		if (write_cache(newfd, active_cache, active_nr) ||
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b458b6d..dde913e 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -76,7 +76,7 @@ static struct pack_idx_option pack_idx_opts;
 static const char *base_name;
 static int progress = 1;
 static int window = 10;
-static unsigned long pack_size_limit, pack_size_limit_cfg;
+static unsigned long pack_size_limit;
 static int depth = 50;
 static int delta_search_threads;
 static int pack_to_stdout;
@@ -2009,10 +2009,6 @@ static int git_pack_config(const char *k, const char *v, void *cb)
 			    pack_idx_opts.version);
 		return 0;
 	}
-	if (!strcmp(k, "pack.packsizelimit")) {
-		pack_size_limit_cfg = git_config_ulong(k, v);
-		return 0;
-	}
 	return git_default_config(k, v, cb);
 }
 
diff --git a/bulk-checkin.c b/bulk-checkin.c
new file mode 100644
index 0000000..6b0b6d4
--- /dev/null
+++ b/bulk-checkin.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2011, Google Inc.
+ */
+#include "bulk-checkin.h"
+#include "csum-file.h"
+#include "pack.h"
+
+static int pack_compression_level = Z_DEFAULT_COMPRESSION;
+
+static struct bulk_checkin_state {
+	unsigned plugged:1;
+
+	char *pack_tmp_name;
+	struct sha1file *f;
+	off_t offset;
+	struct pack_idx_option pack_idx_opts;
+
+	struct pack_idx_entry **written;
+	uint32_t alloc_written;
+	uint32_t nr_written;
+} state;
+
+static void finish_bulk_checkin(struct bulk_checkin_state *state)
+{
+	unsigned char sha1[20];
+	char packname[PATH_MAX];
+	int i;
+
+	if (!state->f)
+		return;
+
+	if (state->nr_written == 0) {
+		close(state->f->fd);
+		unlink(state->pack_tmp_name);
+		goto clear_exit;
+	} else if (state->nr_written == 1) {
+		sha1close(state->f, sha1, CSUM_FSYNC);
+	} else {
+		int fd = sha1close(state->f, sha1, 0);
+		fixup_pack_header_footer(fd, sha1, state->pack_tmp_name,
+					 state->nr_written, sha1,
+					 state->offset);
+		close(fd);
+	}
+
+	sprintf(packname, "%s/pack/pack-", get_object_directory());
+	finish_tmp_packfile(packname, state->pack_tmp_name,
+			    state->written, state->nr_written,
+			    &state->pack_idx_opts, sha1);
+	for (i = 0; i < state->nr_written; i++)
+		free(state->written[i]);
+
+clear_exit:
+	free(state->written);
+	memset(state, 0, sizeof(*state));
+
+	/* Make objects we just wrote available to ourselves */
+	reprepare_packed_git();
+}
+
+static int already_written(struct bulk_checkin_state *state, unsigned char sha1[])
+{
+	int i;
+
+	/* The object may already exist in the repository */
+	if (has_sha1_file(sha1))
+		return 1;
+
+	/* Might want to keep the list sorted */
+	for (i = 0; i < state->nr_written; i++)
+		if (!hashcmp(state->written[i]->sha1, sha1))
+			return 1;
+
+	/* This is a new object we need to keep */
+	return 0;
+}
+
+/*
+ * Read the contents from fd for size bytes, streaming it to the
+ * packfile in state while updating the hash in ctx. Signal a failure
+ * by returning a negative value when the resulting pack would exceed
+ * the pack size limit and this is not the first object in the pack,
+ * so that the caller can discard what we wrote from the current pack
+ * by truncating it and opening a new one. The caller will then call
+ * us again after rewinding the input fd.
+ *
+ * The already_hashed_to pointer is kept untouched by the caller to
+ * make sure we do not hash the same byte when we are called
+ * again. This way, the caller does not have to checkpoint its hash
+ * status before calling us just in case we ask it to call us again
+ * with a new pack.
+ */
+static int stream_to_pack(struct bulk_checkin_state *state,
+			  git_SHA_CTX *ctx, off_t *already_hashed_to,
+			  int fd, size_t size, enum object_type type,
+			  const char *path, unsigned flags)
+{
+	git_zstream s;
+	unsigned char obuf[16384];
+	unsigned hdrlen;
+	int status = Z_OK;
+	int write_object = (flags & HASH_WRITE_OBJECT);
+	off_t offset = 0;
+
+	memset(&s, 0, sizeof(s));
+	git_deflate_init(&s, pack_compression_level);
+
+	hdrlen = encode_in_pack_object_header(type, size, obuf);
+	s.next_out = obuf + hdrlen;
+	s.avail_out = sizeof(obuf) - hdrlen;
+
+	while (status != Z_STREAM_END) {
+		unsigned char ibuf[16384];
+
+		if (size && !s.avail_in) {
+			ssize_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
+			if (xread(fd, ibuf, rsize) != rsize)
+				die("failed to read %d bytes from '%s'",
+				    (int)rsize, path);
+			offset += rsize;
+			if (*already_hashed_to < offset) {
+				size_t hsize = offset - *already_hashed_to;
+				if (rsize < hsize)
+					hsize = rsize;
+				if (hsize)
+					git_SHA1_Update(ctx, ibuf, hsize);
+				*already_hashed_to = offset;
+			}
+			s.next_in = ibuf;
+			s.avail_in = rsize;
+			size -= rsize;
+		}
+
+		status = git_deflate(&s, size ? 0 : Z_FINISH);
+
+		if (!s.avail_out || status == Z_STREAM_END) {
+			if (write_object) {
+				size_t written = s.next_out - obuf;
+
+				/* would we bust the size limit? */
+				if (state->nr_written &&
+				    pack_size_limit_cfg &&
+				    pack_size_limit_cfg < state->offset + written) {
+					git_deflate_abort(&s);
+					return -1;
+				}
+
+				sha1write(state->f, obuf, written);
+				state->offset += written;
+			}
+			s.next_out = obuf;
+			s.avail_out = sizeof(obuf);
+		}
+
+		switch (status) {
+		case Z_OK:
+		case Z_BUF_ERROR:
+		case Z_STREAM_END:
+			continue;
+		default:
+			die("unexpected deflate failure: %d", status);
+		}
+	}
+	git_deflate_end(&s);
+	return 0;
+}
+
+/* Lazily create backing packfile for the state */
+static void prepare_to_stream(struct bulk_checkin_state *state,
+			      unsigned flags)
+{
+	if (!(flags & HASH_WRITE_OBJECT) || state->f)
+		return;
+
+	state->f = create_tmp_packfile(&state->pack_tmp_name);
+	reset_pack_idx_option(&state->pack_idx_opts);
+
+	/* Pretend we are going to write only one object */
+	state->offset = write_pack_header(state->f, 1);
+	if (!state->offset)
+		die_errno("unable to write pack header");
+}
+
+static int deflate_to_pack(struct bulk_checkin_state *state,
+			   unsigned char result_sha1[],
+			   int fd, size_t size,
+			   enum object_type type, const char *path,
+			   unsigned flags)
+{
+	off_t seekback, already_hashed_to;
+	git_SHA_CTX ctx;
+	unsigned char obuf[16384];
+	unsigned header_len;
+	struct sha1file_checkpoint checkpoint;
+	struct pack_idx_entry *idx = NULL;
+
+	seekback = lseek(fd, 0, SEEK_CUR);
+	if (seekback == (off_t) -1)
+		return error("cannot find the current offset");
+
+	header_len = sprintf((char *)obuf, "%s %" PRIuMAX,
+			     typename(type), (uintmax_t)size) + 1;
+	git_SHA1_Init(&ctx);
+	git_SHA1_Update(&ctx, obuf, header_len);
+
+	/* Note: idx is non-NULL when we are writing */
+	if ((flags & HASH_WRITE_OBJECT) != 0)
+		idx = xcalloc(1, sizeof(*idx));
+
+	already_hashed_to = 0;
+
+	while (1) {
+		prepare_to_stream(state, flags);
+		if (idx) {
+			sha1file_checkpoint(state->f, &checkpoint);
+			idx->offset = state->offset;
+			crc32_begin(state->f);
+		}
+		if (!stream_to_pack(state, &ctx, &already_hashed_to,
+				    fd, size, type, path, flags))
+			break;
+		/*
+		 * Writing this object to the current pack will make
+		 * it too big; we need to truncate it, start a new
+		 * pack, and write into it.
+		 */
+		if (!idx)
+			die("BUG: should not happen");
+		sha1file_truncate(state->f, &checkpoint);
+		state->offset = checkpoint.offset;
+		finish_bulk_checkin(state);
+		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
+			return error("cannot seek back");
+	}
+	git_SHA1_Final(result_sha1, &ctx);
+	if (!idx)
+		return 0;
+
+	idx->crc32 = crc32_end(state->f);
+	if (already_written(state, result_sha1)) {
+		sha1file_truncate(state->f, &checkpoint);
+		state->offset = checkpoint.offset;
+		free(idx);
+	} else {
+		hashcpy(idx->sha1, result_sha1);
+		ALLOC_GROW(state->written,
+			   state->nr_written + 1,
+			   state->alloc_written);
+		state->written[state->nr_written++] = idx;
+	}
+	return 0;
+}
+
+int index_bulk_checkin(unsigned char *sha1,
+		       int fd, size_t size, enum object_type type,
+		       const char *path, unsigned flags)
+{
+	int status = deflate_to_pack(&state, sha1, fd, size, type,
+				     path, flags);
+	if (!state.plugged)
+		finish_bulk_checkin(&state);
+	return status;
+}
+
+void plug_bulk_checkin(void)
+{
+	state.plugged = 1;
+}
+
+void unplug_bulk_checkin(void)
+{
+	state.plugged = 0;
+	if (state.f)
+		finish_bulk_checkin(&state);
+}
diff --git a/bulk-checkin.h b/bulk-checkin.h
new file mode 100644
index 0000000..4f599f8
--- /dev/null
+++ b/bulk-checkin.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2011, Google Inc.
+ */
+#ifndef BULK_CHECKIN_H
+#define BULK_CHECKIN_H
+
+#include "cache.h"
+
+extern int index_bulk_checkin(unsigned char sha1[],
+			      int fd, size_t size, enum object_type type,
+			      const char *path, unsigned flags);
+
+extern void plug_bulk_checkin(void);
+extern void unplug_bulk_checkin(void);
+
+#endif
diff --git a/cache.h b/cache.h
index 2e6ad36..4f20861 100644
--- a/cache.h
+++ b/cache.h
@@ -35,6 +35,7 @@ int git_inflate(git_zstream *, int flush);
 void git_deflate_init(git_zstream *, int level);
 void git_deflate_init_gzip(git_zstream *, int level);
 void git_deflate_end(git_zstream *);
+int git_deflate_abort(git_zstream *);
 int git_deflate_end_gently(git_zstream *);
 int git_deflate(git_zstream *, int flush);
 unsigned long git_deflate_bound(git_zstream *, unsigned long);
@@ -598,6 +599,7 @@ extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long pack_size_limit_cfg;
 extern int read_replace_refs;
 extern int fsync_object_files;
 extern int core_preload_index;
diff --git a/config.c b/config.c
index edf9914..c736802 100644
--- a/config.c
+++ b/config.c
@@ -797,6 +797,10 @@ int git_default_config(const char *var, const char *value, void *dummy)
 		return 0;
 	}
 
+	if (!strcmp(var, "pack.packsizelimit")) {
+		pack_size_limit_cfg = git_config_ulong(var, value);
+		return 0;
+	}
 	/* Add other config variables here and to Documentation/config.txt. */
 	return 0;
 }
diff --git a/environment.c b/environment.c
index 0bee6a7..31e4284 100644
--- a/environment.c
+++ b/environment.c
@@ -60,6 +60,7 @@ char *notes_ref_name;
 int grafts_replace_parents = 1;
 int core_apply_sparse_checkout;
 struct startup_info *startup_info;
+unsigned long pack_size_limit_cfg;
 
 /* Parallel index stat data preload? */
 int core_preload_index = 0;
diff --git a/sha1_file.c b/sha1_file.c
index 27f3b9b..c96e366 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -18,6 +18,7 @@
 #include "refs.h"
 #include "pack-revindex.h"
 #include "sha1-lookup.h"
+#include "bulk-checkin.h"
 
 #ifndef O_NOATIME
 #if defined(__linux__) && (defined(__i386__) || defined(__PPC__))
@@ -2679,10 +2680,8 @@ static int index_core(unsigned char *sha1, int fd, size_t size,
 }
 
 /*
- * This creates one packfile per large blob, because the caller
- * immediately wants the result sha1, and fast-import can report the
- * object name via marks mechanism only by closing the created
- * packfile.
+ * This creates one packfile per large blob unless bulk-checkin
+ * machinery is "plugged".
  *
  * This also bypasses the usual "convert-to-git" dance, and that is on
  * purpose. We could write a streaming version of the converting
@@ -2696,65 +2695,7 @@ static int index_stream(unsigned char *sha1, int fd, size_t size,
 			enum object_type type, const char *path,
 			unsigned flags)
 {
-	struct child_process fast_import;
-	char export_marks[512];
-	const char *argv[] = { "fast-import", "--quiet", export_marks, NULL };
-	char tmpfile[512];
-	char fast_import_cmd[512];
-	char buf[512];
-	int len, tmpfd;
-
-	strcpy(tmpfile, git_path("hashstream_XXXXXX"));
-	tmpfd = git_mkstemp_mode(tmpfile, 0600);
-	if (tmpfd < 0)
-		die_errno("cannot create tempfile: %s", tmpfile);
-	if (close(tmpfd))
-		die_errno("cannot close tempfile: %s", tmpfile);
-	sprintf(export_marks, "--export-marks=%s", tmpfile);
-
-	memset(&fast_import, 0, sizeof(fast_import));
-	fast_import.in = -1;
-	fast_import.argv = argv;
-	fast_import.git_cmd = 1;
-	if (start_command(&fast_import))
-		die_errno("index-stream: git fast-import failed");
-
-	len = sprintf(fast_import_cmd, "blob\nmark :1\ndata %lu\n",
-		      (unsigned long) size);
-	write_or_whine(fast_import.in, fast_import_cmd, len,
-		       "index-stream: feeding fast-import");
-	while (size) {
-		char buf[10240];
-		size_t sz = size < sizeof(buf) ? size : sizeof(buf);
-		ssize_t actual;
-
-		actual = read_in_full(fd, buf, sz);
-		if (actual < 0)
-			die_errno("index-stream: reading input");
-		if (write_in_full(fast_import.in, buf, actual) != actual)
-			die_errno("index-stream: feeding fast-import");
-		size -= actual;
-	}
-	if (close(fast_import.in))
-		die_errno("index-stream: closing fast-import");
-	if (finish_command(&fast_import))
-		die_errno("index-stream: finishing fast-import");
-
-	tmpfd = open(tmpfile, O_RDONLY);
-	if (tmpfd < 0)
-		die_errno("index-stream: cannot open fast-import mark");
-	len = read(tmpfd, buf, sizeof(buf));
-	if (len < 0)
-		die_errno("index-stream: reading fast-import mark");
-	if (close(tmpfd) < 0)
-		die_errno("index-stream: closing fast-import mark");
-	if (unlink(tmpfile))
-		die_errno("index-stream: unlinking fast-import mark");
-	if (len != 44 ||
-	    memcmp(":1 ", buf, 3) ||
-	    get_sha1_hex(buf + 3, sha1))
-		die_errno("index-stream: unexpected fast-import mark: <%s>", buf);
-	return 0;
+	return index_bulk_checkin(sha1, fd, size, type, path, flags);
 }
 
 int index_fd(unsigned char *sha1, int fd, struct stat *st,
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index deba111..29d6024 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -7,21 +7,97 @@ test_description='adding and checking out large blobs'
 
 test_expect_success setup '
 	git config core.bigfilethreshold 200k &&
-	echo X | dd of=large bs=1k seek=2000
+	echo X | dd of=large1 bs=1k seek=2000 &&
+	echo X | dd of=large2 bs=1k seek=2000 &&
+	echo X | dd of=large3 bs=1k seek=2000 &&
+	echo Y | dd of=huge bs=1k seek=2500
 '
 
-test_expect_success 'add a large file' '
-	git add large &&
-	# make sure we got a packfile and no loose objects
-	test -f .git/objects/pack/pack-*.pack &&
-	test ! -f .git/objects/??/??????????????????????????????????????
+test_expect_success 'add a large file or two' '
+	git add large1 huge large2 &&
+	# make sure we got a single packfile and no loose objects
+	bad= count=0 idx= &&
+	for p in .git/objects/pack/pack-*.pack
+	do
+		count=$(( $count + 1 ))
+		if test -f "$p" && idx=${p%.pack}.idx && test -f "$idx"
+		then
+			continue
+		fi
+		bad=t
+	done &&
+	test -z "$bad" &&
+	test $count = 1 &&
+	cnt=$(git show-index <"$idx" | wc -l) &&
+	test $cnt = 2 &&
+	for l in .git/objects/??/??????????????????????????????????????
+	do
+		test -f "$l" || continue
+		bad=t
+	done &&
+	test -z "$bad" &&
+
+	# attempt to add another copy of the same
+	git add large3 &&
+	bad= count=0 &&
+	for p in .git/objects/pack/pack-*.pack
+	do
+		count=$(( $count + 1 ))
+		if test -f "$p" && idx=${p%.pack}.idx && test -f "$idx"
+		then
+			continue
+		fi
+		bad=t
+	done &&
+	test -z "$bad" &&
+	test $count = 1
 '
 
 test_expect_success 'checkout a large file' '
-	large=$(git rev-parse :large) &&
-	git update-index --add --cacheinfo 100644 $large another &&
+	large1=$(git rev-parse :large1) &&
+	git update-index --add --cacheinfo 100644 $large1 another &&
 	git checkout another &&
-	cmp large another ;# this must not be test_cmp
+	cmp large1 another ;# this must not be test_cmp
+'
+
+test_expect_success 'packsize limit' '
+	test_create_repo mid &&
+	(
+		cd mid &&
+		git config core.bigfilethreshold 64k &&
+		git config pack.packsizelimit 256k &&
+
+		# mid1 and mid2 will fit within 256k limit but
+		# appending mid3 will bust the limit and will
+		# result in a separate packfile.
+		test-genrandom "a" $(( 66 * 1024 )) >mid1 &&
+		test-genrandom "b" $(( 80 * 1024 )) >mid2 &&
+		test-genrandom "c" $(( 128 * 1024 )) >mid3 &&
+		git add mid1 mid2 mid3 &&
+
+		count=0
+		for pi in .git/objects/pack/pack-*.idx
+		do
+			test -f "$pi" && count=$(( $count + 1 ))
+		done &&
+		test $count = 2 &&
+
+		(
+			git hash-object --stdin <mid1
+			git hash-object --stdin <mid2
+			git hash-object --stdin <mid3
+		) |
+		sort >expect &&
+
+		for pi in .git/objects/pack/pack-*.idx
+		do
+			git show-index <"$pi"
+		done |
+		sed -e "s/^[0-9]* \([0-9a-f]*\) .*/\1/" |
+		sort >actual &&
+
+		test_cmp expect actual
+	)
 '
 
 test_done
diff --git a/zlib.c b/zlib.c
index 3c63d48..2b2c0c7 100644
--- a/zlib.c
+++ b/zlib.c
@@ -188,13 +188,20 @@ void git_deflate_init_gzip(git_zstream *strm, int level)
 	    strm->z.msg ? strm->z.msg : "no message");
 }
 
-void git_deflate_end(git_zstream *strm)
+int git_deflate_abort(git_zstream *strm)
 {
 	int status;
 
 	zlib_pre_call(strm);
 	status = deflateEnd(&strm->z);
 	zlib_post_call(strm);
+	return status;
+}
+
+void git_deflate_end(git_zstream *strm)
+{
+	int status = git_deflate_abort(strm);
+
 	if (status == Z_OK)
 		return;
 	error("deflateEnd: %s (%s)", zerr_to_string(status),
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v3 2/6] varint-in-pack: refactor varint encoding/decoding
  2011-12-02  0:40   ` [PATCH v3 0/6] Bulk check-in Junio C Hamano
  2011-12-02  0:40     ` [PATCH v3 1/6] bulk-checkin: replace fast-import based implementation Junio C Hamano
@ 2011-12-02  0:40     ` Junio C Hamano
  2011-12-02  0:40     ` [PATCH v3 3/6] new representation types in the packstream Junio C Hamano
                       ` (3 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-02  0:40 UTC (permalink / raw)
  To: git

Refactor encode/decode_in_pack_varint() functions from OFS_DELTA codepaths
to read and write variable-length integers in the pack stream.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 builtin/pack-objects.c |   28 +++++++++++++---------------
 pack-write.c           |   27 +++++++++++++++++++++++++++
 pack.h                 |    2 ++
 sha1_file.c            |   18 ++++++------------
 4 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index dde913e..72206a9 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -210,7 +210,7 @@ static unsigned long write_object(struct sha1file *f,
 {
 	unsigned long size, limit, datalen;
 	void *buf;
-	unsigned char header[10], dheader[10];
+	unsigned char header[10];
 	unsigned hdrlen;
 	enum object_type type;
 	int usable_delta, to_reuse;
@@ -304,17 +304,16 @@ static unsigned long write_object(struct sha1file *f,
 			 * base from this object's position in the pack.
 			 */
 			off_t ofs = entry->idx.offset - entry->delta->idx.offset;
-			unsigned pos = sizeof(dheader) - 1;
-			dheader[pos] = ofs & 127;
-			while (ofs >>= 7)
-				dheader[--pos] = 128 | (--ofs & 127);
-			if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
+			unsigned char dheader[10];
+			unsigned pos = encode_in_pack_varint(ofs, dheader);
+
+			if (limit && hdrlen + pos + datalen + 20 >= limit) {
 				free(buf);
 				return 0;
 			}
 			sha1write(f, header, hdrlen);
-			sha1write(f, dheader + pos, sizeof(dheader) - pos);
-			hdrlen += sizeof(dheader) - pos;
+			sha1write(f, dheader, pos);
+			hdrlen += pos;
 		} else if (type == OBJ_REF_DELTA) {
 			/*
 			 * Deltas with a base reference contain
@@ -369,17 +368,16 @@ static unsigned long write_object(struct sha1file *f,
 
 		if (type == OBJ_OFS_DELTA) {
 			off_t ofs = entry->idx.offset - entry->delta->idx.offset;
-			unsigned pos = sizeof(dheader) - 1;
-			dheader[pos] = ofs & 127;
-			while (ofs >>= 7)
-				dheader[--pos] = 128 | (--ofs & 127);
-			if (limit && hdrlen + sizeof(dheader) - pos + datalen + 20 >= limit) {
+			unsigned char dheader[10];
+			unsigned pos = encode_in_pack_varint(ofs, dheader);
+
+			if (limit && hdrlen + pos + datalen + 20 >= limit) {
 				unuse_pack(&w_curs);
 				return 0;
 			}
 			sha1write(f, header, hdrlen);
-			sha1write(f, dheader + pos, sizeof(dheader) - pos);
-			hdrlen += sizeof(dheader) - pos;
+			sha1write(f, dheader, pos);
+			hdrlen += pos;
 			reused_delta++;
 		} else if (type == OBJ_REF_DELTA) {
 			if (limit && hdrlen + 20 + datalen + 20 >= limit) {
diff --git a/pack-write.c b/pack-write.c
index cadc3e1..5702cec 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -302,6 +302,33 @@ char *index_pack_lockfile(int ip_out)
 	return NULL;
 }
 
+uintmax_t decode_in_pack_varint(const unsigned char **bufp)
+{
+	const unsigned char *buf = *bufp;
+	unsigned char c = *buf++;
+	uintmax_t val = c & 127;
+	while (c & 128) {
+		val += 1;
+		if (!val || MSB(val, 7))
+			return 0; /* overflow */
+		c = *buf++;
+		val = (val << 7) + (c & 127);
+	}
+	*bufp = buf;
+	return val;
+}
+
+int encode_in_pack_varint(uintmax_t value, unsigned char *buf)
+{
+	unsigned char varint[16];
+	unsigned pos = sizeof(varint) - 1;
+	varint[pos] = value & 127;
+	while (value >>= 7)
+		varint[--pos] = 128 | (--value & 127);
+	memcpy(buf, varint + pos, sizeof(varint) - pos);
+	return sizeof(varint) - pos;
+}
+
 /*
  * The per-object header is a pretty dense thing, which is
  *  - first byte: low four bits are "size", then three bits of "type",
diff --git a/pack.h b/pack.h
index cfb0f69..d7dc6ca 100644
--- a/pack.h
+++ b/pack.h
@@ -79,6 +79,8 @@ extern off_t write_pack_header(struct sha1file *f, uint32_t);
 extern void fixup_pack_header_footer(int, unsigned char *, const char *, uint32_t, unsigned char *, off_t);
 extern char *index_pack_lockfile(int fd);
 extern int encode_in_pack_object_header(enum object_type, uintmax_t, unsigned char *);
+extern int encode_in_pack_varint(uintmax_t, unsigned char *);
+extern uintmax_t decode_in_pack_varint(const unsigned char **);
 
 #define PH_ERROR_EOF		(-1)
 #define PH_ERROR_PACK_SIGNATURE	(-2)
diff --git a/sha1_file.c b/sha1_file.c
index c96e366..f066c2b 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1484,20 +1484,14 @@ static off_t get_delta_base(struct packed_git *p,
 	 * is stupid, as then a REF_DELTA would be smaller to store.
 	 */
 	if (type == OBJ_OFS_DELTA) {
-		unsigned used = 0;
-		unsigned char c = base_info[used++];
-		base_offset = c & 127;
-		while (c & 128) {
-			base_offset += 1;
-			if (!base_offset || MSB(base_offset, 7))
-				return 0;  /* overflow */
-			c = base_info[used++];
-			base_offset = (base_offset << 7) + (c & 127);
-		}
-		base_offset = delta_obj_offset - base_offset;
+		const unsigned char *buf = base_info;
+		uintmax_t ofs = decode_in_pack_varint(&buf);
+		if (!ofs && buf == base_info)
+			return 0; /* overflow */
+		base_offset = delta_obj_offset - ofs;
 		if (base_offset <= 0 || base_offset >= delta_obj_offset)
 			return 0;  /* out of bound */
-		*curpos += used;
+		*curpos += buf - base_info;
 	} else if (type == OBJ_REF_DELTA) {
 		/* The base entry _must_ be in the same pack */
 		base_offset = find_pack_entry_one(base_info, p);
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v3 3/6] new representation types in the packstream
  2011-12-02  0:40   ` [PATCH v3 0/6] Bulk check-in Junio C Hamano
  2011-12-02  0:40     ` [PATCH v3 1/6] bulk-checkin: replace fast-import based implementation Junio C Hamano
  2011-12-02  0:40     ` [PATCH v3 2/6] varint-in-pack: refactor varint encoding/decoding Junio C Hamano
@ 2011-12-02  0:40     ` Junio C Hamano
  2011-12-02  0:40     ` [PATCH v3 4/6] bulk-checkin: allow the same data to be multiply hashed Junio C Hamano
                       ` (2 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-02  0:40 UTC (permalink / raw)
  To: git

In addition to four basic types (commit, tree, blob and tag), the pack
stream can encode a few other "representation" types, such as REF_DELTA
and OFS_DELTA. As we allocate 3 bits in the first byte for this purpose,
we do not have much room to add new representation types in place, but we
do have one value reserved for future expansion.

When encoding a new representation type, the early part of the in-pack
object header is encoded as if its type is OBJ_EXT (= 5) using exactly the
same way as before. That is, the lower 4-bit of the first byte is used for
the lowest 4-bit of the size information, the next 3-bit has the type
information, and the MSB says if the subsequent bytes encodes higher bits
for the size information.

An in-pack object header that records OBJ_EXT as the type is followed by
an integer in the same variable-length encoding as OFS_DELTA offset is
encoded. This value is the real type of the representation minus 8 (as we
do not need to use OBJ_EXT to encode types smaller than 8).  Because we do
not foresee very many representation types, in practice we would have a
single byte with its MSB clear, to represent types 8-135.

The code does not type=8 and upwards for anything yet.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 cache.h      |    4 +++-
 pack-write.c |   23 +++++++++++++++++------
 sha1_file.c  |   11 +++++++++++
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/cache.h b/cache.h
index 4f20861..4a3b421 100644
--- a/cache.h
+++ b/cache.h
@@ -381,12 +381,14 @@ enum object_type {
 	OBJ_TREE = 2,
 	OBJ_BLOB = 3,
 	OBJ_TAG = 4,
-	/* 5 for future expansion */
+	OBJ_EXT = 5,
 	OBJ_OFS_DELTA = 6,
 	OBJ_REF_DELTA = 7,
 	OBJ_ANY,
 	OBJ_MAX
 };
+#define OBJ_LAST_BASE_TYPE OBJ_REF_DELTA
+#define OBJ_LAST_VALID_TYPE OBJ_REF_DELTA
 
 static inline enum object_type object_type(unsigned int mode)
 {
diff --git a/pack-write.c b/pack-write.c
index 5702cec..9309dd1 100644
--- a/pack-write.c
+++ b/pack-write.c
@@ -338,22 +338,33 @@ int encode_in_pack_varint(uintmax_t value, unsigned char *buf)
  */
 int encode_in_pack_object_header(enum object_type type, uintmax_t size, unsigned char *hdr)
 {
-	int n = 1;
+	unsigned char *hdr_base;
 	unsigned char c;
+	enum object_type header_type;
 
-	if (type < OBJ_COMMIT || type > OBJ_REF_DELTA)
+	if (type < OBJ_COMMIT || OBJ_LAST_VALID_TYPE < type)
 		die("bad type %d", type);
+	else if (OBJ_LAST_BASE_TYPE < type)
+		header_type = OBJ_EXT;
+	else
+		header_type = type;
 
-	c = (type << 4) | (size & 15);
+	c = (header_type << 4) | (size & 15);
 	size >>= 4;
+	hdr_base = hdr;
 	while (size) {
 		*hdr++ = c | 0x80;
 		c = size & 0x7f;
 		size >>= 7;
-		n++;
 	}
-	*hdr = c;
-	return n;
+	*hdr++ = c;
+	if (header_type != type) {
+		int sz;
+		type = type - (OBJ_LAST_BASE_TYPE + 1);
+		sz = encode_in_pack_varint(type, hdr);
+		hdr += sz;
+	}
+	return hdr - hdr_base;
 }
 
 struct sha1file *create_tmp_packfile(char **pack_tmp_name)
diff --git a/sha1_file.c b/sha1_file.c
index f066c2b..14902cc 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1275,6 +1275,17 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf,
 		shift += 7;
 	}
 	*sizep = size;
+	if (*type == OBJ_EXT) {
+		const unsigned char *p = buf + used;
+		uintmax_t val = decode_in_pack_varint(&p);
+
+		if (p == buf + used && !val) {
+			error("bad extended object type");
+			return 0;
+		}
+		*type = val + (OBJ_LAST_BASE_TYPE + 1);
+		used = p - buf;
+	}
 	return used;
 }
 
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v3 4/6] bulk-checkin: allow the same data to be multiply hashed
  2011-12-02  0:40   ` [PATCH v3 0/6] Bulk check-in Junio C Hamano
                       ` (2 preceding siblings ...)
  2011-12-02  0:40     ` [PATCH v3 3/6] new representation types in the packstream Junio C Hamano
@ 2011-12-02  0:40     ` Junio C Hamano
  2011-12-02  0:40     ` [PATCH v3 5/6] bulk-checkin: support chunked-object encoding Junio C Hamano
  2011-12-02  0:40     ` [PATCH v3 6/6] chunked-object: fallback checkout codepaths Junio C Hamano
  5 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-02  0:40 UTC (permalink / raw)
  To: git

This updates stream_to_pack() machinery to feed the data it is writing out
to multiple hash contexts at the same time. Right now we only use a single
git_SHA_CTX, so there is no change in functionality.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 bulk-checkin.c |   33 +++++++++++++++++++++++++--------
 1 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/bulk-checkin.c b/bulk-checkin.c
index 6b0b6d4..6f1ce58 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -75,6 +75,20 @@ static int already_written(struct bulk_checkin_state *state, unsigned char sha1[
 	return 0;
 }
 
+struct chunk_ctx {
+	struct chunk_ctx *up;
+	git_SHA_CTX ctx;
+};
+
+static void chunk_SHA1_Update(struct chunk_ctx *ctx,
+			      const unsigned char *buf, size_t size)
+{
+	while (ctx) {
+		git_SHA1_Update(&ctx->ctx, buf, size);
+		ctx = ctx->up;
+	}
+}
+
 /*
  * Read the contents from fd for size bytes, streaming it to the
  * packfile in state while updating the hash in ctx. Signal a failure
@@ -91,7 +105,7 @@ static int already_written(struct bulk_checkin_state *state, unsigned char sha1[
  * with a new pack.
  */
 static int stream_to_pack(struct bulk_checkin_state *state,
-			  git_SHA_CTX *ctx, off_t *already_hashed_to,
+			  struct chunk_ctx *ctx, off_t *already_hashed_to,
 			  int fd, size_t size, enum object_type type,
 			  const char *path, unsigned flags)
 {
@@ -123,7 +137,7 @@ static int stream_to_pack(struct bulk_checkin_state *state,
 				if (rsize < hsize)
 					hsize = rsize;
 				if (hsize)
-					git_SHA1_Update(ctx, ibuf, hsize);
+					chunk_SHA1_Update(ctx, ibuf, hsize);
 				*already_hashed_to = offset;
 			}
 			s.next_in = ibuf;
@@ -185,10 +199,11 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 			   unsigned char result_sha1[],
 			   int fd, size_t size,
 			   enum object_type type, const char *path,
-			   unsigned flags)
+			   unsigned flags,
+			   struct chunk_ctx *up)
 {
 	off_t seekback, already_hashed_to;
-	git_SHA_CTX ctx;
+	struct chunk_ctx ctx;
 	unsigned char obuf[16384];
 	unsigned header_len;
 	struct sha1file_checkpoint checkpoint;
@@ -200,8 +215,10 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 
 	header_len = sprintf((char *)obuf, "%s %" PRIuMAX,
 			     typename(type), (uintmax_t)size) + 1;
-	git_SHA1_Init(&ctx);
-	git_SHA1_Update(&ctx, obuf, header_len);
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.up = up;
+	git_SHA1_Init(&ctx.ctx);
+	git_SHA1_Update(&ctx.ctx, obuf, header_len);
 
 	/* Note: idx is non-NULL when we are writing */
 	if ((flags & HASH_WRITE_OBJECT) != 0)
@@ -232,7 +249,7 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
 			return error("cannot seek back");
 	}
-	git_SHA1_Final(result_sha1, &ctx);
+	git_SHA1_Final(result_sha1, &ctx.ctx);
 	if (!idx)
 		return 0;
 
@@ -256,7 +273,7 @@ int index_bulk_checkin(unsigned char *sha1,
 		       const char *path, unsigned flags)
 {
 	int status = deflate_to_pack(&state, sha1, fd, size, type,
-				     path, flags);
+				     path, flags, NULL);
 	if (!state.plugged)
 		finish_bulk_checkin(&state);
 	return status;
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v3 5/6] bulk-checkin: support chunked-object encoding
  2011-12-02  0:40   ` [PATCH v3 0/6] Bulk check-in Junio C Hamano
                       ` (3 preceding siblings ...)
  2011-12-02  0:40     ` [PATCH v3 4/6] bulk-checkin: allow the same data to be multiply hashed Junio C Hamano
@ 2011-12-02  0:40     ` Junio C Hamano
  2011-12-02  0:40     ` [PATCH v3 6/6] chunked-object: fallback checkout codepaths Junio C Hamano
  5 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-02  0:40 UTC (permalink / raw)
  To: git

Instead of recording a huge object as a single entry in the packfile,
record it as a concatenation of smaller blob objects. This is primarily to
make it possible to repack repositories with huge objects and transfer
huge objects out of such repositories.

This is the first step of a long and painful journey. We would still need
to teach many codepaths about the new encoding:

 * The streaming checkout codepath must learn how to read these from the
   object store and write a concatenation of component blob contents to
   the working tree. Note that after a repack, a component blob object
   could be represented as a delta to another blob object.

 * The in-core object reading codepath must learn to notice and at least
   reject reading such objects entirely in-core. It is expected that
   nobody is interested in producing a patch out of these huge objects, at
   least initially.

 * The object connectivity machinery must learn that component blob
   objects are reachable from an object that uses them, so that "gc"
   will not lose them, and "fsck" will not complain about them.

 * The pack-object machinery must learn to copy an object that is encoded
   in chunked-object encoding literally to the destination (while perhaps
   validating the object name).

 * The index-pack and verify-pack machineries need to be told about it.

The split-chunk logic used here is kept deliberately useless in order to
avoid distracting the reviewers, and also make sure that the chunked
encoding machinery does not depend any particular chunk splitting
heuristics. We may want to replace it with a better heuristics, perhaps
the one used in "bup" that is based on a self-synchronizing rolling
checksum logic, or something.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Makefile         |    1 +
 bulk-checkin.c   |  127 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 bulk-checkin.h   |    1 +
 cache.h          |    9 +++-
 config.c         |    5 ++
 environment.c    |    1 +
 split-chunk.c    |   28 ++++++++++++
 t/t1050-large.sh |   29 ++++++++++++
 8 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 split-chunk.c

diff --git a/Makefile b/Makefile
index 418dd2e..7d2fc3a 100644
--- a/Makefile
+++ b/Makefile
@@ -679,6 +679,7 @@ LIB_OBJS += sha1_name.o
 LIB_OBJS += shallow.o
 LIB_OBJS += sideband.o
 LIB_OBJS += sigchain.o
+LIB_OBJS += split-chunk.o
 LIB_OBJS += strbuf.o
 LIB_OBJS += streaming.o
 LIB_OBJS += string-list.o
diff --git a/bulk-checkin.c b/bulk-checkin.c
index 6f1ce58..ebdacb8 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -5,6 +5,10 @@
 #include "csum-file.h"
 #include "pack.h"
 
+#ifndef CHUNK_MAX
+#define CHUNK_MAX 2000
+#endif
+
 static int pack_compression_level = Z_DEFAULT_COMPRESSION;
 
 static struct bulk_checkin_state {
@@ -268,12 +272,131 @@ static int deflate_to_pack(struct bulk_checkin_state *state,
 	return 0;
 }
 
+/* This is only called when actually writing the object out */
+static int store_in_chunks(struct bulk_checkin_state *state,
+			   unsigned char result_sha1[],
+			   int fd, size_t size,
+			   enum object_type type, const char *path,
+			   unsigned flags,
+			   struct chunk_ctx *up)
+{
+	struct pack_idx_entry *idx;
+	struct chunk_ctx ctx;
+	unsigned char chunk[CHUNK_MAX][20];
+	unsigned char header[100];
+	unsigned header_len;
+	unsigned chunk_cnt, i;
+	size_t remainder = size;
+	size_t write_size;
+	int status;
+
+	header_len = sprintf((char *)header, "%s %" PRIuMAX,
+			     typename(type), (uintmax_t)size) + 1;
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.up = up;
+	git_SHA1_Init(&ctx.ctx);
+	git_SHA1_Update(&ctx.ctx, header, header_len);
+
+	for (chunk_cnt = 0, remainder = size;
+	     remainder && chunk_cnt < CHUNK_MAX - 1;
+	     chunk_cnt++) {
+		size_t chunk_size = carve_chunk(fd, remainder);
+		status = deflate_to_pack(state, chunk[chunk_cnt], fd,
+					 chunk_size, OBJ_BLOB, path, flags,
+					 &ctx);
+		if (status)
+			return status;
+		remainder -= chunk_size;
+	}
+
+	if (remainder) {
+		if (split_size_limit_cfg && split_size_limit_cfg < remainder)
+			status = store_in_chunks(state, chunk[chunk_cnt], fd,
+						 remainder, OBJ_BLOB, path, flags,
+						 &ctx);
+		else
+			status = deflate_to_pack(state, chunk[chunk_cnt], fd,
+						 remainder, OBJ_BLOB, path, flags,
+						 &ctx);
+		if (status)
+			return status;
+		chunk_cnt++;
+	}
+
+	/*
+	 * Now we have chunk_cnt chunks (the last one may be a large
+	 * blob that itself is represented as concatenation of
+	 * multiple blobs).
+	 */
+	git_SHA1_Final(result_sha1, &ctx.ctx);
+	if (already_written(state, result_sha1))
+		return 0;
+
+	/*
+	 * The standard type & size header is followed by
+	 * - the number of chunks (varint)
+	 * - the object names of the chunks (20 * chunk_cnt bytes)
+	 * - the resulting object name (20 bytes)
+	 */
+	type = OBJ_CHUNKED(type);
+	header_len = encode_in_pack_object_header(type, size, header);
+	header_len += encode_in_pack_varint(chunk_cnt, header + header_len);
+
+	write_size = header_len + 20 * (chunk_cnt + 1);
+
+	prepare_to_stream(state, flags);
+	if (state->nr_written &&
+	    pack_size_limit_cfg &&
+	    pack_size_limit_cfg < (state->offset + write_size)) {
+		finish_bulk_checkin(state);
+		prepare_to_stream(state, flags);
+	}
+
+	idx = xcalloc(1, sizeof(*idx));
+	idx->offset = state->offset;
+
+	crc32_begin(state->f);
+	sha1write(state->f, header, header_len);
+	for (i = 0; i < chunk_cnt; i++)
+		sha1write(state->f, chunk[i], 20);
+	sha1write(state->f, result_sha1, 20);
+	idx->crc32 = crc32_end(state->f);
+	hashcpy(idx->sha1, result_sha1);
+	ALLOC_GROW(state->written,
+		   state->nr_written + 1,
+		   state->alloc_written);
+	state->written[state->nr_written++] = idx;
+	state->offset += write_size;
+
+	return 0;
+}
+
 int index_bulk_checkin(unsigned char *sha1,
 		       int fd, size_t size, enum object_type type,
 		       const char *path, unsigned flags)
 {
-	int status = deflate_to_pack(&state, sha1, fd, size, type,
-				     path, flags, NULL);
+	int status;
+
+	/*
+	 * For now, we only deal with blob objects, as validation
+	 * of a huge tree object that is split into chunks will be
+	 * too cumbersome to be worth it.
+	 *
+	 * Note that we only have to use store_in_chunks() codepath
+	 * when we are actually writing things out. deflate_to_pack()
+	 * codepath can hash arbitrarily huge object without keeping
+	 * everything in core just fine.
+	 */
+	if ((flags & HASH_WRITE_OBJECT) &&
+	    type == OBJ_BLOB &&
+	    split_size_limit_cfg &&
+	    split_size_limit_cfg < size)
+		status = store_in_chunks(&state, sha1, fd, size, type,
+					 path, flags, NULL);
+	else
+		status = deflate_to_pack(&state, sha1, fd, size, type,
+					 path, flags, NULL);
 	if (!state.plugged)
 		finish_bulk_checkin(&state);
 	return status;
diff --git a/bulk-checkin.h b/bulk-checkin.h
index 4f599f8..89f1741 100644
--- a/bulk-checkin.h
+++ b/bulk-checkin.h
@@ -12,5 +12,6 @@ extern int index_bulk_checkin(unsigned char sha1[],
 
 extern void plug_bulk_checkin(void);
 extern void unplug_bulk_checkin(void);
+extern size_t carve_chunk(int fd, size_t size);
 
 #endif
diff --git a/cache.h b/cache.h
index 4a3b421..374f712 100644
--- a/cache.h
+++ b/cache.h
@@ -384,11 +384,17 @@ enum object_type {
 	OBJ_EXT = 5,
 	OBJ_OFS_DELTA = 6,
 	OBJ_REF_DELTA = 7,
+	/* OBJ_CHUNKED_COMMIT = 8 */
+	/* OBJ_CHUNKED_TREE = 9 */
+	OBJ_CHUNKED_BLOB = 10,
+	/* OBJ_CHUNKED_TAG = 11 */
 	OBJ_ANY,
 	OBJ_MAX
 };
 #define OBJ_LAST_BASE_TYPE OBJ_REF_DELTA
-#define OBJ_LAST_VALID_TYPE OBJ_REF_DELTA
+#define OBJ_LAST_VALID_TYPE OBJ_CHUNKED_BLOB
+#define OBJ_CHUNKED(type) ((type) + 7)
+#define OBJ_DEKNUHC(type) ((type) - 7)
 
 static inline enum object_type object_type(unsigned int mode)
 {
@@ -602,6 +608,7 @@ extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
 extern unsigned long pack_size_limit_cfg;
+extern unsigned long split_size_limit_cfg;
 extern int read_replace_refs;
 extern int fsync_object_files;
 extern int core_preload_index;
diff --git a/config.c b/config.c
index c736802..bdc3be1 100644
--- a/config.c
+++ b/config.c
@@ -801,6 +801,11 @@ int git_default_config(const char *var, const char *value, void *dummy)
 		pack_size_limit_cfg = git_config_ulong(var, value);
 		return 0;
 	}
+
+	if (!strcmp(var, "pack.splitsizelimit")) {
+		split_size_limit_cfg = git_config_ulong(var, value);
+		return 0;
+	}
 	/* Add other config variables here and to Documentation/config.txt. */
 	return 0;
 }
diff --git a/environment.c b/environment.c
index 31e4284..b66df28 100644
--- a/environment.c
+++ b/environment.c
@@ -61,6 +61,7 @@ int grafts_replace_parents = 1;
 int core_apply_sparse_checkout;
 struct startup_info *startup_info;
 unsigned long pack_size_limit_cfg;
+unsigned long split_size_limit_cfg;
 
 /* Parallel index stat data preload? */
 int core_preload_index = 0;
diff --git a/split-chunk.c b/split-chunk.c
new file mode 100644
index 0000000..1b60e63
--- /dev/null
+++ b/split-chunk.c
@@ -0,0 +1,28 @@
+#include "git-compat-util.h"
+#include "bulk-checkin.h"
+
+/* Cut at around 512kB */
+#define TARGET_CHUNK_SIZE_LOG2 19
+#define TARGET_CHUNK_SIZE (1U << TARGET_CHUNK_SIZE_LOG2)
+
+/*
+ * Carve out around 500kB to be stored as a separate blob
+ */
+size_t carve_chunk(int fd, size_t size)
+{
+	size_t chunk_size;
+	off_t seekback = lseek(fd, 0, SEEK_CUR);
+
+	if (seekback == (off_t) -1)
+		die("cannot find the current offset");
+
+	/* Next patch will do something complex to find out where to cut */
+	chunk_size = size;
+	if (TARGET_CHUNK_SIZE < chunk_size)
+		chunk_size = TARGET_CHUNK_SIZE;
+
+	if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
+		return error("cannot seek back");
+
+	return chunk_size;
+}
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 29d6024..d6cb66d 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -100,4 +100,33 @@ test_expect_success 'packsize limit' '
 	)
 '
 
+test_expect_success 'split limit' '
+	test_create_repo split &&
+	(
+		cd split &&
+		git config core.bigfilethreshold 2m &&
+		git config pack.splitsizelimit 1m &&
+
+		test-genrandom "a" $(( 4800 * 1024 )) >split &&
+		git add split &&
+
+		# This should result in a new chunked object "tail"
+		# that shares most of the component blobs in its
+		# early part with "split".
+		cat split >tail &&
+		echo cruft >>tail &&
+		git add tail &&
+
+		# This should result in a new chunked object "head"
+		# that begins with its own unique component blobs
+		# but quickly synchronize and start using the same
+		# component blobs with "split" and "tail", once we
+		# switch to a better chunking heuristics.
+		echo cruft >head &&
+		cat split >>head &&
+		git add head
+
+	)
+'
+
 test_done
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH v3 6/6] chunked-object: fallback checkout codepaths
  2011-12-02  0:40   ` [PATCH v3 0/6] Bulk check-in Junio C Hamano
                       ` (4 preceding siblings ...)
  2011-12-02  0:40     ` [PATCH v3 5/6] bulk-checkin: support chunked-object encoding Junio C Hamano
@ 2011-12-02  0:40     ` Junio C Hamano
  5 siblings, 0 replies; 15+ messages in thread
From: Junio C Hamano @ 2011-12-02  0:40 UTC (permalink / raw)
  To: git

This prepares the default codepaths based on the traditional "slurping
everything in-core" model around read_sha1_file() API for objects that use
chunked encoding. Needless to say, these codepaths are unsuitable for the
kind of objects that use chunked encoding and are intended to only serve
as the fallback where specialized "large object API" support is still
lacking.

Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 sha1_file.c      |   54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 t/t1050-large.sh |   14 +++++++++++++-
 2 files changed, 67 insertions(+), 1 deletions(-)

diff --git a/sha1_file.c b/sha1_file.c
index 14902cc..7355716 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1607,6 +1607,11 @@ static int packed_object_info(struct packed_git *p, off_t obj_offset,
 		if (sizep)
 			*sizep = size;
 		break;
+	case OBJ_CHUNKED_BLOB:
+		if (sizep)
+			*sizep = size;
+		type = OBJ_DEKNUHC(type);
+		break;
 	default:
 		error("unknown object type %i at offset %"PRIuMAX" in %s",
 		      type, (uintmax_t)obj_offset, p->pack_name);
@@ -1648,6 +1653,51 @@ static void *unpack_compressed_entry(struct packed_git *p,
 	return buffer;
 }
 
+static void *unpack_chunked_entry(struct packed_git *p,
+				  struct pack_window **w_curs,
+				  off_t curpos,
+				  unsigned long size)
+{
+	/*
+	 * *NOTE* *NOTE* *NOTE*
+	 *
+	 * In the longer term, we should aim to exercise this codepath
+	 * less and less often, as it defeats the whole purpose of
+	 * chuncked object encoding!
+	 */
+	unsigned char *buffer;
+	const unsigned char *in, *ptr;
+	unsigned long avail, ofs;
+	int chunk_cnt;
+
+	buffer = xmallocz(size);
+	in = use_pack(p, w_curs, curpos, &avail);
+	ptr = in;
+	chunk_cnt = decode_in_pack_varint(&ptr);
+	curpos += ptr - in;
+	ofs = 0;
+	while (chunk_cnt--) {
+		unsigned long csize;
+		unsigned char *data;
+		enum object_type type;
+
+		in = use_pack(p, w_curs, curpos, &avail);
+		data = read_sha1_file(in, &type, &csize);
+		if (!data)
+			die("malformed chunked object contents ('%s' does not exist)",
+			    sha1_to_hex(in));
+		if (type != OBJ_BLOB)
+			die("malformed chunked object contents (not a blob)");
+		if (size < ofs + csize)
+			die("malformed chunked object contents (sizes do not add up)");
+		memcpy(buffer + ofs, data, csize);
+		ofs += csize;
+		curpos += 20;
+		free(data);
+	}
+	return buffer;
+}
+
 #define MAX_DELTA_CACHE (256)
 
 static size_t delta_base_cached;
@@ -1883,6 +1933,10 @@ void *unpack_entry(struct packed_git *p, off_t obj_offset,
 	case OBJ_TAG:
 		data = unpack_compressed_entry(p, &w_curs, curpos, *sizep);
 		break;
+	case OBJ_CHUNKED_BLOB:
+		data = unpack_chunked_entry(p, &w_curs, curpos, *sizep);
+		*type = OBJ_DEKNUHC(*type);
+		break;
 	default:
 		data = NULL;
 		error("unknown object type %i at offset %"PRIuMAX" in %s",
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index d6cb66d..eea45d1 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -124,8 +124,20 @@ test_expect_success 'split limit' '
 		# switch to a better chunking heuristics.
 		echo cruft >head &&
 		cat split >>head &&
-		git add head
+		git add head &&
 
+		echo blob >expect &&
+		git cat-file -t :split >actual &&
+		test_cmp expect actual &&
+
+		git cat-file -p :split >actual &&
+		# You probably do not want to use test_cmp here...
+		cmp split actual &&
+
+		mv split expect &&
+		git checkout split &&
+		# You probably do not want to use test_cmp here...
+		cmp expect split
 	)
 '
 
-- 
1.7.8.rc4.177.g4d64

^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2011-12-02  0:41 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-12-01  0:27 [PATCH v2 0/5] Bulk Check-in Junio C Hamano
2011-12-01  0:27 ` [PATCH v2 1/5] write_pack_header(): a helper function Junio C Hamano
2011-12-01  0:27 ` [PATCH v2 2/5] create_tmp_packfile(): " Junio C Hamano
2011-12-01  0:27 ` [PATCH v2 3/5] finish_tmp_packfile(): " Junio C Hamano
2011-12-01  0:27 ` [PATCH v2 4/5] csum-file: introduce sha1file_checkpoint Junio C Hamano
2011-12-01  0:27 ` [PATCH v2 5/5] bulk-checkin: replace fast-import based implementation Junio C Hamano
2011-12-01  8:05   ` Nguyen Thai Ngoc Duy
2011-12-01 15:46     ` Junio C Hamano
2011-12-02  0:40   ` [PATCH v3 0/6] Bulk check-in Junio C Hamano
2011-12-02  0:40     ` [PATCH v3 1/6] bulk-checkin: replace fast-import based implementation Junio C Hamano
2011-12-02  0:40     ` [PATCH v3 2/6] varint-in-pack: refactor varint encoding/decoding Junio C Hamano
2011-12-02  0:40     ` [PATCH v3 3/6] new representation types in the packstream Junio C Hamano
2011-12-02  0:40     ` [PATCH v3 4/6] bulk-checkin: allow the same data to be multiply hashed Junio C Hamano
2011-12-02  0:40     ` [PATCH v3 5/6] bulk-checkin: support chunked-object encoding Junio C Hamano
2011-12-02  0:40     ` [PATCH v3 6/6] chunked-object: fallback checkout codepaths Junio C Hamano

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).