From: Sam Hocevar <sam@zoy.org>
To: git@vger.kernel.org
Subject: [PATCH 2/2] fast-import: treat large blobs (> 100 MiB) specially, by deflating them on-the-fly from stdin instead of keeping an entire copy in memory.
Date: Sun, 8 Mar 2009 19:40:57 +0100 [thread overview]
Message-ID: <20090308184057.GA9606@zoy.org> (raw)
Since deltas need no longer be computed for such files, fast-import is
now twice as fast and memory usage decreases more than threefold when
importing large files.
Signed-off-by: Sam Hocevar <sam@zoy.org>
---
I'd like to hear any suggestions on how to improve this patch. I am
not sure all the decisions I made by trying not to refactor too much
of the code were appropriate. If at least the idea is welcome, I will
also write a proper patch to make the data size threshold a config
option.
Here is a graph of memory usage against time for the current version
of fast-import and a patched version, when importing four 100 MiB files
filled with random data: http://zoy.org/~sam/git/git-faster-import.png
fast-import.c | 155 +++++++++++++++++++++++++++++++++++++++++----------------
1 files changed, 112 insertions(+), 43 deletions(-)
diff --git a/fast-import.c b/fast-import.c
index 6419d00..bdd40e7 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -1044,12 +1044,14 @@ static int store_object(
struct strbuf *dat,
struct last_object *last,
unsigned char *sha1,
- uintmax_t mark)
+ uintmax_t mark,
+ int orig_bytes)
{
void *out, *delta;
struct object_entry *e;
unsigned char hdr[96];
unsigned long hdrlen, deltalen;
+ size_t outbytes;
z_stream s;
e = insert_object(sha1);
@@ -1066,7 +1068,9 @@ static int store_object(
return 1;
}
- if (last && last->data.buf && last->depth < max_depth) {
+ /* If orig_bytes is set, the object is already deflated and the
+ * caller does not want us to computes delta. */
+ if (!orig_bytes && last && last->data.buf && last->depth < max_depth) {
delta = diff_delta(last->data.buf, last->data.len,
dat->buf, dat->len,
&deltalen, 0);
@@ -1077,24 +1081,30 @@ static int store_object(
} else
delta = NULL;
- memset(&s, 0, sizeof(s));
- deflateInit(&s, pack_compression_level);
- if (delta) {
- s.next_in = delta;
- s.avail_in = deltalen;
+ if (!orig_bytes) {
+ memset(&s, 0, sizeof(s));
+ deflateInit(&s, pack_compression_level);
+ if (delta) {
+ s.next_in = delta;
+ s.avail_in = deltalen;
+ } else {
+ s.next_in = (void *)dat->buf;
+ s.avail_in = dat->len;
+ }
+ s.avail_out = deflateBound(&s, s.avail_in);
+ s.next_out = out = xmalloc(s.avail_out);
+ while (deflate(&s, Z_FINISH) == Z_OK)
+ /* nothing */;
+ deflateEnd(&s);
+ outbytes = s.total_out;
} else {
- s.next_in = (void *)dat->buf;
- s.avail_in = dat->len;
+ out = dat->buf;
+ outbytes = dat->len;
}
- s.avail_out = deflateBound(&s, s.avail_in);
- s.next_out = out = xmalloc(s.avail_out);
- while (deflate(&s, Z_FINISH) == Z_OK)
- /* nothing */;
- deflateEnd(&s);
/* Determine if we should auto-checkpoint. */
- if ((pack_size + 60 + s.total_out) > max_packsize
- || (pack_size + 60 + s.total_out) < pack_size) {
+ if ((pack_size + 60 + outbytes) > max_packsize
+ || (pack_size + 60 + outbytes) < pack_size) {
/* This new object needs to *not* have the current pack_id. */
e->pack_id = pack_id + 1;
@@ -1141,24 +1151,27 @@ static int store_object(
pack_size += sizeof(hdr) - pos;
} else {
e->depth = 0;
- hdrlen = encode_header(type, dat->len, hdr);
+ hdrlen = encode_header(type, orig_bytes ? orig_bytes
+ : dat->len, hdr);
write_or_die(pack_data->pack_fd, hdr, hdrlen);
pack_size += hdrlen;
}
- write_or_die(pack_data->pack_fd, out, s.total_out);
- pack_size += s.total_out;
+ write_or_die(pack_data->pack_fd, out, outbytes);
+ pack_size += outbytes;
- free(out);
free(delta);
- if (last) {
- if (last->no_swap) {
- last->data = *dat;
- } else {
- strbuf_swap(&last->data, dat);
+ if (!orig_bytes) {
+ free(out);
+ if (last) {
+ if (last->no_swap) {
+ last->data = *dat;
+ } else {
+ strbuf_swap(&last->data, dat);
+ }
+ last->offset = e->offset;
+ last->depth = e->depth;
}
- last->offset = e->offset;
- last->depth = e->depth;
}
return 0;
}
@@ -1343,7 +1356,7 @@ static void store_tree(struct tree_entry *root)
mktree(t, 1, &new_tree);
sha1_object(OBJ_TREE, &new_tree, root->versions[1].sha1);
- store_object(OBJ_TREE, &new_tree, &lo, root->versions[1].sha1, 0);
+ store_object(OBJ_TREE, &new_tree, &lo, root->versions[1].sha1, 0, 0);
t->delta_depth = lo.depth;
for (i = 0, j = 0, del = 0; i < t->entry_count; i++) {
@@ -1711,11 +1724,15 @@ static void parse_mark(void)
/* This actually parses a "data" command, with the addition that if sha1out
* is not NULL, it will also compute the sha1 on the fly. */
-static void parse_object_data(
+static int parse_object_data(
enum object_type type,
struct strbuf *sb,
- unsigned char *sha1out)
+ unsigned char *sha1out,
+ int candeflate)
{
+ int orig_bytes = 0;
+ size_t n = 0, length;
+
strbuf_reset(sb);
if (prefixcmp(command_buf.buf, "data "))
@@ -1737,14 +1754,63 @@ static void parse_object_data(
}
free(term);
- if(sha1out)
+ if (sha1out)
sha1_object(type, sb, sha1out);
}
- else {
- size_t n = 0, length;
+ /* TODO: make the hardcoded value a configuration option */
+ else if ((length = strtoul(command_buf.buf + 5, NULL, 10))
+ > 100 * 1024 * 1024
+ && candeflate) {
+ /* The incoming file is really big. As it is pretty unlikely
+ * it will give any interesting deltas, we immediately deflate
+ * it instead of storing the original data in memory. */
+ static struct strbuf tmp = STRBUF_INIT;
+ git_SHA_CTX c;
+ z_stream zs;
+
+ if (sha1out) {
+ unsigned char hdr[96];
+ unsigned long hdrlen;
+ hdrlen = sprintf((char*)hdr,"%s %lu", typename(type),
+ (unsigned long)length) + 1;
+ git_SHA1_Init(&c);
+ git_SHA1_Update(&c, hdr, hdrlen);
+ }
+
+ memset(&zs, 0, sizeof(zs));
+ deflateInit(&zs, pack_compression_level);
+ /* TODO: ideally, this should grow dynamically while we
+ * deflate the file. */
+ zs.avail_out = deflateBound(&zs, length);
+ strbuf_grow(sb, zs.avail_out);
+ zs.next_out = (unsigned char *)sb->buf;
- length = strtoul(command_buf.buf + 5, NULL, 10);
+ while (n < length) {
+ size_t s = strbuf_fread(&tmp, length - n < 4096 ?
+ length - n : 4096, stdin);
+ if (!s && feof(stdin))
+ die("EOF in data (%lu bytes remaining)",
+ (unsigned long)(length - n));
+ if (sha1out)
+ git_SHA1_Update(&c, tmp.buf, s);
+ zs.next_in = (unsigned char *)tmp.buf;
+ zs.avail_in = s;
+ while (deflate(&zs, Z_NO_FLUSH) == Z_OK)
+ /* nothing */;
+ strbuf_reset(&tmp);
+ n += s;
+ }
+ deflate(&zs, Z_FINISH);
+ deflateEnd(&zs);
+ strbuf_setlen(sb, zs.total_out);
+
+ if (sha1out)
+ git_SHA1_Final(sha1out, &c);
+
+ orig_bytes = length;
+ }
+ else {
while (n < length) {
size_t s = strbuf_fread(sb, length - n, stdin);
if (!s && feof(stdin))
@@ -1753,16 +1819,17 @@ static void parse_object_data(
n += s;
}
- if(sha1out)
+ if (sha1out)
sha1_object(type, sb, sha1out);
}
skip_optional_lf();
+ return orig_bytes;
}
static void parse_data(struct strbuf *sb)
{
- parse_object_data(OBJ_NONE, sb, NULL);
+ parse_object_data(OBJ_NONE, sb, NULL, 0);
}
static int validate_raw_date(const char *src, char *result, int maxlen)
@@ -1828,13 +1895,14 @@ static char *parse_ident(const char *buf)
static void parse_new_blob(void)
{
- unsigned char sha1[20];
static struct strbuf buf = STRBUF_INIT;
+ unsigned char sha1[20];
+ int orig_bytes;
read_next_command();
parse_mark();
- parse_object_data(OBJ_BLOB, &buf, sha1);
- store_object(OBJ_BLOB, &buf, &last_blob, sha1, next_mark);
+ orig_bytes = parse_object_data(OBJ_BLOB, &buf, sha1, 1);
+ store_object(OBJ_BLOB, &buf, &last_blob, sha1, next_mark, orig_bytes);
}
static void unload_one_branch(void)
@@ -1946,14 +2014,15 @@ static void file_change_m(struct branch *b)
*/
} else if (inline_data) {
static struct strbuf buf = STRBUF_INIT;
+ int orig_bytes;
if (p != uq.buf) {
strbuf_addstr(&uq, p);
p = uq.buf;
}
read_next_command();
- parse_object_data(OBJ_BLOB, &buf, sha1);
- store_object(OBJ_BLOB, &buf, &last_blob, sha1, 0);
+ orig_bytes = parse_object_data(OBJ_BLOB, &buf, sha1, 1);
+ store_object(OBJ_BLOB, &buf, &last_blob, sha1, 0, orig_bytes);
} else if (oe) {
if (oe->type != OBJ_BLOB)
die("Not a blob (actually a %s): %s",
@@ -2236,7 +2305,7 @@ static void parse_new_commit(void)
free(committer);
sha1_object(OBJ_COMMIT, &new_data, b->sha1);
- if (!store_object(OBJ_COMMIT, &new_data, NULL, b->sha1, next_mark))
+ if (!store_object(OBJ_COMMIT, &new_data, NULL, b->sha1, next_mark, 0))
b->pack_id = pack_id;
b->last_commit = object_count_by_type[OBJ_COMMIT];
}
@@ -2317,7 +2386,7 @@ static void parse_new_tag(void)
free(tagger);
sha1_object(OBJ_TAG, &new_data, t->sha1);
- if (store_object(OBJ_TAG, &new_data, NULL, t->sha1, 0))
+ if (store_object(OBJ_TAG, &new_data, NULL, t->sha1, 0, 0))
t->pack_id = MAX_PACK_ID;
else
t->pack_id = pack_id;
--
1.6.2
next reply other threads:[~2009-03-08 18:48 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-03-08 18:40 Sam Hocevar [this message]
2009-03-08 18:57 ` [PATCH 2/2] fast-import: treat large blobs (> 100 MiB) specially, by deflating them on-the-fly from stdin instead of keeping an entire copy in memory Sam Hocevar
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090308184057.GA9606@zoy.org \
--to=sam@zoy.org \
--cc=git@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).