From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Junio C Hamano" <gitster@pobox.com>,
"Nicolas Pitre" <nico@fluxnic.net>,
"Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH] pack-objects: use streaming interface for reading large loose blobs
Date: Sat, 12 May 2012 17:26:15 +0700 [thread overview]
Message-ID: <1336818375-16895-1-git-send-email-pclouds@gmail.com> (raw)
git usually streams large blobs directly to packs. But there are cases
where git can create large loose blobs (unpack-objects or hash-object
over pipe). Or they can come from other git implementations.
core.bigfilethreshold can also be lowered down and introduce a new
wave of large loose blobs.
Use streaming interface to read these blobs and compress/write at the
same time.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
index-pack's streaming support is on the way. unpack-objects is
another story because I'm thinking of merging it back to index-pack
first, which may take more than one release cycle.
builtin/pack-objects.c | 73 ++++++++++++++++++++++++++++++++++++++++++++----
t/t1050-large.sh | 16 ++++++++++
2 files changed, 83 insertions(+), 6 deletions(-)
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 1861093..98b51c1 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -16,6 +16,7 @@
#include "list-objects.h"
#include "progress.h"
#include "refs.h"
+#include "streaming.h"
#include "thread-utils.h"
static const char *pack_usage[] = {
@@ -150,6 +151,55 @@ static unsigned long do_compress(void **pptr, unsigned long size)
return stream.total_out;
}
+static void write_large_blob_data(struct sha1file *f, const unsigned char *sha1)
+{
+ git_zstream stream;
+ unsigned char ibuf[1024 * 16];
+ unsigned char obuf[1024 * 16];
+ int zret;
+
+ struct git_istream *st;
+ enum object_type type;
+ unsigned long sz;
+
+ st = open_istream(sha1, &type, &sz, NULL);
+ if (!st)
+ die(_("failed to read %s"), sha1_to_hex(sha1));
+
+ memset(&stream, 0, sizeof(stream));
+ git_deflate_init(&stream, pack_compression_level);
+
+ if (type != OBJ_BLOB)
+ die("BUG: %s is not a blob", sha1_to_hex(sha1));
+
+ for (;;) {
+ ssize_t readlen;
+ readlen = read_istream(st, ibuf, sizeof(ibuf));
+ if (readlen == -1)
+ die(_("failed to read %s"), sha1_to_hex(sha1));
+
+ stream.next_in = ibuf;
+ stream.avail_in = readlen;
+ zret = Z_OK;
+ while ((stream.avail_in || readlen == 0) &&
+ (zret == Z_OK || zret == Z_BUF_ERROR)) {
+ stream.next_out = obuf;
+ stream.avail_out = sizeof(obuf);
+ zret = git_deflate(&stream, readlen ? 0 : Z_FINISH);
+ sha1write(f, obuf, stream.next_out - obuf);
+ }
+ if (stream.avail_in)
+ die(_("deflate error (%d)"), zret);
+ if (readlen == 0) {
+ if (zret != Z_STREAM_END)
+ die(_("deflate error (%d)"), zret);
+ break;
+ }
+ }
+ close_istream(st);
+ git_deflate_end(&stream);
+}
+
/*
* we are going to reuse the existing object data as is. make
* sure it is not corrupt.
@@ -259,9 +309,14 @@ static unsigned long write_object(struct sha1file *f,
if (!to_reuse) {
no_reuse:
if (!usable_delta) {
- buf = read_sha1_file(entry->idx.sha1, &type, &size);
- if (!buf)
- die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+ type = sha1_object_info(entry->idx.sha1, &size);
+ if (type == OBJ_BLOB && size > big_file_threshold)
+ buf = NULL;
+ else {
+ buf = read_sha1_file(entry->idx.sha1, &type, &size);
+ if (!buf)
+ die("unable to read %s", sha1_to_hex(entry->idx.sha1));
+ }
/*
* make sure no cached delta data remains from a
* previous attempt before a pack split occurred.
@@ -284,8 +339,11 @@ static unsigned long write_object(struct sha1file *f,
if (entry->z_delta_size)
datalen = entry->z_delta_size;
- else
+ else if (buf)
datalen = do_compress(&buf, size);
+ else
+ /* large blob case, just assume we don't compress well */
+ datalen = size;
/*
* The object header is a byte of 'type' followed by zero or
@@ -330,8 +388,11 @@ static unsigned long write_object(struct sha1file *f,
}
sha1write(f, header, hdrlen);
}
- sha1write(f, buf, datalen);
- free(buf);
+ if (buf) {
+ sha1write(f, buf, datalen);
+ free(buf);
+ } else
+ write_large_blob_data(f, entry->idx.sha1);
}
else {
struct packed_git *p = entry->in_pack;
diff --git a/t/t1050-large.sh b/t/t1050-large.sh
index 55ed955..7fbd2e1 100755
--- a/t/t1050-large.sh
+++ b/t/t1050-large.sh
@@ -134,6 +134,22 @@ test_expect_success 'repack' '
git repack -ad
'
+test_expect_success 'pack-objects with large loose object' '
+ echo Z | dd of=large4 bs=1k seek=2000 &&
+ OBJ=9f36d94e145816ec642592c09cc8e601d83af157 &&
+ P=.git/objects/9f/36d94e145816ec642592c09cc8e601d83af157 &&
+ (
+ unset GIT_ALLOC_LIMIT &&
+ cat large4 | git hash-object -w --stdin &&
+ git cat-file blob $OBJ >actual &&
+ cmp large4 actual
+ ) &&
+ echo $OBJ | git pack-objects .git/objects/pack/pack &&
+ rm $P &&
+ git cat-file blob $OBJ >actual &&
+ cmp large4 actual
+'
+
test_expect_success 'tar achiving' '
git archive --format=tar HEAD >/dev/null
'
--
1.7.8.36.g69ee2
next reply other threads:[~2012-05-12 10:30 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-05-12 10:26 Nguyễn Thái Ngọc Duy [this message]
2012-05-12 16:51 ` [PATCH] pack-objects: use streaming interface for reading large loose blobs Nicolas Pitre
2012-05-13 4:37 ` [PATCH v2] " Nguyễn Thái Ngọc Duy
2012-05-14 15:56 ` Junio C Hamano
2012-05-14 19:43 ` Junio C Hamano
2012-05-15 11:18 ` Nguyen Thai Ngoc Duy
2012-05-15 15:27 ` Junio C Hamano
2012-05-16 7:09 ` Nguyen Thai Ngoc Duy
2012-05-16 12:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Nguyễn Thái Ngọc Duy
2012-05-16 12:02 ` [PATCH v2 2/4] pack-objects, streaming: turn "xx >= big_file_threshold" to ".. > .." Nguyễn Thái Ngọc Duy
2012-05-18 21:05 ` Junio C Hamano
2012-05-16 12:02 ` [PATCH v2 3/4] pack-objects: refactor write_object() Nguyễn Thái Ngọc Duy
2012-05-18 21:16 ` Junio C Hamano
2012-05-19 2:43 ` Nicolas Pitre
2012-05-16 12:02 ` [PATCH v2 4/4] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-18 21:02 ` [PATCH v2 1/4] streaming: allow to call close_istream(NULL); Junio C Hamano
-- strict thread matches above, loose matches on Subject: below --
2012-05-26 10:28 [PATCH] pack-objects: use streaming interface for reading large loose blobs Nguyễn Thái Ngọc Duy
2012-05-29 17:56 ` Junio C Hamano
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1336818375-16895-1-git-send-email-pclouds@gmail.com \
--to=pclouds@gmail.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=nico@fluxnic.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.