* [PATCH] {fetch,receive}-pack: drop unpack-objects, delay loosing objects until the end
@ 2013-09-02 3:05 Nguyễn Thái Ngọc Duy
2013-09-02 4:38 ` Eric Sunshine
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-02 3:05 UTC (permalink / raw)
To: git; +Cc: Nguyễn Thái Ngọc Duy
Current code peaks into the transfered pack's header, if the number of
objects is under a limit, unpack-objects is called to handle the rest,
otherwise index-pack is. This patch makes fetch-pack use index-pack
unconditionally, then turn objects loose and remove the pack at the
end. unpack-objects is deprecated and may be removed in future.
There are a few reasons for this:
- down to two code paths to maintain regarding pack reading
(sha1_file.c and index-pack.c). When .pack v4 comes, we don't need
to duplicate work in index-pack and unpack-objects.
- the number of objects should not be the only indicator for
unpacking. If there are a few large objects in the pack, the pack
should be kept anyway. Keeping the pack lets us examine the whole
pack later and make a better decision.
- by going through index-pack first, then unpack, we pay extra cost
for completing a thin pack into a full one. But compared to fetch's
total time, it should not be noticeable because unpack-objects is
only called when the pack contains a small number of objects.
- unpack-objects does not support streaming large blobs. Granted
force_object_loose(), which is used by this patch, does not either.
But it'll be easier to do so because sha1_file.c interface supports
streaming.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
builtin/receive-pack.c | 120 +++++++++++++------------------------------------
cache.h | 1 +
fetch-pack.c | 77 +++++++++++--------------------
sha1_file.c | 70 ++++++++++++++++++++++++++++-
4 files changed, 128 insertions(+), 140 deletions(-)
diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c
index e3eb5fc..6eb4ffb 100644
--- a/builtin/receive-pack.c
+++ b/builtin/receive-pack.c
@@ -792,105 +792,49 @@ static struct command *read_head_info(void)
return commands;
}
-static const char *parse_pack_header(struct pack_header *hdr)
-{
- switch (read_pack_header(0, hdr)) {
- case PH_ERROR_EOF:
- return "eof before pack header was fully read";
-
- case PH_ERROR_PACK_SIGNATURE:
- return "protocol error (pack signature mismatch detected)";
-
- case PH_ERROR_PROTOCOL:
- return "protocol error (pack version unsupported)";
-
- default:
- return "unknown error in parse_pack_header";
-
- case 0:
- return NULL;
- }
-}
-
static const char *pack_lockfile;
static const char *unpack(int err_fd)
{
- struct pack_header hdr;
- const char *hdr_err;
- char hdr_arg[38];
int fsck_objects = (receive_fsck_objects >= 0
? receive_fsck_objects
: transfer_fsck_objects >= 0
? transfer_fsck_objects
: 0);
- hdr_err = parse_pack_header(&hdr);
- if (hdr_err) {
- if (err_fd > 0)
- close(err_fd);
- return hdr_err;
- }
- snprintf(hdr_arg, sizeof(hdr_arg),
- "--pack_header=%"PRIu32",%"PRIu32,
- ntohl(hdr.hdr_version), ntohl(hdr.hdr_entries));
-
- if (ntohl(hdr.hdr_entries) < unpack_limit) {
- int code, i = 0;
- struct child_process child;
- const char *unpacker[5];
- unpacker[i++] = "unpack-objects";
- if (quiet)
- unpacker[i++] = "-q";
- if (fsck_objects)
- unpacker[i++] = "--strict";
- unpacker[i++] = hdr_arg;
- unpacker[i++] = NULL;
- memset(&child, 0, sizeof(child));
- child.argv = unpacker;
- child.no_stdout = 1;
- child.err = err_fd;
- child.git_cmd = 1;
- code = run_command(&child);
- if (!code)
- return NULL;
- return "unpack-objects abnormal exit";
- } else {
- const char *keeper[7];
- int s, status, i = 0;
- char keep_arg[256];
- struct child_process ip;
-
- s = sprintf(keep_arg, "--keep=receive-pack %"PRIuMAX" on ", (uintmax_t) getpid());
- if (gethostname(keep_arg + s, sizeof(keep_arg) - s))
- strcpy(keep_arg + s, "localhost");
-
- keeper[i++] = "index-pack";
- keeper[i++] = "--stdin";
- if (fsck_objects)
- keeper[i++] = "--strict";
- keeper[i++] = "--fix-thin";
- keeper[i++] = hdr_arg;
- keeper[i++] = keep_arg;
- keeper[i++] = NULL;
- memset(&ip, 0, sizeof(ip));
- ip.argv = keeper;
- ip.out = -1;
- ip.err = err_fd;
- ip.git_cmd = 1;
- status = start_command(&ip);
- if (status) {
- return "index-pack fork failed";
- }
- pack_lockfile = index_pack_lockfile(ip.out);
- close(ip.out);
- status = finish_command(&ip);
- if (!status) {
- reprepare_packed_git();
- return NULL;
- }
+ const char *keeper[7];
+ int s, status, i = 0;
+ char keep_arg[256];
+ struct child_process ip;
+
+ s = sprintf(keep_arg, "--keep=receive-pack %"PRIuMAX" on ", (uintmax_t) getpid());
+ if (gethostname(keep_arg + s, sizeof(keep_arg) - s))
+ strcpy(keep_arg + s, "localhost");
+
+ keeper[i++] = "index-pack";
+ keeper[i++] = "--stdin";
+ if (fsck_objects)
+ keeper[i++] = "--strict";
+ keeper[i++] = "--fix-thin";
+ keeper[i++] = keep_arg;
+ keeper[i++] = NULL;
+ memset(&ip, 0, sizeof(ip));
+ ip.argv = keeper;
+ ip.out = -1;
+ ip.err = err_fd;
+ ip.git_cmd = 1;
+ status = start_command(&ip);
+ if (status) {
+ return "index-pack fork failed";
+ }
+ pack_lockfile = index_pack_lockfile(ip.out);
+ close(ip.out);
+ status = finish_command(&ip);
+ if (status)
return "index-pack abnormal exit";
- }
+ if (!maybe_unpack_objects(pack_lockfile, unpack_limit))
+ pack_lockfile = NULL;
+ return NULL;
}
static const char *unpack_with_sideband(void)
diff --git a/cache.h b/cache.h
index 85b544f..0fff958 100644
--- a/cache.h
+++ b/cache.h
@@ -789,6 +789,7 @@ extern int hash_sha1_file(const void *buf, unsigned long len, const char *type,
extern int write_sha1_file(const void *buf, unsigned long len, const char *type, unsigned char *return_sha1);
extern int pretend_sha1_file(void *, unsigned long, enum object_type, unsigned char *);
extern int force_object_loose(const unsigned char *sha1, time_t mtime);
+extern int maybe_unpack_objects(const char *pack_lockfile, int limit);
extern void *map_sha1_file(const unsigned char *sha1, unsigned long *size);
extern int unpack_sha1_header(git_zstream *stream, unsigned char *map, unsigned long mapsize, void *buffer, unsigned long bufsiz);
extern int parse_sha1_header(const char *hdr, unsigned long *sizep);
diff --git a/fetch-pack.c b/fetch-pack.c
index f5d99c1..9c81a15 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -687,9 +687,7 @@ static int get_pack(struct fetch_pack_args *args,
struct async demux;
const char *argv[22];
char keep_arg[256];
- char hdr_arg[256];
const char **av;
- int do_keep = args->keep_pack;
struct child_process cmd;
int ret;
@@ -711,54 +709,29 @@ static int get_pack(struct fetch_pack_args *args,
memset(&cmd, 0, sizeof(cmd));
cmd.argv = argv;
+ cmd.out = -1;
av = argv;
- *hdr_arg = 0;
- if (!args->keep_pack && unpack_limit) {
- struct pack_header header;
-
- if (read_pack_header(demux.out, &header))
- die("protocol error: bad pack header");
- snprintf(hdr_arg, sizeof(hdr_arg),
- "--pack_header=%"PRIu32",%"PRIu32,
- ntohl(header.hdr_version), ntohl(header.hdr_entries));
- if (ntohl(header.hdr_entries) < unpack_limit)
- do_keep = 0;
- else
- do_keep = 1;
- }
if (alternate_shallow_file) {
*av++ = "--shallow-file";
*av++ = alternate_shallow_file;
}
- if (do_keep) {
- if (pack_lockfile)
- cmd.out = -1;
- *av++ = "index-pack";
- *av++ = "--stdin";
- if (!args->quiet && !args->no_progress)
- *av++ = "-v";
- if (args->use_thin_pack)
- *av++ = "--fix-thin";
- if (args->lock_pack || unpack_limit) {
- int s = sprintf(keep_arg,
- "--keep=fetch-pack %"PRIuMAX " on ", (uintmax_t) getpid());
- if (gethostname(keep_arg + s, sizeof(keep_arg) - s))
- strcpy(keep_arg + s, "localhost");
- *av++ = keep_arg;
- }
- if (args->check_self_contained_and_connected)
- *av++ = "--check-self-contained-and-connected";
- }
- else {
- *av++ = "unpack-objects";
- if (args->quiet || args->no_progress)
- *av++ = "-q";
- args->check_self_contained_and_connected = 0;
- }
- if (*hdr_arg)
- *av++ = hdr_arg;
+ *av++ = "index-pack";
+ *av++ = "--stdin";
+ if (!args->quiet && !args->no_progress)
+ *av++ = "-v";
+ if (args->use_thin_pack)
+ *av++ = "--fix-thin";
+ if (args->lock_pack || unpack_limit) {
+ int s = sprintf(keep_arg,
+ "--keep=fetch-pack %"PRIuMAX " on ", (uintmax_t) getpid());
+ if (gethostname(keep_arg + s, sizeof(keep_arg) - s))
+ strcpy(keep_arg + s, "localhost");
+ *av++ = keep_arg;
+ }
+ if (args->check_self_contained_and_connected)
+ *av++ = "--check-self-contained-and-connected";
if (fetch_fsck_objects >= 0
? fetch_fsck_objects
: transfer_fsck_objects >= 0
@@ -771,10 +744,8 @@ static int get_pack(struct fetch_pack_args *args,
cmd.git_cmd = 1;
if (start_command(&cmd))
die("fetch-pack: unable to fork off %s", argv[0]);
- if (do_keep && pack_lockfile) {
- *pack_lockfile = index_pack_lockfile(cmd.out);
- close(cmd.out);
- }
+ *pack_lockfile = index_pack_lockfile(cmd.out);
+ close(cmd.out);
ret = finish_command(&cmd);
if (!ret || (args->check_self_contained_and_connected && ret == 1))
@@ -820,11 +791,12 @@ static struct ref *do_fetch_pack(struct fetch_pack_args *args,
int fd[2],
const struct ref *orig_ref,
struct ref **sought, int nr_sought,
- char **pack_lockfile)
+ char **pack_lockfile_p)
{
struct ref *ref = copy_ref_list(orig_ref);
unsigned char sha1[20];
const char *agent_feature;
+ char *pack_lockfile = NULL;
int agent_len;
sort_ref_list(&ref, ref_compare_name);
@@ -899,9 +871,15 @@ static struct ref *do_fetch_pack(struct fetch_pack_args *args,
setup_alternate_shallow();
else
alternate_shallow_file = NULL;
- if (get_pack(args, fd, pack_lockfile))
+ if (get_pack(args, fd, &pack_lockfile))
die("git fetch-pack: fetch failed.");
+ if (!maybe_unpack_objects(pack_lockfile,
+ args->keep_pack ? 0 : unpack_limit))
+ pack_lockfile = NULL;
+ if (pack_lockfile_p)
+ *pack_lockfile_p = pack_lockfile;
+
all_done:
return ref;
}
@@ -997,6 +975,5 @@ struct ref *fetch_pack(struct fetch_pack_args *args,
commit_lock_file(&shallow_lock);
}
- reprepare_packed_git();
return ref_cpy;
}
diff --git a/sha1_file.c b/sha1_file.c
index 8e27db1..a0cdeae 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -61,6 +61,8 @@ static struct cached_object empty_tree = {
};
static struct packed_git *last_found_pack;
+/* temporarily skip one pack in find_pack_entry() */
+static struct packed_git *skip_this_pack;
static struct cached_object *find_cached_object(const unsigned char *sha1)
{
@@ -2376,11 +2378,15 @@ static int find_pack_entry(const unsigned char *sha1, struct pack_entry *e)
if (!packed_git)
return 0;
- if (last_found_pack && fill_pack_entry(sha1, e, last_found_pack))
+ if (last_found_pack &&
+ last_found_pack != skip_this_pack &&
+ fill_pack_entry(sha1, e, last_found_pack))
return 1;
for (p = packed_git; p; p = p->next) {
- if (p == last_found_pack || !fill_pack_entry(sha1, e, p))
+ if (p == last_found_pack ||
+ p == skip_this_pack ||
+ !fill_pack_entry(sha1, e, p))
continue;
last_found_pack = p;
@@ -3133,3 +3139,63 @@ void assert_sha1_type(const unsigned char *sha1, enum object_type expect)
die("%s is not a valid '%s' object", sha1_to_hex(sha1),
typename(expect));
}
+
+static int has_sha1_except_in(struct packed_git *p,
+ const unsigned char *sha1)
+{
+ int has;
+ skip_this_pack = p;
+ has = has_sha1_file(sha1);
+ skip_this_pack = NULL;
+ return has;
+}
+
+/*
+ * Unpack objects if the number of objects in the pack is lower than
+ * specified limit. Otherwise make sure the new pack is registered
+ * (including when pack_lockfile is NULL). Return 0 is the pack is
+ * removed.
+ */
+int maybe_unpack_objects(const char *pack_lockfile, int limit)
+{
+ const char *ext[] = { ".pack", ".idx", ".keep", NULL };
+ struct packed_git *p;
+ char *path;
+ int len, ret = 0;
+ uint32_t i;
+
+ reprepare_packed_git();
+ if (!pack_lockfile)
+ return -1;
+
+ /* must be enough for any extensions in ext[] */
+ path = xstrdup(pack_lockfile);
+ len = strlen(pack_lockfile) - strlen(".keep");
+ strcpy(path + len, ".pack");
+ for (p = packed_git; p; p = p->next)
+ if (!strcmp(p->pack_name, path))
+ break;
+ if (!p || open_pack_index(p))
+ die("unable to find pack %s", path);
+
+ if (p->num_objects >= limit) {
+ free(path);
+ return -1;
+ }
+
+ for (i = 0; i < p->num_objects; i++) {
+ const unsigned char *sha1 = nth_packed_object_sha1(p, i);
+ if (!has_sha1_except_in(p, sha1)) /* skip --fix-thin objects */
+ ret |= force_object_loose(sha1, p->mtime);
+ }
+
+ if (!ret) {
+ free_pack_by_name(p->pack_name);
+ for (i = 0; ext[i]; i++) {
+ strcpy(path + len, ext[i]);
+ unlink_or_warn(path);
+ }
+ }
+ free(path);
+ return ret;
+}
--
1.8.2.83.gc99314b
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH] {fetch,receive}-pack: drop unpack-objects, delay loosing objects until the end
2013-09-02 3:05 [PATCH] {fetch,receive}-pack: drop unpack-objects, delay loosing objects until the end Nguyễn Thái Ngọc Duy
@ 2013-09-02 4:38 ` Eric Sunshine
2013-09-03 6:49 ` Jeff King
2013-09-06 0:46 ` [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects Nguyễn Thái Ngọc Duy
2 siblings, 0 replies; 12+ messages in thread
From: Eric Sunshine @ 2013-09-02 4:38 UTC (permalink / raw)
To: Nguyễn Thái Ngọc Duy; +Cc: Git List
On Sun, Sep 1, 2013 at 11:05 PM, Nguyễn Thái Ngọc Duy <pclouds@gmail.com> wrote:
> Current code peaks into the transfered pack's header, if the number of
s/peaks/peeks/
> objects is under a limit, unpack-objects is called to handle the rest,
> otherwise index-pack is. This patch makes fetch-pack use index-pack
> unconditionally, then turn objects loose and remove the pack at the
> end. unpack-objects is deprecated and may be removed in future.
>
> There are a few reasons for this:
>
> - down to two code paths to maintain regarding pack reading
> (sha1_file.c and index-pack.c). When .pack v4 comes, we don't need
> to duplicate work in index-pack and unpack-objects.
>
> - the number of objects should not be the only indicator for
> unpacking. If there are a few large objects in the pack, the pack
> should be kept anyway. Keeping the pack lets us examine the whole
> pack later and make a better decision.
>
> - by going through index-pack first, then unpack, we pay extra cost
> for completing a thin pack into a full one. But compared to fetch's
> total time, it should not be noticeable because unpack-objects is
> only called when the pack contains a small number of objects.
>
> - unpack-objects does not support streaming large blobs. Granted
> force_object_loose(), which is used by this patch, does not either.
> But it'll be easier to do so because sha1_file.c interface supports
> streaming.
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] {fetch,receive}-pack: drop unpack-objects, delay loosing objects until the end
2013-09-02 3:05 [PATCH] {fetch,receive}-pack: drop unpack-objects, delay loosing objects until the end Nguyễn Thái Ngọc Duy
2013-09-02 4:38 ` Eric Sunshine
@ 2013-09-03 6:49 ` Jeff King
2013-09-03 11:56 ` Duy Nguyen
2013-09-06 0:46 ` [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects Nguyễn Thái Ngọc Duy
2 siblings, 1 reply; 12+ messages in thread
From: Jeff King @ 2013-09-03 6:49 UTC (permalink / raw)
To: Nguyễn Thái Ngọc Duy; +Cc: git
On Mon, Sep 02, 2013 at 10:05:07AM +0700, Nguyen Thai Ngoc Duy wrote:
> Current code peaks into the transfered pack's header, if the number of
> objects is under a limit, unpack-objects is called to handle the rest,
> otherwise index-pack is. This patch makes fetch-pack use index-pack
> unconditionally, then turn objects loose and remove the pack at the
> end. unpack-objects is deprecated and may be removed in future.
I do like consolidating the object-receiving code paths, but there is a
downside to this strategy: we increase the I/O in cases where we end up
unpacking, as we spool the tmpfile to disk, and then force objects loose
(whereas with the current code, unpack-objects reads straight from the
network into loose objects). I think that is what you're saying here:
> - by going through index-pack first, then unpack, we pay extra cost
> for completing a thin pack into a full one. But compared to fetch's
> total time, it should not be noticeable because unpack-objects is
> only called when the pack contains a small number of objects.
...but the cost is paid by total pack size, not number of objects. So if
I am pushing up a commit with a large uncompressible blob, I've
effectively doubled my disk I/O. It would make more sense to me for
index-pack to learn command-line options specifying the limits, and then
to operate on the pack as it streams in. E.g., to decide after seeing
the header to unpack rather than index, or to drop large blobs from the
pack (and put them in their own pack directly) as we are streaming into
it (we do not know the blob size ahead of time, but we can make a good
guess if it has a large on-disk size in the pack).
-Peff
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] {fetch,receive}-pack: drop unpack-objects, delay loosing objects until the end
2013-09-03 6:49 ` Jeff King
@ 2013-09-03 11:56 ` Duy Nguyen
2013-09-03 17:25 ` Jeff King
0 siblings, 1 reply; 12+ messages in thread
From: Duy Nguyen @ 2013-09-03 11:56 UTC (permalink / raw)
To: Jeff King; +Cc: Git Mailing List
On Tue, Sep 3, 2013 at 1:49 PM, Jeff King <peff@peff.net> wrote:
>> - by going through index-pack first, then unpack, we pay extra cost
>> for completing a thin pack into a full one. But compared to fetch's
>> total time, it should not be noticeable because unpack-objects is
>> only called when the pack contains a small number of objects.
>
> ...but the cost is paid by total pack size, not number of objects. So if
> I am pushing up a commit with a large uncompressible blob, I've
> effectively doubled my disk I/O. It would make more sense to me for
> index-pack to learn command-line options specifying the limits, and then
> to operate on the pack as it streams in. E.g., to decide after seeing
> the header to unpack rather than index, or to drop large blobs from the
> pack (and put them in their own pack directly) as we are streaming into
> it (we do not know the blob size ahead of time, but we can make a good
> guess if it has a large on-disk size in the pack).
Yeah letting index-pack do the work was my backup plan :) I think if
there is a big blob in the pack, then the pack should not be unpacked
at all. If you store big blobs in a separate pack you already pay the
the lookup cost of one more pack in find_pack_entry(), why go through
the process of unpacking? index-pack still has the advantage of
streaming though. Will rework.
--
Duy
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] {fetch,receive}-pack: drop unpack-objects, delay loosing objects until the end
2013-09-03 11:56 ` Duy Nguyen
@ 2013-09-03 17:25 ` Jeff King
0 siblings, 0 replies; 12+ messages in thread
From: Jeff King @ 2013-09-03 17:25 UTC (permalink / raw)
To: Duy Nguyen; +Cc: Git Mailing List
On Tue, Sep 03, 2013 at 06:56:23PM +0700, Nguyen Thai Ngoc Duy wrote:
> > ...but the cost is paid by total pack size, not number of objects. So if
> > I am pushing up a commit with a large uncompressible blob, I've
> > effectively doubled my disk I/O. It would make more sense to me for
> > index-pack to learn command-line options specifying the limits, and then
> > to operate on the pack as it streams in. E.g., to decide after seeing
> > the header to unpack rather than index, or to drop large blobs from the
> > pack (and put them in their own pack directly) as we are streaming into
> > it (we do not know the blob size ahead of time, but we can make a good
> > guess if it has a large on-disk size in the pack).
>
> Yeah letting index-pack do the work was my backup plan :) I think if
> there is a big blob in the pack, then the pack should not be unpacked
> at all. If you store big blobs in a separate pack you already pay the
> the lookup cost of one more pack in find_pack_entry(), why go through
> the process of unpacking? index-pack still has the advantage of
> streaming though. Will rework.
In general, our large-blob strategy is to push them out to their own
pack so that we do not incur the I/O overhead of rewriting them whenever
we repack. But the flipside is that we have to pay the cost of an extra
.idx open and lookup for each such object. In the longer term, I think
it might make sense to be able to generate a multi-pack .idx for such a
case (or even to simply store the large blobs in a special area indexed
by the object sha1, as we do for loose objects).
But that is all orthogonal to your patch. I think as long as we are
moving towards "index-pack makes the decisions while it processes the
pack" we are going in a good direction. Even if we do not implement all
of the decisions immediately, it leaves room for doing so later without
loss of efficiency.
-Peff
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects
2013-09-02 3:05 [PATCH] {fetch,receive}-pack: drop unpack-objects, delay loosing objects until the end Nguyễn Thái Ngọc Duy
2013-09-02 4:38 ` Eric Sunshine
2013-09-03 6:49 ` Jeff King
@ 2013-09-06 0:46 ` Nguyễn Thái Ngọc Duy
2013-09-06 0:46 ` [PATCH v2 2/3] fetch-pack: use index-pack --unpack-limit instead of unpack-objects Nguyễn Thái Ngọc Duy
` (2 more replies)
2 siblings, 3 replies; 12+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-06 0:46 UTC (permalink / raw)
To: git; +Cc: Jeff King, Nguyễn Thái Ngọc Duy
If the number of objects in the given pack is less than the limit, all
objects in the pack will be unpacked, and the pack will not be created
if it's streamed in. It's intended to replace unpack-objects.
Unlike unpack-objects, this operation requires writing the stream to
disk for delta resolution. Base objects are not appended to the
temporary pack though. Given that unpack limit is usually small, the
I/O overhead should be unnoticeable.
When large blobs are present in the stream, things will be
different. Right now, those blobs are uncompressed in memory for
simplicity. But in future bulk-checkin should be used for streaming
large blobs to a new pack (and not written to the temp pack to reduce
I/O overhead).
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
I had something that could unpack without writing to temp pack file
but I scraped it and chose this way because it follows closely how
index-pack works. It's a good thing imo because .pack v4 is coming
and I don't know how v4 may impact this unpack code path. Once things
are settled, we can revisit and open a separate code path if it's
still a good idea.
builtin/index-pack.c | 136 +++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 122 insertions(+), 14 deletions(-)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 9c1cfac..a5b69e4 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -81,6 +81,7 @@ static int do_fsck_object;
static int verbose;
static int show_stat;
static int check_self_contained_and_connected;
+static int unpack;
static struct progress *progress;
@@ -93,6 +94,21 @@ static git_SHA_CTX input_ctx;
static uint32_t input_crc32;
static int input_fd, output_fd, pack_fd;
+/*
+ * When unpacking under --strict mode, objects whose reachability are
+ * suspect are kept in core without getting written in the object
+ * store.
+ */
+struct obj_buffer {
+ char *buffer;
+ unsigned long size;
+ enum object_type type;
+ unsigned char sha1[20];
+};
+
+struct obj_buffer *obj_buffers;
+int nr_object_buffers;
+
#ifndef NO_PTHREADS
static struct thread_local *thread_data;
@@ -209,6 +225,23 @@ static unsigned check_object(struct object *obj)
return 0;
}
+static void write_cached_objects(void)
+{
+ unsigned char parano_sha1[20];
+ int i;
+ for (i = 0; i < nr_object_buffers; i++) {
+ struct obj_buffer *obj_buf = obj_buffers + i;
+ if (write_sha1_file(obj_buf->buffer, obj_buf->size,
+ typename(obj_buf->type), parano_sha1) < 0)
+ die(_("failed to write object %s"),
+ sha1_to_hex(obj_buf->sha1));
+ if (hashcmp(obj_buf->sha1, parano_sha1))
+ die(_("confused by unstable object source data for %s"),
+ sha1_to_hex(obj_buf->sha1));
+ free(obj_buf->buffer);
+ }
+}
+
static unsigned check_objects(void)
{
unsigned i, max, foreign_nr = 0;
@@ -424,7 +457,18 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
git_SHA1_Update(&c, hdr, hdrlen);
} else
sha1 = NULL;
- if (type == OBJ_BLOB && size > big_file_threshold)
+ if (type == OBJ_BLOB && size > big_file_threshold &&
+ /*
+ * In future we should use bulk-checkin instead of forcing
+ * large blobs in core like this.
+ *
+ * If we do so, in --stdin mode, we might also want to
+ * update flush() to do seek() instead of write() on large
+ * blobs to reduce I/O and disk consumption (because we
+ * need the temp pack with other objects at correct
+ * position for delta resolution)
+ */
+ !unpack)
buf = fixed_buf;
else
buf = xmalloc(size);
@@ -700,17 +744,18 @@ static int check_collison(struct object_entry *entry)
return 0;
}
-static void sha1_object(const void *data, struct object_entry *obj_entry,
- unsigned long size, enum object_type type,
- const unsigned char *sha1)
+static void *sha1_object(void *data, struct object_entry *obj_entry,
+ unsigned long size, enum object_type type,
+ const unsigned char *sha1)
{
void *new_data = NULL;
int collision_test_needed;
+ int queued = 0;
assert(data || obj_entry);
read_lock();
- collision_test_needed = has_sha1_file(sha1);
+ collision_test_needed = unpack ? 0 : has_sha1_file(sha1);
read_unlock();
if (collision_test_needed && !data) {
@@ -776,11 +821,36 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
commit->buffer = NULL;
}
obj->flags |= FLAG_CHECKED;
+ if (unpack) {
+ obj_buffers[nr_object_buffers].buffer = data;
+ obj_buffers[nr_object_buffers].size = size;
+ obj_buffers[nr_object_buffers].type = type;
+ hashcpy(obj_buffers[nr_object_buffers].sha1, sha1);
+ nr_object_buffers++;
+ data = NULL;
+ queued = 1;
+ }
}
read_unlock();
}
+ if (unpack && !queued) {
+ unsigned char parano_sha1[20];
+
+ read_lock(); /* because write_sha1_file calls has_sha1_file */
+ if (write_sha1_file(data, size, typename(type),
+ parano_sha1))
+ die(_("failed to unpack object at offset %lu"),
+ obj_entry->idx.offset);
+ read_unlock();
+
+ if (hashcmp(sha1, parano_sha1))
+ die(_("confused by unstable object source data for %s"),
+ sha1_to_hex(sha1));
+ }
+
free(new_data);
+ return data == new_data ? NULL : data;
}
/*
@@ -1021,7 +1091,8 @@ static void parse_pack_objects(unsigned char *sha1)
obj->real_type = OBJ_BAD;
nr_delays++;
} else
- sha1_object(data, NULL, obj->size, obj->type, obj->idx.sha1);
+ data = sha1_object(data, NULL, obj->size,
+ obj->type, obj->idx.sha1);
free(data);
display_progress(progress, i+1);
}
@@ -1188,6 +1259,18 @@ static struct object_entry *append_obj_to_pack(struct sha1file *f,
unsigned long s = size;
int n = 0;
unsigned char c = (type << 4) | (s & 15);
+ if (unpack) {
+ /*
+ * Just enough info to make find_unresolved_deltas()
+ * happy. We do not need to actually append the object
+ * in unpack case.
+ */
+ obj[0].size = size;
+ obj[0].type = type;
+ obj[0].real_type = type;
+ hashcpy(obj->idx.sha1, sha1);
+ return obj;
+ }
s >>= 4;
while (s) {
header[n++] = c | 0x80;
@@ -1277,7 +1360,11 @@ static void final(const char *final_pack_name, const char *curr_pack_name,
err = close(output_fd);
if (err)
die_errno(_("error while closing pack file"));
+ if (unpack)
+ unlink(curr_pack_name);
}
+ if (unpack)
+ return;
if (keep_msg) {
int keep_fd, keep_msg_len = strlen(keep_msg);
@@ -1489,14 +1576,14 @@ static void show_pack_info(int stat_only)
int cmd_index_pack(int argc, const char **argv, const char *prefix)
{
int i, fix_thin_pack = 0, verify = 0, stat_only = 0;
- const char *curr_pack, *curr_index;
+ const char *curr_pack, *curr_index = NULL;
const char *index_name = NULL, *pack_name = NULL;
const char *keep_name = NULL, *keep_msg = NULL;
char *index_name_buf = NULL, *keep_name_buf = NULL;
- struct pack_idx_entry **idx_objects;
struct pack_idx_option opts;
unsigned char pack_sha1[20];
unsigned foreign_nr = 1; /* zero is a "good" value, assume bad */
+ int unpack_limit = 0;
if (argc == 2 && !strcmp(argv[1], "-h"))
usage(index_pack_usage);
@@ -1574,6 +1661,11 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
opts.off32_limit = strtoul(c+1, &c, 0);
if (*c || opts.off32_limit & 0x80000000)
die(_("bad %s"), arg);
+ } else if (!prefixcmp(arg, "--unpack-limit=")) {
+ char *end;
+ unpack_limit = strtoul(arg+15, &end, 0);
+ if (!arg[15] || *end || unpack_limit < 0)
+ usage(index_pack_usage);
} else
usage(index_pack_usage);
continue;
@@ -1628,23 +1720,39 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
curr_pack = open_pack_file(pack_name);
parse_pack_header();
+
+ if (nr_objects < unpack_limit) {
+ unpack = 1;
+ if (strict)
+ obj_buffers = xcalloc(nr_objects, sizeof(*obj_buffers));
+ }
+
objects = xcalloc(nr_objects + 1, sizeof(struct object_entry));
deltas = xcalloc(nr_objects, sizeof(struct delta_entry));
parse_pack_objects(pack_sha1);
resolve_deltas();
conclude_pack(fix_thin_pack, curr_pack, pack_sha1);
free(deltas);
- if (strict)
+ if (strict) {
foreign_nr = check_objects();
+ if (unpack) {
+ write_cached_objects();
+ free(obj_buffers);
+ }
+ }
if (show_stat)
show_pack_info(stat_only);
- idx_objects = xmalloc((nr_objects) * sizeof(struct pack_idx_entry *));
- for (i = 0; i < nr_objects; i++)
- idx_objects[i] = &objects[i].idx;
- curr_index = write_idx_file(index_name, idx_objects, nr_objects, &opts, pack_sha1);
- free(idx_objects);
+ if (!unpack) {
+ struct pack_idx_entry **idx_objects;
+ idx_objects = xmalloc((nr_objects) * sizeof(struct pack_idx_entry *));
+ for (i = 0; i < nr_objects; i++)
+ idx_objects[i] = &objects[i].idx;
+ curr_index = write_idx_file(index_name, idx_objects,
+ nr_objects, &opts, pack_sha1);
+ free(idx_objects);
+ }
if (!verify)
final(pack_name, curr_pack,
--
1.8.2.83.gc99314b
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 2/3] fetch-pack: use index-pack --unpack-limit instead of unpack-objects
2013-09-06 0:46 ` [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects Nguyễn Thái Ngọc Duy
@ 2013-09-06 0:46 ` Nguyễn Thái Ngọc Duy
2013-09-08 4:45 ` Jeff King
2013-09-06 0:46 ` [PATCH v2 3/3] receive-pack: " Nguyễn Thái Ngọc Duy
2013-09-08 4:44 ` [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects Jeff King
2 siblings, 1 reply; 12+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-06 0:46 UTC (permalink / raw)
To: git; +Cc: Jeff King, Nguyễn Thái Ngọc Duy
Current code peeks into the transfered pack's header, if the number of
objects is under a limit, unpack-objects is called to handle the rest,
otherwise index-pack is. This patch makes index-pack handle both
cases. After the next patch, unpack-objects will no longer be used. It
may be removed in future.
Now we only have two code paths to maintain regarding pack reading
(sha1_file.c and index-pack.c). When .pack v4 comes, we don't need to
duplicate work in index-pack and unpack-objects.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
fetch-pack.c | 86 +++++++++++++++++++++++++-----------------------------------
1 file changed, 35 insertions(+), 51 deletions(-)
diff --git a/fetch-pack.c b/fetch-pack.c
index f5d99c1..44d029f 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -682,16 +682,16 @@ static int sideband_demux(int in, int out, void *data)
}
static int get_pack(struct fetch_pack_args *args,
- int xd[2], char **pack_lockfile)
+ int xd[2], char **pack_lockfile_p)
{
struct async demux;
- const char *argv[22];
+ const char *argv[23];
char keep_arg[256];
- char hdr_arg[256];
+ char unpack_limit_arg[256];
const char **av;
- int do_keep = args->keep_pack;
struct child_process cmd;
int ret;
+ char *pack_lockfile;
memset(&demux, 0, sizeof(demux));
if (use_sideband) {
@@ -711,54 +711,33 @@ static int get_pack(struct fetch_pack_args *args,
memset(&cmd, 0, sizeof(cmd));
cmd.argv = argv;
+ cmd.out = -1;
av = argv;
- *hdr_arg = 0;
- if (!args->keep_pack && unpack_limit) {
- struct pack_header header;
-
- if (read_pack_header(demux.out, &header))
- die("protocol error: bad pack header");
- snprintf(hdr_arg, sizeof(hdr_arg),
- "--pack_header=%"PRIu32",%"PRIu32,
- ntohl(header.hdr_version), ntohl(header.hdr_entries));
- if (ntohl(header.hdr_entries) < unpack_limit)
- do_keep = 0;
- else
- do_keep = 1;
- }
if (alternate_shallow_file) {
*av++ = "--shallow-file";
*av++ = alternate_shallow_file;
}
- if (do_keep) {
- if (pack_lockfile)
- cmd.out = -1;
- *av++ = "index-pack";
- *av++ = "--stdin";
- if (!args->quiet && !args->no_progress)
- *av++ = "-v";
- if (args->use_thin_pack)
- *av++ = "--fix-thin";
- if (args->lock_pack || unpack_limit) {
- int s = sprintf(keep_arg,
- "--keep=fetch-pack %"PRIuMAX " on ", (uintmax_t) getpid());
- if (gethostname(keep_arg + s, sizeof(keep_arg) - s))
- strcpy(keep_arg + s, "localhost");
- *av++ = keep_arg;
- }
- if (args->check_self_contained_and_connected)
- *av++ = "--check-self-contained-and-connected";
- }
- else {
- *av++ = "unpack-objects";
- if (args->quiet || args->no_progress)
- *av++ = "-q";
- args->check_self_contained_and_connected = 0;
- }
- if (*hdr_arg)
- *av++ = hdr_arg;
+ *av++ = "index-pack";
+ *av++ = "--stdin";
+ if (!args->quiet && !args->no_progress)
+ *av++ = "-v";
+ if (args->use_thin_pack)
+ *av++ = "--fix-thin";
+ if (args->lock_pack || unpack_limit) {
+ int s = sprintf(keep_arg,
+ "--keep=fetch-pack %"PRIuMAX " on ", (uintmax_t) getpid());
+ if (gethostname(keep_arg + s, sizeof(keep_arg) - s))
+ strcpy(keep_arg + s, "localhost");
+ *av++ = keep_arg;
+ }
+ if (!args->keep_pack) {
+ sprintf(unpack_limit_arg, "--unpack-limit=%u", unpack_limit);
+ *av++ = unpack_limit_arg;
+ }
+ if (args->check_self_contained_and_connected)
+ *av++ = "--check-self-contained-and-connected";
if (fetch_fsck_objects >= 0
? fetch_fsck_objects
: transfer_fsck_objects >= 0
@@ -771,20 +750,26 @@ static int get_pack(struct fetch_pack_args *args,
cmd.git_cmd = 1;
if (start_command(&cmd))
die("fetch-pack: unable to fork off %s", argv[0]);
- if (do_keep && pack_lockfile) {
- *pack_lockfile = index_pack_lockfile(cmd.out);
- close(cmd.out);
- }
-
+ pack_lockfile = index_pack_lockfile(cmd.out);
+ close(cmd.out);
ret = finish_command(&cmd);
+
if (!ret || (args->check_self_contained_and_connected && ret == 1))
args->self_contained_and_connected =
args->check_self_contained_and_connected &&
ret == 0;
else
die("%s failed", argv[0]);
+
if (use_sideband && finish_async(&demux))
die("error in sideband demultiplexer");
+
+ if (pack_lockfile)
+ reprepare_packed_git();
+
+ if (pack_lockfile_p)
+ *pack_lockfile_p = pack_lockfile;
+
return 0;
}
@@ -997,6 +982,5 @@ struct ref *fetch_pack(struct fetch_pack_args *args,
commit_lock_file(&shallow_lock);
}
- reprepare_packed_git();
return ref_cpy;
}
--
1.8.2.83.gc99314b
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 3/3] receive-pack: use index-pack --unpack-limit instead of unpack-objects
2013-09-06 0:46 ` [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects Nguyễn Thái Ngọc Duy
2013-09-06 0:46 ` [PATCH v2 2/3] fetch-pack: use index-pack --unpack-limit instead of unpack-objects Nguyễn Thái Ngọc Duy
@ 2013-09-06 0:46 ` Nguyễn Thái Ngọc Duy
2013-09-08 4:44 ` [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects Jeff King
2 siblings, 0 replies; 12+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2013-09-06 0:46 UTC (permalink / raw)
To: git; +Cc: Jeff King, Nguyễn Thái Ngọc Duy
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
builtin/receive-pack.c | 122 ++++++++++++++-----------------------------------
1 file changed, 34 insertions(+), 88 deletions(-)
diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c
index e3eb5fc..0a84a61 100644
--- a/builtin/receive-pack.c
+++ b/builtin/receive-pack.c
@@ -792,105 +792,51 @@ static struct command *read_head_info(void)
return commands;
}
-static const char *parse_pack_header(struct pack_header *hdr)
-{
- switch (read_pack_header(0, hdr)) {
- case PH_ERROR_EOF:
- return "eof before pack header was fully read";
-
- case PH_ERROR_PACK_SIGNATURE:
- return "protocol error (pack signature mismatch detected)";
-
- case PH_ERROR_PROTOCOL:
- return "protocol error (pack version unsupported)";
-
- default:
- return "unknown error in parse_pack_header";
-
- case 0:
- return NULL;
- }
-}
-
static const char *pack_lockfile;
static const char *unpack(int err_fd)
{
- struct pack_header hdr;
- const char *hdr_err;
- char hdr_arg[38];
int fsck_objects = (receive_fsck_objects >= 0
? receive_fsck_objects
: transfer_fsck_objects >= 0
? transfer_fsck_objects
: 0);
- hdr_err = parse_pack_header(&hdr);
- if (hdr_err) {
- if (err_fd > 0)
- close(err_fd);
- return hdr_err;
- }
- snprintf(hdr_arg, sizeof(hdr_arg),
- "--pack_header=%"PRIu32",%"PRIu32,
- ntohl(hdr.hdr_version), ntohl(hdr.hdr_entries));
-
- if (ntohl(hdr.hdr_entries) < unpack_limit) {
- int code, i = 0;
- struct child_process child;
- const char *unpacker[5];
- unpacker[i++] = "unpack-objects";
- if (quiet)
- unpacker[i++] = "-q";
- if (fsck_objects)
- unpacker[i++] = "--strict";
- unpacker[i++] = hdr_arg;
- unpacker[i++] = NULL;
- memset(&child, 0, sizeof(child));
- child.argv = unpacker;
- child.no_stdout = 1;
- child.err = err_fd;
- child.git_cmd = 1;
- code = run_command(&child);
- if (!code)
- return NULL;
- return "unpack-objects abnormal exit";
- } else {
- const char *keeper[7];
- int s, status, i = 0;
- char keep_arg[256];
- struct child_process ip;
-
- s = sprintf(keep_arg, "--keep=receive-pack %"PRIuMAX" on ", (uintmax_t) getpid());
- if (gethostname(keep_arg + s, sizeof(keep_arg) - s))
- strcpy(keep_arg + s, "localhost");
-
- keeper[i++] = "index-pack";
- keeper[i++] = "--stdin";
- if (fsck_objects)
- keeper[i++] = "--strict";
- keeper[i++] = "--fix-thin";
- keeper[i++] = hdr_arg;
- keeper[i++] = keep_arg;
- keeper[i++] = NULL;
- memset(&ip, 0, sizeof(ip));
- ip.argv = keeper;
- ip.out = -1;
- ip.err = err_fd;
- ip.git_cmd = 1;
- status = start_command(&ip);
- if (status) {
- return "index-pack fork failed";
- }
- pack_lockfile = index_pack_lockfile(ip.out);
- close(ip.out);
- status = finish_command(&ip);
- if (!status) {
- reprepare_packed_git();
- return NULL;
- }
+ const char *keeper[7];
+ int s, status, i = 0;
+ char keep_arg[256];
+ char unpack_limit_arg[256];
+ struct child_process ip;
+
+ s = sprintf(keep_arg, "--keep=receive-pack %"PRIuMAX" on ", (uintmax_t) getpid());
+ if (gethostname(keep_arg + s, sizeof(keep_arg) - s))
+ strcpy(keep_arg + s, "localhost");
+ sprintf(unpack_limit_arg, "--unpack-limit=%u", unpack_limit);
+
+ keeper[i++] = "index-pack";
+ keeper[i++] = "--stdin";
+ if (fsck_objects)
+ keeper[i++] = "--strict";
+ keeper[i++] = "--fix-thin";
+ keeper[i++] = keep_arg;
+ keeper[i++] = unpack_limit_arg;
+ keeper[i++] = NULL;
+ memset(&ip, 0, sizeof(ip));
+ ip.argv = keeper;
+ ip.out = -1;
+ ip.err = err_fd;
+ ip.git_cmd = 1;
+ status = start_command(&ip);
+ if (status)
+ return "index-pack fork failed";
+ pack_lockfile = index_pack_lockfile(ip.out);
+ close(ip.out);
+ status = finish_command(&ip);
+ if (status)
return "index-pack abnormal exit";
- }
+ if (pack_lockfile)
+ reprepare_packed_git();
+ return NULL;
}
static const char *unpack_with_sideband(void)
--
1.8.2.83.gc99314b
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects
2013-09-06 0:46 ` [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects Nguyễn Thái Ngọc Duy
2013-09-06 0:46 ` [PATCH v2 2/3] fetch-pack: use index-pack --unpack-limit instead of unpack-objects Nguyễn Thái Ngọc Duy
2013-09-06 0:46 ` [PATCH v2 3/3] receive-pack: " Nguyễn Thái Ngọc Duy
@ 2013-09-08 4:44 ` Jeff King
2013-09-08 6:28 ` Duy Nguyen
2 siblings, 1 reply; 12+ messages in thread
From: Jeff King @ 2013-09-08 4:44 UTC (permalink / raw)
To: Nguyễn Thái Ngọc Duy; +Cc: git
On Fri, Sep 06, 2013 at 07:46:01AM +0700, Nguyen Thai Ngoc Duy wrote:
> ---
> I had something that could unpack without writing to temp pack file
> but I scraped it and chose this way because it follows closely how
> index-pack works. It's a good thing imo because .pack v4 is coming
> and I don't know how v4 may impact this unpack code path. Once things
> are settled, we can revisit and open a separate code path if it's
> still a good idea.
>From a cursory read, this seems fine. If it were done in complete
isolation, I'd say it was a slight regression, just because we are doing
more I/O for the unpack case, and it is not really saving us any code
(it is not like we can throw away unpack-objects, as I think we would
want to keep it as a last resort for getting data out of malformed or
otherwise non-indexable packs).
But I can also see it making pack v4 handling easier. So it would make
sense to me to put it at the start of a series adding pack v4 indexing.
By the end of the series you would be able to see the benefits of the
reduced code complexity. Until then, it is a "probably this will help
later" change.
-Peff
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 2/3] fetch-pack: use index-pack --unpack-limit instead of unpack-objects
2013-09-06 0:46 ` [PATCH v2 2/3] fetch-pack: use index-pack --unpack-limit instead of unpack-objects Nguyễn Thái Ngọc Duy
@ 2013-09-08 4:45 ` Jeff King
0 siblings, 0 replies; 12+ messages in thread
From: Jeff King @ 2013-09-08 4:45 UTC (permalink / raw)
To: Nguyễn Thái Ngọc Duy; +Cc: git
On Fri, Sep 06, 2013 at 07:46:02AM +0700, Nguyen Thai Ngoc Duy wrote:
> static int get_pack(struct fetch_pack_args *args,
> - int xd[2], char **pack_lockfile)
> + int xd[2], char **pack_lockfile_p)
> {
> struct async demux;
> - const char *argv[22];
> + const char *argv[23];
> char keep_arg[256];
> - char hdr_arg[256];
> + char unpack_limit_arg[256];
Perhaps it is time to convert this to use argv_array? I think you could
even get rid of the fixed-size buffers, too, with argv_array_pushf().
-Peff
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects
2013-09-08 4:44 ` [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects Jeff King
@ 2013-09-08 6:28 ` Duy Nguyen
2013-09-08 6:35 ` Jeff King
0 siblings, 1 reply; 12+ messages in thread
From: Duy Nguyen @ 2013-09-08 6:28 UTC (permalink / raw)
To: Jeff King; +Cc: Git Mailing List
On Sun, Sep 8, 2013 at 11:44 AM, Jeff King <peff@peff.net> wrote:
> On Fri, Sep 06, 2013 at 07:46:01AM +0700, Nguyen Thai Ngoc Duy wrote:
>
>> ---
>> I had something that could unpack without writing to temp pack file
>> but I scraped it and chose this way because it follows closely how
>> index-pack works. It's a good thing imo because .pack v4 is coming
>> and I don't know how v4 may impact this unpack code path. Once things
>> are settled, we can revisit and open a separate code path if it's
>> still a good idea.
>
> From a cursory read, this seems fine. If it were done in complete
> isolation, I'd say it was a slight regression, just because we are doing
> more I/O for the unpack case, and it is not really saving us any code
> (it is not like we can throw away unpack-objects, as I think we would
> want to keep it as a last resort for getting data out of malformed or
> otherwise non-indexable packs).
I can see unpack-objects is more tolerable on corrupt/incomplete
packs. If index-pack finds something wrong, it aborts the whole
process. I think we can make index-pack stop at the first bad object,
adjust nr_objects, and try to recover as much as possible out of the
good part. Any other reasons to keep unpack-objects (because I still
want to kill it)?
> But I can also see it making pack v4 handling easier. So it would make
> sense to me to put it at the start of a series adding pack v4 indexing.
> By the end of the series you would be able to see the benefits of the
> reduced code complexity. Until then, it is a "probably this will help
> later" change.
Agreed.
--
Duy
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects
2013-09-08 6:28 ` Duy Nguyen
@ 2013-09-08 6:35 ` Jeff King
0 siblings, 0 replies; 12+ messages in thread
From: Jeff King @ 2013-09-08 6:35 UTC (permalink / raw)
To: Duy Nguyen; +Cc: Git Mailing List
On Sun, Sep 08, 2013 at 01:28:47PM +0700, Nguyen Thai Ngoc Duy wrote:
> > From a cursory read, this seems fine. If it were done in complete
> > isolation, I'd say it was a slight regression, just because we are doing
> > more I/O for the unpack case, and it is not really saving us any code
> > (it is not like we can throw away unpack-objects, as I think we would
> > want to keep it as a last resort for getting data out of malformed or
> > otherwise non-indexable packs).
>
> I can see unpack-objects is more tolerable on corrupt/incomplete
> packs. If index-pack finds something wrong, it aborts the whole
> process. I think we can make index-pack stop at the first bad object,
> adjust nr_objects, and try to recover as much as possible out of the
> good part. Any other reasons to keep unpack-objects (because I still
> want to kill it)?
No, I don't think there is another reason. The command name may hang
around for historical reasons, but we can always make it an alias for
"index-pack --unpack-limit=0" or whatever.
I do not think index-pack even needs to be particularly clever about
trying to recover. It is mainly that we may do some extra sanity checks
when writing the index (e.g., the recent discussion on avoiding
duplicates in the index), that do not even come up when simply exploding
the pack in a linear fashion. But I don't think there is any fundamental
reason why index-pack could not learn to be as robust when operating in
unpack mode.
As an aside, you cannot just drop the nr_objects that index-pack's
generated index says it contains. Packfile readers double-check that the
.idx and .pack agree on the number of objects (I tried it as a method
for working around duplicate entries :) ).
-Peff
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2013-09-08 6:35 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-09-02 3:05 [PATCH] {fetch,receive}-pack: drop unpack-objects, delay loosing objects until the end Nguyễn Thái Ngọc Duy
2013-09-02 4:38 ` Eric Sunshine
2013-09-03 6:49 ` Jeff King
2013-09-03 11:56 ` Duy Nguyen
2013-09-03 17:25 ` Jeff King
2013-09-06 0:46 ` [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects Nguyễn Thái Ngọc Duy
2013-09-06 0:46 ` [PATCH v2 2/3] fetch-pack: use index-pack --unpack-limit instead of unpack-objects Nguyễn Thái Ngọc Duy
2013-09-08 4:45 ` Jeff King
2013-09-06 0:46 ` [PATCH v2 3/3] receive-pack: " Nguyễn Thái Ngọc Duy
2013-09-08 4:44 ` [PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects Jeff King
2013-09-08 6:28 ` Duy Nguyen
2013-09-08 6:35 ` Jeff King
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).