* [PATCH 1/3] index-pack: dedup first during outgoing link check
2024-12-02 20:18 [PATCH 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
@ 2024-12-02 20:18 ` Jonathan Tan
2024-12-02 21:24 ` Josh Steadmon
2024-12-02 20:18 ` [PATCH 2/3] index-pack: no blobs " Jonathan Tan
` (5 subsequent siblings)
6 siblings, 1 reply; 29+ messages in thread
From: Jonathan Tan @ 2024-12-02 20:18 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, hanyang.tony
Commit c08589efdc (index-pack: repack local links into promisor packs,
2024-11-01) fixed a bug with what was believed to be a negligible
decrease in performance [1] [2]. But at $DAYJOB, with at least one repo,
it was found that the decrease in performance was very significant.
Looking at the patch, whenever we parse an object in the packfile to
be indexed, we check the targets of all its outgoing links for its
existence. However, this could be optimized by first collecting all such
targets into an oidset (thus deduplicating them) before checking. Teach
Git to do that.
On a certain fetch from the aforementioned repo, this improved
performance from approximately 7 hours to 24m47.815s. This number will
be further reduced in a subsequent patch.
[1] https://lore.kernel.org/git/CAG1j3zGiNMbri8rZNaF0w+yP+6OdMz0T8+8_Wgd1R_p1HzVasg@mail.gmail.com/
[2] https://lore.kernel.org/git/20241105212849.3759572-1-jonathantanmy@google.com/
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
builtin/index-pack.c | 44 ++++++++++++++++++++++----------------------
1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 95babdc5ea..8e7d14c17e 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -155,11 +155,11 @@ static int input_fd, output_fd;
static const char *curr_pack;
/*
- * local_links is guarded by read_mutex, and record_local_links is read-only in
- * a thread.
+ * outgoing_links is guarded by read_mutex, and record_outgoing_links is
+ * read-only in a thread.
*/
-static struct oidset local_links = OIDSET_INIT;
-static int record_local_links;
+static struct oidset outgoing_links = OIDSET_INIT;
+static int record_outgoing_links;
static struct thread_local_data *thread_data;
static int nr_dispatched;
@@ -812,18 +812,12 @@ static int check_collison(struct object_entry *entry)
return 0;
}
-static void record_if_local_object(const struct object_id *oid)
+static void record_outgoing_link(const struct object_id *oid)
{
- struct object_info info = OBJECT_INFO_INIT;
- if (oid_object_info_extended(the_repository, oid, &info, 0))
- /* Missing; assume it is a promisor object */
- return;
- if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
- return;
- oidset_insert(&local_links, oid);
+ oidset_insert(&outgoing_links, oid);
}
-static void do_record_local_links(struct object *obj)
+static void do_record_outgoing_links(struct object *obj)
{
if (obj->type == OBJ_TREE) {
struct tree *tree = (struct tree *)obj;
@@ -837,16 +831,16 @@ static void do_record_local_links(struct object *obj)
*/
return;
while (tree_entry_gently(&desc, &entry))
- record_if_local_object(&entry.oid);
+ record_outgoing_link(&entry.oid);
} else if (obj->type == OBJ_COMMIT) {
struct commit *commit = (struct commit *) obj;
struct commit_list *parents = commit->parents;
for (; parents; parents = parents->next)
- record_if_local_object(&parents->item->object.oid);
+ record_outgoing_link(&parents->item->object.oid);
} else if (obj->type == OBJ_TAG) {
struct tag *tag = (struct tag *) obj;
- record_if_local_object(get_tagged_oid(tag));
+ record_outgoing_link(get_tagged_oid(tag));
}
}
@@ -896,7 +890,7 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
free(has_data);
}
- if (strict || do_fsck_object || record_local_links) {
+ if (strict || do_fsck_object || record_outgoing_links) {
read_lock();
if (type == OBJ_BLOB) {
struct blob *blob = lookup_blob(the_repository, oid);
@@ -928,8 +922,8 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
die(_("fsck error in packed object"));
if (strict && fsck_walk(obj, NULL, &fsck_options))
die(_("Not all child objects of %s are reachable"), oid_to_hex(&obj->oid));
- if (record_local_links)
- do_record_local_links(obj);
+ if (record_outgoing_links)
+ do_record_outgoing_links(obj);
if (obj->type == OBJ_TREE) {
struct tree *item = (struct tree *) obj;
@@ -1781,7 +1775,7 @@ static void repack_local_links(void)
struct object_id *oid;
char *base_name;
- if (!oidset_size(&local_links))
+ if (!oidset_size(&outgoing_links))
return;
base_name = mkpathdup("%s/pack/pack", repo_get_object_directory(the_repository));
@@ -1795,8 +1789,14 @@ static void repack_local_links(void)
if (start_command(&cmd))
die(_("could not start pack-objects to repack local links"));
- oidset_iter_init(&local_links, &iter);
+ oidset_iter_init(&outgoing_links, &iter);
while ((oid = oidset_iter_next(&iter))) {
+ struct object_info info = OBJECT_INFO_INIT;
+ if (oid_object_info_extended(the_repository, oid, &info, 0))
+ /* Missing; assume it is a promisor object */
+ continue;
+ if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
+ continue;
if (write_in_full(cmd.in, oid_to_hex(oid), the_hash_algo->hexsz) < 0 ||
write_in_full(cmd.in, "\n", 1) < 0)
die(_("failed to feed local object to pack-objects"));
@@ -1899,7 +1899,7 @@ int cmd_index_pack(int argc,
} else if (skip_to_optional_arg(arg, "--keep", &keep_msg)) {
; /* nothing to do */
} else if (skip_to_optional_arg(arg, "--promisor", &promisor_msg)) {
- record_local_links = 1;
+ record_outgoing_links = 1;
} else if (starts_with(arg, "--threads=")) {
char *end;
nr_threads = strtoul(arg+10, &end, 0);
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply related [flat|nested] 29+ messages in thread* Re: [PATCH 1/3] index-pack: dedup first during outgoing link check
2024-12-02 20:18 ` [PATCH 1/3] index-pack: dedup first during outgoing link check Jonathan Tan
@ 2024-12-02 21:24 ` Josh Steadmon
0 siblings, 0 replies; 29+ messages in thread
From: Josh Steadmon @ 2024-12-02 21:24 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, hanyang.tony
On 2024.12.02 12:18, Jonathan Tan wrote:
> Commit c08589efdc (index-pack: repack local links into promisor packs,
> 2024-11-01) fixed a bug with what was believed to be a negligible
> decrease in performance [1] [2]. But at $DAYJOB, with at least one repo,
> it was found that the decrease in performance was very significant.
>
> Looking at the patch, whenever we parse an object in the packfile to
> be indexed, we check the targets of all its outgoing links for its
> existence. However, this could be optimized by first collecting all such
> targets into an oidset (thus deduplicating them) before checking. Teach
> Git to do that.
>
> On a certain fetch from the aforementioned repo, this improved
> performance from approximately 7 hours to 24m47.815s. This number will
> be further reduced in a subsequent patch.
>
> [1] https://lore.kernel.org/git/CAG1j3zGiNMbri8rZNaF0w+yP+6OdMz0T8+8_Wgd1R_p1HzVasg@mail.gmail.com/
> [2] https://lore.kernel.org/git/20241105212849.3759572-1-jonathantanmy@google.com/
>
> Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
> ---
> builtin/index-pack.c | 44 ++++++++++++++++++++++----------------------
> 1 file changed, 22 insertions(+), 22 deletions(-)
>
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index 95babdc5ea..8e7d14c17e 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -155,11 +155,11 @@ static int input_fd, output_fd;
> static const char *curr_pack;
>
> /*
> - * local_links is guarded by read_mutex, and record_local_links is read-only in
> - * a thread.
> + * outgoing_links is guarded by read_mutex, and record_outgoing_links is
> + * read-only in a thread.
> */
> -static struct oidset local_links = OIDSET_INIT;
> -static int record_local_links;
> +static struct oidset outgoing_links = OIDSET_INIT;
> +static int record_outgoing_links;
>
> static struct thread_local_data *thread_data;
> static int nr_dispatched;
We're renaming the oidset and flag because our purpose is now more
general. OK.
> @@ -812,18 +812,12 @@ static int check_collison(struct object_entry *entry)
> return 0;
> }
>
> -static void record_if_local_object(const struct object_id *oid)
> +static void record_outgoing_link(const struct object_id *oid)
> {
> - struct object_info info = OBJECT_INFO_INIT;
> - if (oid_object_info_extended(the_repository, oid, &info, 0))
> - /* Missing; assume it is a promisor object */
> - return;
> - if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
> - return;
> - oidset_insert(&local_links, oid);
> + oidset_insert(&outgoing_links, oid);
> }
We're now unconditionally recording linked objects, and this logic has
been moved below. Looks good.
> -static void do_record_local_links(struct object *obj)
> +static void do_record_outgoing_links(struct object *obj)
> {
> if (obj->type == OBJ_TREE) {
> struct tree *tree = (struct tree *)obj;
> @@ -837,16 +831,16 @@ static void do_record_local_links(struct object *obj)
> */
> return;
> while (tree_entry_gently(&desc, &entry))
> - record_if_local_object(&entry.oid);
> + record_outgoing_link(&entry.oid);
> } else if (obj->type == OBJ_COMMIT) {
> struct commit *commit = (struct commit *) obj;
> struct commit_list *parents = commit->parents;
>
> for (; parents; parents = parents->next)
> - record_if_local_object(&parents->item->object.oid);
> + record_outgoing_link(&parents->item->object.oid);
> } else if (obj->type == OBJ_TAG) {
> struct tag *tag = (struct tag *) obj;
> - record_if_local_object(get_tagged_oid(tag));
> + record_outgoing_link(get_tagged_oid(tag));
> }
> }
>
> @@ -896,7 +890,7 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
> free(has_data);
> }
>
> - if (strict || do_fsck_object || record_local_links) {
> + if (strict || do_fsck_object || record_outgoing_links) {
> read_lock();
> if (type == OBJ_BLOB) {
> struct blob *blob = lookup_blob(the_repository, oid);
> @@ -928,8 +922,8 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
> die(_("fsck error in packed object"));
> if (strict && fsck_walk(obj, NULL, &fsck_options))
> die(_("Not all child objects of %s are reachable"), oid_to_hex(&obj->oid));
> - if (record_local_links)
> - do_record_local_links(obj);
> + if (record_outgoing_links)
> + do_record_outgoing_links(obj);
>
> if (obj->type == OBJ_TREE) {
> struct tree *item = (struct tree *) obj;
> @@ -1781,7 +1775,7 @@ static void repack_local_links(void)
> struct object_id *oid;
> char *base_name;
>
> - if (!oidset_size(&local_links))
> + if (!oidset_size(&outgoing_links))
> return;
>
> base_name = mkpathdup("%s/pack/pack", repo_get_object_directory(the_repository));
> @@ -1795,8 +1789,14 @@ static void repack_local_links(void)
> if (start_command(&cmd))
> die(_("could not start pack-objects to repack local links"));
>
> - oidset_iter_init(&local_links, &iter);
> + oidset_iter_init(&outgoing_links, &iter);
> while ((oid = oidset_iter_next(&iter))) {
> + struct object_info info = OBJECT_INFO_INIT;
> + if (oid_object_info_extended(the_repository, oid, &info, 0))
> + /* Missing; assume it is a promisor object */
> + continue;
> + if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
> + continue;
> if (write_in_full(cmd.in, oid_to_hex(oid), the_hash_algo->hexsz) < 0 ||
> write_in_full(cmd.in, "\n", 1) < 0)
> die(_("failed to feed local object to pack-objects"));
We've moved our logic to skip promisor objects here, after potential
objects have been recorded to the oidset and thus deduped. Now we
`continue` the loop to skip promisor objects, whereas before we had an
early return to avoid recording them in the first place. Seems
straightforward.
> @@ -1899,7 +1899,7 @@ int cmd_index_pack(int argc,
> } else if (skip_to_optional_arg(arg, "--keep", &keep_msg)) {
> ; /* nothing to do */
> } else if (skip_to_optional_arg(arg, "--promisor", &promisor_msg)) {
> - record_local_links = 1;
> + record_outgoing_links = 1;
> } else if (starts_with(arg, "--threads=")) {
> char *end;
> nr_threads = strtoul(arg+10, &end, 0);
> --
> 2.47.0.338.g60cca15819-goog
>
>
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH 2/3] index-pack: no blobs during outgoing link check
2024-12-02 20:18 [PATCH 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
2024-12-02 20:18 ` [PATCH 1/3] index-pack: dedup first during outgoing link check Jonathan Tan
@ 2024-12-02 20:18 ` Jonathan Tan
2024-12-03 6:00 ` Patrick Steinhardt
2024-12-02 20:18 ` [PATCH 3/3] index-pack: commit tree " Jonathan Tan
` (4 subsequent siblings)
6 siblings, 1 reply; 29+ messages in thread
From: Jonathan Tan @ 2024-12-02 20:18 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, hanyang.tony
As a follow-up to the parent of this commit, it was found that not
checking for the existence of blobs linked from trees sped up the fetch
from 24m47.815s to 2m2.127s. Teach Git to do that.
The benefit of doing this is as above (fetch speedup), but the drawback
is that if the packfile to be indexed references a local blob directly
(that is, not through a local tree), that local blob is in danger of
being garbage collected. Such a situation may arise if we push local
commits, including one with a change to a blob in the root tree,
and then the server incorporates them into its main branch through a
"rebase" or "squash" merge strategy, and then we fetch the new main
branch from the server.
This situation has not been observed yet - we have only noticed missing
commits, not missing trees or blobs. (In fact, if it were believed that
only missing commits are problematic, one could argue that we should
also exclude trees during the outgoing link check; but it is safer to
include them.)
Due to the rarity of the situation (it has not been observed to happen
in real life), and because the "penalty" in such a situation is merely
to refetch the missing blob when it's needed, the tradeoff seems
worth it.
(Blobs may also be linked from tag objects, but it is impossible to know
the type of an object linked from a tag object without looking it up in
the object database, so the code for that is untouched.)
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
builtin/index-pack.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 8e7d14c17e..58d24540dc 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -830,8 +830,10 @@ static void do_record_outgoing_links(struct object *obj)
* verified, so do not print any here.
*/
return;
- while (tree_entry_gently(&desc, &entry))
- record_outgoing_link(&entry.oid);
+ while (tree_entry_gently(&desc, &entry)) {
+ if (S_ISDIR(entry.mode))
+ record_outgoing_link(&entry.oid);
+ }
} else if (obj->type == OBJ_COMMIT) {
struct commit *commit = (struct commit *) obj;
struct commit_list *parents = commit->parents;
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply related [flat|nested] 29+ messages in thread* Re: [PATCH 2/3] index-pack: no blobs during outgoing link check
2024-12-02 20:18 ` [PATCH 2/3] index-pack: no blobs " Jonathan Tan
@ 2024-12-03 6:00 ` Patrick Steinhardt
2024-12-03 21:40 ` Jonathan Tan
2024-12-03 22:16 ` Junio C Hamano
0 siblings, 2 replies; 29+ messages in thread
From: Patrick Steinhardt @ 2024-12-03 6:00 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, hanyang.tony
On Mon, Dec 02, 2024 at 12:18:39PM -0800, Jonathan Tan wrote:
> As a follow-up to the parent of this commit, it was found that not
> checking for the existence of blobs linked from trees sped up the fetch
> from 24m47.815s to 2m2.127s. Teach Git to do that.
>
> The benefit of doing this is as above (fetch speedup), but the drawback
> is that if the packfile to be indexed references a local blob directly
> (that is, not through a local tree), that local blob is in danger of
> being garbage collected. Such a situation may arise if we push local
> commits, including one with a change to a blob in the root tree,
> and then the server incorporates them into its main branch through a
> "rebase" or "squash" merge strategy, and then we fetch the new main
> branch from the server.
Okay, so we know that we are basically doing the wrong thing with the
optimization, but by skipping blobs we can get a significant speedup and
the failure mode is that we will re-fetch the object in a later step.
And because we think the situation is rare it shouldn't be a huge issue
in practice.
> This situation has not been observed yet - we have only noticed missing
> commits, not missing trees or blobs. (In fact, if it were believed that
> only missing commits are problematic, one could argue that we should
> also exclude trees during the outgoing link check; but it is safer to
> include them.)
>
> Due to the rarity of the situation (it has not been observed to happen
> in real life), and because the "penalty" in such a situation is merely
> to refetch the missing blob when it's needed, the tradeoff seems
> worth it.
So is this a one-off event that may happen once per blob, or would we
eventually evict the refetched blob and run into the same situation
repeatedly?
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index 8e7d14c17e..58d24540dc 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -830,8 +830,10 @@ static void do_record_outgoing_links(struct object *obj)
> * verified, so do not print any here.
> */
> return;
> - while (tree_entry_gently(&desc, &entry))
> - record_outgoing_link(&entry.oid);
> + while (tree_entry_gently(&desc, &entry)) {
> + if (S_ISDIR(entry.mode))
> + record_outgoing_link(&entry.oid);
> + }
Without the context of the commit message this code snippet likely would
not make any sense to a reader. The "correct" logic would be to record
all objects, regardless of whether they are an object ID or not. But we
explicitly choose not to as a tradeoff between performance and
correctness.
All to say that we should have a comment here that explains what is
going on.
Patrick
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH 2/3] index-pack: no blobs during outgoing link check
2024-12-03 6:00 ` Patrick Steinhardt
@ 2024-12-03 21:40 ` Jonathan Tan
2024-12-03 22:16 ` Junio C Hamano
1 sibling, 0 replies; 29+ messages in thread
From: Jonathan Tan @ 2024-12-03 21:40 UTC (permalink / raw)
To: Patrick Steinhardt; +Cc: Jonathan Tan, git, hanyang.tony
Patrick Steinhardt <ps@pks.im> writes:
> > This situation has not been observed yet - we have only noticed missing
> > commits, not missing trees or blobs. (In fact, if it were believed that
> > only missing commits are problematic, one could argue that we should
> > also exclude trees during the outgoing link check; but it is safer to
> > include them.)
> >
> > Due to the rarity of the situation (it has not been observed to happen
> > in real life), and because the "penalty" in such a situation is merely
> > to refetch the missing blob when it's needed, the tradeoff seems
> > worth it.
>
> So is this a one-off event that may happen once per blob, or would we
> eventually evict the refetched blob and run into the same situation
> repeatedly?
One-off, since when refetched, the blob is in a promisor pack (and
thus won't be GC-ed). I've added this to the code comment that I added
following your suggestion below.
> > diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> > index 8e7d14c17e..58d24540dc 100644
> > --- a/builtin/index-pack.c
> > +++ b/builtin/index-pack.c
> > @@ -830,8 +830,10 @@ static void do_record_outgoing_links(struct object *obj)
> > * verified, so do not print any here.
> > */
> > return;
> > - while (tree_entry_gently(&desc, &entry))
> > - record_outgoing_link(&entry.oid);
> > + while (tree_entry_gently(&desc, &entry)) {
> > + if (S_ISDIR(entry.mode))
> > + record_outgoing_link(&entry.oid);
> > + }
>
> Without the context of the commit message this code snippet likely would
> not make any sense to a reader. The "correct" logic would be to record
> all objects, regardless of whether they are an object ID or not. But we
> explicitly choose not to as a tradeoff between performance and
> correctness.
>
> All to say that we should have a comment here that explains what is
> going on.
>
> Patrick
Makes sense. I had to move almost the entirety of the commit message
into a code comment - I don't think putting merely a part here would be
enough context.
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH 2/3] index-pack: no blobs during outgoing link check
2024-12-03 6:00 ` Patrick Steinhardt
2024-12-03 21:40 ` Jonathan Tan
@ 2024-12-03 22:16 ` Junio C Hamano
1 sibling, 0 replies; 29+ messages in thread
From: Junio C Hamano @ 2024-12-03 22:16 UTC (permalink / raw)
To: Patrick Steinhardt; +Cc: Jonathan Tan, git, hanyang.tony
Patrick Steinhardt <ps@pks.im> writes:
>> The benefit of doing this is as above (fetch speedup), but the drawback
>> is that if the packfile to be indexed references a local blob directly
>> (that is, not through a local tree), that local blob is in danger of
>> being garbage collected. Such a situation may arise if we push local
>> commits, including one with a change to a blob in the root tree,
>> and then the server incorporates them into its main branch through a
>> "rebase" or "squash" merge strategy, and then we fetch the new main
>> branch from the server.
>
> Okay, so we know that we are basically doing the wrong thing with the
> optimization, but by skipping blobs we can get a significant speedup and
> the failure mode is that we will re-fetch the object in a later step.
> And because we think the situation is rare it shouldn't be a huge issue
> in practice.
That is how I read it, but the description may want to make the pros
and cons more explicit. One of the reasons why the users choose to
use lazy clone is to avoid grabbing large blobs until they need
them, so I am not sure how they feel about having to discard and
then fetch again after creating a potentially large blob. As long
as it does not happen repeatedly for the same blob, it probably is
OK.
> Without the context of the commit message this code snippet likely would
> not make any sense to a reader. The "correct" logic would be to record
> all objects, regardless of whether they are an object ID or not. But we
> explicitly choose not to as a tradeoff between performance and
> correctness.
>
> All to say that we should have a comment here that explains what is
> going on.
Thanks for reviewing and commenting.
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH 3/3] index-pack: commit tree during outgoing link check
2024-12-02 20:18 [PATCH 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
2024-12-02 20:18 ` [PATCH 1/3] index-pack: dedup first during outgoing link check Jonathan Tan
2024-12-02 20:18 ` [PATCH 2/3] index-pack: no blobs " Jonathan Tan
@ 2024-12-02 20:18 ` Jonathan Tan
2024-12-03 3:10 ` Junio C Hamano
2024-12-02 21:25 ` [PATCH 0/3] Performance improvements for repacking non-promisor objects Josh Steadmon
` (3 subsequent siblings)
6 siblings, 1 reply; 29+ messages in thread
From: Jonathan Tan @ 2024-12-02 20:18 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, hanyang.tony
Commit c08589efdc (index-pack: repack local links into promisor packs,
2024-11-01) seems to contain an oversight in that the tree of a commit
is not checked. The fix slows down a fetch from a certain repo at
$DAYJOB from 2m2.127s to 2m45.052s, but in order to make the fetch
correct, it seems worth it.
In order to test this, we could create server and client repos as
follows...
C S
\ /
O
(O and C are commits both on the client and server. S is a commit
only on the server. C and S have the same tree but different commit
messages.)
...and then, from the client, fetch S from the server.
In theory, the client declares "have C" and the server can use this
information to exclude S's tree (since it knows that the client has C's
tree, which is the same as S's tree). However, it is also possible for
the server to compute that it needs to send S and not O, and proceed
from there; therefore the objects of C are not considered at all when
determining what to send in the packfile. In order to prevent a test of
client functionality from having such a dependence on server behavior, I
have not included such a test.
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
builtin/index-pack.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 58d24540dc..338aeeadc8 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -838,6 +838,7 @@ static void do_record_outgoing_links(struct object *obj)
struct commit *commit = (struct commit *) obj;
struct commit_list *parents = commit->parents;
+ record_outgoing_link(get_commit_tree_oid(commit));
for (; parents; parents = parents->next)
record_outgoing_link(&parents->item->object.oid);
} else if (obj->type == OBJ_TAG) {
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply related [flat|nested] 29+ messages in thread* Re: [PATCH 3/3] index-pack: commit tree during outgoing link check
2024-12-02 20:18 ` [PATCH 3/3] index-pack: commit tree " Jonathan Tan
@ 2024-12-03 3:10 ` Junio C Hamano
2024-12-03 21:42 ` Jonathan Tan
0 siblings, 1 reply; 29+ messages in thread
From: Junio C Hamano @ 2024-12-03 3:10 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, hanyang.tony
Jonathan Tan <jonathantanmy@google.com> writes:
> Subject: Re: [PATCH 3/3] index-pack: commit tree during outgoing link check
> Commit c08589efdc (index-pack: repack local links into promisor packs,
> 2024-11-01) seems to contain an oversight in that the tree of a commit
> is not checked.
I am having a hard time linking the subject with the statement. The
verb "commit" should probably be something else, as you are not
creating a tree object while checking, but I am not sure?
> The fix slows down a fetch from a certain repo at
> $DAYJOB from 2m2.127s to 2m45.052s, but in order to make the fetch
> correct, it seems worth it.
And "the fix" is not described so a reader is left wondering. Is
the fix for an oversight of not checking merely to check it? IOW,
is
c08589efdc made outgoing links to be checked for commits, but
failed to do so for trees. Make sure we check both
what is happening?
> In order to test this, we could create server and client repos as
> follows...
>
> C S
> \ /
> O
>
> (O and C are commits both on the client and server. S is a commit
> only on the server. C and S have the same tree but different commit
> messages.)
>
> ...and then, from the client, fetch S from the server.
>
> In theory, the client declares "have C" and the server can use this
> information to exclude S's tree (since it knows that the client has C's
> tree, which is the same as S's tree).
OK.
> However, it is also possible for
> the server to compute that it needs to send S and not O, and proceed
> from there;
If O, C, and S have all identical trees, then wouldn't such a test
work well? At that point it does not matter which between O and C
the server bases its decision to send S but not S's tree on, no?
In any case, will queue. Thanks.
> therefore the objects of C are not considered at all when
> determining what to send in the packfile. In order to prevent a test of
> client functionality from having such a dependence on server behavior, I
> have not included such a test.
>
> Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
> ---
> builtin/index-pack.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/builtin/index-pack.c b/builtin/index-pack.c
> index 58d24540dc..338aeeadc8 100644
> --- a/builtin/index-pack.c
> +++ b/builtin/index-pack.c
> @@ -838,6 +838,7 @@ static void do_record_outgoing_links(struct object *obj)
> struct commit *commit = (struct commit *) obj;
> struct commit_list *parents = commit->parents;
>
> + record_outgoing_link(get_commit_tree_oid(commit));
> for (; parents; parents = parents->next)
> record_outgoing_link(&parents->item->object.oid);
> } else if (obj->type == OBJ_TAG) {
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH 3/3] index-pack: commit tree during outgoing link check
2024-12-03 3:10 ` Junio C Hamano
@ 2024-12-03 21:42 ` Jonathan Tan
2024-12-04 0:21 ` Junio C Hamano
0 siblings, 1 reply; 29+ messages in thread
From: Jonathan Tan @ 2024-12-03 21:42 UTC (permalink / raw)
To: Junio C Hamano; +Cc: Jonathan Tan, git, hanyang.tony
Junio C Hamano <gitster@pobox.com> writes:
> > The fix slows down a fetch from a certain repo at
> > $DAYJOB from 2m2.127s to 2m45.052s, but in order to make the fetch
> > correct, it seems worth it.
>
> And "the fix" is not described so a reader is left wondering. Is
> the fix for an oversight of not checking merely to check it? IOW,
> is
>
> c08589efdc made outgoing links to be checked for commits, but
> failed to do so for trees. Make sure we check both
>
> what is happening?
Yes. I was trying to keep to the character limit and in doing so, made
the commit message title hard to understand. I think the new title
should be easier to understand (and also stated explicitly in the commit
message what is being taught to Git).
> > However, it is also possible for
> > the server to compute that it needs to send S and not O, and proceed
> > from there;
>
> If O, C, and S have all identical trees, then wouldn't such a test
> work well? At that point it does not matter which between O and C
> the server bases its decision to send S but not S's tree on, no?
>
> In any case, will queue. Thanks.
O has a different tree from C and S. I will add a note to clarify this.
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH 3/3] index-pack: commit tree during outgoing link check
2024-12-03 21:42 ` Jonathan Tan
@ 2024-12-04 0:21 ` Junio C Hamano
2024-12-09 20:29 ` Jonathan Tan
0 siblings, 1 reply; 29+ messages in thread
From: Junio C Hamano @ 2024-12-04 0:21 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, hanyang.tony
Jonathan Tan <jonathantanmy@google.com> writes:
> Junio C Hamano <gitster@pobox.com> writes:
>> > The fix slows down a fetch from a certain repo at
>> > $DAYJOB from 2m2.127s to 2m45.052s, but in order to make the fetch
>> > correct, it seems worth it.
>>
>> And "the fix" is not described so a reader is left wondering. Is
>> the fix for an oversight of not checking merely to check it? IOW,
>> is
>>
>> c08589efdc made outgoing links to be checked for commits, but
>> failed to do so for trees. Make sure we check both
>>
>> what is happening?
>
> Yes. I was trying to keep to the character limit and in doing so, made
> the commit message title hard to understand. I think the new title
> should be easier to understand (and also stated explicitly in the commit
> message what is being taught to Git).
Thanks.
>> > However, it is also possible for
>> > the server to compute that it needs to send S and not O, and proceed
>> > from there;
>>
>> If O, C, and S have all identical trees, then wouldn't such a test
>> work well? At that point it does not matter which between O and C
>> the server bases its decision to send S but not S's tree on, no?
>>
>> In any case, will queue. Thanks.
>
> O has a different tree from C and S. I will add a note to clarify this.
No, that is not what I meant. "If you arrange your test so that all
three have the same tree, then would't the reason why such a test
would not work you cited disappear and make this fix testable?" is
what I wanted to ask.
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH 3/3] index-pack: commit tree during outgoing link check
2024-12-04 0:21 ` Junio C Hamano
@ 2024-12-09 20:29 ` Jonathan Tan
2024-12-09 23:51 ` Junio C Hamano
0 siblings, 1 reply; 29+ messages in thread
From: Jonathan Tan @ 2024-12-09 20:29 UTC (permalink / raw)
To: Junio C Hamano; +Cc: Jonathan Tan, git, hanyang.tony
Junio C Hamano <gitster@pobox.com> writes:
> >> > However, it is also possible for
> >> > the server to compute that it needs to send S and not O, and proceed
> >> > from there;
> >>
> >> If O, C, and S have all identical trees, then wouldn't such a test
> >> work well? At that point it does not matter which between O and C
> >> the server bases its decision to send S but not S's tree on, no?
> >>
> >> In any case, will queue. Thanks.
> >
> > O has a different tree from C and S. I will add a note to clarify this.
>
> No, that is not what I meant. "If you arrange your test so that all
> three have the same tree, then would't the reason why such a test
> would not work you cited disappear and make this fix testable?" is
> what I wanted to ask.
Copying and pasting the diagram for reference:
C S
\ /
O
I looked into this and I don't think that making O, C, and S have the
same tree will make this fix testable. If they all had the same tree,
whether we check S's tree or not doesn't matter, since that tree is
already in a promisor pack (O and its tree was previously fetched from
the promisor remote, and thus is in a promisor pack). (We are checking
trees in order to repack them into promisor packs if necessary.)
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH 3/3] index-pack: commit tree during outgoing link check
2024-12-09 20:29 ` Jonathan Tan
@ 2024-12-09 23:51 ` Junio C Hamano
0 siblings, 0 replies; 29+ messages in thread
From: Junio C Hamano @ 2024-12-09 23:51 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, hanyang.tony
Jonathan Tan <jonathantanmy@google.com> writes:
> Copying and pasting the diagram for reference:
>
> C S
> \ /
> O
>
> I looked into this and I don't think that making O, C, and S have the
> same tree will make this fix testable. If they all had the same tree,
> whether we check S's tree or not doesn't matter, since that tree is
> already in a promisor pack (O and its tree was previously fetched from
> the promisor remote, and thus is in a promisor pack). (We are checking
> trees in order to repack them into promisor packs if necessary.)
I see; thanks.
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH 0/3] Performance improvements for repacking non-promisor objects
2024-12-02 20:18 [PATCH 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
` (2 preceding siblings ...)
2024-12-02 20:18 ` [PATCH 3/3] index-pack: commit tree " Jonathan Tan
@ 2024-12-02 21:25 ` Josh Steadmon
2024-12-03 4:09 ` Junio C Hamano
2024-12-03 4:18 ` Junio C Hamano
` (2 subsequent siblings)
6 siblings, 1 reply; 29+ messages in thread
From: Josh Steadmon @ 2024-12-02 21:25 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, hanyang.tony
On 2024.12.02 12:18, Jonathan Tan wrote:
> This is a follow-up to jt/repack-local-promisor [1] (but these patches
> are based on master, since that branch has already been merged).
>
> These patches speed up a fetch that takes 7 hours to take under 3
> minutes. More details are in the commit messages, especially that of
> patch 1.
>
> Thanks in advance to everyone who reviews. While review is going on,
> we'll also be testing these at $DAYJOB (I've tested it to work on one
> known big repo, but there may be others).
>
> [1] https://lore.kernel.org/git/cover.1730491845.git.jonathantanmy@google.com/
>
> Jonathan Tan (3):
> index-pack: dedup first during outgoing link check
> index-pack: no blobs during outgoing link check
> index-pack: commit tree during outgoing link check
>
> builtin/index-pack.c | 49 +++++++++++++++++++++++---------------------
> 1 file changed, 26 insertions(+), 23 deletions(-)
>
> --
> 2.47.0.338.g60cca15819-goog
>
>
This series looks good to me, thanks!
Reviewed-by: Josh Steadmon <steadmon@google.com>
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH 0/3] Performance improvements for repacking non-promisor objects
2024-12-02 20:18 [PATCH 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
` (3 preceding siblings ...)
2024-12-02 21:25 ` [PATCH 0/3] Performance improvements for repacking non-promisor objects Josh Steadmon
@ 2024-12-03 4:18 ` Junio C Hamano
2024-12-03 4:20 ` Junio C Hamano
2024-12-03 21:43 ` [PATCH v2 " Jonathan Tan
2024-12-03 21:52 ` [PATCH v3 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
6 siblings, 1 reply; 29+ messages in thread
From: Junio C Hamano @ 2024-12-03 4:18 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, hanyang.tony
Jonathan Tan <jonathantanmy@google.com> writes:
> This is a follow-up to jt/repack-local-promisor [1] (but these patches
> are based on master, since that branch has already been merged).
>
> These patches speed up a fetch that takes 7 hours to take under 3
> minutes. More details are in the commit messages, especially that of
> patch 1.
>
> Thanks in advance to everyone who reviews. While review is going on,
> we'll also be testing these at $DAYJOB (I've tested it to work on one
> known big repo, but there may be others).
>
> [1] https://lore.kernel.org/git/cover.1730491845.git.jonathantanmy@google.com/
>
> Jonathan Tan (3):
> index-pack: dedup first during outgoing link check
> index-pack: no blobs during outgoing link check
> index-pack: commit tree during outgoing link check
>
> builtin/index-pack.c | 49 +++++++++++++++++++++++---------------------
> 1 file changed, 26 insertions(+), 23 deletions(-)
When merged to 'seen', this seems to break quite a many tests, all
related to "pack".
I haven't tried running tests on the topic stand-alone.
Test Summary Report
-------------------
t5310-pack-bitmaps.sh (Wstat: 256 (exited 1) Tests: 230 Failed: 3)
Failed tests: 26, 100, 179
Non-zero exit status: 1
t5327-multi-pack-bitmaps-rev.sh (Wstat: 256 (exited 1) Tests: 314 Failed: 6)
Failed tests: 25, 70, 136, 182, 227, 293
Non-zero exit status: 1
t5616-partial-clone.sh (Wstat: 256 (exited 1) Tests: 47 Failed: 3)
Failed tests: 5, 38-39
Non-zero exit status: 1
t5326-multi-pack-bitmaps.sh (Wstat: 256 (exited 1) Tests: 357 Failed: 6)
Failed tests: 25, 70, 146, 199, 244, 320
Non-zero exit status: 1
t0410-partial-clone.sh (Wstat: 256 (exited 1) Tests: 38 Failed: 4)
Failed tests: 11, 14-15, 38
Non-zero exit status: 1
t6020-bundle-misc.sh (Wstat: 256 (exited 1) Tests: 30 Failed: 4)
Failed tests: 16-19
Non-zero exit status: 1
t9211-scalar-clone.sh (Wstat: 256 (exited 1) Tests: 13 Failed: 2)
Failed tests: 4-5
Non-zero exit status: 1
Files=1031, Tests=31721, 324 wallclock secs (11.58 usr 4.57 sys + 674.15 cusr 6136.28 csys = 6826.58 CPU)
Result: FAIL
g
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH 0/3] Performance improvements for repacking non-promisor objects
2024-12-03 4:18 ` Junio C Hamano
@ 2024-12-03 4:20 ` Junio C Hamano
2024-12-03 4:39 ` Junio C Hamano
0 siblings, 1 reply; 29+ messages in thread
From: Junio C Hamano @ 2024-12-03 4:20 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, hanyang.tony
Junio C Hamano <gitster@pobox.com> writes:
> When merged to 'seen', this seems to break quite a many tests, all
> related to "pack".
>
> I haven't tried running tests on the topic stand-alone.
Ah, sorry for a false alarm. The breakage may be coming from some
other topic. Haven't figured out which one yet.
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH 0/3] Performance improvements for repacking non-promisor objects
2024-12-03 4:20 ` Junio C Hamano
@ 2024-12-03 4:39 ` Junio C Hamano
0 siblings, 0 replies; 29+ messages in thread
From: Junio C Hamano @ 2024-12-03 4:39 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, hanyang.tony
Junio C Hamano <gitster@pobox.com> writes:
> Junio C Hamano <gitster@pobox.com> writes:
>
>> When merged to 'seen', this seems to break quite a many tests, all
>> related to "pack".
>>
>> I haven't tried running tests on the topic stand-alone.
>
> Ah, sorry for a false alarm. The breakage may be coming from some
> other topic. Haven't figured out which one yet.
Sorry for a double false alarm. The breakages are due to this
topic. These do not break without this topic in 'seen', and do
break with this topic in 'seen'.
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH v2 0/3] Performance improvements for repacking non-promisor objects
2024-12-02 20:18 [PATCH 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
` (4 preceding siblings ...)
2024-12-03 4:18 ` Junio C Hamano
@ 2024-12-03 21:43 ` Jonathan Tan
2024-12-03 21:43 ` [PATCH v2 1/3] index-pack --promisor: dedup before checking links Jonathan Tan
` (2 more replies)
2024-12-03 21:52 ` [PATCH v3 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
6 siblings, 3 replies; 29+ messages in thread
From: Jonathan Tan @ 2024-12-03 21:43 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, ps, gitster
Thanks everyone for the reviews.
The only code change is to invoke the repacking of non-promisor objects
into a promisor pack only when the first object that needs repacking
is detected. If not, there will be an empty pack created, which is the
cause of the failing tests Junio noticed.
Other than that, there are some code comments and commit message changes
as requested by reviews.
I've checked that all tests pass before and after merging with "seen".
Jonathan Tan (3):
index-pack --promisor: dedup before checking links
index-pack --promisor: don't check blobs
index-pack --promisor: also check commits' trees
builtin/index-pack.c | 101 +++++++++++++++++++++++++++++--------------
1 file changed, 69 insertions(+), 32 deletions(-)
Range-diff against v1:
1: 5f0f114dbd ! 1: 7ae21c921f index-pack: dedup first during outgoing link check
@@ Metadata
Author: Jonathan Tan <jonathantanmy@google.com>
## Commit message ##
- index-pack: dedup first during outgoing link check
+ index-pack --promisor: dedup before checking links
Commit c08589efdc (index-pack: repack local links into promisor packs,
2024-11-01) fixed a bug with what was believed to be a negligible
@@ builtin/index-pack.c: static void repack_local_links(void)
+ if (!oidset_size(&outgoing_links))
return;
- base_name = mkpathdup("%s/pack/pack", repo_get_object_directory(the_repository));
-@@ builtin/index-pack.c: static void repack_local_links(void)
- if (start_command(&cmd))
- die(_("could not start pack-objects to repack local links"));
-
-- oidset_iter_init(&local_links, &iter);
+- base_name = mkpathdup("%s/pack/pack", repo_get_object_directory(the_repository));
+ oidset_iter_init(&outgoing_links, &iter);
- while ((oid = oidset_iter_next(&iter))) {
++ while ((oid = oidset_iter_next(&iter))) {
+ struct object_info info = OBJECT_INFO_INIT;
+ if (oid_object_info_extended(the_repository, oid, &info, 0))
+ /* Missing; assume it is a promisor object */
+ continue;
+ if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
+ continue;
+
+- strvec_push(&cmd.args, "pack-objects");
+- strvec_push(&cmd.args, "--exclude-promisor-objects-best-effort");
+- strvec_push(&cmd.args, base_name);
+- cmd.git_cmd = 1;
+- cmd.in = -1;
+- cmd.out = -1;
+- if (start_command(&cmd))
+- die(_("could not start pack-objects to repack local links"));
++ if (!cmd.args.nr) {
++ base_name = mkpathdup(
++ "%s/pack/pack",
++ repo_get_object_directory(the_repository));
++ strvec_push(&cmd.args, "pack-objects");
++ strvec_push(&cmd.args,
++ "--exclude-promisor-objects-best-effort");
++ strvec_push(&cmd.args, base_name);
++ cmd.git_cmd = 1;
++ cmd.in = -1;
++ cmd.out = -1;
++ if (start_command(&cmd))
++ die(_("could not start pack-objects to repack local links"));
++ }
+
+- oidset_iter_init(&local_links, &iter);
+- while ((oid = oidset_iter_next(&iter))) {
if (write_in_full(cmd.in, oid_to_hex(oid), the_hash_algo->hexsz) < 0 ||
write_in_full(cmd.in, "\n", 1) < 0)
die(_("failed to feed local object to pack-objects"));
+ }
++
++ if (!cmd.args.nr)
++ return;
++
+ close(cmd.in);
+
+ out = xfdopen(cmd.out, "r");
@@ builtin/index-pack.c: int cmd_index_pack(int argc,
} else if (skip_to_optional_arg(arg, "--keep", &keep_msg)) {
; /* nothing to do */
2: 300f53b8e3 ! 2: 5a63c9a5ca index-pack: no blobs during outgoing link check
@@ Metadata
Author: Jonathan Tan <jonathantanmy@google.com>
## Commit message ##
- index-pack: no blobs during outgoing link check
+ index-pack --promisor: don't check blobs
As a follow-up to the parent of this commit, it was found that not
checking for the existence of blobs linked from trees sped up the fetch
from 24m47.815s to 2m2.127s. Teach Git to do that.
- The benefit of doing this is as above (fetch speedup), but the drawback
- is that if the packfile to be indexed references a local blob directly
- (that is, not through a local tree), that local blob is in danger of
- being garbage collected. Such a situation may arise if we push local
- commits, including one with a change to a blob in the root tree,
- and then the server incorporates them into its main branch through a
- "rebase" or "squash" merge strategy, and then we fetch the new main
- branch from the server.
-
- This situation has not been observed yet - we have only noticed missing
- commits, not missing trees or blobs. (In fact, if it were believed that
- only missing commits are problematic, one could argue that we should
- also exclude trees during the outgoing link check; but it is safer to
- include them.)
-
- Due to the rarity of the situation (it has not been observed to happen
- in real life), and because the "penalty" in such a situation is merely
- to refetch the missing blob when it's needed, the tradeoff seems
- worth it.
+ The tradeoff of not checking blobs is documented in a code comment.
(Blobs may also be linked from tag objects, but it is impossible to know
the type of an object linked from a tag object without looking it up in
@@ Commit message
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
## builtin/index-pack.c ##
+@@ builtin/index-pack.c: static void record_outgoing_link(const struct object_id *oid)
+ oidset_insert(&outgoing_links, oid);
+ }
+
++static void maybe_record_name_entry(const struct name_entry *entry)
++{
++ /*
++ * The benefit of doing this is as above (fetch speedup), but the drawback
++is that if the packfile to be indexed references a local blob directly
++(that is, not through a local tree), that local blob is in danger of
++being garbage collected. Such a situation may arise if we push local
++commits, including one with a change to a blob in the root tree,
++and then the server incorporates them into its main branch through a
++"rebase" or "squash" merge strategy, and then we fetch the new main
++branch from the server.
++
++This situation has not been observed yet - we have only noticed missing
++commits, not missing trees or blobs. (In fact, if it were believed that
++only missing commits are problematic, one could argue that we should
++also exclude trees during the outgoing link check; but it is safer to
++include them.)
++
++Due to the rarity of the situation (it has not been observed to happen
++in real life), and because the "penalty" in such a situation is merely
++to refetch the missing blob when it's needed, the tradeoff seems
++worth it.
++ */
++ if (S_ISDIR(entry->mode))
++ record_outgoing_link(&entry->oid);
++}
++
+ static void do_record_outgoing_links(struct object *obj)
+ {
+ if (obj->type == OBJ_TREE) {
@@ builtin/index-pack.c: static void do_record_outgoing_links(struct object *obj)
- * verified, so do not print any here.
*/
return;
-- while (tree_entry_gently(&desc, &entry))
+ while (tree_entry_gently(&desc, &entry))
- record_outgoing_link(&entry.oid);
-+ while (tree_entry_gently(&desc, &entry)) {
-+ if (S_ISDIR(entry.mode))
-+ record_outgoing_link(&entry.oid);
-+ }
++ maybe_record_name_entry(&entry);
} else if (obj->type == OBJ_COMMIT) {
struct commit *commit = (struct commit *) obj;
struct commit_list *parents = commit->parents;
3: 2f2f0db78b ! 3: 8139325bf2 index-pack: commit tree during outgoing link check
@@ Metadata
Author: Jonathan Tan <jonathantanmy@google.com>
## Commit message ##
- index-pack: commit tree during outgoing link check
+ index-pack --promisor: also check commits' trees
Commit c08589efdc (index-pack: repack local links into promisor packs,
2024-11-01) seems to contain an oversight in that the tree of a commit
- is not checked. The fix slows down a fetch from a certain repo at
- $DAYJOB from 2m2.127s to 2m45.052s, but in order to make the fetch
- correct, it seems worth it.
+ is not checked. Teach git to check these trees.
+
+ The fix slows down a fetch from a certain repo at $DAYJOB from 2m2.127s
+ to 2m45.052s, but in order to make the fetch correct, it seems worth it.
In order to test this, we could create server and client repos as
follows...
@@ Commit message
(O and C are commits both on the client and server. S is a commit
only on the server. C and S have the same tree but different commit
- messages.)
+ messages. The diff between O and C is non-zero.)
...and then, from the client, fetch S from the server.
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply [flat|nested] 29+ messages in thread* [PATCH v2 1/3] index-pack --promisor: dedup before checking links
2024-12-03 21:43 ` [PATCH v2 " Jonathan Tan
@ 2024-12-03 21:43 ` Jonathan Tan
2024-12-03 21:43 ` [PATCH v2 2/3] index-pack --promisor: don't check blobs Jonathan Tan
2024-12-03 21:43 ` [PATCH v2 3/3] index-pack --promisor: also check commits' trees Jonathan Tan
2 siblings, 0 replies; 29+ messages in thread
From: Jonathan Tan @ 2024-12-03 21:43 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, ps, gitster
Commit c08589efdc (index-pack: repack local links into promisor packs,
2024-11-01) fixed a bug with what was believed to be a negligible
decrease in performance [1] [2]. But at $DAYJOB, with at least one repo,
it was found that the decrease in performance was very significant.
Looking at the patch, whenever we parse an object in the packfile to
be indexed, we check the targets of all its outgoing links for its
existence. However, this could be optimized by first collecting all such
targets into an oidset (thus deduplicating them) before checking. Teach
Git to do that.
On a certain fetch from the aforementioned repo, this improved
performance from approximately 7 hours to 24m47.815s. This number will
be further reduced in a subsequent patch.
[1] https://lore.kernel.org/git/CAG1j3zGiNMbri8rZNaF0w+yP+6OdMz0T8+8_Wgd1R_p1HzVasg@mail.gmail.com/
[2] https://lore.kernel.org/git/20241105212849.3759572-1-jonathantanmy@google.com/
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
builtin/index-pack.c | 73 +++++++++++++++++++++++++-------------------
1 file changed, 41 insertions(+), 32 deletions(-)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 95babdc5ea..d1c777a6af 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -155,11 +155,11 @@ static int input_fd, output_fd;
static const char *curr_pack;
/*
- * local_links is guarded by read_mutex, and record_local_links is read-only in
- * a thread.
+ * outgoing_links is guarded by read_mutex, and record_outgoing_links is
+ * read-only in a thread.
*/
-static struct oidset local_links = OIDSET_INIT;
-static int record_local_links;
+static struct oidset outgoing_links = OIDSET_INIT;
+static int record_outgoing_links;
static struct thread_local_data *thread_data;
static int nr_dispatched;
@@ -812,18 +812,12 @@ static int check_collison(struct object_entry *entry)
return 0;
}
-static void record_if_local_object(const struct object_id *oid)
+static void record_outgoing_link(const struct object_id *oid)
{
- struct object_info info = OBJECT_INFO_INIT;
- if (oid_object_info_extended(the_repository, oid, &info, 0))
- /* Missing; assume it is a promisor object */
- return;
- if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
- return;
- oidset_insert(&local_links, oid);
+ oidset_insert(&outgoing_links, oid);
}
-static void do_record_local_links(struct object *obj)
+static void do_record_outgoing_links(struct object *obj)
{
if (obj->type == OBJ_TREE) {
struct tree *tree = (struct tree *)obj;
@@ -837,16 +831,16 @@ static void do_record_local_links(struct object *obj)
*/
return;
while (tree_entry_gently(&desc, &entry))
- record_if_local_object(&entry.oid);
+ record_outgoing_link(&entry.oid);
} else if (obj->type == OBJ_COMMIT) {
struct commit *commit = (struct commit *) obj;
struct commit_list *parents = commit->parents;
for (; parents; parents = parents->next)
- record_if_local_object(&parents->item->object.oid);
+ record_outgoing_link(&parents->item->object.oid);
} else if (obj->type == OBJ_TAG) {
struct tag *tag = (struct tag *) obj;
- record_if_local_object(get_tagged_oid(tag));
+ record_outgoing_link(get_tagged_oid(tag));
}
}
@@ -896,7 +890,7 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
free(has_data);
}
- if (strict || do_fsck_object || record_local_links) {
+ if (strict || do_fsck_object || record_outgoing_links) {
read_lock();
if (type == OBJ_BLOB) {
struct blob *blob = lookup_blob(the_repository, oid);
@@ -928,8 +922,8 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
die(_("fsck error in packed object"));
if (strict && fsck_walk(obj, NULL, &fsck_options))
die(_("Not all child objects of %s are reachable"), oid_to_hex(&obj->oid));
- if (record_local_links)
- do_record_local_links(obj);
+ if (record_outgoing_links)
+ do_record_outgoing_links(obj);
if (obj->type == OBJ_TREE) {
struct tree *item = (struct tree *) obj;
@@ -1781,26 +1775,41 @@ static void repack_local_links(void)
struct object_id *oid;
char *base_name;
- if (!oidset_size(&local_links))
+ if (!oidset_size(&outgoing_links))
return;
- base_name = mkpathdup("%s/pack/pack", repo_get_object_directory(the_repository));
+ oidset_iter_init(&outgoing_links, &iter);
+ while ((oid = oidset_iter_next(&iter))) {
+ struct object_info info = OBJECT_INFO_INIT;
+ if (oid_object_info_extended(the_repository, oid, &info, 0))
+ /* Missing; assume it is a promisor object */
+ continue;
+ if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
+ continue;
- strvec_push(&cmd.args, "pack-objects");
- strvec_push(&cmd.args, "--exclude-promisor-objects-best-effort");
- strvec_push(&cmd.args, base_name);
- cmd.git_cmd = 1;
- cmd.in = -1;
- cmd.out = -1;
- if (start_command(&cmd))
- die(_("could not start pack-objects to repack local links"));
+ if (!cmd.args.nr) {
+ base_name = mkpathdup(
+ "%s/pack/pack",
+ repo_get_object_directory(the_repository));
+ strvec_push(&cmd.args, "pack-objects");
+ strvec_push(&cmd.args,
+ "--exclude-promisor-objects-best-effort");
+ strvec_push(&cmd.args, base_name);
+ cmd.git_cmd = 1;
+ cmd.in = -1;
+ cmd.out = -1;
+ if (start_command(&cmd))
+ die(_("could not start pack-objects to repack local links"));
+ }
- oidset_iter_init(&local_links, &iter);
- while ((oid = oidset_iter_next(&iter))) {
if (write_in_full(cmd.in, oid_to_hex(oid), the_hash_algo->hexsz) < 0 ||
write_in_full(cmd.in, "\n", 1) < 0)
die(_("failed to feed local object to pack-objects"));
}
+
+ if (!cmd.args.nr)
+ return;
+
close(cmd.in);
out = xfdopen(cmd.out, "r");
@@ -1899,7 +1908,7 @@ int cmd_index_pack(int argc,
} else if (skip_to_optional_arg(arg, "--keep", &keep_msg)) {
; /* nothing to do */
} else if (skip_to_optional_arg(arg, "--promisor", &promisor_msg)) {
- record_local_links = 1;
+ record_outgoing_links = 1;
} else if (starts_with(arg, "--threads=")) {
char *end;
nr_threads = strtoul(arg+10, &end, 0);
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v2 2/3] index-pack --promisor: don't check blobs
2024-12-03 21:43 ` [PATCH v2 " Jonathan Tan
2024-12-03 21:43 ` [PATCH v2 1/3] index-pack --promisor: dedup before checking links Jonathan Tan
@ 2024-12-03 21:43 ` Jonathan Tan
2024-12-03 21:43 ` [PATCH v2 3/3] index-pack --promisor: also check commits' trees Jonathan Tan
2 siblings, 0 replies; 29+ messages in thread
From: Jonathan Tan @ 2024-12-03 21:43 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, ps, gitster
As a follow-up to the parent of this commit, it was found that not
checking for the existence of blobs linked from trees sped up the fetch
from 24m47.815s to 2m2.127s. Teach Git to do that.
The tradeoff of not checking blobs is documented in a code comment.
(Blobs may also be linked from tag objects, but it is impossible to know
the type of an object linked from a tag object without looking it up in
the object database, so the code for that is untouched.)
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
builtin/index-pack.c | 29 ++++++++++++++++++++++++++++-
1 file changed, 28 insertions(+), 1 deletion(-)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index d1c777a6af..57b7888c42 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -817,6 +817,33 @@ static void record_outgoing_link(const struct object_id *oid)
oidset_insert(&outgoing_links, oid);
}
+static void maybe_record_name_entry(const struct name_entry *entry)
+{
+ /*
+ * The benefit of doing this is as above (fetch speedup), but the drawback
+is that if the packfile to be indexed references a local blob directly
+(that is, not through a local tree), that local blob is in danger of
+being garbage collected. Such a situation may arise if we push local
+commits, including one with a change to a blob in the root tree,
+and then the server incorporates them into its main branch through a
+"rebase" or "squash" merge strategy, and then we fetch the new main
+branch from the server.
+
+This situation has not been observed yet - we have only noticed missing
+commits, not missing trees or blobs. (In fact, if it were believed that
+only missing commits are problematic, one could argue that we should
+also exclude trees during the outgoing link check; but it is safer to
+include them.)
+
+Due to the rarity of the situation (it has not been observed to happen
+in real life), and because the "penalty" in such a situation is merely
+to refetch the missing blob when it's needed, the tradeoff seems
+worth it.
+ */
+ if (S_ISDIR(entry->mode))
+ record_outgoing_link(&entry->oid);
+}
+
static void do_record_outgoing_links(struct object *obj)
{
if (obj->type == OBJ_TREE) {
@@ -831,7 +858,7 @@ static void do_record_outgoing_links(struct object *obj)
*/
return;
while (tree_entry_gently(&desc, &entry))
- record_outgoing_link(&entry.oid);
+ maybe_record_name_entry(&entry);
} else if (obj->type == OBJ_COMMIT) {
struct commit *commit = (struct commit *) obj;
struct commit_list *parents = commit->parents;
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v2 3/3] index-pack --promisor: also check commits' trees
2024-12-03 21:43 ` [PATCH v2 " Jonathan Tan
2024-12-03 21:43 ` [PATCH v2 1/3] index-pack --promisor: dedup before checking links Jonathan Tan
2024-12-03 21:43 ` [PATCH v2 2/3] index-pack --promisor: don't check blobs Jonathan Tan
@ 2024-12-03 21:43 ` Jonathan Tan
2 siblings, 0 replies; 29+ messages in thread
From: Jonathan Tan @ 2024-12-03 21:43 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, ps, gitster
Commit c08589efdc (index-pack: repack local links into promisor packs,
2024-11-01) seems to contain an oversight in that the tree of a commit
is not checked. Teach git to check these trees.
The fix slows down a fetch from a certain repo at $DAYJOB from 2m2.127s
to 2m45.052s, but in order to make the fetch correct, it seems worth it.
In order to test this, we could create server and client repos as
follows...
C S
\ /
O
(O and C are commits both on the client and server. S is a commit
only on the server. C and S have the same tree but different commit
messages. The diff between O and C is non-zero.)
...and then, from the client, fetch S from the server.
In theory, the client declares "have C" and the server can use this
information to exclude S's tree (since it knows that the client has C's
tree, which is the same as S's tree). However, it is also possible for
the server to compute that it needs to send S and not O, and proceed
from there; therefore the objects of C are not considered at all when
determining what to send in the packfile. In order to prevent a test of
client functionality from having such a dependence on server behavior, I
have not included such a test.
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
builtin/index-pack.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 57b7888c42..2250a410e2 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -863,6 +863,7 @@ static void do_record_outgoing_links(struct object *obj)
struct commit *commit = (struct commit *) obj;
struct commit_list *parents = commit->parents;
+ record_outgoing_link(get_commit_tree_oid(commit));
for (; parents; parents = parents->next)
record_outgoing_link(&parents->item->object.oid);
} else if (obj->type == OBJ_TAG) {
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply related [flat|nested] 29+ messages in thread
* [PATCH v3 0/3] Performance improvements for repacking non-promisor objects
2024-12-02 20:18 [PATCH 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
` (5 preceding siblings ...)
2024-12-03 21:43 ` [PATCH v2 " Jonathan Tan
@ 2024-12-03 21:52 ` Jonathan Tan
2024-12-03 21:52 ` [PATCH v3 1/3] index-pack --promisor: dedup before checking links Jonathan Tan
` (4 more replies)
6 siblings, 5 replies; 29+ messages in thread
From: Jonathan Tan @ 2024-12-03 21:52 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, ps, gitster
Apparently I did not save in my text editor (and didn't notice because
the code comment was still valid syntactically, so everything still
compiled). Here's a version with the updated and correctly formatted
code comment.
Jonathan Tan (3):
index-pack --promisor: dedup before checking links
index-pack --promisor: don't check blobs
index-pack --promisor: also check commits' trees
builtin/index-pack.c | 103 +++++++++++++++++++++++++++++--------------
1 file changed, 71 insertions(+), 32 deletions(-)
Range-diff against v2:
1: 7ae21c921f = 1: 7ae21c921f index-pack --promisor: dedup before checking links
2: 5a63c9a5ca ! 2: a1d2a20203 index-pack --promisor: don't check blobs
@@ builtin/index-pack.c: static void record_outgoing_link(const struct object_id *o
+static void maybe_record_name_entry(const struct name_entry *entry)
+{
+ /*
-+ * The benefit of doing this is as above (fetch speedup), but the drawback
-+is that if the packfile to be indexed references a local blob directly
-+(that is, not through a local tree), that local blob is in danger of
-+being garbage collected. Such a situation may arise if we push local
-+commits, including one with a change to a blob in the root tree,
-+and then the server incorporates them into its main branch through a
-+"rebase" or "squash" merge strategy, and then we fetch the new main
-+branch from the server.
-+
-+This situation has not been observed yet - we have only noticed missing
-+commits, not missing trees or blobs. (In fact, if it were believed that
-+only missing commits are problematic, one could argue that we should
-+also exclude trees during the outgoing link check; but it is safer to
-+include them.)
-+
-+Due to the rarity of the situation (it has not been observed to happen
-+in real life), and because the "penalty" in such a situation is merely
-+to refetch the missing blob when it's needed, the tradeoff seems
-+worth it.
++ * Checking only trees here results in a significantly faster packfile
++ * indexing, but the drawback is that if the packfile to be indexed
++ * references a local blob only directly (that is, never through a
++ * local tree), that local blob is in danger of being garbage
++ * collected. Such a situation may arise if we push local commits,
++ * including one with a change to a blob in the root tree, and then the
++ * server incorporates them into its main branch through a "rebase" or
++ * "squash" merge strategy, and then we fetch the new main branch from
++ * the server.
++ *
++ * This situation has not been observed yet - we have only noticed
++ * missing commits, not missing trees or blobs. (In fact, if it were
++ * believed that only missing commits are problematic, one could argue
++ * that we should also exclude trees during the outgoing link check;
++ * but it is safer to include them.)
++ *
++ * Due to the rarity of the situation (it has not been observed to
++ * happen in real life), and because the "penalty" in such a situation
++ * is merely to refetch the missing blob when it's needed (and this
++ * happens only once - when refetched, the blob goes into a promisor
++ * pack, so it won't be GC-ed, the tradeoff seems worth it.
+ */
+ if (S_ISDIR(entry->mode))
+ record_outgoing_link(&entry->oid);
3: 8139325bf2 = 3: f9f9969a8f index-pack --promisor: also check commits' trees
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply [flat|nested] 29+ messages in thread* [PATCH v3 1/3] index-pack --promisor: dedup before checking links
2024-12-03 21:52 ` [PATCH v3 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
@ 2024-12-03 21:52 ` Jonathan Tan
2024-12-04 4:36 ` Junio C Hamano
2024-12-03 21:52 ` [PATCH v3 2/3] index-pack --promisor: don't check blobs Jonathan Tan
` (3 subsequent siblings)
4 siblings, 1 reply; 29+ messages in thread
From: Jonathan Tan @ 2024-12-03 21:52 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, ps, gitster
Commit c08589efdc (index-pack: repack local links into promisor packs,
2024-11-01) fixed a bug with what was believed to be a negligible
decrease in performance [1] [2]. But at $DAYJOB, with at least one repo,
it was found that the decrease in performance was very significant.
Looking at the patch, whenever we parse an object in the packfile to
be indexed, we check the targets of all its outgoing links for its
existence. However, this could be optimized by first collecting all such
targets into an oidset (thus deduplicating them) before checking. Teach
Git to do that.
On a certain fetch from the aforementioned repo, this improved
performance from approximately 7 hours to 24m47.815s. This number will
be further reduced in a subsequent patch.
[1] https://lore.kernel.org/git/CAG1j3zGiNMbri8rZNaF0w+yP+6OdMz0T8+8_Wgd1R_p1HzVasg@mail.gmail.com/
[2] https://lore.kernel.org/git/20241105212849.3759572-1-jonathantanmy@google.com/
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
builtin/index-pack.c | 73 +++++++++++++++++++++++++-------------------
1 file changed, 41 insertions(+), 32 deletions(-)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 95babdc5ea..d1c777a6af 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -155,11 +155,11 @@ static int input_fd, output_fd;
static const char *curr_pack;
/*
- * local_links is guarded by read_mutex, and record_local_links is read-only in
- * a thread.
+ * outgoing_links is guarded by read_mutex, and record_outgoing_links is
+ * read-only in a thread.
*/
-static struct oidset local_links = OIDSET_INIT;
-static int record_local_links;
+static struct oidset outgoing_links = OIDSET_INIT;
+static int record_outgoing_links;
static struct thread_local_data *thread_data;
static int nr_dispatched;
@@ -812,18 +812,12 @@ static int check_collison(struct object_entry *entry)
return 0;
}
-static void record_if_local_object(const struct object_id *oid)
+static void record_outgoing_link(const struct object_id *oid)
{
- struct object_info info = OBJECT_INFO_INIT;
- if (oid_object_info_extended(the_repository, oid, &info, 0))
- /* Missing; assume it is a promisor object */
- return;
- if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
- return;
- oidset_insert(&local_links, oid);
+ oidset_insert(&outgoing_links, oid);
}
-static void do_record_local_links(struct object *obj)
+static void do_record_outgoing_links(struct object *obj)
{
if (obj->type == OBJ_TREE) {
struct tree *tree = (struct tree *)obj;
@@ -837,16 +831,16 @@ static void do_record_local_links(struct object *obj)
*/
return;
while (tree_entry_gently(&desc, &entry))
- record_if_local_object(&entry.oid);
+ record_outgoing_link(&entry.oid);
} else if (obj->type == OBJ_COMMIT) {
struct commit *commit = (struct commit *) obj;
struct commit_list *parents = commit->parents;
for (; parents; parents = parents->next)
- record_if_local_object(&parents->item->object.oid);
+ record_outgoing_link(&parents->item->object.oid);
} else if (obj->type == OBJ_TAG) {
struct tag *tag = (struct tag *) obj;
- record_if_local_object(get_tagged_oid(tag));
+ record_outgoing_link(get_tagged_oid(tag));
}
}
@@ -896,7 +890,7 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
free(has_data);
}
- if (strict || do_fsck_object || record_local_links) {
+ if (strict || do_fsck_object || record_outgoing_links) {
read_lock();
if (type == OBJ_BLOB) {
struct blob *blob = lookup_blob(the_repository, oid);
@@ -928,8 +922,8 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
die(_("fsck error in packed object"));
if (strict && fsck_walk(obj, NULL, &fsck_options))
die(_("Not all child objects of %s are reachable"), oid_to_hex(&obj->oid));
- if (record_local_links)
- do_record_local_links(obj);
+ if (record_outgoing_links)
+ do_record_outgoing_links(obj);
if (obj->type == OBJ_TREE) {
struct tree *item = (struct tree *) obj;
@@ -1781,26 +1775,41 @@ static void repack_local_links(void)
struct object_id *oid;
char *base_name;
- if (!oidset_size(&local_links))
+ if (!oidset_size(&outgoing_links))
return;
- base_name = mkpathdup("%s/pack/pack", repo_get_object_directory(the_repository));
+ oidset_iter_init(&outgoing_links, &iter);
+ while ((oid = oidset_iter_next(&iter))) {
+ struct object_info info = OBJECT_INFO_INIT;
+ if (oid_object_info_extended(the_repository, oid, &info, 0))
+ /* Missing; assume it is a promisor object */
+ continue;
+ if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
+ continue;
- strvec_push(&cmd.args, "pack-objects");
- strvec_push(&cmd.args, "--exclude-promisor-objects-best-effort");
- strvec_push(&cmd.args, base_name);
- cmd.git_cmd = 1;
- cmd.in = -1;
- cmd.out = -1;
- if (start_command(&cmd))
- die(_("could not start pack-objects to repack local links"));
+ if (!cmd.args.nr) {
+ base_name = mkpathdup(
+ "%s/pack/pack",
+ repo_get_object_directory(the_repository));
+ strvec_push(&cmd.args, "pack-objects");
+ strvec_push(&cmd.args,
+ "--exclude-promisor-objects-best-effort");
+ strvec_push(&cmd.args, base_name);
+ cmd.git_cmd = 1;
+ cmd.in = -1;
+ cmd.out = -1;
+ if (start_command(&cmd))
+ die(_("could not start pack-objects to repack local links"));
+ }
- oidset_iter_init(&local_links, &iter);
- while ((oid = oidset_iter_next(&iter))) {
if (write_in_full(cmd.in, oid_to_hex(oid), the_hash_algo->hexsz) < 0 ||
write_in_full(cmd.in, "\n", 1) < 0)
die(_("failed to feed local object to pack-objects"));
}
+
+ if (!cmd.args.nr)
+ return;
+
close(cmd.in);
out = xfdopen(cmd.out, "r");
@@ -1899,7 +1908,7 @@ int cmd_index_pack(int argc,
} else if (skip_to_optional_arg(arg, "--keep", &keep_msg)) {
; /* nothing to do */
} else if (skip_to_optional_arg(arg, "--promisor", &promisor_msg)) {
- record_local_links = 1;
+ record_outgoing_links = 1;
} else if (starts_with(arg, "--threads=")) {
char *end;
nr_threads = strtoul(arg+10, &end, 0);
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply related [flat|nested] 29+ messages in thread* Re: [PATCH v3 1/3] index-pack --promisor: dedup before checking links
2024-12-03 21:52 ` [PATCH v3 1/3] index-pack --promisor: dedup before checking links Jonathan Tan
@ 2024-12-04 4:36 ` Junio C Hamano
0 siblings, 0 replies; 29+ messages in thread
From: Junio C Hamano @ 2024-12-04 4:36 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, ps
Jonathan Tan <jonathantanmy@google.com> writes:
> @@ -1781,26 +1775,41 @@ static void repack_local_links(void)
> struct object_id *oid;
> char *base_name;
We may want to give a meaningless NULL initialization to this
variable, due to false positive from a compliler.
> - if (!oidset_size(&local_links))
> + if (!oidset_size(&outgoing_links))
> return;
>
> - base_name = mkpathdup("%s/pack/pack", repo_get_object_directory(the_repository));
It used to be that it was really obvious that base_name is always
initialized. But now due to micro-optimization ...
> + oidset_iter_init(&outgoing_links, &iter);
> + while ((oid = oidset_iter_next(&iter))) {
> + struct object_info info = OBJECT_INFO_INIT;
> + if (oid_object_info_extended(the_repository, oid, &info, 0))
> + /* Missing; assume it is a promisor object */
> + continue;
> + if (info.whence == OI_PACKED && info.u.packed.pack->pack_promisor)
> + continue;
> ...
> + if (!cmd.args.nr) {
> + base_name = mkpathdup(
> + "%s/pack/pack",
> + repo_get_object_directory(the_repository));
... we lazily allocate only after we know we will run a command.
> + strvec_push(&cmd.args, "pack-objects");
> + strvec_push(&cmd.args,
> + "--exclude-promisor-objects-best-effort");
> + strvec_push(&cmd.args, base_name);
> + cmd.git_cmd = 1;
> + cmd.in = -1;
> + cmd.out = -1;
> + if (start_command(&cmd))
> + die(_("could not start pack-objects to repack local links"));
> + }
We know outgoing_links is not empty, so we know we will enter the
while() loop at least once, but it may be possible that all the
objects in the outgoing_links oidset end up to be missing or packed
in a promisor pack, hitting continue and never running the command.
> - oidset_iter_init(&local_links, &iter);
> - while ((oid = oidset_iter_next(&iter))) {
> if (write_in_full(cmd.in, oid_to_hex(oid), the_hash_algo->hexsz) < 0 ||
> write_in_full(cmd.in, "\n", 1) < 0)
> die(_("failed to feed local object to pack-objects"));
> }
> +
> + if (!cmd.args.nr)
> + return;
But then we have this early return, so from human-reader's point of
view, we will never hit free(base_name) at the end of this function.
But GCC used in the macOS build does not seem to realize it.
https://github.com/git/git/actions/runs/12152173257/job/33888089229#step:4:380
It may be safer to give a meaningless NULL as the initial value of
the variable.
Thanks.
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH v3 2/3] index-pack --promisor: don't check blobs
2024-12-03 21:52 ` [PATCH v3 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
2024-12-03 21:52 ` [PATCH v3 1/3] index-pack --promisor: dedup before checking links Jonathan Tan
@ 2024-12-03 21:52 ` Jonathan Tan
2024-12-03 21:52 ` [PATCH v3 3/3] index-pack --promisor: also check commits' trees Jonathan Tan
` (2 subsequent siblings)
4 siblings, 0 replies; 29+ messages in thread
From: Jonathan Tan @ 2024-12-03 21:52 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, ps, gitster
As a follow-up to the parent of this commit, it was found that not
checking for the existence of blobs linked from trees sped up the fetch
from 24m47.815s to 2m2.127s. Teach Git to do that.
The tradeoff of not checking blobs is documented in a code comment.
(Blobs may also be linked from tag objects, but it is impossible to know
the type of an object linked from a tag object without looking it up in
the object database, so the code for that is untouched.)
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
builtin/index-pack.c | 31 ++++++++++++++++++++++++++++++-
1 file changed, 30 insertions(+), 1 deletion(-)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index d1c777a6af..2e90fe186e 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -817,6 +817,35 @@ static void record_outgoing_link(const struct object_id *oid)
oidset_insert(&outgoing_links, oid);
}
+static void maybe_record_name_entry(const struct name_entry *entry)
+{
+ /*
+ * Checking only trees here results in a significantly faster packfile
+ * indexing, but the drawback is that if the packfile to be indexed
+ * references a local blob only directly (that is, never through a
+ * local tree), that local blob is in danger of being garbage
+ * collected. Such a situation may arise if we push local commits,
+ * including one with a change to a blob in the root tree, and then the
+ * server incorporates them into its main branch through a "rebase" or
+ * "squash" merge strategy, and then we fetch the new main branch from
+ * the server.
+ *
+ * This situation has not been observed yet - we have only noticed
+ * missing commits, not missing trees or blobs. (In fact, if it were
+ * believed that only missing commits are problematic, one could argue
+ * that we should also exclude trees during the outgoing link check;
+ * but it is safer to include them.)
+ *
+ * Due to the rarity of the situation (it has not been observed to
+ * happen in real life), and because the "penalty" in such a situation
+ * is merely to refetch the missing blob when it's needed (and this
+ * happens only once - when refetched, the blob goes into a promisor
+ * pack, so it won't be GC-ed, the tradeoff seems worth it.
+ */
+ if (S_ISDIR(entry->mode))
+ record_outgoing_link(&entry->oid);
+}
+
static void do_record_outgoing_links(struct object *obj)
{
if (obj->type == OBJ_TREE) {
@@ -831,7 +860,7 @@ static void do_record_outgoing_links(struct object *obj)
*/
return;
while (tree_entry_gently(&desc, &entry))
- record_outgoing_link(&entry.oid);
+ maybe_record_name_entry(&entry);
} else if (obj->type == OBJ_COMMIT) {
struct commit *commit = (struct commit *) obj;
struct commit_list *parents = commit->parents;
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v3 3/3] index-pack --promisor: also check commits' trees
2024-12-03 21:52 ` [PATCH v3 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
2024-12-03 21:52 ` [PATCH v3 1/3] index-pack --promisor: dedup before checking links Jonathan Tan
2024-12-03 21:52 ` [PATCH v3 2/3] index-pack --promisor: don't check blobs Jonathan Tan
@ 2024-12-03 21:52 ` Jonathan Tan
2024-12-04 2:22 ` [PATCH v3 0/3] Performance improvements for repacking non-promisor objects Junio C Hamano
2024-12-04 4:46 ` [PATCH 4/3] index-pack: work around false positive use of uninitialized variable Junio C Hamano
4 siblings, 0 replies; 29+ messages in thread
From: Jonathan Tan @ 2024-12-03 21:52 UTC (permalink / raw)
To: git; +Cc: Jonathan Tan, ps, gitster
Commit c08589efdc (index-pack: repack local links into promisor packs,
2024-11-01) seems to contain an oversight in that the tree of a commit
is not checked. Teach git to check these trees.
The fix slows down a fetch from a certain repo at $DAYJOB from 2m2.127s
to 2m45.052s, but in order to make the fetch correct, it seems worth it.
In order to test this, we could create server and client repos as
follows...
C S
\ /
O
(O and C are commits both on the client and server. S is a commit
only on the server. C and S have the same tree but different commit
messages. The diff between O and C is non-zero.)
...and then, from the client, fetch S from the server.
In theory, the client declares "have C" and the server can use this
information to exclude S's tree (since it knows that the client has C's
tree, which is the same as S's tree). However, it is also possible for
the server to compute that it needs to send S and not O, and proceed
from there; therefore the objects of C are not considered at all when
determining what to send in the packfile. In order to prevent a test of
client functionality from having such a dependence on server behavior, I
have not included such a test.
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
builtin/index-pack.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 2e90fe186e..1594f2b81d 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -865,6 +865,7 @@ static void do_record_outgoing_links(struct object *obj)
struct commit *commit = (struct commit *) obj;
struct commit_list *parents = commit->parents;
+ record_outgoing_link(get_commit_tree_oid(commit));
for (; parents; parents = parents->next)
record_outgoing_link(&parents->item->object.oid);
} else if (obj->type == OBJ_TAG) {
--
2.47.0.338.g60cca15819-goog
^ permalink raw reply related [flat|nested] 29+ messages in thread* Re: [PATCH v3 0/3] Performance improvements for repacking non-promisor objects
2024-12-03 21:52 ` [PATCH v3 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
` (2 preceding siblings ...)
2024-12-03 21:52 ` [PATCH v3 3/3] index-pack --promisor: also check commits' trees Jonathan Tan
@ 2024-12-04 2:22 ` Junio C Hamano
2024-12-04 4:46 ` [PATCH 4/3] index-pack: work around false positive use of uninitialized variable Junio C Hamano
4 siblings, 0 replies; 29+ messages in thread
From: Junio C Hamano @ 2024-12-04 2:22 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, ps
Jonathan Tan <jonathantanmy@google.com> writes:
> Apparently I did not save in my text editor (and didn't notice because
> the code comment was still valid syntactically, so everything still
> compiled). Here's a version with the updated and correctly formatted
> code comment.
Thanks. Will replace.
^ permalink raw reply [flat|nested] 29+ messages in thread* [PATCH 4/3] index-pack: work around false positive use of uninitialized variable
2024-12-03 21:52 ` [PATCH v3 0/3] Performance improvements for repacking non-promisor objects Jonathan Tan
` (3 preceding siblings ...)
2024-12-04 2:22 ` [PATCH v3 0/3] Performance improvements for repacking non-promisor objects Junio C Hamano
@ 2024-12-04 4:46 ` Junio C Hamano
4 siblings, 0 replies; 29+ messages in thread
From: Junio C Hamano @ 2024-12-04 4:46 UTC (permalink / raw)
To: Jonathan Tan; +Cc: git, ps
The base_name variable in this function is given a value if cmd.args
array is not empty (i.e., if we run the pack-objects command), and
the function returns when cmd.args is empty before hitting a call to
free(base_name) near the end of the function, so to a human reader,
it can be seen that the variable is not used uninitialized, but to a
semi-intelligent compiler it is not so clear.
Squelch a false positive by a meaningless NULL initialization.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
* Tentatively queued to unblock CI. There may be breakages due to
other topics in flight, but at least this one is easy to resolve
(hopefully---I haven't pushed it out).
https://github.com/git/git/actions/runs/12152173257
builtin/index-pack.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 1594f2b81d..8e600a58bf 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -1803,7 +1803,7 @@ static void repack_local_links(void)
struct strbuf line = STRBUF_INIT;
struct oidset_iter iter;
struct object_id *oid;
- char *base_name;
+ char *base_name = NULL;
if (!oidset_size(&outgoing_links))
return;
--
2.47.1-574-g3b2d6bb55a
^ permalink raw reply related [flat|nested] 29+ messages in thread