* [PATCH v6 7/9] pack-objects: swap 'show_{object,commit}_pack_hint'
From: Taylor Blau @ 2025-06-23 22:32 UTC (permalink / raw)
To: git; +Cc: Elijah Newren, Jeff King, Junio C Hamano
In-Reply-To: <cover.1750717921.git.me@ttaylorr.com>
show_commit_pack_hint() has heretofore been a noop, so its position
within its compilation unit only needs to appear before its first use.
But the following commit will sometimes have `show_commit_pack_hint()`
call `show_object_pack_hint()`, so reorder the former to appear after
the latter to minimize the code movement in that patch.
Suggested-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
builtin/pack-objects.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 9580b4ea1a..f44447a3f9 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3748,12 +3748,6 @@ static int add_object_entry_from_pack(const struct object_id *oid,
return 0;
}
-static void show_commit_pack_hint(struct commit *commit UNUSED,
- void *data UNUSED)
-{
- /* nothing to do; commits don't have a namehash */
-}
-
static void show_object_pack_hint(struct object *object, const char *name,
void *data UNUSED)
{
@@ -3776,6 +3770,12 @@ static void show_object_pack_hint(struct object *object, const char *name,
stdin_packs_hints_nr++;
}
+static void show_commit_pack_hint(struct commit *commit UNUSED,
+ void *data UNUSED)
+{
+ /* nothing to do; commits don't have a namehash */
+}
+
static int pack_mtime_cmp(const void *_a, const void *_b)
{
struct packed_git *a = ((const struct string_list_item*)_a)->util;
--
2.50.0.61.g1981e40f2d
^ permalink raw reply related
* [PATCH v6 6/9] pack-objects: fix typo in 'show_object_pack_hint()'
From: Taylor Blau @ 2025-06-23 22:32 UTC (permalink / raw)
To: git; +Cc: Elijah Newren, Jeff King, Junio C Hamano
In-Reply-To: <cover.1750717921.git.me@ttaylorr.com>
Noticed-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
builtin/pack-objects.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 3437dbd7f1..9580b4ea1a 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3767,7 +3767,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
* would typically pick up during a reachability traversal.
*
* Make a best-effort attempt to fill in the ->hash and ->no_try_delta
- * here using a now in order to perhaps improve the delta selection
+ * fields here in order to perhaps improve the delta selection
* process.
*/
oe->hash = pack_name_hash_fn(name);
--
2.50.0.61.g1981e40f2d
^ permalink raw reply related
* [PATCH v6 5/9] pack-objects: perform name-hash traversal for unpacked objects
From: Taylor Blau @ 2025-06-23 22:32 UTC (permalink / raw)
To: git; +Cc: Elijah Newren, Jeff King, Junio C Hamano
In-Reply-To: <cover.1750717921.git.me@ttaylorr.com>
With '--unpacked', pack-objects adds loose objects (which don't appear
in any of the excluded packs from '--stdin-packs') to the output pack
without considering them as reachability tips for the name-hash
traversal.
This was an oversight in the original implementation of '--stdin-packs',
since the code which enumerates and adds loose objects to the output
pack (`add_unreachable_loose_objects()`) did not have access to the
'rev_info' struct found in `read_packs_list_from_stdin()`.
Excluding unpacked objects from that traversal doesn't affect the
correctness of the resulting pack, but it does make it harder to
discover good deltas for loose objects.
Now that the 'rev_info' struct is declared outside of
`read_packs_list_from_stdin()`, we can pass it to
`add_objects_in_unpacked_packs()` and add any loose objects as tips to
the above-mentioned traversal, in theory producing slightly tighter
packs as a result.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
builtin/pack-objects.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 4258ac1792..3437dbd7f1 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3879,7 +3879,7 @@ static void read_packs_list_from_stdin(struct rev_info *revs)
string_list_clear(&exclude_packs, 0);
}
-static void add_unreachable_loose_objects(void);
+static void add_unreachable_loose_objects(struct rev_info *revs);
static void read_stdin_packs(int rev_list_unpacked)
{
@@ -3906,7 +3906,7 @@ static void read_stdin_packs(int rev_list_unpacked)
ignore_packed_keep_in_core = 1;
read_packs_list_from_stdin(&revs);
if (rev_list_unpacked)
- add_unreachable_loose_objects();
+ add_unreachable_loose_objects(&revs);
if (prepare_revision_walk(&revs))
die(_("revision walk setup failed"));
@@ -4025,7 +4025,7 @@ static void enumerate_cruft_objects(void)
_("Enumerating cruft objects"), 0);
add_objects_in_unpacked_packs();
- add_unreachable_loose_objects();
+ add_unreachable_loose_objects(NULL);
stop_progress(&progress_state);
}
@@ -4303,8 +4303,9 @@ static void add_objects_in_unpacked_packs(void)
}
static int add_loose_object(const struct object_id *oid, const char *path,
- void *data UNUSED)
+ void *data)
{
+ struct rev_info *revs = data;
enum object_type type = oid_object_info(the_repository, oid, NULL);
if (type < 0) {
@@ -4325,6 +4326,10 @@ static int add_loose_object(const struct object_id *oid, const char *path,
} else {
add_object_entry(oid, type, "", 0);
}
+
+ if (revs && type == OBJ_COMMIT)
+ add_pending_oid(revs, NULL, oid, 0);
+
return 0;
}
@@ -4333,11 +4338,10 @@ static int add_loose_object(const struct object_id *oid, const char *path,
* add_object_entry will weed out duplicates, so we just add every
* loose object we find.
*/
-static void add_unreachable_loose_objects(void)
+static void add_unreachable_loose_objects(struct rev_info *revs)
{
for_each_loose_file_in_objdir(repo_get_object_directory(the_repository),
- add_loose_object,
- NULL, NULL, NULL);
+ add_loose_object, NULL, NULL, revs);
}
static int has_sha1_pack_kept_or_nonlocal(const struct object_id *oid)
@@ -4684,7 +4688,7 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av)
if (keep_unreachable)
add_objects_in_unpacked_packs();
if (pack_loose_unreachable)
- add_unreachable_loose_objects();
+ add_unreachable_loose_objects(NULL);
if (unpack_unreachable)
loosen_unused_packed_objects();
--
2.50.0.61.g1981e40f2d
^ permalink raw reply related
* [PATCH v6 4/9] pack-objects: declare 'rev_info' for '--stdin-packs' earlier
From: Taylor Blau @ 2025-06-23 22:32 UTC (permalink / raw)
To: git; +Cc: Elijah Newren, Jeff King, Junio C Hamano
In-Reply-To: <cover.1750717921.git.me@ttaylorr.com>
Once 'read_packs_list_from_stdin()' has called for_each_object_in_pack()
on each of the input packs, we do a reachability traversal to discover
names for any objects we picked up so we can generate name hash values
and hopefully get higher quality deltas as a result.
A future commit will change the purpose of this reachability traversal
to find and pack objects which are reachable from commits in the input
packs, but are packed in an unknown (not included nor excluded) pack.
Extract the code which initializes and performs the reachability
traversal to take place in the caller, not the callee, which prepares us
to share this code for the '--unpacked' case (see the function
add_unreachable_loose_objects() for more details).
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
builtin/pack-objects.c | 71 +++++++++++++++++++++---------------------
1 file changed, 36 insertions(+), 35 deletions(-)
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7ce04b71dd..4258ac1792 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3793,7 +3793,7 @@ static int pack_mtime_cmp(const void *_a, const void *_b)
return 0;
}
-static void read_packs_list_from_stdin(void)
+static void read_packs_list_from_stdin(struct rev_info *revs)
{
struct strbuf buf = STRBUF_INIT;
struct string_list include_packs = STRING_LIST_INIT_DUP;
@@ -3801,24 +3801,6 @@ static void read_packs_list_from_stdin(void)
struct string_list_item *item = NULL;
struct packed_git *p;
- struct rev_info revs;
-
- repo_init_revisions(the_repository, &revs, NULL);
- /*
- * Use a revision walk to fill in the namehash of objects in the include
- * packs. To save time, we'll avoid traversing through objects that are
- * in excluded packs.
- *
- * That may cause us to avoid populating all of the namehash fields of
- * all included objects, but our goal is best-effort, since this is only
- * an optimization during delta selection.
- */
- revs.no_kept_objects = 1;
- revs.keep_pack_cache_flags |= IN_CORE_KEEP_PACKS;
- revs.blob_objects = 1;
- revs.tree_objects = 1;
- revs.tag_objects = 1;
- revs.ignore_missing_links = 1;
while (strbuf_getline(&buf, stdin) != EOF) {
if (!buf.len)
@@ -3888,10 +3870,44 @@ static void read_packs_list_from_stdin(void)
struct packed_git *p = item->util;
for_each_object_in_pack(p,
add_object_entry_from_pack,
- &revs,
+ revs,
FOR_EACH_OBJECT_PACK_ORDER);
}
+ strbuf_release(&buf);
+ string_list_clear(&include_packs, 0);
+ string_list_clear(&exclude_packs, 0);
+}
+
+static void add_unreachable_loose_objects(void);
+
+static void read_stdin_packs(int rev_list_unpacked)
+{
+ struct rev_info revs;
+
+ repo_init_revisions(the_repository, &revs, NULL);
+ /*
+ * Use a revision walk to fill in the namehash of objects in the include
+ * packs. To save time, we'll avoid traversing through objects that are
+ * in excluded packs.
+ *
+ * That may cause us to avoid populating all of the namehash fields of
+ * all included objects, but our goal is best-effort, since this is only
+ * an optimization during delta selection.
+ */
+ revs.no_kept_objects = 1;
+ revs.keep_pack_cache_flags |= IN_CORE_KEEP_PACKS;
+ revs.blob_objects = 1;
+ revs.tree_objects = 1;
+ revs.tag_objects = 1;
+ revs.ignore_missing_links = 1;
+
+ /* avoids adding objects in excluded packs */
+ ignore_packed_keep_in_core = 1;
+ read_packs_list_from_stdin(&revs);
+ if (rev_list_unpacked)
+ add_unreachable_loose_objects();
+
if (prepare_revision_walk(&revs))
die(_("revision walk setup failed"));
traverse_commit_list(&revs,
@@ -3903,21 +3919,6 @@ static void read_packs_list_from_stdin(void)
stdin_packs_found_nr);
trace2_data_intmax("pack-objects", the_repository, "stdin_packs_hints",
stdin_packs_hints_nr);
-
- strbuf_release(&buf);
- string_list_clear(&include_packs, 0);
- string_list_clear(&exclude_packs, 0);
-}
-
-static void add_unreachable_loose_objects(void);
-
-static void read_stdin_packs(int rev_list_unpacked)
-{
- /* avoids adding objects in excluded packs */
- ignore_packed_keep_in_core = 1;
- read_packs_list_from_stdin();
- if (rev_list_unpacked)
- add_unreachable_loose_objects();
}
static void add_cruft_object_entry(const struct object_id *oid, enum object_type type,
--
2.50.0.61.g1981e40f2d
^ permalink raw reply related
* [PATCH v6 3/9] pack-objects: factor out handling '--stdin-packs'
From: Taylor Blau @ 2025-06-23 22:32 UTC (permalink / raw)
To: git; +Cc: Elijah Newren, Jeff King, Junio C Hamano
In-Reply-To: <cover.1750717921.git.me@ttaylorr.com>
At the bottom of cmd_pack_objects() we check which mode the command is
running in (e.g., generating a cruft pack, handling '--stdin-packs',
using the internal rev-list, etc.) and handle the mode appropriately.
The '--stdin-packs' case is handled inline (dating back to its
introduction in 339bce27f4 (builtin/pack-objects.c: add '--stdin-packs'
option, 2021-02-22)) since it is relatively short. Extract the body of
"if (stdin_packs)" into its own function to prepare for the
implementation to become lengthier in a following commit.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
builtin/pack-objects.c | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index d04a36a6bf..7ce04b71dd 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3909,6 +3909,17 @@ static void read_packs_list_from_stdin(void)
string_list_clear(&exclude_packs, 0);
}
+static void add_unreachable_loose_objects(void);
+
+static void read_stdin_packs(int rev_list_unpacked)
+{
+ /* avoids adding objects in excluded packs */
+ ignore_packed_keep_in_core = 1;
+ read_packs_list_from_stdin();
+ if (rev_list_unpacked)
+ add_unreachable_loose_objects();
+}
+
static void add_cruft_object_entry(const struct object_id *oid, enum object_type type,
struct packed_git *pack, off_t offset,
const char *name, uint32_t mtime)
@@ -4004,7 +4015,6 @@ static void mark_pack_kept_in_core(struct string_list *packs, unsigned keep)
}
}
-static void add_unreachable_loose_objects(void);
static void add_objects_in_unpacked_packs(void);
static void enumerate_cruft_objects(void)
@@ -5135,11 +5145,7 @@ int cmd_pack_objects(int argc,
progress_state = start_progress(the_repository,
_("Enumerating objects"), 0);
if (stdin_packs) {
- /* avoids adding objects in excluded packs */
- ignore_packed_keep_in_core = 1;
- read_packs_list_from_stdin();
- if (rev_list_unpacked)
- add_unreachable_loose_objects();
+ read_stdin_packs(rev_list_unpacked);
} else if (cruft) {
read_cruft_objects();
} else if (!use_internal_rev_list) {
--
2.50.0.61.g1981e40f2d
^ permalink raw reply related
* [PATCH v6 2/9] pack-objects: limit scope in 'add_object_entry_from_pack()'
From: Taylor Blau @ 2025-06-23 22:32 UTC (permalink / raw)
To: git; +Cc: Elijah Newren, Jeff King, Junio C Hamano
In-Reply-To: <cover.1750717921.git.me@ttaylorr.com>
In add_object_entry_from_pack() we declare 'revs' (given to us through
the miscellaneous context argument) earlier in the "if (p)" conditional
than is necessary. Move it down as far as it can go to reduce its
scope.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
builtin/pack-objects.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index e7274e0e00..d04a36a6bf 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3725,7 +3725,6 @@ static int add_object_entry_from_pack(const struct object_id *oid,
return 0;
if (p) {
- struct rev_info *revs = _data;
struct object_info oi = OBJECT_INFO_INIT;
oi.typep = &type;
@@ -3733,6 +3732,7 @@ static int add_object_entry_from_pack(const struct object_id *oid,
die(_("could not get type of object %s in pack %s"),
oid_to_hex(oid), p->pack_name);
} else if (type == OBJ_COMMIT) {
+ struct rev_info *revs = _data;
/*
* commits in included packs are used as starting points for the
* subsequent revision walk
--
2.50.0.61.g1981e40f2d
^ permalink raw reply related
* [PATCH v6 1/9] pack-objects: use standard option incompatibility functions
From: Taylor Blau @ 2025-06-23 22:32 UTC (permalink / raw)
To: git; +Cc: Elijah Newren, Jeff King, Junio C Hamano
In-Reply-To: <cover.1750717921.git.me@ttaylorr.com>
pack-objects has a handful of explicit checks for pairs of command-line
options which are mutually incompatible. Many of these pre-date
a699367bb8 (i18n: factorize more 'incompatible options' messages,
2022-01-31).
Convert the explicit checks into die_for_incompatible_opt2() calls,
which simplifies the implementation and standardizes pack-objects'
output when given incompatible options (e.g., --stdin-packs with
--filter gives different output than --keep-unreachable with
--unpack-unreachable).
There is one minor piece of test fallout in t5331 that expects the old
format, which has been corrected.
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
builtin/pack-objects.c | 20 +++++++++++---------
t/t5331-pack-objects-stdin.sh | 2 +-
2 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 67941c8a60..e7274e0e00 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -5010,9 +5010,10 @@ int cmd_pack_objects(int argc,
strvec_push(&rp, "--unpacked");
}
- if (exclude_promisor_objects && exclude_promisor_objects_best_effort)
- die(_("options '%s' and '%s' cannot be used together"),
- "--exclude-promisor-objects", "--exclude-promisor-objects-best-effort");
+ die_for_incompatible_opt2(exclude_promisor_objects,
+ "--exclude-promisor-objects",
+ exclude_promisor_objects_best_effort,
+ "--exclude-promisor-objects-best-effort");
if (exclude_promisor_objects) {
use_internal_rev_list = 1;
fetch_if_missing = 0;
@@ -5050,13 +5051,14 @@ int cmd_pack_objects(int argc,
if (!pack_to_stdout && thin)
die(_("--thin cannot be used to build an indexable pack"));
- if (keep_unreachable && unpack_unreachable)
- die(_("options '%s' and '%s' cannot be used together"), "--keep-unreachable", "--unpack-unreachable");
+ die_for_incompatible_opt2(keep_unreachable, "--keep-unreachable",
+ unpack_unreachable, "--unpack-unreachable");
if (!rev_list_all || !rev_list_reflog || !rev_list_index)
unpack_unreachable_expiration = 0;
- if (stdin_packs && filter_options.choice)
- die(_("cannot use --filter with --stdin-packs"));
+ die_for_incompatible_opt2(stdin_packs, "--stdin-packs",
+ filter_options.choice, "--filter");
+
if (stdin_packs && use_internal_rev_list)
die(_("cannot use internal rev list with --stdin-packs"));
@@ -5064,8 +5066,8 @@ int cmd_pack_objects(int argc,
if (cruft) {
if (use_internal_rev_list)
die(_("cannot use internal rev list with --cruft"));
- if (stdin_packs)
- die(_("cannot use --stdin-packs with --cruft"));
+ die_for_incompatible_opt2(stdin_packs, "--stdin-packs",
+ cruft, "--cruft");
}
/*
diff --git a/t/t5331-pack-objects-stdin.sh b/t/t5331-pack-objects-stdin.sh
index b48c0cbe8f..8fd07deb8d 100755
--- a/t/t5331-pack-objects-stdin.sh
+++ b/t/t5331-pack-objects-stdin.sh
@@ -64,7 +64,7 @@ test_expect_success '--stdin-packs is incompatible with --filter' '
cd stdin-packs &&
test_must_fail git pack-objects --stdin-packs --stdout \
--filter=blob:none </dev/null 2>err &&
- test_grep "cannot use --filter with --stdin-packs" err
+ test_grep "options .--stdin-packs. and .--filter. cannot be used together" err
)
'
--
2.50.0.61.g1981e40f2d
^ permalink raw reply related
* [PATCH v6 0/9] repack: avoid MIDX'ing cruft pack(s) where possible
From: Taylor Blau @ 2025-06-23 22:32 UTC (permalink / raw)
To: git; +Cc: Elijah Newren, Jeff King, Junio C Hamano
In-Reply-To: <cover.1744413969.git.me@ttaylorr.com>
Here is an additional reroll of my series to create MIDXs that do not
include a repository's cruft pack(s).
Nearly everything is identical between this version and the previous
(v5), with two exceptions:
- Adjusted where to split a long line in show_object_pack_hint().
- Fixed a test failure with GIT_TEST_MULTI_PACK_INDEX_WRITE_INCREMENTAL
Thanks for Junio and Peff (respectively) for pointing out each of the
above. As usual, a range-diff is attached for convenience.
Thanks in advance for any review :-).
Taylor Blau (9):
pack-objects: use standard option incompatibility functions
pack-objects: limit scope in 'add_object_entry_from_pack()'
pack-objects: factor out handling '--stdin-packs'
pack-objects: declare 'rev_info' for '--stdin-packs' earlier
pack-objects: perform name-hash traversal for unpacked objects
pack-objects: fix typo in 'show_object_pack_hint()'
pack-objects: swap 'show_{object,commit}_pack_hint'
pack-objects: introduce '--stdin-packs=follow'
repack: exclude cruft pack(s) from the MIDX where possible
Documentation/config/repack.adoc | 7 +
Documentation/git-pack-objects.adoc | 10 +-
builtin/pack-objects.c | 193 ++++++++++++++++++----------
builtin/repack.c | 187 ++++++++++++++++++++++++---
t/t5331-pack-objects-stdin.sh | 122 +++++++++++++++++-
t/t7704-repack-cruft.sh | 145 +++++++++++++++++++++
6 files changed, 573 insertions(+), 91 deletions(-)
Range-diff against v5:
1: 19fab7a35c = 1: 8e7b2dacc7 pack-objects: use standard option incompatibility functions
2: 6f2d3f17a4 = 2: 86fb36d317 pack-objects: limit scope in 'add_object_entry_from_pack()'
3: c06f5b264a = 3: 19e8c789e9 pack-objects: factor out handling '--stdin-packs'
4: 40d7d87cb1 = 4: c9f874eb94 pack-objects: declare 'rev_info' for '--stdin-packs' earlier
5: 5e2599436c = 5: 6b0149a32d pack-objects: perform name-hash traversal for unpacked objects
6: 3a5c3f63d8 = 6: f31dd00a98 pack-objects: fix typo in 'show_object_pack_hint()'
7: 796e8743f8 = 7: 5d15055985 pack-objects: swap 'show_{object,commit}_pack_hint'
8: 8830775beb ! 8: 3699c25337 pack-objects: introduce '--stdin-packs=follow'
@@ builtin/pack-objects.c: static int add_object_entry_from_pack(const struct objec
- return;
+ enum stdin_packs_mode mode = *(enum stdin_packs_mode *)data;
+ if (mode == STDIN_PACKS_MODE_FOLLOW) {
-+ if (object->type == OBJ_BLOB && !has_object(the_repository,
-+ &object->oid, 0))
++ if (object->type == OBJ_BLOB &&
++ !has_object(the_repository, &object->oid, 0))
+ return;
+ add_object_entry(&object->oid, object->type, name, 0);
+ } else {
9: 8f505179cc ! 9: f519777059 repack: exclude cruft pack(s) from the MIDX where possible
@@ builtin/repack.c: int cmd_repack(int argc,
string_list_sort(&names);
+ if (get_local_multi_pack_index(the_repository)) {
-+ uint32_t i;
+ struct multi_pack_index *m =
+ get_local_multi_pack_index(the_repository);
+
-+ ALLOC_ARRAY(midx_pack_names, m->num_packs);
-+ for (i = 0; i < m->num_packs; i++)
-+ midx_pack_names[midx_pack_names_nr++] = xstrdup(m->pack_names[i]);
++ ALLOC_ARRAY(midx_pack_names,
++ m->num_packs + m->num_packs_in_base);
++
++ for (; m; m = m->base_midx)
++ for (uint32_t i = 0; i < m->num_packs; i++)
++ midx_pack_names[midx_pack_names_nr++] =
++ xstrdup(m->pack_names[i]);
+ }
+
close_object_store(the_repository->objects);
base-commit: f9aa0eedb37eb94d9d3711ef0d565fd7cb3b6148
--
2.50.0.61.g1981e40f2d
^ permalink raw reply
* Re: Perf bug: rev-list w/ 2+ paths relatively slow with commit-graph
From: Junio C Hamano @ 2025-06-23 21:00 UTC (permalink / raw)
To: Kai Koponen; +Cc: git
In-Reply-To: <CADYQcGrR0mKLEWSYZCrL6b7NYLGfdsZsuKCCFQ_ptpMJ8mofmQ@mail.gmail.com>
Kai Koponen <kaikoponen@google.com> writes:
> I see, more of a perf FR than a bug then.
> I don't have much expertise here, but on the surface of it, it doesn't
> seem to me like there would be any reason the algorithm couldn't check
> each path's bloom filter in turn while searching, other than that this
> would be a large and annoying change.
It looks like that the necessary changes are probably fairly well
isolated to two functions, i.e., prepare_to_use_bloom_filter() and
forbid_bloom_filters(). Right now, for a pathspec that has one
element "dir/file", the code uses two bloom keys for "dir" and
"dir/file", but if we have "dir1/file1" as well, then it does look
like a matter of using two more (and the bloom_keys[] array is
designed to be variable length).
But those who have more intimate knowledge in the area than I do may
point out what is missing in my "it looks like" gut feeling.
^ permalink raw reply
* Re: [GSoC RFC PATCH v2 4/7] repo-info: add the --allow-empty flag
From: Lucas Seiki Oshiro @ 2025-06-23 20:28 UTC (permalink / raw)
To: Karthik Nayak; +Cc: Junio C Hamano, git, ps, ben.knoble
In-Reply-To: <CAOLa=ZTCoc9vfeMrWxqU5psmbxGzW=B-QULeSR+uvF9kQi9WzQ@mail.gmail.com>
> I thought the idea was that we would show a subset of the fields by
> default when no user input is provided. Which would mean we have the
> following:
>
> * `git repo-info` which would show a subset of all fields, giving
> important repository information.
> * `git repo-info --all` which would show all available fields.
> * `git repo-info <fields>` which would only show the requested fields.
>
>
> This is also a good time to think about if we should make the default to
> not show anything as Junio mentioned.
After this review, I'm starting to think that leaving it empty by default
would be better. Specially after the review by Phillip Wood [1], who
has a good argument for it:
"""
As this is a plumbing command I think it would be clearer if the caller
was required to specify the output format and the information that they
require with an "--all" option for "show me everything" as Junio
suggested. If we were to set defaults for the format and keys now we
would be stuck with them forever.
"""
[1] https://lore.kernel.org/git/af27af92-73d5-4f0a-84f4-9c91de6ab6e6@gmail.com/
^ permalink raw reply
* Re: Perf bug: rev-list w/ 2+ paths relatively slow with commit-graph
From: Kai Koponen @ 2025-06-23 20:19 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git
In-Reply-To: <xmqq8qli5jyi.fsf@gitster.g>
I see, more of a perf FR than a bug then.
I don't have much expertise here, but on the surface of it, it doesn't
seem to me like there would be any reason the algorithm couldn't check
each path's bloom filter in turn while searching, other than that this
would be a large and annoying change.
On Mon, Jun 23, 2025 at 3:36 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Kai Koponen <kaikoponen@google.com> writes:
>
> > Reproduce steps:
> > ```
> > git clone https://github.com/golang/go.git
> > cd go
> > git config core.commitGraph true
> > git commit-graph write --split --reachable --changed-paths # Without
> > this, all calls equally slow (~1s)
> > time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
> > src/clean.bash > /dev/null # ~90ms
> > time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
> > src/Make.dist > /dev/null # ~100ms
> > time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
> > src/clean.bash src/Make.dist > /dev/null # ~650ms
> > ```
> >
> > The rev-list call with multiple paths takes over 3x longer than the
> > sum of individual calls to it for the same files.
> >
> > Expectation: rev-list with multiple paths should take <= the sum of
> > the time it takes to call it with each path individually (ideally <,
> > since with the count limit it should be able to early-exit and search
> > less commits for either path).
> >
> > Also reproduces without the -10 arg, or with a lower count (double
> > instead of triple w/ -1), but these results are perhaps most
> > surprising with a count present.
>
> I asked
>
> How does "git log -- path" use the changed-paths bloom filter
> stored in the commit-graph file?
>
> to https://deepwiki.com/git/git (there is a text field in the bottom
> of the page), and an early part of its answer explains why in a
> fairly convincing way ;-)
>
> When you run git log -- path, Git first prepares to use bloom
> filters in the prepare_to_use_bloom_filter function. This function:
>
> 1. Validates the pathspec - It calls forbid_bloom_filters to check
> if bloom filters can be used revision.c:674-686 . Bloom filters
> are disabled for wildcards, multiple paths, or complex pathspec
> magic.
>
> ...
>
> In short, the changed-path filter is used only when following
> pathspec with a single element that is not a wildcard. So the
> observed result is (unfortunately) quite expected.
>
^ permalink raw reply
* Re: [PATCH v4 2/5] promisor-remote: allow a server to advertise more fields
From: Justin Tobler @ 2025-06-23 19:59 UTC (permalink / raw)
To: Christian Couder
Cc: git, Junio C Hamano, Patrick Steinhardt, Taylor Blau,
Karthik Nayak, Christian Couder
In-Reply-To: <20250611134506.2975856-3-christian.couder@gmail.com>
On 25/06/11 03:45PM, Christian Couder wrote:
> For now the "promisor-remote" protocol capability can only pass "name"
> and "url" information from a server to a client in the form
> "name=<remote_name>,url=<remote_url>".
>
> Let's make it possible to pass more information by introducing a new
> "promisor.sendFields" configuration variable. This variable should
> contain a comma or space separated list of field names that will be
> looked up in the configuration of the remote on the server to find the
> values that will be passed to the client.
>
> Only a set of predefined fields are allowed. The only fields in this
> set are "partialCloneFilter" and "token". The "partialCloneFilter"
> field specifies the filter definition used by the promisor remote,
> and the "token" field can provide an authentication credential for
> accessing it.
>
> For example, if "promisor.sendFields" is set to "partialCloneFilter",
> and the server has the "remote.<name>.partialCloneFilter" config
> variable set to a value for a remote, then that value will be passed
> in the form "partialCloneFilter=<value>" after the "name" and "url"
> fields.
>
> A following commit will allow the client to use the information to
> decide if it accepts the remote or not. For now the client doesn't do
> anything with the additional information it receives.
>
> Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
> ---
[snip]
> +static char *fields_from_config(struct string_list *fields_list, const char *config_key)
> +{
> + char *fields = NULL;
> +
> + if (!git_config_get_string(config_key, &fields) && *fields) {
> + string_list_split_in_place(fields_list, fields, ", ", -1);
> + string_list_remove_empty_items(fields_list, 0);
Ok, in this version we now filter out empty entries from the
string_list. Previously if fields were specified with both a comma and
SP character (i.e. "partialCloneFilter, token"), an empty entry would be
parsed in the middle and lead to a warning message.
This change is good because it would be pretty natural for a user to
specify the config with both. It might be nice to leave a comment
explaining why we do this though as it may be confusing without context.
-Justin
> + filter_string_list(fields_list, 0, is_valid_field, (void *)config_key);
> + }
> +
> + return fields;
> +}
> +
^ permalink raw reply
* Re: [PATCH v4 1/5] promisor-remote: refactor to get rid of 'struct strvec'
From: Justin Tobler @ 2025-06-23 19:38 UTC (permalink / raw)
To: Christian Couder
Cc: git, Junio C Hamano, Patrick Steinhardt, Taylor Blau,
Karthik Nayak, Christian Couder
In-Reply-To: <20250611134506.2975856-2-christian.couder@gmail.com>
On 25/06/11 03:45PM, Christian Couder wrote:
> In a following commit, we will use the new 'promisor-remote' protocol
> capability introduced by d460267613 (Add 'promisor-remote' capability
> to protocol v2, 2025-02-18) to pass and process more information
> about promisor remotes than just their name and url.
>
> For that purpose, we will need to store information about other
> fields, especially information that might or might not be available
> for different promisor remotes. Unfortunately using 'struct strvec',
> as we currently do, to store information about the promisor remotes
> with one 'struct strvec' for each field like "name" or "url" does not
> scale easily in that case.
>
> Let's refactor this and introduce a new 'struct promisor_info'.
>
> It will only store promisor remote information in its members. For now
> it has only a 'name' member for the promisor remote name and an 'url'
> member for its URL. We will use use a 'struct string_list' to store
s/use use/use/
> the instances of 'struct promisor_info'. For each 'item' in the
> string_list, 'item->string' will point to the promisor remote name and
> 'item->util' will point to the corresponding 'struct promisor_info'
> instance.
>
> Explicit members are used within 'struct promisor_info' for type
> safety and clarity regarding the specific information being handled,
> rather than a generic key-value store. We want to specify and document
> each field and its content, so adding new members to the struct as
> more fields are supported is fine.
>
> Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
> ---
> promisor-remote.c | 111 +++++++++++++++++++++++++++++-----------------
> 1 file changed, 70 insertions(+), 41 deletions(-)
>
> diff --git a/promisor-remote.c b/promisor-remote.c
> index 9d058586df..90a063ea53 100644
> --- a/promisor-remote.c
> +++ b/promisor-remote.c
> @@ -314,9 +314,35 @@ static int allow_unsanitized(char ch)
> return ch > 32 && ch < 127;
> }
>
> -static void promisor_info_vecs(struct repository *repo,
> - struct strvec *names,
> - struct strvec *urls)
> +/*
> + * Struct for promisor remotes involved in the "promisor-remote"
> + * protocol capability.
> + *
> + * Except for "name", each <member> in this struct and its <value>
> + * should correspond (either on the client side or on the server side)
> + * to a "remote.<name>.<member>" config variable set to <value> where
> + * "<name>" is a promisor remote name.
> + */
> +struct promisor_info {
> + const char *name;
> + const char *url;
> +};
Ok so now all promisor info for a given remote is stored in its own
struct. This will enable easier extension in the future to add
additional fields.
> +
> +static void promisor_info_list_clear(struct string_list *list)
> +{
> + for (size_t i = 0; i < list->nr; i++) {
> + struct promisor_info *p = list->items[i].util;
> + free((char *)p->name);
> + free((char *)p->url);
> + }
> + string_list_clear(list, 1);
> +}
> +
> +/*
> + * Populate 'list' with promisor remote information from the config.
> + * The 'util' pointer of each list item will hold a 'struct promisor_info'.
> + */
> +static void promisor_config_info_list(struct repository *repo, struct string_list *list)
> {
> struct promisor_remote *r;
>
> @@ -328,8 +354,14 @@ static void promisor_info_vecs(struct repository *repo,
>
> /* Only add remotes with a non empty URL */
> if (!git_config_get_string_tmp(url_key, &url) && *url) {
> - strvec_push(names, r->name);
> - strvec_push(urls, url);
> + struct promisor_info *new_info = xcalloc(1, sizeof(*new_info));
> + struct string_list_item *item;
> +
> + new_info->name = xstrdup(r->name);
> + new_info->url = xstrdup(url);
> +
> + item = string_list_append(list, new_info->name);
> + item->util = new_info;
> }
In this version, each remote is now stored as a member of a `struct
string_list` instead of a custom collection type specific to promisor
remote info. Nice
> free(url_key);
> @@ -340,47 +372,36 @@ char *promisor_remote_info(struct repository *repo)
> {
> struct strbuf sb = STRBUF_INIT;
> int advertise_promisors = 0;
> - struct strvec names = STRVEC_INIT;
> - struct strvec urls = STRVEC_INIT;
> + struct string_list config_info = STRING_LIST_INIT_NODUP;
nit: Ok in this context, "config_info" is specific to the list of
promisor_info not just generic git configuration. Something like
"promisor_info_list" would be a bit more explicit, but I don't feel
super strongly.
> + struct string_list_item *item;
>
> git_config_get_bool("promisor.advertise", &advertise_promisors);
>
> if (!advertise_promisors)
> return NULL;
>
> - promisor_info_vecs(repo, &names, &urls);
> + promisor_config_info_list(repo, &config_info);
>
> - if (!names.nr)
> + if (!config_info.nr)
> return NULL;
>
> - for (size_t i = 0; i < names.nr; i++) {
> - if (i)
> + for_each_string_list_item(item, &config_info) {
> + struct promisor_info *p = item->util;
> +
> + if (item != config_info.items)
> strbuf_addch(&sb, ';');
Out of curiousity, is it invalid for the trailing promisor remote entry
to end with a ';'? It would be simpler if each entry could just end with
a semi-colon.
> +
> strbuf_addstr(&sb, "name=");
> - strbuf_addstr_urlencode(&sb, names.v[i], allow_unsanitized);
> + strbuf_addstr_urlencode(&sb, p->name, allow_unsanitized);
> strbuf_addstr(&sb, ",url=");
> - strbuf_addstr_urlencode(&sb, urls.v[i], allow_unsanitized);
> + strbuf_addstr_urlencode(&sb, p->url, allow_unsanitized);
> }
>
> - strvec_clear(&names);
> - strvec_clear(&urls);
> + promisor_info_list_clear(&config_info);
>
> return strbuf_detach(&sb, NULL);
> }
>
> -/*
> - * Find first index of 'nicks' where there is 'nick'. 'nick' is
> - * compared case sensitively to the strings in 'nicks'. If not found
> - * 'nicks->nr' is returned.
> - */
> -static size_t remote_nick_find(struct strvec *nicks, const char *nick)
> -{
> - for (size_t i = 0; i < nicks->nr; i++)
> - if (!strcmp(nicks->v[i], nick))
> - return i;
> - return nicks->nr;
> -}
> -
> enum accept_promisor {
> ACCEPT_NONE = 0,
> ACCEPT_KNOWN_URL,
> @@ -390,19 +411,23 @@ enum accept_promisor {
>
> static int should_accept_remote(enum accept_promisor accept,
> const char *remote_name, const char *remote_url,
> - struct strvec *names, struct strvec *urls)
> + struct string_list *config_info)
> {
> - size_t i;
> + struct promisor_info *p;
> + struct string_list_item *item;
>
> if (accept == ACCEPT_ALL)
> return 1;
>
> - i = remote_nick_find(names, remote_name);
> + /* Get config info for that promisor remote */
> + item = string_list_lookup(config_info, remote_name);
>
> - if (i >= names->nr)
> + if (!item)
> /* We don't know about that remote */
> return 0;
>
> + p = item->util;
> +
> if (accept == ACCEPT_KNOWN_NAME)
> return 1;
>
> @@ -414,11 +439,15 @@ static int should_accept_remote(enum accept_promisor accept,
> return 0;
> }
>
> - if (!strcmp(urls->v[i], remote_url))
> + if (!p->url)
> + BUG("bad config_info (invalid URL) for remote '%s'",
> + remote_name);
Ok just to clarify, it is invalid for a promisor remote to not have a
URL specified. If so, it might be better to say "empty URL" or something
along those lines.
> +
> + if (!strcmp(p->url, remote_url))
> return 1;
>
> warning(_("known remote named '%s' but with URL '%s' instead of '%s'"),
> - remote_name, urls->v[i], remote_url);
> + remote_name, p->url, remote_url);
>
> return 0;
> }
> @@ -430,8 +459,7 @@ static void filter_promisor_remote(struct repository *repo,
> struct strbuf **remotes;
> const char *accept_str;
> enum accept_promisor accept = ACCEPT_NONE;
> - struct strvec names = STRVEC_INIT;
> - struct strvec urls = STRVEC_INIT;
> + struct string_list config_info = STRING_LIST_INIT_NODUP;
>
> if (!git_config_get_string_tmp("promisor.acceptfromserver", &accept_str)) {
> if (!*accept_str || !strcasecmp("None", accept_str))
> @@ -450,8 +478,10 @@ static void filter_promisor_remote(struct repository *repo,
> if (accept == ACCEPT_NONE)
> return;
>
> - if (accept != ACCEPT_ALL)
> - promisor_info_vecs(repo, &names, &urls);
> + if (accept != ACCEPT_ALL) {
> + promisor_config_info_list(repo, &config_info);
> + string_list_sort(&config_info);
> + }
>
> /* Parse remote info received */
>
> @@ -482,7 +512,7 @@ static void filter_promisor_remote(struct repository *repo,
> if (remote_url)
> decoded_url = url_percent_decode(remote_url);
>
> - if (decoded_name && should_accept_remote(accept, decoded_name, decoded_url, &names, &urls))
> + if (decoded_name && should_accept_remote(accept, decoded_name, decoded_url, &config_info))
> strvec_push(accepted, decoded_name);
>
> strbuf_list_free(elems);
> @@ -490,8 +520,7 @@ static void filter_promisor_remote(struct repository *repo,
> free(decoded_url);
> }
>
> - strvec_clear(&names);
> - strvec_clear(&urls);
> + promisor_info_list_clear(&config_info);
> strbuf_list_free(remotes);
> }
>
> --
> 2.50.0.rc2.5.ge8efe62b7f
>
^ permalink raw reply
* Re: Perf bug: rev-list w/ 2+ paths relatively slow with commit-graph
From: Junio C Hamano @ 2025-06-23 19:36 UTC (permalink / raw)
To: Kai Koponen; +Cc: git
In-Reply-To: <CADYQcGqaMC=4jgbmnF9Q11oC11jfrqyvH8EuiRRHytpMXd4wYA@mail.gmail.com>
Kai Koponen <kaikoponen@google.com> writes:
> Reproduce steps:
> ```
> git clone https://github.com/golang/go.git
> cd go
> git config core.commitGraph true
> git commit-graph write --split --reachable --changed-paths # Without
> this, all calls equally slow (~1s)
> time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
> src/clean.bash > /dev/null # ~90ms
> time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
> src/Make.dist > /dev/null # ~100ms
> time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
> src/clean.bash src/Make.dist > /dev/null # ~650ms
> ```
>
> The rev-list call with multiple paths takes over 3x longer than the
> sum of individual calls to it for the same files.
>
> Expectation: rev-list with multiple paths should take <= the sum of
> the time it takes to call it with each path individually (ideally <,
> since with the count limit it should be able to early-exit and search
> less commits for either path).
>
> Also reproduces without the -10 arg, or with a lower count (double
> instead of triple w/ -1), but these results are perhaps most
> surprising with a count present.
I asked
How does "git log -- path" use the changed-paths bloom filter
stored in the commit-graph file?
to https://deepwiki.com/git/git (there is a text field in the bottom
of the page), and an early part of its answer explains why in a
fairly convincing way ;-)
When you run git log -- path, Git first prepares to use bloom
filters in the prepare_to_use_bloom_filter function. This function:
1. Validates the pathspec - It calls forbid_bloom_filters to check
if bloom filters can be used revision.c:674-686 . Bloom filters
are disabled for wildcards, multiple paths, or complex pathspec
magic.
...
In short, the changed-path filter is used only when following
pathspec with a single element that is not a wildcard. So the
observed result is (unfortunately) quite expected.
^ permalink raw reply
* Re: [GSoC RFC PATCH v2 1/7] repo-info: declare the repo-info command
From: Lucas Seiki Oshiro @ 2025-06-23 19:04 UTC (permalink / raw)
To: Karthik Nayak; +Cc: git, ps, ben.knoble, gitster
In-Reply-To: <CAOLa=ZSgFYXEwdTyAfC2TSgVqpCSq6D1NDBwPU9cY-unX0Jx6Q@mail.gmail.com>
Hi!
> Something I mentioned in the previous review, but hasn't been addressed
> is the addition of documentation for the new command.
I haven't wrote a documentation because I thought it would be too early
as this command is still under discussion, but it looks like it is time
to starting working on it.
I'll include that in v3.
Thanks!
^ permalink raw reply
* Re: [GSoC RFC PATCH v2 0/7] repo-info: add new command for retrieving repository info
From: Lucas Seiki Oshiro @ 2025-06-23 18:49 UTC (permalink / raw)
To: phillip.wood; +Cc: git, ps, karthik.188, ben.knoble, gitster
In-Reply-To: <af27af92-73d5-4f0a-84f4-9c91de6ab6e6@gmail.com>
> Hi Lucas
Hi, Phillip, thanks for joining this discussion!
> I think using an output format generated by 'printf("%s\n%s\0", key,
> value)' would be easier to parse. This format matches that used by 'git
> config --list -z'.
Thanks for your suggestion! However, this still breaks in the corner case
mentioned by Junio in
https://lore.kernel.org/git/xmqqikl3mtx2.fsf@gitster.g/:
when a value contains a LF, which would be possible to have in the (yet to be
implemented) path values.
> I've not seen any discussion of how paths are going to be encoded in the
> JSON output. As I understand it some JSON decoders only accept utf8 input
> but the paths reported by git are arbitrary NUL terminated byte sequences.
> How is one expected to parse the output for a non utf8 encoded path using
> rust's JSON decoding for example?
By now, I'm directly using the jw_* functions, which format strings using the
function append_quoted_string, introduced in 75459410ed (json_writer: new
routines to create JSON data, 2018-07-13). It was also discussed when that
function was introduced:
"""
We say "JSON-like" because we do not enforce the Unicode (usually UTF-8)
requirement on string fields. Internally, Git does not necessarily have
Unicode/UTF-8 data for most fields, so it is currently unclear the best
way to enforce that requirement. For example, on Linux pathnames can
contain arbitrary 8-bit character data, so a command like "status" would
not know how to encode the reported pathnames. We may want to revisit
this (or double encode such strings) in the future.
"""
So, it looks like that "the future" is soon :-). In this RFC, I'm not handling
paths yet, and I can't propose a proper solution by now as I honestly know
very little about UTF-8 encoding...
The first solution that I can think of is to check if the sequence is a valid
UTF-8 bytestring, aborting the entire command if it's not, which would be
better than just guess the charset and re-encode it as UTF-8. However,
I don't know how hard it would be to do.
> On the subject of paths do you plan to support the equivalent of "git
> rev-parse --git-path"?
Hmmmm... In the way that it works under rev-parse, no, as it may bloat this
command with other things that aren't exactly metadata.
> I'm not sure what the future plans for this command are but when I'm
> scripting around git it would be nice to be able to a single process that I
> could query for the things currently returned by "git rev-parse", "git var"
> and "git config"
My concern here is that this main motivation for this new command is that
rev-parse has too many responsibilities. Giving too many responsibilities to
this new command may turn it into a new rev-parse and create a XKCD 927 [1]
situation
>
> Best Wishes
>
> Phillip
>
Thanks again for bringing more light to this discussion! These first patches
are only outputting hardcoded strings from Git, and dealing with Unicode is
something that I'll really need to think about how to solve.
[1] https://xkcd.com/927/
^ permalink raw reply
* Re: [PATCH v5 9/9] repack: exclude cruft pack(s) from the MIDX where possible
From: Taylor Blau @ 2025-06-23 18:47 UTC (permalink / raw)
To: Jeff King; +Cc: git, Elijah Newren, Junio C Hamano
In-Reply-To: <20250621043551.GA3002138@coredump.intra.peff.net>
On Sat, Jun 21, 2025 at 12:35:51AM -0400, Jeff King wrote:
> On Thu, Jun 19, 2025 at 07:30:33PM -0400, Taylor Blau wrote:
>
> > +test_expect_success 'repack --write-midx excludes cruft where possible' '
> > + setup_cruft_exclude_tests exclude-cruft-when-possible &&
> > + (
> > + cd exclude-cruft-when-possible &&
> > +
> > + GIT_TEST_MULTI_PACK_INDEX=0 \
> > + git repack -d --geometric=2 --write-midx --write-bitmap-index &&
> > +
> > + test-tool read-midx --show-objects $objdir >midx &&
> > + cruft="$(ls $packdir/*.mtimes)" &&
> > + test_grep ! "$(basename "$cruft" .mtimes).idx" midx &&
> > +
> > + git rev-list --all --objects --no-object-names >reachable.raw &&
> > + sort reachable.raw >reachable.objects &&
> > + awk "/\.pack$/ { print \$1 }" <midx | sort >midx.objects &&
> > +
> > + test_cmp reachable.objects midx.objects
> > + )
> > +'
>
> This test (but none of the others) fails when run with:
>
> GIT_TEST_MULTI_PACK_INDEX=1 \
> GIT_TEST_MULTI_PACK_INDEX_WRITE_INCREMENTAL=1 \
> ./t7704-repack-cruft.sh
>
> The culprit is the incremental flag, but you need the first one for the
> second to do anything. The issue is that the cruft pack unexpectedly
> appears in the midx:
>
> [...]
>
> I'm not sure if it's just a funky interaction with the hacky GIT_TEST_*
> variables, or if it's a real bug.
Thanks for spotting. This is definitely a real bug. The root cause here
is that our loop to gather the set of packs we know are in the MIDX does
not account for multi-layered / incremental MIDXs.
In our example, if there's a cruft pack in any other layer of a MIDX
besides the tip, the proposed implementation here won't realize it, and
thus (incorrectly) conclude that the cruft pack is not in the MIDX
already, so can thusly be omitted.
If we do this on top:
--- 8< ---
diff --git a/builtin/repack.c b/builtin/repack.c
index 346d44fbcd..8d1540a0fd 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -1614,13 +1614,16 @@ int cmd_repack(int argc,
string_list_sort(&names);
if (get_local_multi_pack_index(the_repository)) {
- uint32_t i;
struct multi_pack_index *m =
get_local_multi_pack_index(the_repository);
- ALLOC_ARRAY(midx_pack_names, m->num_packs);
- for (i = 0; i < m->num_packs; i++)
- midx_pack_names[midx_pack_names_nr++] = xstrdup(m->pack_names[i]);
+ ALLOC_ARRAY(midx_pack_names,
+ m->num_packs + m->num_packs_in_base);
+
+ for (; m; m = m->base_midx)
+ for (uint32_t i = 0; i < m->num_packs; i++)
+ midx_pack_names[midx_pack_names_nr++] =
+ xstrdup(m->pack_names[i]);
}
close_object_store(the_repository->objects);
--- >8 ---
(Note that this assumes that 'tb/prepare-midx-pack-cleanup' has not
landed, this could be a little bit simplified with the
nth_midx_pack_names() function. I kept these two separate so they could
proceed independently.)
Things work as expected. I'll send out a new round with this fix
Incorporated as well as a style issue that Junio noted earlier in this
thread.
Thanks,
Taylor
^ permalink raw reply related
* Re: [PATCH v2 1/4] userdiff: add javascript diff driver
From: Junio C Hamano @ 2025-06-23 18:09 UTC (permalink / raw)
To: Derick W. de M. Frias; +Cc: j6t, git
In-Reply-To: <20250623090538.154858-2-derick.william.moraes@gmail.com>
"Derick W. de M. Frias" <derick.william.moraes@gmail.com> writes:
> --- a/userdiff.c
> +++ b/userdiff.c
> @@ -238,33 +238,43 @@ PATTERNS("java",
> "|[-+*/<>%&^|=!]="
> "|--|\\+\\+|<<=?|>>>?=?|&&|\\|\\|"),
> PATTERNS("javascript",
> - /* conventional named functions */
> - "^[ \t]*(async[ \t]+)?function[ \t]*\\*?[ \t]*([$_a-zA-Z][$_a-zA-Z0-9]*)[ \t]*\\(.*$|"
Curious to see that anything needs to be removed from "javascript"
section, when we do not have any patterns for the language in the
file. Which version of Git is this patch meant to apply to?
^ permalink raw reply
* Re: Perf bug: rev-list w/ 2+ paths relatively slow with commit-graph
From: Kai Koponen @ 2025-06-23 18:04 UTC (permalink / raw)
To: git, Kai Koponen
In-Reply-To: <CADYQcGqaMC=4jgbmnF9Q11oC11jfrqyvH8EuiRRHytpMXd4wYA@mail.gmail.com>
Re: rev-list perf bug, some `git bugreport` version information:
[System Info]
git version:
git version 2.50.0.714.g196bf9f422-goog
cpu: x86_64
no commit associated with this build
sizeof-long: 8
sizeof-size_t: 8
shell-path: /bin/sh
libcurl: 8.13.0
OpenSSL: OpenSSL 3.4.1 11 Feb 2025
zlib: 1.3.1
SHA-1: SHA1_DC
SHA-256: SHA256_BLK
compiler info: gnuc: 14.2
libc info: glibc: 2.41
On Mon, Jun 23, 2025 at 1:58 PM Kai Koponen <kaikoponen@google.com> wrote:
>
> Reproduce steps:
> ```
> git clone https://github.com/golang/go.git
> cd go
> git config core.commitGraph true
> git commit-graph write --split --reachable --changed-paths # Without
> this, all calls equally slow (~1s)
> time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
> src/clean.bash > /dev/null # ~90ms
> time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
> src/Make.dist > /dev/null # ~100ms
> time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
> src/clean.bash src/Make.dist > /dev/null # ~650ms
> ```
>
> The rev-list call with multiple paths takes over 3x longer than the
> sum of individual calls to it for the same files.
>
> Expectation: rev-list with multiple paths should take <= the sum of
> the time it takes to call it with each path individually (ideally <,
> since with the count limit it should be able to early-exit and search
> less commits for either path).
>
> Also reproduces without the -10 arg, or with a lower count (double
> instead of triple w/ -1), but these results are perhaps most
> surprising with a count present.
^ permalink raw reply
* Re: [PATCH v2 3/3] contrib: better support symbolic port names in git-credential-netrc
From: Junio C Hamano @ 2025-06-23 18:03 UTC (permalink / raw)
To: Maxim Cournoyer; +Cc: git
In-Reply-To: <20250622152535.11837-4-maxim@guixotic.coop>
Maxim Cournoyer <maxim@guixotic.coop> writes:
> diff --git a/contrib/credential/netrc/git-credential-netrc.perl b/contrib/credential/netrc/git-credential-netrc.perl
> index 09d77b4f69..72d6b6a974 100755
> --- a/contrib/credential/netrc/git-credential-netrc.perl
> +++ b/contrib/credential/netrc/git-credential-netrc.perl
> @@ -268,13 +268,16 @@ sub load_netrc {
> next;
> }
> if (defined $nentry->{port}) {
> - if ($nentry->{port} =~ m/^\d+$/) {
> - $num_port = $nentry->{port};
> - delete $nentry->{port};
> - } else {
> + $num_port = Git::is_port($nentry->{port});
> + unless ($num_port) {
> printf(STDERR "ignoring invalid port `%s' " .
> "from netrc file\n", $nentry->{port});
> }
> + # Since we've already validated and converted
> + # the port to its numerical value, do not
> + # capture it as the `protocol' value, as used
> + # to be the case for symbolic port names.
> + delete $nentry->{port};
> }
OK, so we rewrite textual service names into port number, and
normalize the "host" member of the entry read from the file into
"host:port" form. Earlier we did that only for numeric port
numbers. Nice.
> diff --git a/contrib/credential/netrc/test.pl b/contrib/credential/netrc/test.pl
> index 67a0ede564..8a7fc2588a 100755
> --- a/contrib/credential/netrc/test.pl
> +++ b/contrib/credential/netrc/test.pl
> @@ -45,7 +45,7 @@ BEGIN
> diag "Testing with invalid data\n";
> $cred = run_credential(['-f', $netrc, 'get'],
> "bad data");
> -ok(scalar keys %$cred == 4, "Got first found keys with bad data");
> +ok(scalar keys %$cred == 3, "Got first found keys with bad data");
>
> diag "Testing netrc file for a missing corovamilkbar entry\n";
> $cred = run_credential(['-f', $netrc, 'get'],
> @@ -64,12 +64,12 @@ BEGIN
>
> diag "Testing netrc file for a username-specific entry\n";
> $cred = run_credential(['-f', $netrc, 'get'],
> - { host => 'imap', username => 'bob' });
> + { host => 'imap:993', username => 'bob' });
Is this rewriting an existing test, instead of adding a new test to
trigger a feature that didn't have a test coverage, while keeping
the old test? I am wondering if we want to ensure that both
":port"-less case and "host:port" case keep working even after the
change to -netrc credential helper in this patch.
> diff --git a/perl/Git.pm b/perl/Git.pm
> index 6f47d653ab..1fa535e1ad 100644
> --- a/perl/Git.pm
> +++ b/perl/Git.pm
> @@ -1061,6 +1061,21 @@ sub _close_cat_blob {
> delete @$self{@vars};
> }
>
> +# Predicate to check whether PORT is a valid port number or service
> +# name. The numerical value of PORT is returned, else undef if
> +# invalid.
Hmph. It _can_ be used to validate a random end-user supplied
string names a port, either by being a port number in the valid
range or by being a valid service name. But another use case in the
code after this patch applied that is equally if not more important
is to ensure that a valid port specified by the end-user is turned
into a port number. We should not name such a sub as if its primary
functionality is to serve as a Boolean "is_foo". Perhaps call it
port_num or something?
> +sub is_port {
> + my ($port) = @_;
> +
> + # Port can be either a positive integer within the 16-bit range...
> + if ($port =~ /^\d+$/ && $port > 0 && $port <= (2**16 - 1)) {
> + return $port;
> + }
> +
> + # ... or a symbolic port (service name).
> + my $num = getservbyname($port, '');
> + return defined $num ? $num : undef;
Wouldn't "return $num" work here? getservbyname() would return
"undef" when the given $port is not a valid service name anyway, no?
Or even "return scalar getservbyname($port, 'tcp')" without an
intermediate variable $num?
> +}
>
> =item credential_read( FILEHANDLE )
>
> diff --git a/t/t9001-send-email.sh b/t/t9001-send-email.sh
> index 0c1af43f6f..3e749175ab 100755
> --- a/t/t9001-send-email.sh
> +++ b/t/t9001-send-email.sh
> @@ -201,6 +201,13 @@ test_expect_success $PREREQ 'cc trailer with get_maintainer.pl output' '
> test_cmp expected-cc commandline1
> '
>
> +test_expect_failure $PREREQ 'invalid smtp server port value' '
> + clean_fake_sendmail &&
> + git send-email -1 --to=recipient@example.com \
> + --smtp-server-port=bogus-symbolic-name \
> + --smtp-server="$(pwd)/fake.sendmail"
> +'
> +
> test_expect_success $PREREQ 'setup expect' "
> cat >expected-show-all-headers <<\EOF
> 0001-Second.patch
^ permalink raw reply
* Re: [PATCH 1/2] t7422: replace confusing printf with echo
From: Eric Sunshine @ 2025-06-23 17:59 UTC (permalink / raw)
To: Jeff King; +Cc: git
In-Reply-To: <20250623105542.GA654412@coredump.intra.peff.net>
On Mon, Jun 23, 2025 at 6:57 AM Jeff King <peff@peff.net> wrote:
> While looping over a counter "i", we do:
>
> printf "[submodule \"sm-$i\"]\npath = recursive-submodule-path-$i\n" "$i"
>
> So we are passing "$i" as an argument to be filled in, but there is no
> "%" placeholder in the format string, which is a bit confusing to read.
>
> We could switch both instances of "$i" to "%d" (and pass $i twice). But
> that makes the line even longer. Let's just keep interpolating the value
> in the string, and drop the confusing extra "$i" argument.
>
> And since we are not using any printf specifiers at all, it becomes
> clear that we can swap it out for echo. We do use a "\n" in the middle
> of the string, but breaking this into two separate echo statements
> actually makes it easier to read.
>
> Signed-off-by: Jeff King <peff@peff.net>
> ---
> diff --git a/t/t7422-submodule-output.sh b/t/t7422-submodule-output.sh
> @@ -180,7 +180,8 @@ test_expect_success !MINGW 'git submodule status --recursive propagates SIGPIPE'
> - printf "[submodule \"sm-$i\"]\npath = recursive-submodule-path-$i\n" "$i" ||
> + echo "[submodule \"sm-$i\"]" &&
> + echo "path = recursive-submodule-path-$i" ||
This looks obviously correct and, as the commit message says, is
almost certainly easier to read, but I was more than a little
surprised to see the patch since I thought this code had been fixed
previously[*] and had some discussion around it.
[*] https://lore.kernel.org/git/20250403144852.19153-1-sn03.general@gmail.com/
^ permalink raw reply
* Perf bug: rev-list w/ 2+ paths relatively slow with commit-graph
From: Kai Koponen @ 2025-06-23 17:58 UTC (permalink / raw)
To: git, Kai Koponen
Reproduce steps:
```
git clone https://github.com/golang/go.git
cd go
git config core.commitGraph true
git commit-graph write --split --reachable --changed-paths # Without
this, all calls equally slow (~1s)
time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
src/clean.bash > /dev/null # ~90ms
time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
src/Make.dist > /dev/null # ~100ms
time git rev-list -10 3730814f2f2bf24550920c39a16841583de2dac1 --
src/clean.bash src/Make.dist > /dev/null # ~650ms
```
The rev-list call with multiple paths takes over 3x longer than the
sum of individual calls to it for the same files.
Expectation: rev-list with multiple paths should take <= the sum of
the time it takes to call it with each path individually (ideally <,
since with the count limit it should be able to early-exit and search
less commits for either path).
Also reproduces without the -10 arg, or with a lower count (double
instead of triple w/ -1), but these results are perhaps most
surprising with a count present.
^ permalink raw reply
* Re: [PATCH 2/2] test-lib: teach test_seq the -f option
From: Todd Zullinger @ 2025-06-23 17:27 UTC (permalink / raw)
To: Jeff King; +Cc: Junio C Hamano, Justin Tobler, git
In-Reply-To: <20250623105625.GB654412@coredump.intra.peff.net>
Jeff King wrote:
> The "seq" tool has a "-f" option to produce printf-style formatted
> lines. Let's teach our test_seq helper the same trick. This lets us get
> rid of some shell loops in test snippets (which are particularly verbose
> in our test suite because we have to "|| return 1" to keep the &&-chain
> going).
This is a nice improvement.
> diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh
> index bee4a2ca34..8c176f4efc 100644
> --- a/t/test-lib-functions.sh
> +++ b/t/test-lib-functions.sh
> @@ -1454,6 +1454,13 @@ test_cmp_fspath () {
> # from 1.
>
> test_seq () {
> + local fmt="%d"
> + case "$1" in
> + -f)
> + fmt="$2"
> + shift 2
> + ;;
> + esac
> case $# in
> 1) set 1 "$@" ;;
> 2) ;;
> @@ -1462,7 +1469,7 @@ test_seq () {
> test_seq_counter__=$1
> while test "$test_seq_counter__" -le "$2"
> do
> - echo "$test_seq_counter__"
> + printf "$fmt\n" "$test_seq_counter__"
> test_seq_counter__=$(( $test_seq_counter__ + 1 ))
> done
> }
Is it a sharp edge worth caring about that someone might
write `test_seq -f 1 5` where we'd pass 1 as the format
string?
If so, perhaps a check like this might be sufficient to
catch it early?
diff --git i/t/test-lib-functions.sh w/t/test-lib-functions.sh
index 8c176f4efc..87b59d5895 100644
--- i/t/test-lib-functions.sh
+++ w/t/test-lib-functions.sh
@@ -1458,6 +1458,10 @@ test_seq () {
case "$1" in
-f)
fmt="$2"
+ case "$fmt" in
+ *%*) : ;;
+ *) BUG "no % in -f argument" ;;
+ esac
shift 2
;;
esac
I don't know whether it's worth the extra code or not. I
just wondered about how it would fail in the face of a minor
typo. It certainly should cause any test to fail if it were
to output 1 instead of the intended format string, so it's
arguably fine as-is.
Adding -f to the usage note above, as Justin suggested might
help folks avoid making the mistake of cuddling the format
string against -f, e.g.: -f%d. That is caught by the
parameter count check (though perhaps not everyone would
notice why, thinking they did pass an argument to -f).
--
Todd
^ permalink raw reply
* Re: [PATCH 2/2] test-lib: teach test_seq the -f option
From: Justin Tobler @ 2025-06-23 16:25 UTC (permalink / raw)
To: Jeff King; +Cc: git
In-Reply-To: <20250623105625.GB654412@coredump.intra.peff.net>
On 25/06/23 06:56AM, Jeff King wrote:
> The "seq" tool has a "-f" option to produce printf-style formatted
> lines. Let's teach our test_seq helper the same trick. This lets us get
> rid of some shell loops in test snippets (which are particularly verbose
> in our test suite because we have to "|| return 1" to keep the &&-chain
> going).
>
> This converts a few call-sites I found by grepping around the test
> suite. A few notes on these:
>
> - In "seq", the format specifier is a "%g" float. Since test_seq only
> supports integers, I've kept the more natural "%d" (which is what
> these call sites were using already).
Sticking with "%d" definately feels more natural.
> - Like "seq", test_seq automatically adds a newline to the specified
> format. This is what all callers are doing already except for t0021,
> but there we do not care about the exact format. We are just trying
> to printf a large number of bytes to a file. It's not worth
> complicating other callers or adding an option to avoid the newline
> in that caller.
>
> - Most conversions are just replacing a shell loop (which does get rid
> of an extra fork, since $() requires a subshell). In t0612 we can
> replace an awk invocation, which I think makes the end result more
> readable, as there's less quoting.
>
> - In t7422 we can replace one loop, but sadly we have to leave the
> loop directly above it. This is because that earlier loop wants to
> include the seq value twice in the output, which test_seq does not
> support (nor does regular seq). If you run:
>
> test_seq -f "foo-%d %d" 10
>
> the second "%d" will always be the empty string. You might naively
> think that test_seq could add some extra arguments, like:
>
> # 3 ought to be enough for anyone...
> printf "$fmt\n" "$i "$i" $i"
>
> but that just triggers printf to format multiple lines, one per
> extra set of arguments.
>
> So we'd have to actually parse the format string, figure out how
> many "%" placeholders are there, and then feed it that many
> instances of the sequence number. The complexity isn't worth it.
>
> Signed-off-by: Jeff King <peff@peff.net>
> ---
> t/t0021-conversion.sh | 4 ++--
> t/t0610-reftable-basics.sh | 6 +-----
> t/t0612-reftable-jgit-compatibility.sh | 13 +++++--------
> t/t0613-reftable-write-options.sh | 24 ++++--------------------
> t/t1400-update-ref.sh | 10 ++--------
> t/t5004-archive-corner-cases.sh | 5 +----
> t/t6422-merge-rename-corner-cases.sh | 10 ++--------
> t/t7422-submodule-output.sh | 6 +-----
> t/test-lib-functions.sh | 9 ++++++++-
> 9 files changed, 26 insertions(+), 61 deletions(-)
>
> diff --git a/t/t0021-conversion.sh b/t/t0021-conversion.sh
> index bf10d253ec..f0d50d769e 100755
> --- a/t/t0021-conversion.sh
> +++ b/t/t0021-conversion.sh
> @@ -281,7 +281,7 @@ test_expect_success 'required filter with absent smudge field' '
> test_expect_success 'filtering large input to small output should use little memory' '
> test_config filter.devnull.clean "cat >/dev/null" &&
> test_config filter.devnull.required true &&
> - for i in $(test_seq 1 30); do printf "%1048576d" 1 || return 1; done >30MB &&
> + test_seq -f "%1048576d" 1 30 >30MB &&
Very nice quality of life improvement indeed. :)
> echo "30MB filter=devnull" >.gitattributes &&
> GIT_MMAP_LIMIT=1m GIT_ALLOC_LIMIT=1m git add 30MB
> '
[snip]
> diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh
> index bee4a2ca34..8c176f4efc 100644
> --- a/t/test-lib-functions.sh
> +++ b/t/test-lib-functions.sh
> @@ -1454,6 +1454,13 @@ test_cmp_fspath () {
> # from 1.
>
> test_seq () {
> + local fmt="%d"
> + case "$1" in
> + -f)
> + fmt="$2"
With the `-f` option, the default format string gets overwritten to what
is provided by the user. Makes sense.
If we want, we could update the comment above this function to mention
this new option.
> + shift 2
> + ;;
> + esac
> case $# in
> 1) set 1 "$@" ;;
> 2) ;;
> @@ -1462,7 +1469,7 @@ test_seq () {
> test_seq_counter__=$1
> while test "$test_seq_counter__" -le "$2"
> do
> - echo "$test_seq_counter__"
> + printf "$fmt\n" "$test_seq_counter__"
Nice and simple! Each of the updated callsites also look good to me.
-Justin
> test_seq_counter__=$(( $test_seq_counter__ + 1 ))
> done
> }
> --
> 2.50.0.385.g2a828bf5b7
>
^ permalink raw reply
* Re: [PATCH v19 00/10] imap-send: make it usable again and add OAuth2.0 support
From: Junio C Hamano @ 2025-06-23 16:27 UTC (permalink / raw)
To: Phillip Wood
Cc: Aditya Garg, git, Eric Sunshine, Zi Yao, brian m . carlson,
Jeff King, Ben Knoble
In-Reply-To: <c787a41c-97c6-437f-aae0-52132c79db7c@gmail.com>
Phillip Wood <phillip.wood123@gmail.com> writes:
> On 20/06/2025 16:50, Junio C Hamano wrote:
>> Aditya Garg <gargaditya08@live.com> writes:
>>
>>> v19: - Use xstrfmt() for OAuth2 strings and strbuf for PLAIN.
>>>
>>> Aditya Garg (10):
>>> imap-send: fix bug causing cfg->folder being set to NULL
>>> imap-send: fix memory leak in case auth_cram_md5 fails
>>> imap-send: gracefully fail if CRAM-MD5 authentication is requested
>>> without OpenSSL
>>> imap-send: add support for OAuth2.0 authentication
>>> imap-send: add PLAIN authentication method to OpenSSL
>>> imap-send: enable specifying the folder using the command line
>>> imap-send: add ability to list the available folders
>>> imap-send: display port alongwith host when git credential is invoked
>>> imap-send: display the destination mailbox when sending a message
>>> imap-send: fix minor mistakes in the logs
>>>
>>> Documentation/config/imap.adoc | 11 +-
>>> Documentation/git-imap-send.adoc | 68 +++++-
>>> imap-send.c | 405 ++++++++++++++++++++++++++-----
>>> 3 files changed, 407 insertions(+), 77 deletions(-)
>> Looking good. Will replace.
>> Should we declare victory and mark the topic for 'next' now?
>
> I think so, the range diff looks good. I've not reviewed each patch
> but I just had a quick scan of
>
> git diff origin/master origin/seen imap-send.c
>
> and it looked reasonable.
>
> Best Wishes
>
> Phillip
Thanks.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox