Git development
 help / color / mirror / Atom feed
* [PATCH 08/10] t5401: speed up creation of many branches
From: Patrick Steinhardt @ 2023-11-29  7:25 UTC (permalink / raw)
  To: git
In-Reply-To: <cover.1701242407.git.ps@pks.im>

[-- Attachment #1: Type: text/plain, Size: 1610 bytes --]

One of the tests in t5401 creates a bunch of branches by calling
git-branch(1) for every one of them. This is quite inefficient and takes
a comparatively long time even on Unix systems where spawning processes
is comparatively fast. Refactor it to instead use git-update-ref(1),
which leads to an almost 10-fold speedup:

```
Benchmark 1: ./t5401-update-hooks.sh (rev = HEAD)
  Time (mean ± σ):     983.2 ms ±  97.6 ms    [User: 328.8 ms, System: 679.2 ms]
  Range (min … max):   882.9 ms … 1078.0 ms    3 runs

Benchmark 2: ./t5401-update-hooks.sh (rev = HEAD~)
  Time (mean ± σ):      9.312 s ±  0.398 s    [User: 2.766 s, System: 6.617 s]
  Range (min … max):    8.885 s …  9.674 s    3 runs

Summary
  ./t5401-update-hooks.sh (rev = HEAD) ran
    9.47 ± 1.02 times faster than ./t5401-update-hooks.sh (rev = HEAD~)
```

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 t/t5401-update-hooks.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/t/t5401-update-hooks.sh b/t/t5401-update-hooks.sh
index 001b7a17ad..8b8bc47dc0 100755
--- a/t/t5401-update-hooks.sh
+++ b/t/t5401-update-hooks.sh
@@ -133,10 +133,8 @@ test_expect_success 'pre-receive hook that forgets to read its input' '
 	EOF
 	rm -f victim.git/hooks/update victim.git/hooks/post-update &&
 
-	for v in $(test_seq 100 999)
-	do
-		git branch branch_$v main || return
-	done &&
+	printf "create refs/heads/branch_%d main\n" $(test_seq 100 999) >input &&
+	git update-ref --stdin <input &&
 	git push ./victim.git "+refs/heads/*:refs/heads/*"
 '
 
-- 
2.43.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related

* [PATCH 07/10] t4013: simplify magic parsing and drop "failure"
From: Patrick Steinhardt @ 2023-11-29  7:25 UTC (permalink / raw)
  To: git
In-Reply-To: <cover.1701242407.git.ps@pks.im>

[-- Attachment #1: Type: text/plain, Size: 2155 bytes --]

In t14013, we have various different tests that verify whether certain
diffs are generated as expected. As much of the logic is the same across
many of the tests we some common code in there that generates the actual
test cases for us.

As some diffs are more special than others depending on the command line
parameters passed to git-diff(1), these tests need to adapt behaviour to
the specific test case sometimes. This is done via colon-prefixed magic
commands, of which we currently know "failure" and "noellipses". The
logic to parse this magic is a bit convoluted though and hard to grasp,
also due to the rather unnecessary nesting.

Un-nest the cases so that it becomes a bit more straightfoward. The
logic is further simplified by removing support for the "failure" magic,
which is not actually used anymore.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 t/t4013-diff-various.sh | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/t/t4013-diff-various.sh b/t/t4013-diff-various.sh
index 5cc17c2e0d..76b8619e2e 100755
--- a/t/t4013-diff-various.sh
+++ b/t/t4013-diff-various.sh
@@ -178,32 +178,29 @@ process_diffs () {
 V=$(git version | sed -e 's/^git version //' -e 's/\./\\./g')
 while read magic cmd
 do
-	status=success
 	case "$magic" in
 	'' | '#'*)
 		continue ;;
-	:*)
-		magic=${magic#:}
+	:noellipses)
+		magic=noellipses
 		label="$magic-$cmd"
-		case "$magic" in
-		noellipses) ;;
-		failure)
-			status=failure
-			magic=
-			label="$cmd" ;;
-		*)
-			BUG "unknown magic $magic" ;;
-		esac ;;
+		;;
+	:*)
+		BUG "unknown magic $magic"
+		;;
 	*)
-		cmd="$magic $cmd" magic=
-		label="$cmd" ;;
+		cmd="$magic $cmd"
+		magic=
+		label="$cmd"
+		;;
 	esac
+
 	test=$(echo "$label" | sed -e 's|[/ ][/ ]*|_|g')
 	pfx=$(printf "%04d" $test_count)
 	expect="$TEST_DIRECTORY/t4013/diff.$test"
 	actual="$pfx-diff.$test"
 
-	test_expect_$status "git $cmd # magic is ${magic:-(not used)}" '
+	test_expect_success "git $cmd # magic is ${magic:-(not used)}" '
 		{
 			echo "$ git $cmd"
 			case "$magic" in
-- 
2.43.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related

* [PATCH 06/10] t3310: stop checking for reference existence via `test -f`
From: Patrick Steinhardt @ 2023-11-29  7:25 UTC (permalink / raw)
  To: git
In-Reply-To: <cover.1701242407.git.ps@pks.im>

[-- Attachment #1: Type: text/plain, Size: 1314 bytes --]

One of the tests in t3310 exercises whether the special references
`NOTES_MERGE_PARTIAL` and `NOTES_MERGE_REF` exist as expected when the
notes subsystem runs into a merge conflict. This is done by checking
on-disk data structures directly though instead of asking the reference
backend.

Refactor the test to use git-rev-parse(1) instead.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 t/t3310-notes-merge-manual-resolve.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/t/t3310-notes-merge-manual-resolve.sh b/t/t3310-notes-merge-manual-resolve.sh
index 60d6ed2dc8..597df5ebc0 100755
--- a/t/t3310-notes-merge-manual-resolve.sh
+++ b/t/t3310-notes-merge-manual-resolve.sh
@@ -561,9 +561,9 @@ y and z notes on 4th commit
 EOF
 	# Fail to finalize merge
 	test_must_fail git notes merge --commit >output 2>&1 &&
-	# .git/NOTES_MERGE_* must remain
-	test -f .git/NOTES_MERGE_PARTIAL &&
-	test -f .git/NOTES_MERGE_REF &&
+	# NOTES_MERGE_* refs and .git/NOTES_MERGE_* state files must remain
+	git rev-parse --verify NOTES_MERGE_PARTIAL &&
+	git rev-parse --verify NOTES_MERGE_REF &&
 	test -f .git/NOTES_MERGE_WORKTREE/$commit_sha1 &&
 	test -f .git/NOTES_MERGE_WORKTREE/$commit_sha2 &&
 	test -f .git/NOTES_MERGE_WORKTREE/$commit_sha3 &&
-- 
2.43.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related

* [PATCH 05/10] t1417: make `reflog --updateref` tests backend agnostic
From: Patrick Steinhardt @ 2023-11-29  7:24 UTC (permalink / raw)
  To: git
In-Reply-To: <cover.1701242407.git.ps@pks.im>

[-- Attachment #1: Type: text/plain, Size: 1320 bytes --]

The tests for `git reflog delete --updateref` are currently marked to
only run with the reffiles backend. There is no inherent reason that
this should be the case other than the fact that the setup messes with
the on-disk reflogs directly.

Refactor the test to stop doing so and drop the REFFILES prerequisite.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 t/t1417-reflog-updateref.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/t/t1417-reflog-updateref.sh b/t/t1417-reflog-updateref.sh
index 14f13b57c6..0eb5e674bc 100755
--- a/t/t1417-reflog-updateref.sh
+++ b/t/t1417-reflog-updateref.sh
@@ -14,9 +14,13 @@ test_expect_success 'setup' '
 		test_commit B &&
 		test_commit C &&
 
-		cp .git/logs/HEAD HEAD.old &&
+		git reflog HEAD >expect &&
 		git reset --hard HEAD~ &&
-		cp HEAD.old .git/logs/HEAD
+		# Make sure that the reflog does not point to the same commit
+		# as HEAD.
+		git reflog delete HEAD@{0} &&
+		git reflog HEAD >actual &&
+		test_cmp expect actual
 	)
 '
 
@@ -25,7 +29,7 @@ test_reflog_updateref () {
 	shift
 	args="$@"
 
-	test_expect_success REFFILES "get '$exp' with '$args'"  '
+	test_expect_success "get '$exp' with '$args'"  '
 		test_when_finished "rm -rf copy" &&
 		cp -R repo copy &&
 
-- 
2.43.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related

* [PATCH 04/10] t1410: use test-tool to create empty reflog
From: Patrick Steinhardt @ 2023-11-29  7:24 UTC (permalink / raw)
  To: git
In-Reply-To: <cover.1701242407.git.ps@pks.im>

[-- Attachment #1: Type: text/plain, Size: 1041 bytes --]

One of the tests in t1410 is marked to be specific to the files
reference backend, which is because we create a reflog manually by
creating the respective file. Refactor the test to instead use our
`test-tool ref-store` helper to create the reflog so that it works with
other reference backends, as well.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 t/t1410-reflog.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/t/t1410-reflog.sh b/t/t1410-reflog.sh
index aeddc2fb3f..a0ff8d51f0 100755
--- a/t/t1410-reflog.sh
+++ b/t/t1410-reflog.sh
@@ -469,11 +469,11 @@ test_expect_success 'expire one of multiple worktrees' '
 	)
 '
 
-test_expect_success REFFILES 'empty reflog' '
+test_expect_success 'empty reflog' '
 	test_when_finished "rm -rf empty" &&
 	git init empty &&
 	test_commit -C empty A &&
-	>empty/.git/logs/refs/heads/foo &&
+	test-tool ref-store main create-reflog refs/heads/foo &&
 	git -C empty reflog expire --all 2>err &&
 	test_must_be_empty err
 '
-- 
2.43.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related

* [PATCH 03/10] t1401: stop treating FETCH_HEAD as real reference
From: Patrick Steinhardt @ 2023-11-29  7:24 UTC (permalink / raw)
  To: git
In-Reply-To: <cover.1701242407.git.ps@pks.im>

[-- Attachment #1: Type: text/plain, Size: 1181 bytes --]

One of the tests in t1401 asserts that we can create a symref from a
symbolic reference to a top-level reference, which is done by linking
from `refs/heads/top-level` to `FETCH_HEAD`. But `FETCH_HEAD` is not a
proper reference and doesn't even follow the loose reference format, so
it is not a good candidate for the logic under test.

Refactor the test to use `ORIG_HEAD` instead of `FETCH_HEAD`. This also
works with other backends than the reffiles one.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 t/t1401-symbolic-ref.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/t/t1401-symbolic-ref.sh b/t/t1401-symbolic-ref.sh
index c7745e1bf6..3241d35917 100755
--- a/t/t1401-symbolic-ref.sh
+++ b/t/t1401-symbolic-ref.sh
@@ -171,8 +171,8 @@ test_expect_success 'symbolic-ref refuses invalid target for non-HEAD' '
 '
 
 test_expect_success 'symbolic-ref allows top-level target for non-HEAD' '
-	git symbolic-ref refs/heads/top-level FETCH_HEAD &&
-	git update-ref FETCH_HEAD HEAD &&
+	git symbolic-ref refs/heads/top-level ORIG_HEAD &&
+	git update-ref ORIG_HEAD HEAD &&
 	test_cmp_rev top-level HEAD
 '
 
-- 
2.43.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related

* [PATCH 02/10] t1400: split up generic reflog tests from the reffile-specific ones
From: Patrick Steinhardt @ 2023-11-29  7:24 UTC (permalink / raw)
  To: git
In-Reply-To: <cover.1701242407.git.ps@pks.im>

[-- Attachment #1: Type: text/plain, Size: 3840 bytes --]

We have a bunch of tests in t1400 that check whether we correctly read
reflog entries. These tests create the reflog by manually writing to the
respective loose file, which makes them specific to the files backend.
But while some of them do indeed exercise very specific edge cases in
the reffiles backend, most of the tests exercise generic functionality
that should be common to all backends.

Unfortunately, we can't easily adapt all of the tests to work with all
backends. Instead, split out the reffile-specific tests from the ones
that should work with all backends and refactor the generic ones to not
write to the on-disk files directly anymore.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 t/t1400-update-ref.sh | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/t/t1400-update-ref.sh b/t/t1400-update-ref.sh
index 9ac4b7036b..c41cd9b6bf 100755
--- a/t/t1400-update-ref.sh
+++ b/t/t1400-update-ref.sh
@@ -354,14 +354,28 @@ test_expect_success "verifying $m's log (logged by config)" '
 '
 
 test_expect_success 'set up for querying the reflog' '
+	git update-ref -d $m &&
+	test-tool ref-store main delete-reflog $m &&
+
+	GIT_COMMITTER_DATE="1117150320 -0500" git update-ref $m $C &&
+	GIT_COMMITTER_DATE="1117150350 -0500" git update-ref $m $A &&
+	GIT_COMMITTER_DATE="1117150380 -0500" git update-ref $m $B &&
+	GIT_COMMITTER_DATE="1117150680 -0500" git update-ref $m $F &&
+	GIT_COMMITTER_DATE="1117150980 -0500" git update-ref $m $E &&
 	git update-ref $m $D &&
-	cat >.git/logs/$m <<-EOF
+	# Delete the last reflog entry so that the tip of m and the reflog for
+	# it disagree.
+	git reflog delete $m@{0} &&
+
+	cat >expect <<-EOF &&
 	$Z $C $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1117150320 -0500
 	$C $A $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1117150350 -0500
 	$A $B $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1117150380 -0500
-	$F $Z $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1117150680 -0500
-	$Z $E $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1117150980 -0500
+	$B $F $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1117150680 -0500
+	$F $E $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1117150980 -0500
 	EOF
+	test-tool ref-store main for-each-reflog-ent $m >actual &&
+	test_cmp expect actual
 '
 
 ed="Thu, 26 May 2005 18:32:00 -0500"
@@ -409,13 +423,12 @@ test_expect_success 'Query "main@{2005-05-26 23:33:01}" (middle of history with
 	test_when_finished "rm -f o e" &&
 	git rev-parse --verify "main@{2005-05-26 23:33:01}" >o 2>e &&
 	echo "$B" >expect &&
-	test_cmp expect o &&
-	test_grep -F "warning: log for ref $m has gap after $gd" e
+	test_cmp expect o
 '
 test_expect_success 'Query "main@{2005-05-26 23:38:00}" (middle of history)' '
 	test_when_finished "rm -f o e" &&
 	git rev-parse --verify "main@{2005-05-26 23:38:00}" >o 2>e &&
-	echo "$Z" >expect &&
+	echo "$F" >expect &&
 	test_cmp expect o &&
 	test_must_be_empty e
 '
@@ -436,6 +449,22 @@ test_expect_success 'Query "main@{2005-05-28}" (past end of history)' '
 
 rm -f .git/$m .git/logs/$m expect
 
+test_expect_success REFFILES 'query reflog with gap' '
+	test_when_finished "git update-ref -d $m" &&
+
+	git update-ref $m $F &&
+	cat >.git/logs/$m <<-EOF &&
+	$Z $A $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1117150320 -0500
+	$A $B $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1117150380 -0500
+	$D $F $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1117150680 -0500
+	EOF
+
+	git rev-parse --verify "main@{2005-05-26 23:33:01}" >actual 2>stderr &&
+	echo "$B" >expect &&
+	test_cmp expect actual &&
+	test_grep -F "warning: log for ref $m has gap after $gd" stderr
+'
+
 test_expect_success 'creating initial files' '
 	test_when_finished rm -f M &&
 	echo TEST >F &&
-- 
2.43.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related

* [PATCH 01/10] t0410: mark tests to require the reffiles backend
From: Patrick Steinhardt @ 2023-11-29  7:24 UTC (permalink / raw)
  To: git
In-Reply-To: <cover.1701242407.git.ps@pks.im>

[-- Attachment #1: Type: text/plain, Size: 1478 bytes --]

Two of our tests in t0410 verify whether partial clones end up with the
correct repository format version and extensions. These checks require
the reffiles backend because every other backend would by necessity bump
the repository format version to be at least 1.

Mark the tests accordingly.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 t/t0410-partial-clone.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/t/t0410-partial-clone.sh b/t/t0410-partial-clone.sh
index 5b7bee888d..6b6424b3df 100755
--- a/t/t0410-partial-clone.sh
+++ b/t/t0410-partial-clone.sh
@@ -49,7 +49,7 @@ test_expect_success 'convert shallow clone to partial clone' '
 	test_cmp_config -C client 1 core.repositoryformatversion
 '
 
-test_expect_success SHA1 'convert to partial clone with noop extension' '
+test_expect_success SHA1,REFFILES 'convert to partial clone with noop extension' '
 	rm -fr server client &&
 	test_create_repo server &&
 	test_commit -C server my_commit 1 &&
@@ -60,7 +60,7 @@ test_expect_success SHA1 'convert to partial clone with noop extension' '
 	git -C client fetch --unshallow --filter="blob:none"
 '
 
-test_expect_success SHA1 'converting to partial clone fails with unrecognized extension' '
+test_expect_success SHA1,REFFILES 'converting to partial clone fails with unrecognized extension' '
 	rm -fr server client &&
 	test_create_repo server &&
 	test_commit -C server my_commit 1 &&
-- 
2.43.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related

* [PATCH 00/10] t: more compatibility fixes with reftables
From: Patrick Steinhardt @ 2023-11-29  7:24 UTC (permalink / raw)
  To: git

[-- Attachment #1: Type: text/plain, Size: 1563 bytes --]

Hi,

this is the second patch series that refactors tests to become
compatible with the upcoming reftables backend. It's in the same spirit
as the first round of patches [1], where most of the refactorings are to
use plumbing tools to access refs or the reflog instead of modifying
on-disk data structures directly.

Patrick

[1]: <cover.1697607222.git.ps@pks.im>

Patrick Steinhardt (10):
  t0410: mark tests to require the reffiles backend
  t1400: split up generic reflog tests from the reffile-specific ones
  t1401: stop treating FETCH_HEAD as real reference
  t1410: use test-tool to create empty reflog
  t1417: make `reflog --updateref` tests backend agnostic
  t3310: stop checking for reference existence via `test -f`
  t4013: simplify magic parsing and drop "failure"
  t5401: speed up creation of many branches
  t5551: stop writing packed-refs directly
  t6301: write invalid object ID via `test-tool ref-store`

 t/t0410-partial-clone.sh              |  4 +--
 t/t1400-update-ref.sh                 | 41 +++++++++++++++++++++++----
 t/t1401-symbolic-ref.sh               |  4 +--
 t/t1410-reflog.sh                     |  4 +--
 t/t1417-reflog-updateref.sh           | 10 +++++--
 t/t3310-notes-merge-manual-resolve.sh |  6 ++--
 t/t4013-diff-various.sh               | 27 ++++++++----------
 t/t5401-update-hooks.sh               |  6 ++--
 t/t5551-http-fetch-smart.sh           |  4 ++-
 t/t6301-for-each-ref-errors.sh        | 13 ++++-----
 10 files changed, 74 insertions(+), 45 deletions(-)

-- 
2.43.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* My hub
From: Mo Mu @ 2023-11-28 23:50 UTC (permalink / raw)
  To: git


Sent from my iPhone

^ permalink raw reply

* [PATCH v5] subtree: fix split processing with multiple subtrees present
From: Zach FettersMoore via GitGitGadget @ 2023-11-28 21:17 UTC (permalink / raw)
  To: git; +Cc: Christian Couder, Zach FettersMoore, Zach FettersMoore
In-Reply-To: <pull.1587.v4.git.1698347871200.gitgitgadget@gmail.com>

From: Zach FettersMoore <zach.fetters@apollographql.com>

When there are multiple subtrees present in a repository and they are
all using 'git subtree split', the 'split' command can take a
significant (and constantly growing) amount of time to run even when
using the '--rejoin' flag. This is due to the fact that when processing
commits to determine the last known split to start from when looking
for changes, if there has been a split/merge done from another subtree
there will be 2 split commits, one mainline and one subtree, for the
second subtree that are part of the processing. The non-mainline
subtree split commit will cause the processing to always need to search
the entire history of the given subtree as part of its processing even
though those commits are totally irrelevant to the current subtree
split being run.

To see this in practice you can use the open source GitHub repo
'apollo-ios-dev' and do the following in order:

-Make a changes to a file in 'apollo-ios'A and 'apollo-ios-codegen'
 directories
-Create a commit containing these changes
-Do a split on apollo-ios-codegen
   - git subtree split --prefix=apollo-ios-codegen --squash --rejoin
-Do a split on apollo-ios
   - git subtree split --prefix=apollo-ios --squash --rejoin
-Make changes to a file in apollo-ios-codegen
-Create a commit containing the change(s)
-Do a split on apollo-ios-codegen
   - git subtree split --prefix=apollo-ios-codegen --squash --rejoin

You will see that the final split is looking for the last split
on apollo-ios-codegen to use as it's starting point to process
commits. Since there is a split commit from apollo-ios in between the
2 splits run on apollo-ios-codegen, the processing ends up traversing
the entire history of apollo-ios which increases the time it takes to
do a split based on how long of a history apollo-ios has, while none
of these commits are relevant to the split being done on
apollo-ios-codegen.

So this commit makes a change to the processing of commits for the
split command in order to ignore non-mainline commits from other
subtrees such as apollo-ios in the above breakdown by adding a new
function 'should_ignore_subtree_commit' which is called during
'process_split_commit'. This allows the split/rejoin processing to
still function as expected but removes all of the unnecessary
processing that takes place currently which greatly inflates the
processing time. In the above example, previously the final split
would take ~10-12 minutes, while after this fix it takes seconds.

Added a test to validate that the proposed fix
solves the issue.

The test accomplishes this by checking the output
of the split command to ensure the output from
the progress of 'process_split_commit' function
that represents the 'extracount' of commits
processed remains at 0, meaning none of the commits
from the second subtree were processed.

This was tested against the original functionality
to show the test failed, and then with this fix
to show the test passes.

This illustrated that when using multiple subtrees,
A and B, when doing a split on subtree B, the
processing does not traverse the entire history
of subtree A which is unnecessary and would cause
the 'extracount' of processed commits to climb
based on the number of commits in the history of
subtree A.

Signed-off-by: Zach FettersMoore <zach.fetters@apollographql.com>
---
    subtree: fix split processing with multiple subtrees present
    
    When there are multiple subtrees in a repo and git subtree split
    --rejoin is being used for the subtrees, the processing of commits for a
    new split can take a significant (and constantly growing) amount of time
    because the split commits from other subtrees cause the processing to
    have to scan the entire history of the other subtree(s). This patch
    filters out the other subtree split commits that are unnecessary for the
    split commit processing.

Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-1587%2FBobaFetters%2Fzf%2Fmulti-subtree-processing-v5
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-1587/BobaFetters/zf/multi-subtree-processing-v5
Pull-Request: https://github.com/gitgitgadget/git/pull/1587

Range-diff vs v4:

 1:  353152910eb ! 1:  e7445a95f30 subtree: fix split processing with multiple subtrees present
     @@ Commit message
          though those commits are totally irrelevant to the current subtree
          split being run.
      
     -    In the diagram below, 'M' represents the mainline repo branch, 'A'
     -    represents one subtree, and 'B' represents another. M3 and B1 represent
     -    a split commit for subtree B that was created from commit M4. M2 and A1
     -    represent a split commit made from subtree A that was also created
     -    based on changes back to and including M4. M1 represents new changes to
     -    the repo, in this scenario if you try to run a 'git subtree split
     -    --rejoin' for subtree B, commits M1, M2, and A1, will be included in
     -    the processing of changes for the new split commit since the last
     -    split/rejoin for subtree B was at M3. The issue is that by having A1
     -    included in this processing the command ends up needing to processing
     -    every commit down tree A even though none of that is needed or relevant
     -    to the current command and result.
     +    To see this in practice you can use the open source GitHub repo
     +    'apollo-ios-dev' and do the following in order:
      
     -    M1
     -     |        \       \
     -    M2         |       |
     -     |        A1       |
     -    M3         |       |
     -     |         |      B1
     -    M4         |       |
     +    -Make a changes to a file in 'apollo-ios'A and 'apollo-ios-codegen'
     +     directories
     +    -Create a commit containing these changes
     +    -Do a split on apollo-ios-codegen
     +       - git subtree split --prefix=apollo-ios-codegen --squash --rejoin
     +    -Do a split on apollo-ios
     +       - git subtree split --prefix=apollo-ios --squash --rejoin
     +    -Make changes to a file in apollo-ios-codegen
     +    -Create a commit containing the change(s)
     +    -Do a split on apollo-ios-codegen
     +       - git subtree split --prefix=apollo-ios-codegen --squash --rejoin
      
     -    So this commit makes a change to the processing of commits for the split
     -    command in order to ignore non-mainline commits from other subtrees such
     -    as A1 in the diagram by adding a new function
     -    'should_ignore_subtree_commit' which is called during
     -    'process_split_commit'. This allows the split/rejoin processing to still
     -    function as expected but removes all of the unnecessary processing that
     -    takes place currently which greatly inflates the processing time.
     +    You will see that the final split is looking for the last split
     +    on apollo-ios-codegen to use as it's starting point to process
     +    commits. Since there is a split commit from apollo-ios in between the
     +    2 splits run on apollo-ios-codegen, the processing ends up traversing
     +    the entire history of apollo-ios which increases the time it takes to
     +    do a split based on how long of a history apollo-ios has, while none
     +    of these commits are relevant to the split being done on
     +    apollo-ios-codegen.
     +
     +    So this commit makes a change to the processing of commits for the
     +    split command in order to ignore non-mainline commits from other
     +    subtrees such as apollo-ios in the above breakdown by adding a new
     +    function 'should_ignore_subtree_commit' which is called during
     +    'process_split_commit'. This allows the split/rejoin processing to
     +    still function as expected but removes all of the unnecessary
     +    processing that takes place currently which greatly inflates the
     +    processing time. In the above example, previously the final split
     +    would take ~10-12 minutes, while after this fix it takes seconds.
      
          Added a test to validate that the proposed fix
          solves the issue.
     @@ Commit message
          of the split command to ensure the output from
          the progress of 'process_split_commit' function
          that represents the 'extracount' of commits
     -    processed does not increment.
     +    processed remains at 0, meaning none of the commits
     +    from the second subtree were processed.
      
          This was tested against the original functionality
          to show the test failed, and then with this fix
     @@ contrib/subtree/git-subtree.sh: ensure_valid_ref_format () {
      +# Usage: check if a commit from another subtree should be
      +# ignored from processing for splits
      +should_ignore_subtree_split_commit () {
     -+  if test -n "$(git log -1 --grep="git-subtree-dir:" $1)"
     -+  then
     -+    if test -z "$(git log -1 --grep="git-subtree-mainline:" $1)" &&
     -+			test -z "$(git log -1 --grep="git-subtree-dir: $arg_prefix$" $1)"
     -+    then
     -+      return 0
     -+    fi
     -+  fi
     -+  return 1
     ++	assert test $# = 1
     ++	local rev="$1"
     ++	if test -n "$(git log -1 --grep="git-subtree-dir:" $rev)"
     ++	then
     ++		if test -z "$(git log -1 --grep="git-subtree-mainline:" $rev)" &&
     ++			test -z "$(git log -1 --grep="git-subtree-dir: $arg_prefix$" $rev)"
     ++		then
     ++			return 0
     ++		fi
     ++	fi
     ++	return 1
      +}
      +
       # Usage: process_split_commit REV PARENTS
     @@ contrib/subtree/git-subtree.sh: cmd_split () {
      +		then
      +			continue
      +		fi
     -+		parsedParents=''
     ++		parsedparents=''
      +		for parent in $parents
      +		do
     -+			should_ignore_subtree_split_commit "$parent"
     -+			if test $? -eq 1
     ++			if ! should_ignore_subtree_split_commit "$parent"
      +			then
     -+				parsedParents+="$parent "
     ++				parsedparents="$parsedparents$parent "
      +			fi
      +		done
     -+		process_split_commit "$rev" "$parsedParents"
     ++		process_split_commit "$rev" "$parsedparents"
       	done || exit $?
       
       	latest_new=$(cache_get latest_new) || exit $?
     @@ contrib/subtree/t/t7900-subtree.sh: test_expect_success 'split sub dir/ with --r
       	)
       '
       
     ++# Tests that commits from other subtrees are not processed as
     ++# part of a split.
     ++#
     ++# This test performs the following:
     ++# - Creates Repo with subtrees 'subA' and 'subB'
     ++# - Creates commits in the repo including changes to subtrees
     ++# - Runs the following 'split' and commit' commands in order:
     ++# 	- Perform 'split' on subtree A
     ++# 	- Perform 'split' on subtree B
     ++# 	- Create new commits with changes to subtree A and B
     ++# 	- Perform split on subtree A
     ++# 	- Check that the commits in subtree B are not processed
     ++#			as part of the subtree A split
      +test_expect_success 'split with multiple subtrees' '
      +	subtree_test_create_repo "$test_count" &&
      +	subtree_test_create_repo "$test_count/subA" &&
     @@ contrib/subtree/t/t7900-subtree.sh: test_expect_success 'split sub dir/ with --r
      +	test_create_commit "$test_count/subA" subA2 &&
      +	test_create_commit "$test_count/subA" subA3 &&
      +	test_create_commit "$test_count/subB" subB1 &&
     -+	(
     -+		cd "$test_count" &&
     -+		git fetch ./subA HEAD &&
     -+		git subtree add --prefix=subADir FETCH_HEAD
     -+	) &&
     -+	(
     -+		cd "$test_count" &&
     -+		git fetch ./subB HEAD &&
     -+		git subtree add --prefix=subBDir FETCH_HEAD
     -+	) &&
     ++	git -C "$test_count" fetch ./subA HEAD &&
     ++	git -C "$test_count" subtree add --prefix=subADir FETCH_HEAD &&
     ++	git -C "$test_count" fetch ./subB HEAD &&
     ++	git -C "$test_count" subtree add --prefix=subBDir FETCH_HEAD &&
      +	test_create_commit "$test_count" subADir/main-subA1 &&
      +	test_create_commit "$test_count" subBDir/main-subB1 &&
     -+	(
     -+		cd "$test_count" &&
     -+		git subtree split --prefix=subADir --squash --rejoin -m "Sub A Split 1"
     -+	) &&
     -+	(
     -+		cd "$test_count" &&
     -+		git subtree split --prefix=subBDir --squash --rejoin -m "Sub B Split 1"
     -+	) &&
     ++	git -C "$test_count" subtree split --prefix=subADir \
     ++		--squash --rejoin -m "Sub A Split 1" &&
     ++	git -C "$test_count" subtree split --prefix=subBDir \
     ++		--squash --rejoin -m "Sub B Split 1" &&
      +	test_create_commit "$test_count" subADir/main-subA2 &&
      +	test_create_commit "$test_count" subBDir/main-subB2 &&
     -+	(
     -+		cd "$test_count" &&
     -+		git subtree split --prefix=subADir --squash --rejoin -m "Sub A Split 2"
     -+	) &&
     -+	(
     -+		cd "$test_count" &&
     -+		test "$(git subtree split --prefix=subBDir --squash --rejoin \
     -+		 -d -m "Sub B Split 1" 2>&1 | grep -w "\[1\]")" = ""
     -+	)
     ++	git -C "$test_count" subtree split --prefix=subADir \
     ++		--squash --rejoin -m "Sub A Split 2" &&
     ++	test "$(git -C "$test_count" subtree split --prefix=subBDir \
     ++		--squash --rejoin -d -m "Sub B Split 1" 2>&1 | grep -w "\[1\]")" = ""
      +'
      +
       test_expect_success 'split sub dir/ with --rejoin from scratch' '


 contrib/subtree/git-subtree.sh     | 30 +++++++++++++++++++++-
 contrib/subtree/t/t7900-subtree.sh | 40 ++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/contrib/subtree/git-subtree.sh b/contrib/subtree/git-subtree.sh
index e0c5d3b0de6..a0bf958ea66 100755
--- a/contrib/subtree/git-subtree.sh
+++ b/contrib/subtree/git-subtree.sh
@@ -778,6 +778,22 @@ ensure_valid_ref_format () {
 		die "fatal: '$1' does not look like a ref"
 }
 
+# Usage: check if a commit from another subtree should be
+# ignored from processing for splits
+should_ignore_subtree_split_commit () {
+	assert test $# = 1
+	local rev="$1"
+	if test -n "$(git log -1 --grep="git-subtree-dir:" $rev)"
+	then
+		if test -z "$(git log -1 --grep="git-subtree-mainline:" $rev)" &&
+			test -z "$(git log -1 --grep="git-subtree-dir: $arg_prefix$" $rev)"
+		then
+			return 0
+		fi
+	fi
+	return 1
+}
+
 # Usage: process_split_commit REV PARENTS
 process_split_commit () {
 	assert test $# = 2
@@ -963,7 +979,19 @@ cmd_split () {
 	eval "$grl" |
 	while read rev parents
 	do
-		process_split_commit "$rev" "$parents"
+		if should_ignore_subtree_split_commit "$rev"
+		then
+			continue
+		fi
+		parsedparents=''
+		for parent in $parents
+		do
+			if ! should_ignore_subtree_split_commit "$parent"
+			then
+				parsedparents="$parsedparents$parent "
+			fi
+		done
+		process_split_commit "$rev" "$parsedparents"
 	done || exit $?
 
 	latest_new=$(cache_get latest_new) || exit $?
diff --git a/contrib/subtree/t/t7900-subtree.sh b/contrib/subtree/t/t7900-subtree.sh
index 49a21dd7c9c..ca4df5be832 100755
--- a/contrib/subtree/t/t7900-subtree.sh
+++ b/contrib/subtree/t/t7900-subtree.sh
@@ -385,6 +385,46 @@ test_expect_success 'split sub dir/ with --rejoin' '
 	)
 '
 
+# Tests that commits from other subtrees are not processed as
+# part of a split.
+#
+# This test performs the following:
+# - Creates Repo with subtrees 'subA' and 'subB'
+# - Creates commits in the repo including changes to subtrees
+# - Runs the following 'split' and commit' commands in order:
+# 	- Perform 'split' on subtree A
+# 	- Perform 'split' on subtree B
+# 	- Create new commits with changes to subtree A and B
+# 	- Perform split on subtree A
+# 	- Check that the commits in subtree B are not processed
+#			as part of the subtree A split
+test_expect_success 'split with multiple subtrees' '
+	subtree_test_create_repo "$test_count" &&
+	subtree_test_create_repo "$test_count/subA" &&
+	subtree_test_create_repo "$test_count/subB" &&
+	test_create_commit "$test_count" main1 &&
+	test_create_commit "$test_count/subA" subA1 &&
+	test_create_commit "$test_count/subA" subA2 &&
+	test_create_commit "$test_count/subA" subA3 &&
+	test_create_commit "$test_count/subB" subB1 &&
+	git -C "$test_count" fetch ./subA HEAD &&
+	git -C "$test_count" subtree add --prefix=subADir FETCH_HEAD &&
+	git -C "$test_count" fetch ./subB HEAD &&
+	git -C "$test_count" subtree add --prefix=subBDir FETCH_HEAD &&
+	test_create_commit "$test_count" subADir/main-subA1 &&
+	test_create_commit "$test_count" subBDir/main-subB1 &&
+	git -C "$test_count" subtree split --prefix=subADir \
+		--squash --rejoin -m "Sub A Split 1" &&
+	git -C "$test_count" subtree split --prefix=subBDir \
+		--squash --rejoin -m "Sub B Split 1" &&
+	test_create_commit "$test_count" subADir/main-subA2 &&
+	test_create_commit "$test_count" subBDir/main-subB2 &&
+	git -C "$test_count" subtree split --prefix=subADir \
+		--squash --rejoin -m "Sub A Split 2" &&
+	test "$(git -C "$test_count" subtree split --prefix=subBDir \
+		--squash --rejoin -d -m "Sub B Split 1" 2>&1 | grep -w "\[1\]")" = ""
+'
+
 test_expect_success 'split sub dir/ with --rejoin from scratch' '
 	subtree_test_create_repo "$test_count" &&
 	test_create_commit "$test_count" main1 &&

base-commit: bda494f4043963b9ec9a1ecd4b19b7d1cd9a0518
-- 
gitgitgadget

^ permalink raw reply related

* Re: [PATCH v4] subtree: fix split processing with multiple subtrees present
From: Zach FettersMoore @ 2023-11-28 21:04 UTC (permalink / raw)
  To: Christian Couder; +Cc: Zach FettersMoore via GitGitGadget, git
In-Reply-To: <CAP8UFD18Hh=m8HQibAgZW1KNAn6zg_rxe9asg0ViC5z27W=Smw@mail.gmail.com>

>> In the diagram below, 'M' represents the mainline repo branch, 'A'
>> represents one subtree, and 'B' represents another. M3 and B1 represent
>> a split commit for subtree B that was created from commit M4. M2 and A1
>> represent a split commit made from subtree A that was also created
>> based on changes back to and including M4. M1 represents new changes to
>> the repo, in this scenario if you try to run a 'git subtree split
>> --rejoin' for subtree B, commits M1, M2, and A1, will be included in
>> the processing of changes for the new split commit since the last
>> split/rejoin for subtree B was at M3. The issue is that by having A1
>> included in this processing the command ends up needing to processing
>> every commit down tree A even though none of that is needed or relevant
>> to the current command and result.
>>
>> M1
>>  |        \       \
>> M2         |       |
>>  |        A1       |
>> M3         |       |
>>  |         |      B1
>> M4         |       |

> About the above, Junio already commented the following:
>
> -> The above paragraph explains which different things you drew in the
> -> diagram are representing, but it is not clear how they relate to
> -> each other.  Do they for example depict parent-child commit
> -> relationship?  What are the wide gaps between these three tracks and
> -> what are the short angled lines leaning to the left near the tip?
> -> Is the time/topology flowing from bottom to top?
>
> and it doesn't look like you have addressed that comment.
>
> When you say "M3 and B1 represent a split commit for subtree B that
> was created from commit M4." I am not sure what it means exactly.
> Could you give example commands that could have created the M3 and B1
> commits?

I removed the diagram from the commit message since it seems a little
unclear, and in its place I added an example of an open source repo
(which I am currently using the fix in) and the commands to replicate
the issue. Hopefully that better illustrates how I came across the issue
and what it is.

>> So this commit makes a change to the processing of commits for the split
>> command in order to ignore non-mainline commits from other subtrees such
>> as A1 in the diagram by adding a new function
>> 'should_ignore_subtree_commit' which is called during
>> 'process_split_commit'. This allows the split/rejoin processing to still
>> function as expected but removes all of the unnecessary processing that
>> takes place currently which greatly inflates the processing time.

> Could you tell a bit more what kind of processing time reduction is or
> would be possible on what kind of repo? Have you benchmark-ed or just
> timed this somehow on one of your repos or better on an open source
> repo (so that we could reproduce if we wanted)?

I added some extra info for this to the commit message as well, but to
answer your question yes I discovered and benchmarked this issue in a
repo I help maintain. I was seeing splits take upwards of 12 minutes
before the fix, and after they were taking only seconds. Also provided
infor on the repo and how to reproduce in the updated commit message.

>> Added a test to validate that the proposed fix
>> solves the issue.
>>
>> The test accomplishes this by checking the output
>> of the split command to ensure the output from
>> the progress of 'process_split_commit' function
>> that represents the 'extracount' of commits
>> processed does not increment.

> Does not increment compared to what?

I reworded this to say the 'extracount' remains at 0 since
there should be no extra processed commits from the second subtree
in the test.

>> This was tested against the original functionality
>> to show the test failed, and then with this fix
>> to show the test passes.
>>
>> This illustrated that when using multiple subtrees,
>> A and B, when doing a split on subtree B, the
>> processing does not traverse the entire history
>> of subtree A which is unnecessary and would cause
>> the 'extracount' of processed commits to climb
>> based on the number of commits in the history of
>> subtree A.

> Does this mean that the test checks that the extracount is the same
> when subtree A exists as when it doesn't exist?

This means the test is checking that the 'extracount' remains at
0, because anything above 0 would mean commits from subtree A were
being processed, which is where the issue stems from.

>>  contrib/subtree/git-subtree.sh     | 29 ++++++++++++++++++++-
>>  contrib/subtree/t/t7900-subtree.sh | 42 ++++++++++++++++++++++++++++++
>>  2 files changed, 70 insertions(+), 1 deletion(-)
>>
>> diff --git a/contrib/subtree/git-subtree.sh b/contrib/subtree/git-subtree=
>.sh
>> index e0c5d3b0de6..e69991a9d80 100755
>> --- a/contrib/subtree/git-subtree.sh
>> +++ b/contrib/subtree/git-subtree.sh
>> @@ -778,6 +778,20 @@ ensure_valid_ref_format () {
>>                 die "fatal: '$1' does not look like a ref"
>>  }
>>
>> +# Usage: check if a commit from another subtree should be
>> +# ignored from processing for splits
>> +should_ignore_subtree_split_commit () {

> Maybe adding:
>
>     assert test $# =3D 1
>     local rev=3D"$1"
>
> here, and using $rev instead of $1 in this function could make things
> a bit clearer and similar to what is done in other functions.

Updated.

>> +  if test -n "$(git log -1 --grep=3D"git-subtree-dir:" $1)"
>> +  then
>> +    if test -z "$(git log -1 --grep=3D"git-subtree-mainline:" $1)" &&
>> +                       test -z "$(git log -1 --grep=3D"git-subtree-dir: =
>>  $arg_prefix$" $1)"
>> +    then
>> +      return 0
>> +    fi
>> +  fi
>> +  return 1
>> +}

> The above doesn't seem to be properly indented. We use tabs not spaces.

Fixed.

>>  # Usage: process_split_commit REV PARENTS
>>  process_split_commit () {
>>         assert test $# =3D 2
>> @@ -963,7 +977,20 @@ cmd_split () {
>>         eval "$grl" |
>>         while read rev parents
>>         do
>> -               process_split_commit "$rev" "$parents"
>> +               if should_ignore_subtree_split_commit "$rev"
>> +               then
>> +                       continue
>> +               fi
>> +               parsedParents=3D''

> It seems to me that we name variables "parsed_parents" (or sometimes
> "parsedparents") rather than "parsedParents".

Fixed.

>> +               for parent in $parents
>> +               do
>> +                       should_ignore_subtree_split_commit "$parent"
>> +                       if test $? -eq 1

> I think the 2 lines above could be replaced by:
>
> +                       if ! should_ignore_subtree_split_commit "$parent"

Updated.

>> +                       then
>> +                               parsedParents+=3D"$parent "

> It doesn't seem to me that we use "+=3D" much in our shell scripts.
> https://www.shellcheck.net/ emits the following:
>
> (warning): In POSIX sh, +=3D is undefined.
>
> so I guess we don't use it because it's not available in some usual shells.
>
> (I haven't checked the script with https://www.shellcheck.net/ before
> and after your patch, but it might help avoid bash-isms and such
> issues.)

Updated this to remove the '+=' usage.

>> +                       fi
>> +               done
>> +               process_split_commit "$rev" "$parsedParents"
>>         done || exit $?

> It looks like we use "exit $?" a lot in git-subtree.sh while we use
> just "exit" most often elsewhere. Not sure why.

Yea I am unsure of the reasoning of that, I was just trying to follow the
what the existing script was already doing.

>>         latest_new=3D$(cache_get latest_new) || exit $?
>> diff --git a/contrib/subtree/t/t7900-subtree.sh b/contrib/subtree/t/t7900=
>> -subtree.sh
>> index 49a21dd7c9c..87d59afd761 100755
>> --- a/contrib/subtree/t/t7900-subtree.sh
>> +++ b/contrib/subtree/t/t7900-subtree.sh
>> @@ -385,6 +385,48 @@ test_expect_success 'split sub dir/ with --rejoin' '
>>         )
>>  '
>>
>> +test_expect_success 'split with multiple subtrees' '
>> +       subtree_test_create_repo "$test_count" &&
>> +       subtree_test_create_repo "$test_count/subA" &&
>> +       subtree_test_create_repo "$test_count/subB" &&
>> +       test_create_commit "$test_count" main1 &&
>> +       test_create_commit "$test_count/subA" subA1 &&
>> +       test_create_commit "$test_count/subA" subA2 &&
>> +       test_create_commit "$test_count/subA" subA3 &&
>> +       test_create_commit "$test_count/subB" subB1 &&
>> +       (
>> +               cd "$test_count" &&
>> +               git fetch ./subA HEAD &&
>> +               git subtree add --prefix=3DsubADir FETCH_HEAD
>> +       ) &&
>> +       (
>> +               cd "$test_count" &&
>> +               git fetch ./subB HEAD &&
>> +               git subtree add --prefix=3DsubBDir FETCH_HEAD
>> +       ) &&
>> +       test_create_commit "$test_count" subADir/main-subA1 &&
>> +       test_create_commit "$test_count" subBDir/main-subB1 &&
>> +       (
>> +               cd "$test_count" &&
>> +               git subtree split --prefix=3DsubADir --squash --rejoin -m=
>> "Sub A Split 1"
>> +       ) &&

> Not sure why there are so many sub-shells used, and why the -C option
> is not used instead to tell Git to work in a subdirectory. I guess you
> copied what most existing (old) tests in this test script do.
>
> For example perhaps the 4 line above could be replaced by just:
>
> +               git -C "$test_count" subtree split --prefix=3DsubADir
> --squash --rejoin -m "Sub A Split 1" &&

Yea I was following what was being done in other existing tests, although
this seems like a better way to do this so I updated the test to remove
the extra sub-shells.

>> +       (
>> +               cd "$test_count" &&
>> +               git subtree split --prefix=3DsubBDir --squash --rejoin -m=
>> "Sub B Split 1"
>> +       ) &&
>> +       test_create_commit "$test_count" subADir/main-subA2 &&
>> +       test_create_commit "$test_count" subBDir/main-subB2 &&
>> +       (
>> +               cd "$test_count" &&
>> +               git subtree split --prefix=3DsubADir --squash --rejoin -m=
>> "Sub A Split 2"
>> +       ) &&
>> +       (
>> +               cd "$test_count" &&
>> +               test "$(git subtree split --prefix=3DsubBDir --squash --r=
>> ejoin \
>> +                -d -m "Sub B Split 1" 2>&1 | grep -w "\[1\]")" =3D ""
>> +       )
>> +'

> It's not clear to me what the test is doing. Maybe you could split it
> into 2 tests. Perhaps one setting up a repo with multiple subtrees and
> one checking that a new split ignores other subtree split commits.
> Perhaps adding a few comments would help too.

Added some comments before the test to describe the steps the test is taking in
order to verify the desired behavior.


On Sat, Nov 18, 2023 at 6:28 AM Christian Couder
<christian.couder@gmail.com> wrote:
>
> On Thu, Oct 26, 2023 at 9:18 PM Zach FettersMoore via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
> >
> > From: Zach FettersMoore <zach.fetters@apollographql.com>
> >
> > When there are multiple subtrees present in a repository and they are
> > all using 'git subtree split', the 'split' command can take a
> > significant (and constantly growing) amount of time to run even when
> > using the '--rejoin' flag. This is due to the fact that when processing
> > commits to determine the last known split to start from when looking
> > for changes, if there has been a split/merge done from another subtree
> > there will be 2 split commits, one mainline and one subtree, for the
> > second subtree that are part of the processing. The non-mainline
> > subtree split commit will cause the processing to always need to search
> > the entire history of the given subtree as part of its processing even
> > though those commits are totally irrelevant to the current subtree
> > split being run.
>
> Thanks for your continued work on this!
>
> I am not familiar with git subtree so I might miss obvious things. On
> the other hand, my comments might help increase a bit the number of
> people who could review this patch.
>
> > In the diagram below, 'M' represents the mainline repo branch, 'A'
> > represents one subtree, and 'B' represents another. M3 and B1 represent
> > a split commit for subtree B that was created from commit M4. M2 and A1
> > represent a split commit made from subtree A that was also created
> > based on changes back to and including M4. M1 represents new changes to
> > the repo, in this scenario if you try to run a 'git subtree split
> > --rejoin' for subtree B, commits M1, M2, and A1, will be included in
> > the processing of changes for the new split commit since the last
> > split/rejoin for subtree B was at M3. The issue is that by having A1
> > included in this processing the command ends up needing to processing
> > every commit down tree A even though none of that is needed or relevant
> > to the current command and result.
> >
> > M1
> >  |        \       \
> > M2         |       |
> >  |        A1       |
> > M3         |       |
> >  |         |      B1
> > M4         |       |
>
> About the above, Junio already commented the following:
>
> -> The above paragraph explains which different things you drew in the
> -> diagram are representing, but it is not clear how they relate to
> -> each other.  Do they for example depict parent-child commit
> -> relationship?  What are the wide gaps between these three tracks and
> -> what are the short angled lines leaning to the left near the tip?
> -> Is the time/topology flowing from bottom to top?
>
> and it doesn't look like you have addressed that comment.
>
> When you say "M3 and B1 represent a split commit for subtree B that
> was created from commit M4." I am not sure what it means exactly.
> Could you give example commands that could have created the M3 and B1
> commits?
>
> > So this commit makes a change to the processing of commits for the split
> > command in order to ignore non-mainline commits from other subtrees such
> > as A1 in the diagram by adding a new function
> > 'should_ignore_subtree_commit' which is called during
> > 'process_split_commit'. This allows the split/rejoin processing to still
> > function as expected but removes all of the unnecessary processing that
> > takes place currently which greatly inflates the processing time.
>
> Could you tell a bit more what kind of processing time reduction is or
> would be possible on what kind of repo? Have you benchmark-ed or just
> timed this somehow on one of your repos or better on an open source
> repo (so that we could reproduce if we wanted)?
>
> > Added a test to validate that the proposed fix
> > solves the issue.
> >
> > The test accomplishes this by checking the output
> > of the split command to ensure the output from
> > the progress of 'process_split_commit' function
> > that represents the 'extracount' of commits
> > processed does not increment.
>
> Does not increment compared to what?
>
> > This was tested against the original functionality
> > to show the test failed, and then with this fix
> > to show the test passes.
> >
> > This illustrated that when using multiple subtrees,
> > A and B, when doing a split on subtree B, the
> > processing does not traverse the entire history
> > of subtree A which is unnecessary and would cause
> > the 'extracount' of processed commits to climb
> > based on the number of commits in the history of
> > subtree A.
>
> Does this mean that the test checks that the extracount is the same
> when subtree A exists as when it doesn't exist?
>
> [...]
>
> >  contrib/subtree/git-subtree.sh     | 29 ++++++++++++++++++++-
> >  contrib/subtree/t/t7900-subtree.sh | 42 ++++++++++++++++++++++++++++++
> >  2 files changed, 70 insertions(+), 1 deletion(-)
> >
> > diff --git a/contrib/subtree/git-subtree.sh b/contrib/subtree/git-subtree.sh
> > index e0c5d3b0de6..e69991a9d80 100755
> > --- a/contrib/subtree/git-subtree.sh
> > +++ b/contrib/subtree/git-subtree.sh
> > @@ -778,6 +778,20 @@ ensure_valid_ref_format () {
> >                 die "fatal: '$1' does not look like a ref"
> >  }
> >
> > +# Usage: check if a commit from another subtree should be
> > +# ignored from processing for splits
> > +should_ignore_subtree_split_commit () {
>
> Maybe adding:
>
>     assert test $# = 1
>     local rev="$1"
>
> here, and using $rev instead of $1 in this function could make things
> a bit clearer and similar to what is done in other functions.
>
> > +  if test -n "$(git log -1 --grep="git-subtree-dir:" $1)"
> > +  then
> > +    if test -z "$(git log -1 --grep="git-subtree-mainline:" $1)" &&
> > +                       test -z "$(git log -1 --grep="git-subtree-dir: $arg_prefix$" $1)"
> > +    then
> > +      return 0
> > +    fi
> > +  fi
> > +  return 1
> > +}
>
> The above doesn't seem to be properly indented. We use tabs not spaces.
>
> >  # Usage: process_split_commit REV PARENTS
> >  process_split_commit () {
> >         assert test $# = 2
> > @@ -963,7 +977,20 @@ cmd_split () {
> >         eval "$grl" |
> >         while read rev parents
> >         do
> > -               process_split_commit "$rev" "$parents"
> > +               if should_ignore_subtree_split_commit "$rev"
> > +               then
> > +                       continue
> > +               fi
> > +               parsedParents=''
>
> It seems to me that we name variables "parsed_parents" (or sometimes
> "parsedparents") rather than "parsedParents".
>
> > +               for parent in $parents
> > +               do
> > +                       should_ignore_subtree_split_commit "$parent"
> > +                       if test $? -eq 1
>
> I think the 2 lines above could be replaced by:
>
> +                       if ! should_ignore_subtree_split_commit "$parent"
>
> > +                       then
> > +                               parsedParents+="$parent "
>
> It doesn't seem to me that we use "+=" much in our shell scripts.
> https://www.shellcheck.net/ emits the following:
>
> (warning): In POSIX sh, += is undefined.
>
> so I guess we don't use it because it's not available in some usual shells.
>
> (I haven't checked the script with https://www.shellcheck.net/ before
> and after your patch, but it might help avoid bash-isms and such
> issues.)
>
> > +                       fi
> > +               done
> > +               process_split_commit "$rev" "$parsedParents"
> >         done || exit $?
>
> It looks like we use "exit $?" a lot in git-subtree.sh while we use
> just "exit" most often elsewhere. Not sure why.
>
> >         latest_new=$(cache_get latest_new) || exit $?
> > diff --git a/contrib/subtree/t/t7900-subtree.sh b/contrib/subtree/t/t7900-subtree.sh
> > index 49a21dd7c9c..87d59afd761 100755
> > --- a/contrib/subtree/t/t7900-subtree.sh
> > +++ b/contrib/subtree/t/t7900-subtree.sh
> > @@ -385,6 +385,48 @@ test_expect_success 'split sub dir/ with --rejoin' '
> >         )
> >  '
> >
> > +test_expect_success 'split with multiple subtrees' '
> > +       subtree_test_create_repo "$test_count" &&
> > +       subtree_test_create_repo "$test_count/subA" &&
> > +       subtree_test_create_repo "$test_count/subB" &&
> > +       test_create_commit "$test_count" main1 &&
> > +       test_create_commit "$test_count/subA" subA1 &&
> > +       test_create_commit "$test_count/subA" subA2 &&
> > +       test_create_commit "$test_count/subA" subA3 &&
> > +       test_create_commit "$test_count/subB" subB1 &&
> > +       (
> > +               cd "$test_count" &&
> > +               git fetch ./subA HEAD &&
> > +               git subtree add --prefix=subADir FETCH_HEAD
> > +       ) &&
> > +       (
> > +               cd "$test_count" &&
> > +               git fetch ./subB HEAD &&
> > +               git subtree add --prefix=subBDir FETCH_HEAD
> > +       ) &&
> > +       test_create_commit "$test_count" subADir/main-subA1 &&
> > +       test_create_commit "$test_count" subBDir/main-subB1 &&
> > +       (
> > +               cd "$test_count" &&
> > +               git subtree split --prefix=subADir --squash --rejoin -m "Sub A Split 1"
> > +       ) &&
>
> Not sure why there are so many sub-shells used, and why the -C option
> is not used instead to tell Git to work in a subdirectory. I guess you
> copied what most existing (old) tests in this test script do.
>
> For example perhaps the 4 line above could be replaced by just:
>
> +               git -C "$test_count" subtree split --prefix=subADir
> --squash --rejoin -m "Sub A Split 1" &&
>
> > +       (
> > +               cd "$test_count" &&
> > +               git subtree split --prefix=subBDir --squash --rejoin -m "Sub B Split 1"
> > +       ) &&
> > +       test_create_commit "$test_count" subADir/main-subA2 &&
> > +       test_create_commit "$test_count" subBDir/main-subB2 &&
> > +       (
> > +               cd "$test_count" &&
> > +               git subtree split --prefix=subADir --squash --rejoin -m "Sub A Split 2"
> > +       ) &&
> > +       (
> > +               cd "$test_count" &&
> > +               test "$(git subtree split --prefix=subBDir --squash --rejoin \
> > +                -d -m "Sub B Split 1" 2>&1 | grep -w "\[1\]")" = ""
> > +       )
> > +'
>
> It's not clear to me what the test is doing. Maybe you could split it
> into 2 tests. Perhaps one setting up a repo with multiple subtrees and
> one checking that a new split ignores other subtree split commits.
> Perhaps adding a few comments would help too.
>
> Best,
> Christian.

^ permalink raw reply

* [PATCH 24/24] t/perf: add performance tests for multi-pack reuse
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

To ensure that we don't regress either the size or runtime performance
of multi-pack reuse, add a performance test to measure both of these.

The test partitions the objects in GIT_TEST_PERF_LARGE_REPO into 1, 10,
and 100 packs, and then tries to perform a "clone" at each stage with
both single- and multi-pack reuse enabled.

Note that the `repack_into_n_chunks()` function in this new test script
differs from the existing `repack_into_n()`. The former partitions the
repository into N equal-sized chunks, while the latter produces N packs
of five commits each (plus their objects), and then another pack with
the remainder.

On git.git, I can produce the following results on my machine:

    Test                                                            this tree
    --------------------------------------------------------------------------------
    5332.3: clone for 1-pack scenario (single-pack reuse)           1.57(2.99+0.15)
    5332.4: clone size for 1-pack scenario (single-pack reuse)               231.8M
    5332.5: clone for 1-pack scenario (multi-pack reuse)            1.79(2.96+0.21)
    5332.6: clone size for 1-pack scenario (multi-pack reuse)                231.7M
    5332.9: clone for 10-pack scenario (single-pack reuse)          3.89(16.75+0.35)
    5332.10: clone size for 10-pack scenario (single-pack reuse)             209.9M
    5332.11: clone for 10-pack scenario (multi-pack reuse)          1.56(2.99+0.17)
    5332.12: clone size for 10-pack scenario (multi-pack reuse)              224.4M
    5332.15: clone for 100-pack scenario (single-pack reuse)        8.24(54.31+0.59)
    5332.16: clone size for 100-pack scenario (single-pack reuse)            278.3M
    5332.17: clone for 100-pack scenario (multi-pack reuse)         2.13(2.44+0.33)
    5332.18: clone size for 100-pack scenario (multi-pack reuse)             357.9M

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/perf/p5332-multi-pack-reuse.sh | 81 ++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100755 t/perf/p5332-multi-pack-reuse.sh

diff --git a/t/perf/p5332-multi-pack-reuse.sh b/t/perf/p5332-multi-pack-reuse.sh
new file mode 100755
index 0000000000..5c6c575d62
--- /dev/null
+++ b/t/perf/p5332-multi-pack-reuse.sh
@@ -0,0 +1,81 @@
+#!/bin/sh
+
+test_description='tests pack performance with multi-pack reuse'
+
+. ./perf-lib.sh
+. "${TEST_DIRECTORY}/perf/lib-pack.sh"
+
+packdir=.git/objects/pack
+
+test_perf_large_repo
+
+find_pack () {
+	for idx in $packdir/pack-*.idx
+	do
+		if git show-index <$idx | grep -q "$1"
+		then
+			basename $idx
+		fi || return 1
+	done
+}
+
+repack_into_n_chunks () {
+	git repack -adk &&
+
+	test "$1" -eq 1 && return ||
+
+	find $packdir -type f | sort >packs.before &&
+
+	# partition the repository into $1 chunks of consecutive commits, and
+	# then create $1 packs with the objects reachable from each chunk
+	# (excluding any objects reachable from the previous chunks)
+	sz="$(($(git rev-list --count --all) / $1))"
+	for rev in $(git rev-list --all | awk "NR % $sz == 0" | tac)
+	do
+		pack="$(echo "$rev" | git pack-objects --revs \
+			--honor-pack-keep --delta-base-offset $packdir/pack)" &&
+		touch $packdir/pack-$pack.keep || return 1
+	done
+
+	# grab any remaining objects not packed by the previous step(s)
+	git pack-objects --revs --all --honor-pack-keep --delta-base-offset \
+		$packdir/pack &&
+
+	find $packdir -type f | sort >packs.after &&
+
+	# and install the whole thing
+	for f in $(comm -12 packs.before packs.after)
+	do
+		rm -f "$f" || return 1
+	done
+	rm -fr $packdir/*.keep
+}
+
+for nr_packs in 1 10 100
+do
+	test_expect_success "create $nr_packs-pack scenario" '
+		repack_into_n_chunks $nr_packs
+	'
+
+	test_expect_success "setup bitmaps for $nr_packs-pack scenario" '
+		find $packdir -type f -name "*.idx" | sed -e "s/.*\/\(.*\)$/+\1/g" |
+		git multi-pack-index write --stdin-packs --bitmap \
+			--preferred-pack="$(find_pack $(git rev-parse HEAD))"
+	'
+
+	for reuse in single multi
+	do
+		test_perf "clone for $nr_packs-pack scenario ($reuse-pack reuse)" "
+			git for-each-ref --format='%(objectname)' refs/heads refs/tags >in &&
+			git -c pack.allowPackReuse=$reuse pack-objects \
+				--revs --delta-base-offset --use-bitmap-index \
+				--stdout <in >result
+		"
+
+		test_size "clone size for $nr_packs-pack scenario ($reuse-pack reuse)" '
+			wc -c <result
+		'
+	done
+done
+
+test_done
-- 
2.43.0.24.g980b318f98

^ permalink raw reply related

* [PATCH 23/24] pack-bitmap: reuse objects from all disjoint packs
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

Now that both the pack-bitmap and pack-objects code are prepared to
handle marking and using objects from multiple disjoint packs for
verbatim reuse, allow marking objects from all disjoint packs as
eligible for reuse.

Within the `reuse_partial_packfile_from_bitmap()` function, we no longer
only mark the pack whose first object is at bit position zero for reuse,
and instead mark any pack which is flagged as disjoint by the MIDX as a
reuse candidate. If no such packs exist (i.e because we are reading a
MIDX written before the "DISP" chunk was introduced), then treat the
preferred pack as disjoint for the purposes of reuse. This is a safe
assumption to make since all duplicate objects are resolved in favor of
the preferred pack.

Provide a handful of test cases in a new script (t5332) exercising
interesting behavior for multi-pack reuse to ensure that we performed
all of the previous steps correctly.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/config/pack.txt |   6 +-
 builtin/pack-objects.c        |   6 +-
 pack-bitmap.c                 |  73 +++++++++---
 pack-bitmap.h                 |   3 +-
 t/t5332-multi-pack-reuse.sh   | 219 ++++++++++++++++++++++++++++++++++
 5 files changed, 290 insertions(+), 17 deletions(-)
 create mode 100755 t/t5332-multi-pack-reuse.sh

diff --git a/Documentation/config/pack.txt b/Documentation/config/pack.txt
index fe100d0fb7..9fe48d41c9 100644
--- a/Documentation/config/pack.txt
+++ b/Documentation/config/pack.txt
@@ -30,7 +30,11 @@ to linkgit:git-repack[1].
 pack.allowPackReuse::
 	When true or "single", and when reachability bitmaps are enabled,
 	pack-objects will try to send parts of the bitmapped packfile
-	verbatim. This can reduce memory and CPU usage to serve fetches,
+	verbatim. When "multi", and when a multi-pack reachability bitmap is
+	available, pack-objects will try to send parts of all packs marked as
+	disjoint by the MIDX. If only a single pack bitmap is available, and
+	`pack.allowPackReuse` is set to "multi", reuse parts of just the
+	bitmapped packfile. This can reduce memory and CPU usage to serve fetches,
 	but might result in sending a slightly larger pack. Defaults to
 	true.
 
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 4853e91251..43b77bff7c 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -233,6 +233,7 @@ static int use_bitmap_index = -1;
 static enum {
 	NO_PACK_REUSE = 0,
 	SINGLE_PACK_REUSE,
+	MULTI_PACK_REUSE,
 } allow_pack_reuse = SINGLE_PACK_REUSE;
 static enum {
 	WRITE_BITMAP_FALSE = 0,
@@ -3253,6 +3254,8 @@ static int git_pack_config(const char *k, const char *v,
 		if (res < 0) {
 			if (!strcasecmp(v, "single"))
 				allow_pack_reuse = SINGLE_PACK_REUSE;
+			else if (!strcasecmp(v, "multi"))
+				allow_pack_reuse = MULTI_PACK_REUSE;
 			else
 				die(_("invalid pack.allowPackReuse value: '%s'"), v);
 		} else if (res) {
@@ -4032,7 +4035,8 @@ static int get_object_list_from_bitmap(struct rev_info *revs)
 		reuse_partial_packfile_from_bitmap(bitmap_git,
 						   &reuse_packfiles,
 						   &reuse_packfiles_nr,
-						   &reuse_packfile_bitmap);
+						   &reuse_packfile_bitmap,
+						   allow_pack_reuse == MULTI_PACK_REUSE);
 
 	if (reuse_packfiles) {
 		reuse_packfile_objects = bitmap_popcount(reuse_packfile_bitmap);
diff --git a/pack-bitmap.c b/pack-bitmap.c
index be53fc6da5..561690c679 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -2061,10 +2061,19 @@ static void reuse_partial_packfile_from_bitmap_1(struct bitmap_index *bitmap_git
 	unuse_pack(&w_curs);
 }
 
+static void make_disjoint_pack(struct bitmapped_pack *out, struct packed_git *p)
+{
+	out->p = p;
+	out->bitmap_pos = 0;
+	out->bitmap_nr = p->num_objects;
+	out->disjoint = 1;
+}
+
 void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 					struct bitmapped_pack **packs_out,
 					size_t *packs_nr_out,
-					struct bitmap **reuse_out)
+					struct bitmap **reuse_out,
+					int multi_pack_reuse)
 {
 	struct repository *r = the_repository;
 	struct bitmapped_pack *packs = NULL;
@@ -2088,24 +2097,62 @@ void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 				free(packs);
 				return;
 			}
-			if (!pack.bitmap_nr)
-				continue; /* no objects from this pack */
-			if (pack.bitmap_pos)
-				continue; /* not preferred pack */
+
+			if (!pack.disjoint)
+				continue;
+
+			if (!multi_pack_reuse && pack.bitmap_pos) {
+				/*
+				 * If we're only reusing a single pack, skip
+				 * over any packs which are not positioned at
+				 * the beginning of the MIDX bitmap.
+				 *
+				 * This is consistent with the existing
+				 * single-pack reuse behavior, which only reuses
+				 * parts of the MIDX's preferred pack.
+				 */
+				continue;
+			}
 
 			ALLOC_GROW(packs, packs_nr + 1, packs_alloc);
 			memcpy(&packs[packs_nr++], &pack, sizeof(pack));
 
 			objects_nr += pack.p->num_objects;
+
+			if (!multi_pack_reuse)
+				break;
+		}
+
+		if (!packs_nr) {
+			/*
+			 * Old MIDXs (i.e. those written before the "DISP" chunk
+			 * existed) will not have any packs marked as disjoint.
+			 *
+			 * But we still want to perform pack reuse with the
+			 * special "preferred pack" as before. To do this, form
+			 * the singleton set containing just the preferred pack,
+			 * which is trivially disjoint with itself.
+			 *
+			 * Moreover, the MIDX is guaranteed to resolve duplicate
+			 * objects in favor of the copy in the preferred pack
+			 * (if one exists). Thus, we can safely perform pack
+			 * reuse on this pack.
+			 */
+			uint32_t preferred_pack_pos;
+			struct packed_git *preferred_pack;
+
+			preferred_pack_pos = midx_preferred_pack(bitmap_git);
+			preferred_pack = bitmap_git->midx->packs[preferred_pack_pos];
+
+			ALLOC_GROW(packs, packs_nr + 1, packs_alloc);
+
+			make_disjoint_pack(&packs[packs_nr], preferred_pack);
+			objects_nr = packs[packs_nr++].p->num_objects;
 		}
 	} else {
 		ALLOC_GROW(packs, packs_nr + 1, packs_alloc);
 
-		packs[packs_nr].p = bitmap_git->pack;
-		packs[packs_nr].bitmap_pos = 0;
-		packs[packs_nr].bitmap_nr = bitmap_git->pack->num_objects;
-		packs[packs_nr].disjoint = 1;
-
+		make_disjoint_pack(&packs[packs_nr], bitmap_git->pack);
 		objects_nr = packs[packs_nr++].p->num_objects;
 	}
 
@@ -2114,10 +2161,8 @@ void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 		word_alloc++;
 	reuse = bitmap_word_alloc(word_alloc);
 
-	if (packs_nr != 1)
-		BUG("pack reuse not yet implemented for multiple packs");
-
-	reuse_partial_packfile_from_bitmap_1(bitmap_git, packs, reuse);
+	for (i = 0; i < packs_nr; i++)
+		reuse_partial_packfile_from_bitmap_1(bitmap_git, &packs[i], reuse);
 
 	if (!bitmap_popcount(reuse)) {
 		free(packs);
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 901a3b86ed..8bb316ce52 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -81,7 +81,8 @@ uint32_t midx_preferred_pack(struct bitmap_index *bitmap_git);
 void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 					struct bitmapped_pack **packs_out,
 					size_t *packs_nr_out,
-					struct bitmap **reuse_out);
+					struct bitmap **reuse_out,
+					int multi_pack_reuse);
 int rebuild_existing_bitmaps(struct bitmap_index *, struct packing_data *mapping,
 			     kh_oid_map_t *reused_bitmaps, int show_progress);
 void free_bitmap_index(struct bitmap_index *);
diff --git a/t/t5332-multi-pack-reuse.sh b/t/t5332-multi-pack-reuse.sh
new file mode 100755
index 0000000000..a9bd3870e6
--- /dev/null
+++ b/t/t5332-multi-pack-reuse.sh
@@ -0,0 +1,219 @@
+#!/bin/sh
+
+test_description='pack-objects multi-pack reuse'
+
+. ./test-lib.sh
+. "$TEST_DIRECTORY"/lib-bitmap.sh
+. "$TEST_DIRECTORY"/lib-disjoint.sh
+
+objdir=.git/objects
+packdir=$objdir/pack
+
+all_packs () {
+	find $packdir -type f -name "*.idx" | sed -e 's/^.*\/\([^\/]\)/\1/g'
+}
+
+all_disjoint () {
+	all_packs | sed -e 's/^/+/g'
+}
+
+test_pack_reused () {
+	test_trace2_data pack-objects pack-reused "$1"
+}
+
+test_packs_reused () {
+	test_trace2_data pack-objects packs-reused "$1"
+}
+
+
+# pack_position <object> </path/to/pack.idx
+pack_position () {
+	git show-index >objects &&
+	grep "$1" objects | cut -d" " -f1
+}
+
+test_expect_success 'setup' '
+	git config pack.allowPackReuse multi
+'
+
+test_expect_success 'preferred pack is reused without packs marked disjoint' '
+	test_commit A &&
+	test_commit B &&
+
+	A="$(echo A | git pack-objects --unpacked --delta-base-offset $packdir/pack)" &&
+	B="$(echo B | git pack-objects --unpacked --delta-base-offset $packdir/pack)" &&
+
+	git prune-packed &&
+
+	git multi-pack-index write --bitmap &&
+
+	test_must_not_be_disjoint "pack-$A.pack" &&
+	test_must_not_be_disjoint "pack-$B.pack" &&
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --revs --all >/dev/null &&
+
+	test_pack_reused 3 <trace2.txt &&
+	test_packs_reused 1 <trace2.txt
+'
+
+test_expect_success 'reuse all objects from subset of disjoint packs' '
+	test_commit C &&
+
+	C="$(echo C | git pack-objects --unpacked --delta-base-offset $packdir/pack)" &&
+
+	git prune-packed &&
+
+	cat >in <<-EOF &&
+	pack-$A.idx
+	+pack-$B.idx
+	+pack-$C.idx
+	EOF
+	git multi-pack-index write --bitmap --stdin-packs <in &&
+
+	test_must_not_be_disjoint "pack-$A.pack" &&
+	test_must_be_disjoint "pack-$B.pack" &&
+	test_must_be_disjoint "pack-$C.pack" &&
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --revs --all >/dev/null &&
+
+	test_pack_reused 6 <trace2.txt &&
+	test_packs_reused 2 <trace2.txt
+'
+
+test_expect_success 'reuse all objects from all disjoint packs' '
+	rm -fr $packdir/multi-pack-index* &&
+
+	all_disjoint >in &&
+	git multi-pack-index write --bitmap --stdin-packs <in &&
+
+	test_must_be_disjoint "pack-$A.pack" &&
+	test_must_be_disjoint "pack-$B.pack" &&
+	test_must_be_disjoint "pack-$C.pack" &&
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --revs --all >/dev/null &&
+
+	test_pack_reused 9 <trace2.txt &&
+	test_packs_reused 3 <trace2.txt
+'
+
+test_expect_success 'reuse objects from first disjoint pack with middle gap' '
+	test_commit D &&
+	test_commit E &&
+	test_commit F &&
+
+	# Set "pack.window" to zero to ensure that we do not create any
+	# deltas, which could alter the amount of pack reuse we perform
+	# (if, for e.g., we are not sending one or more bases).
+	D="$(git -c pack.window=0 pack-objects --all --unpacked $packdir/pack)" &&
+
+	d_pos="$(pack_position $(git rev-parse D) <$packdir/pack-$D.idx)" &&
+	e_pos="$(pack_position $(git rev-parse E) <$packdir/pack-$D.idx)" &&
+	f_pos="$(pack_position $(git rev-parse F) <$packdir/pack-$D.idx)" &&
+
+	# commits F, E, and D, should appear in that order at the
+	# beginning of the pack
+	test $f_pos -lt $e_pos &&
+	test $e_pos -lt $d_pos &&
+
+	# Ensure that the pack we are constructing sorts ahead of any
+	# other packs in lexical/bitmap order by choosing it as the
+	# preferred pack.
+	all_disjoint >in &&
+	git multi-pack-index write --bitmap --preferred-pack="pack-$D.idx" \
+		--stdin-packs <in &&
+
+	test_must_be_disjoint pack-$A.pack &&
+	test_must_be_disjoint pack-$B.pack &&
+	test_must_be_disjoint pack-$C.pack &&
+	test_must_be_disjoint pack-$D.pack &&
+
+	cat >in <<-EOF &&
+	$(git rev-parse E)
+	^$(git rev-parse D)
+	EOF
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --delta-base-offset --revs <in >/dev/null &&
+
+	test_pack_reused 3 <trace2.txt &&
+	test_packs_reused 1 <trace2.txt
+'
+
+test_expect_success 'reuse objects from middle disjoint pack with middle gap' '
+	rm -fr $packdir/multi-pack-index* &&
+
+	# Ensure that the pack we are constructing sort into any
+	# position *but* the first one, by choosing a different pack as
+	# the preferred one.
+	all_disjoint >in &&
+	git multi-pack-index write --bitmap --preferred-pack="pack-$A.idx" \
+		--stdin-packs <in &&
+
+	cat >in <<-EOF &&
+	$(git rev-parse E)
+	^$(git rev-parse D)
+	EOF
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --delta-base-offset --revs <in >/dev/null &&
+
+	test_pack_reused 3 <trace2.txt &&
+	test_packs_reused 1 <trace2.txt
+'
+
+test_expect_success 'omit delta with uninteresting base' '
+	git repack -adk &&
+
+	test_seq 32 >f &&
+	git add f &&
+	test_tick &&
+	git commit -m "delta" &&
+	delta="$(git rev-parse HEAD)" &&
+
+	test_seq 64 >f &&
+	test_tick &&
+	git commit -a -m "base" &&
+	base="$(git rev-parse HEAD)" &&
+
+	test_commit other &&
+
+	git repack -d &&
+
+	have_delta "$(git rev-parse $delta:f)" "$(git rev-parse $base:f)" &&
+
+	all_disjoint >in &&
+	git multi-pack-index write --bitmap --stdin-packs <in &&
+
+	cat >in <<-EOF &&
+	$(git rev-parse other)
+	^$base
+	EOF
+
+	: >trace2.txt &&
+	GIT_TRACE2_EVENT="$PWD/trace2.txt" \
+		git pack-objects --stdout --delta-base-offset --revs <in >/dev/null &&
+
+	# Even though all packs are marked disjoint, we can only reuse
+	# the 3 objects corresponding to "other" from the latest pack.
+	#
+	# This is because even though we want "delta", we do not want
+	# "base", meaning that we have to inflate the delta/base-pair
+	# corresponding to the blob in commit "delta", which bypasses
+	# the pack-reuse mechanism.
+	#
+	# The remaining objects from the other pack are similarly not
+	# reused because their objects are on the uninteresting side of
+	# the query.
+	test_pack_reused 3 <trace2.txt &&
+	test_packs_reused 1 <trace2.txt
+'
+
+test_done
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 22/24] pack-objects: allow setting `pack.allowPackReuse` to "single"
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

In e704fc7978 (pack-objects: introduce pack.allowPackReuse, 2019-12-18),
the `pack.allowPackReuse` configuration option was introduced, allowing
users to disable the pack reuse mechanism.

To prepare for debugging multi-pack reuse, allow setting configuration
to "single" in addition to the usual bool-or-int values.

"single" implies the same behavior as "true", "1", "yes", and so on. But
it will complement a new "multi" value (to be introduced in a future
commit). When set to "single", we will only perform pack reuse on a
single pack, regardless of whether or not there are multiple disjoint
packs.

This requires no code changes (yet), since we only support single pack
reuse.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 Documentation/config/pack.txt |  2 +-
 builtin/pack-objects.c        | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/Documentation/config/pack.txt b/Documentation/config/pack.txt
index f50df9dbce..fe100d0fb7 100644
--- a/Documentation/config/pack.txt
+++ b/Documentation/config/pack.txt
@@ -28,7 +28,7 @@ all existing objects. You can force recompression by passing the -F option
 to linkgit:git-repack[1].
 
 pack.allowPackReuse::
-	When true, and when reachability bitmaps are enabled,
+	When true or "single", and when reachability bitmaps are enabled,
 	pack-objects will try to send parts of the bitmapped packfile
 	verbatim. This can reduce memory and CPU usage to serve fetches,
 	but might result in sending a slightly larger pack. Defaults to
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index fa71fe1ccf..4853e91251 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -230,7 +230,10 @@ static struct bitmap *reuse_packfile_bitmap;
 
 static int use_bitmap_index_default = 1;
 static int use_bitmap_index = -1;
-static int allow_pack_reuse = 1;
+static enum {
+	NO_PACK_REUSE = 0,
+	SINGLE_PACK_REUSE,
+} allow_pack_reuse = SINGLE_PACK_REUSE;
 static enum {
 	WRITE_BITMAP_FALSE = 0,
 	WRITE_BITMAP_QUIET,
@@ -3246,7 +3249,17 @@ static int git_pack_config(const char *k, const char *v,
 		return 0;
 	}
 	if (!strcmp(k, "pack.allowpackreuse")) {
-		allow_pack_reuse = git_config_bool(k, v);
+		int res = git_parse_maybe_bool_text(v);
+		if (res < 0) {
+			if (!strcasecmp(v, "single"))
+				allow_pack_reuse = SINGLE_PACK_REUSE;
+			else
+				die(_("invalid pack.allowPackReuse value: '%s'"), v);
+		} else if (res) {
+			allow_pack_reuse = SINGLE_PACK_REUSE;
+		} else {
+			allow_pack_reuse = NO_PACK_REUSE;
+		}
 		return 0;
 	}
 	if (!strcmp(k, "pack.threads")) {
@@ -4002,7 +4015,7 @@ static void loosen_unused_packed_objects(void)
  */
 static int pack_options_allow_reuse(void)
 {
-	return allow_pack_reuse &&
+	return allow_pack_reuse != NO_PACK_REUSE &&
 	       pack_to_stdout &&
 	       !ignore_packed_keep_on_disk &&
 	       !ignore_packed_keep_in_core &&
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 21/24] t/test-lib-functions.sh: implement `test_trace2_data` helper
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

Introduce a helper function which looks for a specific (category, key,
value) tuple in the output of a trace2 event stream.

We will use this function in a future patch to ensure that the expected
number of objects are reused from an expected number of packs.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/test-lib-functions.sh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh
index 9c3cf12b26..93fe819b0a 100644
--- a/t/test-lib-functions.sh
+++ b/t/test-lib-functions.sh
@@ -1874,6 +1874,20 @@ test_region () {
 	return 0
 }
 
+# Check that the given data fragment was included as part of the
+# trace2-format trace on stdin.
+#
+#	test_trace2_data <category> <key> <value>
+#
+# For example, to look for trace2_data_intmax("pack-objects", repo,
+# "reused", N) in an invocation of "git pack-objects", run:
+#
+#	GIT_TRACE2_EVENT="$(pwd)/trace.txt" git pack-objects ... &&
+#	test_trace2_data pack-objects reused N <trace2.txt
+test_trace2_data () {
+	grep -e '"category":"'"$1"'","key":"'"$2"'","value":"'"$3"'"'
+}
+
 # Given a GIT_TRACE2_EVENT log over stdin, writes to stdout a list of URLs
 # sent to git-remote-https child processes.
 test_remote_https_urls() {
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 20/24] pack-objects: add tracing for various packfile metrics
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

As part of the multi-pack reuse effort, we will want to add some tests
that assert that we reused a certain number of objects from a certain
number of packs.

We could do this by grepping through the stderr output of
`pack-objects`, but doing so would be brittle in case the output format
changed.

Instead, let's use the trace2 mechanism to log various pieces of
information about the generated packfile, which we can then use to
compare against desired values.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 902e70abc5..fa71fe1ccf 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -4620,6 +4620,13 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			   reuse_packfile_objects,
 			   (uintmax_t)reuse_packfiles_used_nr);
 
+	trace2_data_intmax("pack-objects", the_repository, "written", written);
+	trace2_data_intmax("pack-objects", the_repository, "written/delta", written_delta);
+	trace2_data_intmax("pack-objects", the_repository, "reused", reused);
+	trace2_data_intmax("pack-objects", the_repository, "reused/delta", reused_delta);
+	trace2_data_intmax("pack-objects", the_repository, "pack-reused", reuse_packfile_objects);
+	trace2_data_intmax("pack-objects", the_repository, "packs-reused", reuse_packfiles_used_nr);
+
 cleanup:
 	free_packing_data(&to_pack);
 	list_objects_filter_release(&filter_options);
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 19/24] pack-bitmap: prepare to mark objects from multiple packs for reuse
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

Now that the pack-objects code is equipped to handle reusing objects
from multiple packs, prepare the pack-bitmap code to mark objects from
multiple packs as reuse candidates.

In order to prepare the pack-bitmap code for this change, remove the
same set of assumptions we unwound in previous commits from the helper
function `reuse_partial_packfile_from_bitmap_1()`, in preparation for it
to be called in a loop over the set of disjoint packs in a following
commit.

Specifically, remove the assumption that the bit position corresponding
to the first object in a given reuse pack candidate is at a word
boundary. Like in previous commits, we have to walk up to the first word
boundary before marking whole words at a time for reuse. Unlike in
previous commits, however, we have to keep track of whether all of the
objects in the run-up to the first word boundary are wanted in the
resulting pack. This is because we cannot blindly reuse whole words at a
time unless we know for certain that we are sending all bases for any
objects stored as deltas within each word.

Once we're on a word boundary (provided that we want a complete prefix
of objects from the pack), we can then reuse the same "whole-words"
optimization from previous patches, marking all of the bits in a single
word at a time.

Any remaining objects (either from a partial word corresponding to
objects at the end of a pack, or starting from the middle of the pack if
we do not have a complete prefix) are dealt with individually via
try_partial_reuse(), which (among other things) ensures that we are
sending the necessary bases for all objects packed as OFS_DELTA or
REF_DELTA.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 pack-bitmap.c | 113 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 93 insertions(+), 20 deletions(-)

diff --git a/pack-bitmap.c b/pack-bitmap.c
index 670deec909..be53fc6da5 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1940,36 +1940,109 @@ static void reuse_partial_packfile_from_bitmap_1(struct bitmap_index *bitmap_git
 {
 	struct bitmap *result = bitmap_git->result;
 	struct pack_window *w_curs = NULL;
-	size_t i = 0;
+	size_t pos, offset;
+	unsigned complete_prefix = 1;
 
-	while (i < result->word_alloc && result->words[i] == (eword_t)~0)
-		i++;
+	pos = pack->bitmap_pos / BITS_IN_EWORD;
+	offset = pack->bitmap_pos % BITS_IN_EWORD;
 
 	/*
-	 * Don't mark objects not in the packfile or preferred pack. This bitmap
-	 * marks objects eligible for reuse, but the pack-reuse code only
-	 * understands how to reuse a single pack. Since the preferred pack is
-	 * guaranteed to have all bases for its deltas (in a multi-pack bitmap),
-	 * we use it instead of another pack. In single-pack bitmaps, the choice
-	 * is made for us.
+	 * If the position of our first object is not on a word
+	 * boundary, check all bits individually until we reach the
+	 * first word boundary.
+	 *
+	 * If no bits are missing between pack->bitmap_pos and the next
+	 * word boundary, then we can move by whole words instead of by
+	 * individual objects. If one or more of the objects are missing
+	 * in that range, we must evaluate each subsequent object
+	 * individually in order to exclude deltas whose base we are not
+	 * sending, etc.
 	 */
-	if (i > pack->p->num_objects / BITS_IN_EWORD)
-		i = pack->p->num_objects / BITS_IN_EWORD;
+	if (offset) {
+		/*
+		 * Scan to the next word boundary, or through the last
+		 * object in this bitmap, whichever occurs earlier.
+		 */
+		size_t last;
+		eword_t word = result->words[pos];
+		if (pack->bitmap_nr < BITS_IN_EWORD - offset)
+			last = offset + pack->bitmap_nr;
+		else
+			last = BITS_IN_EWORD;
 
-	memset(reuse->words, 0xFF, i * sizeof(eword_t));
+		for (; offset < last; offset++) {
+			size_t pack_pos;
+			if (word >> offset == 0) {
+				complete_prefix = 0;
+				continue;
+			}
 
-	for (; i < result->word_alloc; ++i) {
-		eword_t word = result->words[i];
-		size_t pos = (i * BITS_IN_EWORD);
-		size_t offset;
+			offset += ewah_bit_ctz64(word >> offset);
 
-		for (offset = 0; offset < BITS_IN_EWORD; ++offset) {
-			if ((word >> offset) == 0)
+			pack_pos = pos * BITS_IN_EWORD + offset;
+			pack_pos -= pack->bitmap_pos;
+
+			try_partial_reuse(pack, pack_pos, reuse, &w_curs);
+			if (!bitmap_get(reuse, pos + pack->bitmap_pos))
+				complete_prefix = 0;
+		}
+
+		pos++;
+	}
+
+	if (complete_prefix) {
+		/*
+		 * If we are using all of the objects at the beginning
+		 * of this pack, we can safely reuse objects in eword_t
+		 * sized chunks, since we are guaranteed to send all
+		 * potential delta bases.
+		 *
+		 * Scan the nearest word boundaries within range of this
+		 * pack's bit positions. If the pack does not start on a
+		 * word boundary, skip to the next boundary, since we
+		 * have already checked above.
+		 */
+		size_t start = pos;
+		size_t word_end = (pack->bitmap_pos + pack->bitmap_nr) / BITS_IN_EWORD;
+		while (start <= pos && pos < word_end &&
+		       pos < result->word_alloc &&
+		       result->words[pos] == (eword_t)~0)
+			pos++;
+		memset(reuse->words + start, 0xFF, (pos - start) * sizeof(eword_t));
+	}
+
+	/*
+	 * At this point, we know that we are at an eword boundary,
+	 * either because:
+	 *
+	 *   - we started at one and used zero or more whole words
+	 *     following pack->bitmap_pos
+	 *
+	 *   - we started in between two word boundaries, advanced
+	 *     forward to the next word boundary, and then used zero or
+	 *     more (assuming a complete prefix) whole words following.
+	 */
+	for (; pos < result->word_alloc; pos++) {
+		eword_t word = result->words[pos];
+
+		for (offset = 0; offset < BITS_IN_EWORD; offset++) {
+			size_t bit_pos, pack_pos;
+			if (word >> offset == 0)
 				break;
 
 			offset += ewah_bit_ctz64(word >> offset);
-			if (try_partial_reuse(pack, pos + offset,
-					      reuse, &w_curs) < 0) {
+
+			bit_pos = pos * BITS_IN_EWORD + offset;
+			if (bit_pos >= pack->bitmap_pos + pack->bitmap_nr)
+				goto done;
+
+			pack_pos = bit_pos - pack->bitmap_pos;
+			if (pack_pos >= pack->p->num_objects)
+				BUG("advanced beyond the end of pack %s (%"PRIuMAX" > %"PRIu32")",
+				    pack_basename(pack->p), (uintmax_t)pack_pos,
+				    pack->p->num_objects);
+
+			if (try_partial_reuse(pack, pack_pos, reuse, &w_curs) < 0) {
 				/*
 				 * try_partial_reuse indicated we couldn't reuse
 				 * any bits, so there is no point in trying more
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 18/24] pack-objects: include number of packs reused in output
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

In addition to including the number of objects reused verbatim from a
reuse-pack, include the number of packs from which objects were reused.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index e37509568b..902e70abc5 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -224,6 +224,7 @@ static struct progress *progress_state;
 
 static struct bitmapped_pack *reuse_packfiles;
 static size_t reuse_packfiles_nr;
+static size_t reuse_packfiles_used_nr;
 static uint32_t reuse_packfile_objects;
 static struct bitmap *reuse_packfile_bitmap;
 
@@ -1266,6 +1267,8 @@ static void write_pack_file(void)
 			for (j = 0; j < reuse_packfiles_nr; j++) {
 				reused_chunks_nr = 0;
 				write_reused_pack(&reuse_packfiles[j], f);
+				if (reused_chunks_nr)
+					reuse_packfiles_used_nr++;
 			}
 			offset = hashfile_total(f);
 		}
@@ -4612,9 +4615,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		fprintf_ln(stderr,
 			   _("Total %"PRIu32" (delta %"PRIu32"),"
 			     " reused %"PRIu32" (delta %"PRIu32"),"
-			     " pack-reused %"PRIu32),
+			     " pack-reused %"PRIu32" (from %"PRIuMAX")"),
 			   written, written_delta, reused, reused_delta,
-			   reuse_packfile_objects);
+			   reuse_packfile_objects,
+			   (uintmax_t)reuse_packfiles_used_nr);
 
 cleanup:
 	free_packing_data(&to_pack);
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 17/24] pack-objects: prepare `write_reused_pack_verbatim()` for multi-pack reuse
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

The function `write_reused_pack_verbatim()` within
`builtin/pack-objects.c` is responsible for writing out a continuous
set of objects beginning at the start of the reuse packfile.

In the existing implementation, we did something like:

    while (pos < reuse_packfile_bitmap->word_alloc &&
           reuse_packfile_bitmap->words[pos] == (eword_t)~0)
      pos++;

    if (pos)
      /* write first `pos * BITS_IN_WORD` objects from pack */

as an optimization to record a single chunk for the longest continuous
prefix of objects wanted out of the reuse pack, instead of having a
chunk for each individual object. For more details, see bb514de356
(pack-objects: improve partial packfile reuse, 2019-12-18).

In order to retain this optimization in a multi-pack reuse world, we can
no longer assume that the first object in a pack is on a word boundary
in the bitmap storing the set of reusable objects.

Assuming that all objects from the beginning of the reuse packfile up to
the object corresponding to the first bit on a word boundary are part of
the result, consume whole words at a time until the last whole word
belonging to the reuse packfile. Copy those objects to the resulting
packfile, and track that we reused them by recording a single chunk.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c | 73 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 60 insertions(+), 13 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b5e6f6377a..e37509568b 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1098,31 +1098,78 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile,
 
 static size_t write_reused_pack_verbatim(struct bitmapped_pack *reuse_packfile,
 					 struct hashfile *out,
-					 off_t pack_start UNUSED,
+					 off_t pack_start,
 					 struct pack_window **w_curs)
 {
-	size_t pos = 0;
+	size_t pos = reuse_packfile->bitmap_pos;
+	size_t end;
 
-	while (pos < reuse_packfile_bitmap->word_alloc &&
-			reuse_packfile_bitmap->words[pos] == (eword_t)~0)
-		pos++;
+	if (pos % BITS_IN_EWORD) {
+		size_t word_pos = (pos / BITS_IN_EWORD);
+		size_t offset = pos % BITS_IN_EWORD;
+		size_t last;
+		eword_t word = reuse_packfile_bitmap->words[word_pos];
 
-	if (pos) {
-		off_t to_write;
+		if (offset + reuse_packfile->bitmap_nr < BITS_IN_EWORD)
+			last = offset + reuse_packfile->bitmap_nr;
+		else
+			last = BITS_IN_EWORD;
 
-		written = (pos * BITS_IN_EWORD);
-		to_write = pack_pos_to_offset(reuse_packfile->p, written)
-			- sizeof(struct pack_header);
+		for (; offset < last; offset++) {
+			if (word >> offset == 0)
+				return word_pos;
+			if (!bitmap_get(reuse_packfile_bitmap,
+					word_pos * BITS_IN_EWORD + offset))
+				return word_pos;
+		}
+
+		pos += BITS_IN_EWORD - (pos % BITS_IN_EWORD);
+	}
+
+	/*
+	 * Now we're going to copy as many whole eword_t's as possible.
+	 * "end" is the index of the last whole eword_t we copy, but
+	 * there may be additional bits to process. Those are handled
+	 * individually by write_reused_pack().
+	 *
+	 * Begin by advancing to the first word boundary in range of the
+	 * bit positions occupied by objects in "reuse_packfile". Then
+	 * pick the last word boundary in the same range. If we have at
+	 * least one word's worth of bits to process, continue on.
+	 */
+	end = reuse_packfile->bitmap_pos + reuse_packfile->bitmap_nr;
+	if (end % BITS_IN_EWORD)
+		end -= end % BITS_IN_EWORD;
+	if (pos >= end)
+		return reuse_packfile->bitmap_pos / BITS_IN_EWORD;
+
+	while (pos < end &&
+	       reuse_packfile_bitmap->words[pos / BITS_IN_EWORD] == (eword_t)~0)
+		pos += BITS_IN_EWORD;
+
+	if (pos > end)
+		pos = end;
+
+	if (reuse_packfile->bitmap_pos < pos) {
+		off_t pack_start_off = pack_pos_to_offset(reuse_packfile->p, 0);
+		off_t pack_end_off = pack_pos_to_offset(reuse_packfile->p,
+							pos - reuse_packfile->bitmap_pos);
+
+		written += pos - reuse_packfile->bitmap_pos;
 
 		/* We're recording one chunk, not one object. */
-		record_reused_object(sizeof(struct pack_header), 0);
+		record_reused_object(pack_start_off,
+				     pack_start_off - (hashfile_total(out) - pack_start));
 		hashflush(out);
 		copy_pack_data(out, reuse_packfile->p, w_curs,
-			sizeof(struct pack_header), to_write);
+			pack_start_off, pack_end_off - pack_start_off);
 
 		display_progress(progress_state, written);
 	}
-	return pos;
+	if (pos % BITS_IN_EWORD)
+		BUG("attempted to jump past a word boundary to %"PRIuMAX,
+		    (uintmax_t)pos);
+	return pos / BITS_IN_EWORD;
 }
 
 static void write_reused_pack(struct bitmapped_pack *reuse_packfile,
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 16/24] pack-objects: prepare `write_reused_pack()` for multi-pack reuse
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

The function `write_reused_pack()` within `builtin/pack-objects.c` is
responsible for performing pack-reuse on a single pack, and has two main
functions:

  - it dispatches a call to `write_reused_pack_verbatim()` to see if we
    can reuse portions of the packfile in whole-word chunks

  - for any remaining objects (that is, any objects that appear after
    the first "gap" in the bitmap), call write_reused_pack_one() on that
    object to record it for reuse.

Prepare this function for multi-pack reuse by removing the assumption
that the bit position corresponding to the first object being reused
from a given pack may not be at bit position zero.

The changes in this function are mostly straightforward. Initialize `i`
to the position of the first word to contain bits corresponding to that
reuse pack. In most situations, we throw the initialized value away,
since we end up replacing it with the return value from
write_reused_pack_verbatim(), moving us past the section of whole words
that we reused.

Likewise, modify the per-object loop to ignore any bits at the beginning
of the first word that do not belong to the pack currently being reused,
as well as skip to the "done" section once we have processed the last
bit corresponding to this pack.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 3b7704d062..b5e6f6377a 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1128,7 +1128,7 @@ static size_t write_reused_pack_verbatim(struct bitmapped_pack *reuse_packfile,
 static void write_reused_pack(struct bitmapped_pack *reuse_packfile,
 			      struct hashfile *f)
 {
-	size_t i = 0;
+	size_t i = reuse_packfile->bitmap_pos / BITS_IN_EWORD;
 	uint32_t offset;
 	off_t pack_start = hashfile_total(f) - sizeof(struct pack_header);
 	struct pack_window *w_curs = NULL;
@@ -1146,17 +1146,23 @@ static void write_reused_pack(struct bitmapped_pack *reuse_packfile,
 				break;
 
 			offset += ewah_bit_ctz64(word >> offset);
+			if (pos + offset < reuse_packfile->bitmap_pos)
+				continue;
+			if (pos + offset >= reuse_packfile->bitmap_pos + reuse_packfile->bitmap_nr)
+				goto done;
 			/*
 			 * Can use bit positions directly, even for MIDX
 			 * bitmaps. See comment in try_partial_reuse()
 			 * for why.
 			 */
-			write_reused_pack_one(reuse_packfile->p, pos + offset,
+			write_reused_pack_one(reuse_packfile->p,
+					      pos + offset - reuse_packfile->bitmap_pos,
 					      f, pack_start, &w_curs);
 			display_progress(progress_state, ++written);
 		}
 	}
 
+done:
 	unuse_pack(&w_curs);
 }
 
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 15/24] pack-objects: pass `bitmapped_pack`'s to pack-reuse functions
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

Further prepare pack-objects to perform verbatim pack-reuse over
multiple packfiles by converting functions that take in a pointer to a
`struct packed_git` to instead take in a pointer to a `struct
bitmapped_pack`.

The additional information found in the bitmapped_pack struct (such as
the bit position corresponding to the beginning of the pack) will be
necessary in order to perform verbatim pack-reuse.

Note that we don't use any of the extra pieces of information contained
in the bitmapped_pack struct, so this step is merely preparatory and
does not introduce any functional changes.

Note further that we do not change the argument type to
write_reused_pack_one(). That function is responsible for copying
sections of the packfile directly and optionally patching any OFS_DELTAs
to account for not reusing sections of the packfile in between a delta
and its base.

As such, that function is (and should remain) oblivious to multi-pack
reuse, and does not require any of the extra pieces of information
stored in the bitmapped_pack struct.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index eb8be514d1..3b7704d062 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -222,7 +222,8 @@ static int thin;
 static int num_preferred_base;
 static struct progress *progress_state;
 
-static struct packed_git *reuse_packfile;
+static struct bitmapped_pack *reuse_packfiles;
+static size_t reuse_packfiles_nr;
 static uint32_t reuse_packfile_objects;
 static struct bitmap *reuse_packfile_bitmap;
 
@@ -1095,7 +1096,7 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile,
 	copy_pack_data(out, reuse_packfile, w_curs, offset, next - offset);
 }
 
-static size_t write_reused_pack_verbatim(struct packed_git *reuse_packfile,
+static size_t write_reused_pack_verbatim(struct bitmapped_pack *reuse_packfile,
 					 struct hashfile *out,
 					 off_t pack_start UNUSED,
 					 struct pack_window **w_curs)
@@ -1110,13 +1111,13 @@ static size_t write_reused_pack_verbatim(struct packed_git *reuse_packfile,
 		off_t to_write;
 
 		written = (pos * BITS_IN_EWORD);
-		to_write = pack_pos_to_offset(reuse_packfile, written)
+		to_write = pack_pos_to_offset(reuse_packfile->p, written)
 			- sizeof(struct pack_header);
 
 		/* We're recording one chunk, not one object. */
 		record_reused_object(sizeof(struct pack_header), 0);
 		hashflush(out);
-		copy_pack_data(out, reuse_packfile, w_curs,
+		copy_pack_data(out, reuse_packfile->p, w_curs,
 			sizeof(struct pack_header), to_write);
 
 		display_progress(progress_state, written);
@@ -1124,7 +1125,7 @@ static size_t write_reused_pack_verbatim(struct packed_git *reuse_packfile,
 	return pos;
 }
 
-static void write_reused_pack(struct packed_git *reuse_packfile,
+static void write_reused_pack(struct bitmapped_pack *reuse_packfile,
 			      struct hashfile *f)
 {
 	size_t i = 0;
@@ -1150,8 +1151,8 @@ static void write_reused_pack(struct packed_git *reuse_packfile,
 			 * bitmaps. See comment in try_partial_reuse()
 			 * for why.
 			 */
-			write_reused_pack_one(reuse_packfile, pos + offset, f,
-					      pack_start, &w_curs);
+			write_reused_pack_one(reuse_packfile->p, pos + offset,
+					      f, pack_start, &w_curs);
 			display_progress(progress_state, ++written);
 		}
 	}
@@ -1207,9 +1208,12 @@ static void write_pack_file(void)
 
 		offset = write_pack_header(f, nr_remaining);
 
-		if (reuse_packfile) {
+		if (reuse_packfiles_nr) {
 			assert(pack_to_stdout);
-			write_reused_pack(reuse_packfile, f);
+			for (j = 0; j < reuse_packfiles_nr; j++) {
+				reused_chunks_nr = 0;
+				write_reused_pack(&reuse_packfiles[j], f);
+			}
 			offset = hashfile_total(f);
 		}
 
@@ -3952,19 +3956,16 @@ static int pack_options_allow_reuse(void)
 
 static int get_object_list_from_bitmap(struct rev_info *revs)
 {
-	struct bitmapped_pack *packs = NULL;
-	size_t packs_nr = 0;
-
 	if (!(bitmap_git = prepare_bitmap_walk(revs, 0)))
 		return -1;
 
 	if (pack_options_allow_reuse())
-		reuse_partial_packfile_from_bitmap(bitmap_git, &packs,
-						   &packs_nr,
+		reuse_partial_packfile_from_bitmap(bitmap_git,
+						   &reuse_packfiles,
+						   &reuse_packfiles_nr,
 						   &reuse_packfile_bitmap);
 
-	if (packs) {
-		reuse_packfile = packs[0].p;
+	if (reuse_packfiles) {
 		reuse_packfile_objects = bitmap_popcount(reuse_packfile_bitmap);
 		if (!reuse_packfile_objects)
 			BUG("expected non-empty reuse bitmap");
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 14/24] pack-objects: keep track of `pack_start` for each reuse pack
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

When reusing objects from a pack, we keep track of a set of one or more
`reused_chunk`s, corresponding to sections of one or more object(s) from
a source pack that we are reusing. Each chunk contains two pieces of
information:

  - the offset of the first object in the source pack (relative to the
    beginning of the source pack)
  - the difference between that offset, and the corresponding offset in
    the pack we're generating

The purpose of keeping track of these is so that we can patch an
OFS_DELTAs that cross over a section of the reuse pack that we didn't
take.

For instance, consider a hypothetical pack as shown below:

                                                (chunk #2)
                                                __________...
                                               /
                                              /
      +--------+---------+-------------------+---------+
  ... | <base> | <other> |      (unused)     | <delta> | ...
      +--------+---------+-------------------+---------+
       \                /
        \______________/
           (chunk #1)

Suppose that we are sending objects "base", "other", and "delta", and
that the "delta" object is stored as an OFS_DELTA, and that its base is
"base". If we don't send any objects in the "(unused)" range, we can't
copy the delta'd object directly, since its delta offset includes a
range of the pack that we didn't copy, so we have to account for that
difference when patching and reassembling the delta.

In order to compute this value correctly, we need to know not only where
we are in the packfile we're assembling (with `hashfile_total(f)`) but
also the position of the first byte of the packfile that we are
currently reusing.

Together, these two allow us to compute the reused chunk's offset
difference relative to the start of the reused pack, as desired.

Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 7682bd65bb..eb8be514d1 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1016,6 +1016,7 @@ static off_t find_reused_offset(off_t where)
 
 static void write_reused_pack_one(struct packed_git *reuse_packfile,
 				  size_t pos, struct hashfile *out,
+				  off_t pack_start,
 				  struct pack_window **w_curs)
 {
 	off_t offset, next, cur;
@@ -1025,7 +1026,8 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile,
 	offset = pack_pos_to_offset(reuse_packfile, pos);
 	next = pack_pos_to_offset(reuse_packfile, pos + 1);
 
-	record_reused_object(offset, offset - hashfile_total(out));
+	record_reused_object(offset,
+			     offset - (hashfile_total(out) - pack_start));
 
 	cur = offset;
 	type = unpack_object_header(reuse_packfile, w_curs, &cur, &size);
@@ -1095,6 +1097,7 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile,
 
 static size_t write_reused_pack_verbatim(struct packed_git *reuse_packfile,
 					 struct hashfile *out,
+					 off_t pack_start UNUSED,
 					 struct pack_window **w_curs)
 {
 	size_t pos = 0;
@@ -1126,10 +1129,12 @@ static void write_reused_pack(struct packed_git *reuse_packfile,
 {
 	size_t i = 0;
 	uint32_t offset;
+	off_t pack_start = hashfile_total(f) - sizeof(struct pack_header);
 	struct pack_window *w_curs = NULL;
 
 	if (allow_ofs_delta)
-		i = write_reused_pack_verbatim(reuse_packfile, f, &w_curs);
+		i = write_reused_pack_verbatim(reuse_packfile, f, pack_start,
+					       &w_curs);
 
 	for (; i < reuse_packfile_bitmap->word_alloc; ++i) {
 		eword_t word = reuse_packfile_bitmap->words[i];
@@ -1146,7 +1151,7 @@ static void write_reused_pack(struct packed_git *reuse_packfile,
 			 * for why.
 			 */
 			write_reused_pack_one(reuse_packfile, pos + offset, f,
-					      &w_curs);
+					      pack_start, &w_curs);
 			display_progress(progress_state, ++written);
 		}
 	}
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 13/24] pack-objects: parameterize pack-reuse routines over a single pack
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

The routines pack-objects uses to perform verbatim pack-reuse are:

  - write_reused_pack_one()
  - write_reused_pack_verbatim()
  - write_reused_pack()

, all of which assume that there is exactly one packfile being reused:
the global constant `reuse_packfile`.

Prepare for reusing objects from multiple packs by making reuse packfile
a parameter of each of the above functions in preparation for calling
these functions in a loop with multiple packfiles.

Note that we still have the global "reuse_packfile", but pass it through
each of the above function's parameter lists, eliminating all but one
direct access (the top-level caller in `write_pack_file()`). Even after
this series, we will still have a global, but it will hold the array of
reusable packfiles, and we'll pass them one at a time to these functions
in a loop.

Note also that we will eventually need to pass a `bitmapped_pack`
instead of a `packed_git` in order to hold onto additional information
required for reuse (such as the bit position of the first object
belonging to that pack). But that change will be made in a future commit
so as to minimize the noise below as much as possible.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 89de23f39a..7682bd65bb 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1014,7 +1014,8 @@ static off_t find_reused_offset(off_t where)
 	return reused_chunks[lo-1].difference;
 }
 
-static void write_reused_pack_one(size_t pos, struct hashfile *out,
+static void write_reused_pack_one(struct packed_git *reuse_packfile,
+				  size_t pos, struct hashfile *out,
 				  struct pack_window **w_curs)
 {
 	off_t offset, next, cur;
@@ -1092,7 +1093,8 @@ static void write_reused_pack_one(size_t pos, struct hashfile *out,
 	copy_pack_data(out, reuse_packfile, w_curs, offset, next - offset);
 }
 
-static size_t write_reused_pack_verbatim(struct hashfile *out,
+static size_t write_reused_pack_verbatim(struct packed_git *reuse_packfile,
+					 struct hashfile *out,
 					 struct pack_window **w_curs)
 {
 	size_t pos = 0;
@@ -1119,14 +1121,15 @@ static size_t write_reused_pack_verbatim(struct hashfile *out,
 	return pos;
 }
 
-static void write_reused_pack(struct hashfile *f)
+static void write_reused_pack(struct packed_git *reuse_packfile,
+			      struct hashfile *f)
 {
 	size_t i = 0;
 	uint32_t offset;
 	struct pack_window *w_curs = NULL;
 
 	if (allow_ofs_delta)
-		i = write_reused_pack_verbatim(f, &w_curs);
+		i = write_reused_pack_verbatim(reuse_packfile, f, &w_curs);
 
 	for (; i < reuse_packfile_bitmap->word_alloc; ++i) {
 		eword_t word = reuse_packfile_bitmap->words[i];
@@ -1142,7 +1145,8 @@ static void write_reused_pack(struct hashfile *f)
 			 * bitmaps. See comment in try_partial_reuse()
 			 * for why.
 			 */
-			write_reused_pack_one(pos + offset, f, &w_curs);
+			write_reused_pack_one(reuse_packfile, pos + offset, f,
+					      &w_curs);
 			display_progress(progress_state, ++written);
 		}
 	}
@@ -1200,7 +1204,7 @@ static void write_pack_file(void)
 
 		if (reuse_packfile) {
 			assert(pack_to_stdout);
-			write_reused_pack(f);
+			write_reused_pack(reuse_packfile, f);
 			offset = hashfile_total(f);
 		}
 
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related

* [PATCH 12/24] pack-bitmap: return multiple packs via `reuse_partial_packfile_from_bitmap()`
From: Taylor Blau @ 2023-11-28 19:08 UTC (permalink / raw)
  To: git; +Cc: Jeff King, Patrick Steinhardt, Junio C Hamano
In-Reply-To: <cover.1701198172.git.me@ttaylorr.com>

Further prepare for enabling verbatim pack-reuse over multiple packfiles
by changing the signature of reuse_partial_packfile_from_bitmap() to
populate an array of `struct bitmapped_pack *`'s instead of a pointer to
a single packfile.

Since the array we're filling out is sized dynamically[^1], add an
additional `size_t *` parameter which will hold the number of reusable
packs (equal to the number of elements in the array).

Note that since we still have not implemented true multi-pack reuse,
these changes aren't propagated out to the rest of the caller in
builtin/pack-objects.c.

In the interim state, we expect that the array has a single element, and
we use that element to fill out the static `reuse_packfile` variable
(which is a bog-standard `struct packed_git *`). Future commits will
continue to push this change further out through the pack-objects code.

[^1]: That is, even though we know the number of packs which are
  candidates for pack-reuse, we do not know how many of those
  candidates we can actually reuse.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 builtin/pack-objects.c | 9 +++++++--
 pack-bitmap.c          | 6 ++++--
 pack-bitmap.h          | 5 +++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 2bb1b64e8f..89de23f39a 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3943,14 +3943,19 @@ static int pack_options_allow_reuse(void)
 
 static int get_object_list_from_bitmap(struct rev_info *revs)
 {
+	struct bitmapped_pack *packs = NULL;
+	size_t packs_nr = 0;
+
 	if (!(bitmap_git = prepare_bitmap_walk(revs, 0)))
 		return -1;
 
 	if (pack_options_allow_reuse())
-		reuse_partial_packfile_from_bitmap(bitmap_git, &reuse_packfile,
+		reuse_partial_packfile_from_bitmap(bitmap_git, &packs,
+						   &packs_nr,
 						   &reuse_packfile_bitmap);
 
-	if (reuse_packfile) {
+	if (packs) {
+		reuse_packfile = packs[0].p;
 		reuse_packfile_objects = bitmap_popcount(reuse_packfile_bitmap);
 		if (!reuse_packfile_objects)
 			BUG("expected non-empty reuse bitmap");
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 614fc09a4e..670deec909 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1989,7 +1989,8 @@ static void reuse_partial_packfile_from_bitmap_1(struct bitmap_index *bitmap_git
 }
 
 void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
-					struct packed_git **packfile_out,
+					struct bitmapped_pack **packs_out,
+					size_t *packs_nr_out,
 					struct bitmap **reuse_out)
 {
 	struct repository *r = the_repository;
@@ -2056,7 +2057,8 @@ void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
 	 * need to be handled separately.
 	 */
 	bitmap_and_not(result, reuse);
-	*packfile_out = packs[0].p;
+	*packs_out = packs;
+	*packs_nr_out = packs_nr;
 	*reuse_out = reuse;
 }
 
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 5bc1ca5b65..901a3b86ed 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -78,8 +78,9 @@ int test_bitmap_hashes(struct repository *r);
 struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs,
 					 int filter_provided_objects);
 uint32_t midx_preferred_pack(struct bitmap_index *bitmap_git);
-void reuse_partial_packfile_from_bitmap(struct bitmap_index *,
-					struct packed_git **packfile,
+void reuse_partial_packfile_from_bitmap(struct bitmap_index *bitmap_git,
+					struct bitmapped_pack **packs_out,
+					size_t *packs_nr_out,
 					struct bitmap **reuse_out);
 int rebuild_existing_bitmaps(struct bitmap_index *, struct packing_data *mapping,
 			     kh_oid_map_t *reused_bitmaps, int show_progress);
-- 
2.43.0.24.g980b318f98


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox