From: "Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: gitster@pobox.com, johannes.schindelin@gmx.de, peff@peff.net,
ps@pks.im, me@ttaylorr.com, johncai86@gmail.com,
newren@gmail.com, Derrick Stolee <stolee@gmail.com>,
Derrick Stolee <derrickstolee@github.com>
Subject: [PATCH 03/30] backfill: basic functionality and tests
Date: Tue, 10 Sep 2024 02:28:28 +0000 [thread overview]
Message-ID: <be21c8370eb5143c3f6e91d354d48465d221692a.1725935335.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.1786.git.1725935335.gitgitgadget@gmail.com>
From: Derrick Stolee <derrickstolee@github.com>
The default behavior of 'git backfill' is to fetch all missing blobs that
are reachable from HEAD. Document and test this behavior.
The implementation is a very simple use of the path-walk API, initializing
the revision walk at HEAD to start the path-walk from all commits reachable
from HEAD. Ignore the object arrays that correspond to tree entries,
assuming that they are all present already.
Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
Documentation/git-backfill.txt | 24 ++++++++
builtin/backfill.c | 101 ++++++++++++++++++++++++++++++++-
t/t5620-backfill.sh | 97 +++++++++++++++++++++++++++++++
3 files changed, 219 insertions(+), 3 deletions(-)
create mode 100755 t/t5620-backfill.sh
diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt
index 640144187d3..0e10f066fef 100644
--- a/Documentation/git-backfill.txt
+++ b/Documentation/git-backfill.txt
@@ -14,6 +14,30 @@ SYNOPSIS
DESCRIPTION
-----------
+Blobless partial clones are created using `git clone --filter=blob:none`
+and then configure the local repository such that the Git client avoids
+downloading blob objects unless they are required for a local operation.
+This initially means that the clone and later fetches download reachable
+commits and trees but no blobs. Later operations that change the `HEAD`
+pointer, such as `git checkout` or `git merge`, may need to download
+missing blobs in order to complete their operation.
+
+In the worst cases, commands that compute blob diffs, such as `git blame`,
+become very slow as they download the missing blobs in single-blob
+requests to satisfy the missing object as the Git command needs it. This
+leads to multiple download requests and no ability for the Git server to
+provide delta compression across those objects.
+
+The `git backfill` command provides a way for the user to request that
+Git downloads the missing blobs (with optional filters) such that the
+missing blobs representing historical versions of files can be downloaded
+in batches. The `backfill` command attempts to optimize the request by
+grouping blobs that appear at the same path, hopefully leading to good
+delta compression in the packfile sent by the server.
+
+By default, `git backfill` downloads all blobs reachable from the `HEAD`
+commit. This set can be restricted or expanded using various options.
+
SEE ALSO
--------
linkgit:git-clone[1].
diff --git a/builtin/backfill.c b/builtin/backfill.c
index 77b05a2f838..23d40fc02a2 100644
--- a/builtin/backfill.c
+++ b/builtin/backfill.c
@@ -1,16 +1,113 @@
#include "builtin.h"
+#include "git-compat-util.h"
#include "config.h"
#include "parse-options.h"
#include "repository.h"
+#include "commit.h"
+#include "hex.h"
+#include "tree.h"
+#include "tree-walk.h"
#include "object.h"
+#include "object-store-ll.h"
+#include "oid-array.h"
+#include "oidset.h"
+#include "promisor-remote.h"
+#include "strmap.h"
+#include "string-list.h"
+#include "revision.h"
+#include "trace2.h"
+#include "progress.h"
+#include "packfile.h"
+#include "path-walk.h"
static const char * const builtin_backfill_usage[] = {
N_("git backfill [<options>]"),
NULL
};
+struct backfill_context {
+ struct repository *repo;
+ struct oid_array current_batch;
+ size_t batch_size;
+};
+
+static void clear_backfill_context(struct backfill_context *ctx)
+{
+ oid_array_clear(&ctx->current_batch);
+}
+
+static void download_batch(struct backfill_context *ctx)
+{
+ promisor_remote_get_direct(ctx->repo,
+ ctx->current_batch.oid,
+ ctx->current_batch.nr);
+ oid_array_clear(&ctx->current_batch);
+
+ /*
+ * We likely have a new packfile. Add it to the packed list to
+ * avoid possible duplicate downloads of the same objects.
+ */
+ reprepare_packed_git(ctx->repo);
+}
+
+static int fill_missing_blobs(const char *path,
+ struct oid_array *list,
+ enum object_type type,
+ void *data)
+{
+ struct backfill_context *ctx = data;
+
+ if (type != OBJ_BLOB)
+ return 0;
+
+ for (size_t i = 0; i < list->nr; i++) {
+ off_t size = 0;
+ struct object_info info = OBJECT_INFO_INIT;
+ info.disk_sizep = &size;
+ if (oid_object_info_extended(the_repository,
+ &list->oid[i],
+ &info,
+ OBJECT_INFO_FOR_PREFETCH) ||
+ !size)
+ oid_array_append(&ctx->current_batch, &list->oid[i]);
+ }
+
+ if (ctx->current_batch.nr >= ctx->batch_size)
+ download_batch(ctx);
+
+ return 0;
+}
+
+static int do_backfill(struct backfill_context *ctx)
+{
+ struct rev_info revs;
+ struct path_walk_info info = PATH_WALK_INFO_INIT;
+ int ret;
+
+ repo_init_revisions(ctx->repo, &revs, "");
+ handle_revision_arg("HEAD", &revs, 0, 0);
+
+ info.revs = &revs;
+ info.path_fn = fill_missing_blobs;
+ info.path_fn_data = ctx;
+
+ ret = walk_objects_by_path(&info);
+
+ /* Download the objects that did not fill a batch. */
+ if (!ret)
+ download_batch(ctx);
+
+ clear_backfill_context(ctx);
+ return ret;
+}
+
int cmd_backfill(int argc, const char **argv, const char *prefix)
{
+ struct backfill_context ctx = {
+ .repo = the_repository,
+ .current_batch = OID_ARRAY_INIT,
+ .batch_size = 16000,
+ };
struct option options[] = {
OPT_END(),
};
@@ -23,7 +120,5 @@ int cmd_backfill(int argc, const char **argv, const char *prefix)
git_config(git_default_config, NULL);
- die(_("not implemented"));
-
- return 0;
+ return do_backfill(&ctx);
}
diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh
new file mode 100755
index 00000000000..43868a4a75f
--- /dev/null
+++ b/t/t5620-backfill.sh
@@ -0,0 +1,97 @@
+#!/bin/sh
+
+test_description='git backfill on partial clones'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+TEST_PASSES_SANITIZE_LEAK=0
+export TEST_PASSES_SANITIZE_LEAK
+
+. ./test-lib.sh
+
+# We create objects in the 'src' repo.
+test_expect_success 'setup repo for object creation' '
+ echo "{print \$1}" >print_1.awk &&
+ echo "{print \$2}" >print_2.awk &&
+
+ git init src &&
+
+ mkdir -p src/a/b/c &&
+ mkdir -p src/d/e &&
+
+ for i in 1 2
+ do
+ for n in 1 2 3 4
+ do
+ echo "Version $i of file $n" > src/file.$n.txt &&
+ echo "Version $i of file a/$n" > src/a/file.$n.txt &&
+ echo "Version $i of file a/b/$n" > src/a/b/file.$n.txt &&
+ echo "Version $i of file a/b/c/$n" > src/a/b/c/file.$n.txt &&
+ echo "Version $i of file d/$n" > src/d/file.$n.txt &&
+ echo "Version $i of file d/e/$n" > src/d/e/file.$n.txt &&
+ git -C src add . &&
+ git -C src commit -m "Iteration $n" || return 1
+ done
+ done
+'
+
+# Clone 'src' into 'srv.bare' so we have a bare repo to be our origin
+# server for the partial clone.
+test_expect_success 'setup bare clone for server' '
+ git clone --bare "file://$(pwd)/src" srv.bare &&
+ git -C srv.bare config --local uploadpack.allowfilter 1 &&
+ git -C srv.bare config --local uploadpack.allowanysha1inwant 1
+'
+
+# do basic partial clone from "srv.bare"
+test_expect_success 'do partial clone 1, backfill gets all objects' '
+ git clone --no-checkout --filter=blob:none \
+ --single-branch --branch=main \
+ "file://$(pwd)/srv.bare" backfill1 &&
+
+ # Backfill with no options gets everything reachable from HEAD.
+ GIT_TRACE2_EVENT="$(pwd)/backfill-file-trace" git \
+ -C backfill1 backfill &&
+
+ # We should have engaged the partial clone machinery
+ test_trace2_data promisor fetch_count 48 <backfill-file-trace &&
+
+ # No more missing objects!
+ git -C backfill1 rev-list --quiet --objects --missing=print HEAD >revs2 &&
+ test_line_count = 0 revs2
+'
+
+. "$TEST_DIRECTORY"/lib-httpd.sh
+start_httpd
+
+test_expect_success 'create a partial clone over HTTP' '
+ SERVER="$HTTPD_DOCUMENT_ROOT_PATH/server" &&
+ rm -rf "$SERVER" repo &&
+ git clone --bare "file://$(pwd)/src" "$SERVER" &&
+ test_config -C "$SERVER" uploadpack.allowfilter 1 &&
+ test_config -C "$SERVER" uploadpack.allowanysha1inwant 1 &&
+
+ git clone --no-checkout --filter=blob:none \
+ "$HTTPD_URL/smart/server" backfill-http
+'
+
+test_expect_success 'backfilling over HTTP succeeds' '
+ GIT_TRACE2_EVENT="$(pwd)/backfill-http-trace" git \
+ -C backfill-http backfill &&
+
+ # We should have engaged the partial clone machinery
+ test_trace2_data promisor fetch_count 48 <backfill-http-trace &&
+
+ # Confirm all objects are present, none missing.
+ git -C backfill-http rev-list --objects --all >rev-list-out &&
+ awk "{print \$1;}" <rev-list-out >oids &&
+ GIT_TRACE2_EVENT="$(pwd)/walk-trace" git -C backfill-http \
+ cat-file --batch-check <oids >batch-out &&
+ ! grep missing batch-out
+'
+
+# DO NOT add non-httpd-specific tests here, because the last part of this
+# test script is only executed when httpd is available and enabled.
+
+test_done
--
gitgitgadget
next prev parent reply other threads:[~2024-09-10 2:29 UTC|newest]
Thread overview: 38+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-09-10 2:28 [PATCH 00/30] [RFC] Path-walk API and applications Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 01/30] path-walk: introduce an object walk by path Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 02/30] backfill: add builtin boilerplate Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` Derrick Stolee via GitGitGadget [this message]
2024-09-10 2:28 ` [PATCH 04/30] backfill: add --batch-size=<n> option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 05/30] backfill: add --sparse option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 06/30] backfill: assume --sparse when sparse-checkout is enabled Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 07/30] path-walk: allow consumer to specify object types Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 08/30] path-walk: allow visiting tags Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 09/30] survey: stub in new experimental `git-survey` command Jeff Hostetler via GitGitGadget
2024-09-10 2:28 ` [PATCH 10/30] survey: add command line opts to select references Jeff Hostetler via GitGitGadget
2024-09-10 2:28 ` [PATCH 11/30] survey: collect the set of requested refs Jeff Hostetler via GitGitGadget
2024-09-10 2:28 ` [PATCH 12/30] survey: start pretty printing data in table form Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 13/30] survey: add object count summary Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 14/30] survey: summarize total sizes by object type Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 15/30] survey: show progress during object walk Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 16/30] survey: add ability to track prioritized lists Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 17/30] survey: add report of "largest" paths Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 18/30] revision: create mark_trees_uninteresting_dense() Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 19/30] path-walk: add prune_all_uninteresting option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 20/30] pack-objects: add --path-walk option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 21/30] pack-objects: extract should_attempt_deltas() Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 22/30] pack-objects: introduce GIT_TEST_PACK_PATH_WALK Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 23/30] p5313: add size comparison test Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 24/30] repack: add --path-walk option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 25/30] pack-objects: enable --path-walk via config Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 26/30] scalar: enable path-walk during push " Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 27/30] pack-objects: add --full-name-hash option Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 28/30] test-name-hash: add helper to compute name-hash functions Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 29/30] p5314: add a size test for name-hash collisions Derrick Stolee via GitGitGadget
2024-09-10 2:28 ` [PATCH 30/30] pack-objects: output debug info about deltas Derrick Stolee via GitGitGadget
2024-09-11 21:32 ` [PATCH 00/30] [RFC] Path-walk API and applications Junio C Hamano
2024-09-17 10:41 ` Christian Couder
2024-09-18 23:18 ` Derrick Stolee
2024-09-22 18:37 ` Junio C Hamano
2024-09-23 1:22 ` Derrick Stolee
2024-09-23 16:56 ` Junio C Hamano
2024-09-22 21:08 ` Kristoffer Haugsbakk
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=be21c8370eb5143c3f6e91d354d48465d221692a.1725935335.git.gitgitgadget@gmail.com \
--to=gitgitgadget@gmail.com \
--cc=derrickstolee@github.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=johannes.schindelin@gmx.de \
--cc=johncai86@gmail.com \
--cc=me@ttaylorr.com \
--cc=newren@gmail.com \
--cc=peff@peff.net \
--cc=ps@pks.im \
--cc=stolee@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).