From: "Elijah Newren via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: Elijah Newren <newren@gmail.com>, Elijah Newren <newren@gmail.com>
Subject: [PATCH 3/3] grep: prefetch necessary blobs
Date: Thu, 16 Apr 2026 22:48:14 +0000 [thread overview]
Message-ID: <6dbfc7608b7707decf9c036fade5d0fe25459aa8.1776379694.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.2089.git.1776379694.gitgitgadget@gmail.com>
From: Elijah Newren <newren@gmail.com>
In partial clones, `git grep` fetches necessary blobs on-demand one
at a time, which can be very slow. In partial clones, add an extra
preliminary walk over the tree similar to grep_tree() which collects
the blobs of interest, and then prefetches them.
Signed-off-by: Elijah Newren <newren@gmail.com>
---
builtin/grep.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++
t/t7810-grep.sh | 35 ++++++++++++
2 files changed, 177 insertions(+)
diff --git a/builtin/grep.c b/builtin/grep.c
index e33285e5e6..d559c48d94 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -28,9 +28,12 @@
#include "object-file.h"
#include "object-name.h"
#include "odb.h"
+#include "oid-array.h"
+#include "oidset.h"
#include "packfile.h"
#include "pager.h"
#include "path.h"
+#include "promisor-remote.h"
#include "read-cache-ll.h"
#include "write-or-die.h"
@@ -692,6 +695,143 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
return hit;
}
+static void collect_blob_oids_for_tree(struct repository *repo,
+ const struct pathspec *pathspec,
+ struct tree_desc *tree,
+ struct strbuf *base,
+ int tn_len,
+ struct oidset *blob_oids)
+{
+ struct name_entry entry;
+ int old_baselen = base->len;
+ struct strbuf name = STRBUF_INIT;
+ enum interesting match = entry_not_interesting;
+
+ while (tree_entry(tree, &entry)) {
+ if (match != all_entries_interesting) {
+ strbuf_addstr(&name, base->buf + tn_len);
+ match = tree_entry_interesting(repo->index,
+ &entry, &name,
+ pathspec);
+ strbuf_reset(&name);
+
+ if (match == all_entries_not_interesting)
+ break;
+ if (match == entry_not_interesting)
+ continue;
+ }
+
+ strbuf_add(base, entry.path, tree_entry_len(&entry));
+
+ if (S_ISREG(entry.mode)) {
+ oidset_insert(blob_oids, &entry.oid);
+ } else if (S_ISDIR(entry.mode)) {
+ enum object_type type;
+ struct tree_desc sub_tree;
+ void *data;
+ unsigned long size;
+
+ data = odb_read_object(repo->objects, &entry.oid,
+ &type, &size);
+ if (!data)
+ die(_("unable to read tree (%s)"),
+ oid_to_hex(&entry.oid));
+
+ strbuf_addch(base, '/');
+ init_tree_desc(&sub_tree, &entry.oid, data, size);
+ collect_blob_oids_for_tree(repo, pathspec, &sub_tree,
+ base, tn_len, blob_oids);
+ free(data);
+ }
+ /*
+ * ...no else clause for S_ISGITLINK: submodules have their
+ * own promisor configuration and would need separate fetches
+ * anyway.
+ */
+
+ strbuf_setlen(base, old_baselen);
+ }
+
+ strbuf_release(&name);
+}
+
+static void collect_blob_oids_for_treeish(struct grep_opt *opt,
+ const struct pathspec *pathspec,
+ const struct object_id *tree_ish_oid,
+ const char *name,
+ struct oidset *blob_oids)
+{
+ struct tree_desc tree;
+ void *data;
+ unsigned long size;
+ struct strbuf base = STRBUF_INIT;
+ int len;
+
+ data = odb_read_object_peeled(opt->repo->objects, tree_ish_oid,
+ OBJ_TREE, &size, NULL);
+
+ if (!data)
+ return;
+
+ len = name ? strlen(name) : 0;
+ if (len) {
+ strbuf_add(&base, name, len);
+ strbuf_addch(&base, ':');
+ }
+ init_tree_desc(&tree, tree_ish_oid, data, size);
+
+ collect_blob_oids_for_tree(opt->repo, pathspec, &tree,
+ &base, base.len, blob_oids);
+
+ strbuf_release(&base);
+ free(data);
+}
+
+static void prefetch_grep_blobs(struct grep_opt *opt,
+ const struct pathspec *pathspec,
+ const struct object_array *list)
+{
+ struct oidset blob_oids = OIDSET_INIT;
+
+ /* Exit if we're not in a partial clone */
+ if (!repo_has_promisor_remote(opt->repo))
+ return;
+
+ /* For each tree, gather the blobs in it */
+ for (int i = 0; i < list->nr; i++) {
+ struct object *real_obj;
+
+ obj_read_lock();
+ real_obj = deref_tag(opt->repo, list->objects[i].item,
+ NULL, 0);
+ obj_read_unlock();
+
+ if (real_obj &&
+ (real_obj->type == OBJ_COMMIT ||
+ real_obj->type == OBJ_TREE))
+ collect_blob_oids_for_treeish(opt, pathspec,
+ &real_obj->oid,
+ list->objects[i].name,
+ &blob_oids);
+ }
+
+ /* Prefetch the blobs we found */
+ if (oidset_size(&blob_oids)) {
+ struct oid_array to_fetch = OID_ARRAY_INIT;
+ struct oidset_iter iter;
+ const struct object_id *oid;
+
+ oidset_iter_init(&blob_oids, &iter);
+ while ((oid = oidset_iter_next(&iter)))
+ oid_array_append(&to_fetch, oid);
+
+ promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr);
+
+ oid_array_clear(&to_fetch);
+ }
+ oidset_clear(&blob_oids);
+}
+
static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
struct object *obj, const char *name, const char *path)
{
@@ -732,6 +872,8 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
int hit = 0;
const unsigned int nr = list->nr;
+ prefetch_grep_blobs(opt, pathspec, list);
+
for (i = 0; i < nr; i++) {
struct object *real_obj;
diff --git a/t/t7810-grep.sh b/t/t7810-grep.sh
index 64ac4f04ee..1f484502fe 100755
--- a/t/t7810-grep.sh
+++ b/t/t7810-grep.sh
@@ -1929,4 +1929,39 @@ test_expect_success 'grep does not report i-t-a and assume unchanged with -L' '
test_cmp expected actual
'
+test_expect_success 'grep of revision in partial clone does bulk prefetch' '
+ test_when_finished "rm -rf grep-partial-src grep-partial" &&
+
+ git init grep-partial-src &&
+ (
+ cd grep-partial-src &&
+ git config uploadpack.allowfilter 1 &&
+ git config uploadpack.allowanysha1inwant 1 &&
+ echo "needle in haystack" >searchme &&
+ echo "no match here" >other &&
+ mkdir subdir &&
+ echo "needle again" >subdir/deep &&
+ git add . &&
+ git commit -m "initial"
+ ) &&
+
+ git clone --no-checkout --filter=blob:none \
+ "file://$(pwd)/grep-partial-src" grep-partial &&
+
+ # All blobs should be missing after a blobless clone.
+ git -C grep-partial rev-list --quiet --objects \
+ --missing=print HEAD >missing &&
+ test_line_count = 3 missing &&
+
+ # grep HEAD should batch-prefetch all blobs in one request.
+ GIT_TRACE2_EVENT="$(pwd)/grep-trace" \
+ git -C grep-partial grep -c "needle" HEAD >result &&
+
+ # Should find matches in two files.
+ test_line_count = 2 result &&
+
+ # Should have prefetched all 3 objects at once
+ test_trace2_data promisor fetch_count 3 <grep-trace
+'
+
test_done
--
gitgitgadget
next prev parent reply other threads:[~2026-04-16 22:48 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-16 22:48 [PATCH 0/3] Batch prefetching Elijah Newren via GitGitGadget
2026-04-16 22:48 ` [PATCH 1/3] patch-ids.h: add missing trailing parenthesis in documentation comment Elijah Newren via GitGitGadget
2026-04-16 22:48 ` [PATCH 2/3] builtin/log: prefetch necessary blobs for `git cherry` Elijah Newren via GitGitGadget
2026-04-17 21:42 ` Junio C Hamano
2026-04-17 22:02 ` Elijah Newren
2026-04-16 22:48 ` Elijah Newren via GitGitGadget [this message]
2026-04-18 0:32 ` [PATCH v2 0/3] Batch prefetching Elijah Newren via GitGitGadget
2026-04-18 0:32 ` [PATCH v2 1/3] patch-ids.h: add missing trailing parenthesis in documentation comment Elijah Newren via GitGitGadget
2026-04-18 0:32 ` [PATCH v2 2/3] builtin/log: prefetch necessary blobs for `git cherry` Elijah Newren via GitGitGadget
2026-04-19 14:04 ` Phillip Wood
2026-04-21 21:28 ` Elijah Newren
2026-04-23 15:15 ` Phillip Wood
2026-04-23 17:38 ` Elijah Newren
2026-04-27 13:16 ` Derrick Stolee
2026-05-11 2:51 ` Junio C Hamano
2026-05-11 17:45 ` Elijah Newren
2026-05-13 23:17 ` Elijah Newren
2026-04-18 0:32 ` [PATCH v2 3/3] grep: prefetch necessary blobs Elijah Newren via GitGitGadget
2026-04-27 12:59 ` Derrick Stolee
2026-05-13 19:21 ` Elijah Newren
2026-05-14 16:25 ` [PATCH v3 0/4] Batch prefetching Elijah Newren via GitGitGadget
2026-05-14 16:25 ` [PATCH v3 1/4] promisor-remote: document caller filtering contract Elijah Newren via GitGitGadget
2026-05-14 16:25 ` [PATCH v3 2/4] patch-ids.h: add missing trailing parenthesis in documentation comment Elijah Newren via GitGitGadget
2026-05-14 16:25 ` [PATCH v3 3/4] builtin/log: prefetch necessary blobs for `git cherry` Elijah Newren via GitGitGadget
2026-05-14 16:25 ` [PATCH v3 4/4] grep: prefetch necessary blobs Elijah Newren via GitGitGadget
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=6dbfc7608b7707decf9c036fade5d0fe25459aa8.1776379694.git.gitgitgadget@gmail.com \
--to=gitgitgadget@gmail.com \
--cc=git@vger.kernel.org \
--cc=newren@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox