Git development
 help / color / mirror / Atom feed
From: "Elijah Newren via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: Elijah Newren <newren@gmail.com>,
	Elijah Newren <newren@gmail.com>,
	Elijah Newren <newren@gmail.com>
Subject: [PATCH v2 3/3] grep: prefetch necessary blobs
Date: Sat, 18 Apr 2026 00:32:27 +0000	[thread overview]
Message-ID: <8fbfe69bc4d0c6166967986f24861ffa393ed7cf.1776472347.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.2089.v2.git.1776472347.gitgitgadget@gmail.com>

From: Elijah Newren <newren@gmail.com>

In partial clones, `git grep` fetches necessary blobs on-demand one
at a time, which can be very slow.  In partial clones, add an extra
preliminary walk over the tree similar to grep_tree() which collects
the blobs of interest, and then prefetches them.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 builtin/grep.c  | 142 ++++++++++++++++++++++++++++++++++++++++++++++++
 t/t7810-grep.sh |  35 ++++++++++++
 2 files changed, 177 insertions(+)

diff --git a/builtin/grep.c b/builtin/grep.c
index e33285e5e6..d559c48d94 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -28,9 +28,12 @@
 #include "object-file.h"
 #include "object-name.h"
 #include "odb.h"
+#include "oid-array.h"
+#include "oidset.h"
 #include "packfile.h"
 #include "pager.h"
 #include "path.h"
+#include "promisor-remote.h"
 #include "read-cache-ll.h"
 #include "write-or-die.h"
 
@@ -692,6 +695,143 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
 	return hit;
 }
 
+static void collect_blob_oids_for_tree(struct repository *repo,
+				       const struct pathspec *pathspec,
+				       struct tree_desc *tree,
+				       struct strbuf *base,
+				       int tn_len,
+				       struct oidset *blob_oids)
+{
+	struct name_entry entry;
+	int old_baselen = base->len;
+	struct strbuf name = STRBUF_INIT;
+	enum interesting match = entry_not_interesting;
+
+	while (tree_entry(tree, &entry)) {
+		if (match != all_entries_interesting) {
+			strbuf_addstr(&name, base->buf + tn_len);
+			match = tree_entry_interesting(repo->index,
+						       &entry, &name,
+						       pathspec);
+			strbuf_reset(&name);
+
+			if (match == all_entries_not_interesting)
+				break;
+			if (match == entry_not_interesting)
+				continue;
+		}
+
+		strbuf_add(base, entry.path, tree_entry_len(&entry));
+
+		if (S_ISREG(entry.mode)) {
+			oidset_insert(blob_oids, &entry.oid);
+		} else if (S_ISDIR(entry.mode)) {
+			enum object_type type;
+			struct tree_desc sub_tree;
+			void *data;
+			unsigned long size;
+
+			data = odb_read_object(repo->objects, &entry.oid,
+					       &type, &size);
+			if (!data)
+				die(_("unable to read tree (%s)"),
+				    oid_to_hex(&entry.oid));
+
+			strbuf_addch(base, '/');
+			init_tree_desc(&sub_tree, &entry.oid, data, size);
+			collect_blob_oids_for_tree(repo, pathspec, &sub_tree,
+						   base, tn_len, blob_oids);
+			free(data);
+		}
+		/*
+		 * ...no else clause for S_ISGITLINK: submodules have their
+		 * own promisor configuration and would need separate fetches
+		 * anyway.
+		 */
+
+		strbuf_setlen(base, old_baselen);
+	}
+
+	strbuf_release(&name);
+}
+
+static void collect_blob_oids_for_treeish(struct grep_opt *opt,
+					  const struct pathspec *pathspec,
+					  const struct object_id *tree_ish_oid,
+					  const char *name,
+					  struct oidset *blob_oids)
+{
+	struct tree_desc tree;
+	void *data;
+	unsigned long size;
+	struct strbuf base = STRBUF_INIT;
+	int len;
+
+	data = odb_read_object_peeled(opt->repo->objects, tree_ish_oid,
+				      OBJ_TREE, &size, NULL);
+
+	if (!data)
+		return;
+
+	len = name ? strlen(name) : 0;
+	if (len) {
+		strbuf_add(&base, name, len);
+		strbuf_addch(&base, ':');
+	}
+	init_tree_desc(&tree, tree_ish_oid, data, size);
+
+	collect_blob_oids_for_tree(opt->repo, pathspec, &tree,
+				   &base, base.len, blob_oids);
+
+	strbuf_release(&base);
+	free(data);
+}
+
+static void prefetch_grep_blobs(struct grep_opt *opt,
+				const struct pathspec *pathspec,
+				const struct object_array *list)
+{
+	struct oidset blob_oids = OIDSET_INIT;
+
+	/* Exit if we're not in a partial clone */
+	if (!repo_has_promisor_remote(opt->repo))
+		return;
+
+	/* For each tree, gather the blobs in it */
+	for (int i = 0; i < list->nr; i++) {
+		struct object *real_obj;
+
+		obj_read_lock();
+		real_obj = deref_tag(opt->repo, list->objects[i].item,
+				     NULL, 0);
+		obj_read_unlock();
+
+		if (real_obj &&
+		    (real_obj->type == OBJ_COMMIT ||
+		     real_obj->type == OBJ_TREE))
+			collect_blob_oids_for_treeish(opt, pathspec,
+						      &real_obj->oid,
+						      list->objects[i].name,
+						      &blob_oids);
+	}
+
+	/* Prefetch the blobs we found */
+	if (oidset_size(&blob_oids)) {
+		struct oid_array to_fetch = OID_ARRAY_INIT;
+		struct oidset_iter iter;
+		const struct object_id *oid;
+
+		oidset_iter_init(&blob_oids, &iter);
+		while ((oid = oidset_iter_next(&iter)))
+			oid_array_append(&to_fetch, oid);
+
+		promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr);
+
+		oid_array_clear(&to_fetch);
+	}
+	oidset_clear(&blob_oids);
+}
+
 static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
 		       struct object *obj, const char *name, const char *path)
 {
@@ -732,6 +872,8 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
 	int hit = 0;
 	const unsigned int nr = list->nr;
 
+	prefetch_grep_blobs(opt, pathspec, list);
+
 	for (i = 0; i < nr; i++) {
 		struct object *real_obj;
 
diff --git a/t/t7810-grep.sh b/t/t7810-grep.sh
index 64ac4f04ee..1f484502fe 100755
--- a/t/t7810-grep.sh
+++ b/t/t7810-grep.sh
@@ -1929,4 +1929,39 @@ test_expect_success 'grep does not report i-t-a and assume unchanged with -L' '
 	test_cmp expected actual
 '
 
+test_expect_success 'grep of revision in partial clone does bulk prefetch' '
+	test_when_finished "rm -rf grep-partial-src grep-partial" &&
+
+	git init grep-partial-src &&
+	(
+		cd grep-partial-src &&
+		git config uploadpack.allowfilter 1 &&
+		git config uploadpack.allowanysha1inwant 1 &&
+		echo "needle in haystack" >searchme &&
+		echo "no match here" >other &&
+		mkdir subdir &&
+		echo "needle again" >subdir/deep &&
+		git add . &&
+		git commit -m "initial"
+	) &&
+
+	git clone --no-checkout --filter=blob:none \
+		"file://$(pwd)/grep-partial-src" grep-partial &&
+
+	# All blobs should be missing after a blobless clone.
+	git -C grep-partial rev-list --quiet --objects \
+		--missing=print HEAD >missing &&
+	test_line_count = 3 missing &&
+
+	# grep HEAD should batch-prefetch all blobs in one request.
+	GIT_TRACE2_EVENT="$(pwd)/grep-trace" \
+		git -C grep-partial grep -c "needle" HEAD >result &&
+
+	# Should find matches in two files.
+	test_line_count = 2 result &&
+
+	# Should have prefetched all 3 objects at once
+	test_trace2_data promisor fetch_count 3 <grep-trace
+'
+
 test_done
-- 
gitgitgadget

  parent reply	other threads:[~2026-04-18  0:32 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-16 22:48 [PATCH 0/3] Batch prefetching Elijah Newren via GitGitGadget
2026-04-16 22:48 ` [PATCH 1/3] patch-ids.h: add missing trailing parenthesis in documentation comment Elijah Newren via GitGitGadget
2026-04-16 22:48 ` [PATCH 2/3] builtin/log: prefetch necessary blobs for `git cherry` Elijah Newren via GitGitGadget
2026-04-17 21:42   ` Junio C Hamano
2026-04-17 22:02     ` Elijah Newren
2026-04-16 22:48 ` [PATCH 3/3] grep: prefetch necessary blobs Elijah Newren via GitGitGadget
2026-04-18  0:32 ` [PATCH v2 0/3] Batch prefetching Elijah Newren via GitGitGadget
2026-04-18  0:32   ` [PATCH v2 1/3] patch-ids.h: add missing trailing parenthesis in documentation comment Elijah Newren via GitGitGadget
2026-04-18  0:32   ` [PATCH v2 2/3] builtin/log: prefetch necessary blobs for `git cherry` Elijah Newren via GitGitGadget
2026-04-19 14:04     ` Phillip Wood
2026-04-21 21:28       ` Elijah Newren
2026-04-23 15:15         ` Phillip Wood
2026-04-23 17:38           ` Elijah Newren
2026-04-27 13:16     ` Derrick Stolee
2026-05-11  2:51       ` Junio C Hamano
2026-05-11 17:45         ` Elijah Newren
2026-05-13 23:17       ` Elijah Newren
2026-04-18  0:32   ` Elijah Newren via GitGitGadget [this message]
2026-04-27 12:59     ` [PATCH v2 3/3] grep: prefetch necessary blobs Derrick Stolee
2026-05-13 19:21       ` Elijah Newren
2026-05-14 16:25   ` [PATCH v3 0/4] Batch prefetching Elijah Newren via GitGitGadget
2026-05-14 16:25     ` [PATCH v3 1/4] promisor-remote: document caller filtering contract Elijah Newren via GitGitGadget
2026-05-14 16:25     ` [PATCH v3 2/4] patch-ids.h: add missing trailing parenthesis in documentation comment Elijah Newren via GitGitGadget
2026-05-14 16:25     ` [PATCH v3 3/4] builtin/log: prefetch necessary blobs for `git cherry` Elijah Newren via GitGitGadget
2026-05-14 16:25     ` [PATCH v3 4/4] grep: prefetch necessary blobs Elijah Newren via GitGitGadget

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8fbfe69bc4d0c6166967986f24861ffa393ed7cf.1776472347.git.gitgitgadget@gmail.com \
    --to=gitgitgadget@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=newren@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox