All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Elijah Newren via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: Elijah Newren <newren@gmail.com>, Elijah Newren <newren@gmail.com>
Subject: [PATCH 3/3] grep: prefetch necessary blobs
Date: Thu, 16 Apr 2026 22:48:14 +0000	[thread overview]
Message-ID: <6dbfc7608b7707decf9c036fade5d0fe25459aa8.1776379694.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.2089.git.1776379694.gitgitgadget@gmail.com>

From: Elijah Newren <newren@gmail.com>

In partial clones, `git grep` fetches necessary blobs on-demand one
at a time, which can be very slow.  In partial clones, add an extra
preliminary walk over the tree similar to grep_tree() which collects
the blobs of interest, and then prefetches them.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 builtin/grep.c  | 142 ++++++++++++++++++++++++++++++++++++++++++++++++
 t/t7810-grep.sh |  35 ++++++++++++
 2 files changed, 177 insertions(+)

diff --git a/builtin/grep.c b/builtin/grep.c
index e33285e5e6..d559c48d94 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -28,9 +28,12 @@
 #include "object-file.h"
 #include "object-name.h"
 #include "odb.h"
+#include "oid-array.h"
+#include "oidset.h"
 #include "packfile.h"
 #include "pager.h"
 #include "path.h"
+#include "promisor-remote.h"
 #include "read-cache-ll.h"
 #include "write-or-die.h"
 
@@ -692,6 +695,143 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
 	return hit;
 }
 
+static void collect_blob_oids_for_tree(struct repository *repo,
+				       const struct pathspec *pathspec,
+				       struct tree_desc *tree,
+				       struct strbuf *base,
+				       int tn_len,
+				       struct oidset *blob_oids)
+{
+	struct name_entry entry;
+	int old_baselen = base->len;
+	struct strbuf name = STRBUF_INIT;
+	enum interesting match = entry_not_interesting;
+
+	while (tree_entry(tree, &entry)) {
+		if (match != all_entries_interesting) {
+			strbuf_addstr(&name, base->buf + tn_len);
+			match = tree_entry_interesting(repo->index,
+						       &entry, &name,
+						       pathspec);
+			strbuf_reset(&name);
+
+			if (match == all_entries_not_interesting)
+				break;
+			if (match == entry_not_interesting)
+				continue;
+		}
+
+		strbuf_add(base, entry.path, tree_entry_len(&entry));
+
+		if (S_ISREG(entry.mode)) {
+			oidset_insert(blob_oids, &entry.oid);
+		} else if (S_ISDIR(entry.mode)) {
+			enum object_type type;
+			struct tree_desc sub_tree;
+			void *data;
+			unsigned long size;
+
+			data = odb_read_object(repo->objects, &entry.oid,
+					       &type, &size);
+			if (!data)
+				die(_("unable to read tree (%s)"),
+				    oid_to_hex(&entry.oid));
+
+			strbuf_addch(base, '/');
+			init_tree_desc(&sub_tree, &entry.oid, data, size);
+			collect_blob_oids_for_tree(repo, pathspec, &sub_tree,
+						   base, tn_len, blob_oids);
+			free(data);
+		}
+		/*
+		 * ...no else clause for S_ISGITLINK: submodules have their
+		 * own promisor configuration and would need separate fetches
+		 * anyway.
+		 */
+
+		strbuf_setlen(base, old_baselen);
+	}
+
+	strbuf_release(&name);
+}
+
+static void collect_blob_oids_for_treeish(struct grep_opt *opt,
+					  const struct pathspec *pathspec,
+					  const struct object_id *tree_ish_oid,
+					  const char *name,
+					  struct oidset *blob_oids)
+{
+	struct tree_desc tree;
+	void *data;
+	unsigned long size;
+	struct strbuf base = STRBUF_INIT;
+	int len;
+
+	data = odb_read_object_peeled(opt->repo->objects, tree_ish_oid,
+				      OBJ_TREE, &size, NULL);
+
+	if (!data)
+		return;
+
+	len = name ? strlen(name) : 0;
+	if (len) {
+		strbuf_add(&base, name, len);
+		strbuf_addch(&base, ':');
+	}
+	init_tree_desc(&tree, tree_ish_oid, data, size);
+
+	collect_blob_oids_for_tree(opt->repo, pathspec, &tree,
+				   &base, base.len, blob_oids);
+
+	strbuf_release(&base);
+	free(data);
+}
+
+static void prefetch_grep_blobs(struct grep_opt *opt,
+				const struct pathspec *pathspec,
+				const struct object_array *list)
+{
+	struct oidset blob_oids = OIDSET_INIT;
+
+	/* Exit if we're not in a partial clone */
+	if (!repo_has_promisor_remote(opt->repo))
+		return;
+
+	/* For each tree, gather the blobs in it */
+	for (int i = 0; i < list->nr; i++) {
+		struct object *real_obj;
+
+		obj_read_lock();
+		real_obj = deref_tag(opt->repo, list->objects[i].item,
+				     NULL, 0);
+		obj_read_unlock();
+
+		if (real_obj &&
+		    (real_obj->type == OBJ_COMMIT ||
+		     real_obj->type == OBJ_TREE))
+			collect_blob_oids_for_treeish(opt, pathspec,
+						      &real_obj->oid,
+						      list->objects[i].name,
+						      &blob_oids);
+	}
+
+	/* Prefetch the blobs we found */
+	if (oidset_size(&blob_oids)) {
+		struct oid_array to_fetch = OID_ARRAY_INIT;
+		struct oidset_iter iter;
+		const struct object_id *oid;
+
+		oidset_iter_init(&blob_oids, &iter);
+		while ((oid = oidset_iter_next(&iter)))
+			oid_array_append(&to_fetch, oid);
+
+		promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr);
+
+		oid_array_clear(&to_fetch);
+	}
+	oidset_clear(&blob_oids);
+}
+
 static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
 		       struct object *obj, const char *name, const char *path)
 {
@@ -732,6 +872,8 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
 	int hit = 0;
 	const unsigned int nr = list->nr;
 
+	prefetch_grep_blobs(opt, pathspec, list);
+
 	for (i = 0; i < nr; i++) {
 		struct object *real_obj;
 
diff --git a/t/t7810-grep.sh b/t/t7810-grep.sh
index 64ac4f04ee..1f484502fe 100755
--- a/t/t7810-grep.sh
+++ b/t/t7810-grep.sh
@@ -1929,4 +1929,39 @@ test_expect_success 'grep does not report i-t-a and assume unchanged with -L' '
 	test_cmp expected actual
 '
 
+test_expect_success 'grep of revision in partial clone does bulk prefetch' '
+	test_when_finished "rm -rf grep-partial-src grep-partial" &&
+
+	git init grep-partial-src &&
+	(
+		cd grep-partial-src &&
+		git config uploadpack.allowfilter 1 &&
+		git config uploadpack.allowanysha1inwant 1 &&
+		echo "needle in haystack" >searchme &&
+		echo "no match here" >other &&
+		mkdir subdir &&
+		echo "needle again" >subdir/deep &&
+		git add . &&
+		git commit -m "initial"
+	) &&
+
+	git clone --no-checkout --filter=blob:none \
+		"file://$(pwd)/grep-partial-src" grep-partial &&
+
+	# All blobs should be missing after a blobless clone.
+	git -C grep-partial rev-list --quiet --objects \
+		--missing=print HEAD >missing &&
+	test_line_count = 3 missing &&
+
+	# grep HEAD should batch-prefetch all blobs in one request.
+	GIT_TRACE2_EVENT="$(pwd)/grep-trace" \
+		git -C grep-partial grep -c "needle" HEAD >result &&
+
+	# Should find matches in two files.
+	test_line_count = 2 result &&
+
+	# Should have prefetched all 3 objects at once
+	test_trace2_data promisor fetch_count 3 <grep-trace
+'
+
 test_done
-- 
gitgitgadget

  parent reply	other threads:[~2026-04-16 22:48 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-16 22:48 [PATCH 0/3] Batch prefetching Elijah Newren via GitGitGadget
2026-04-16 22:48 ` [PATCH 1/3] patch-ids.h: add missing trailing parenthesis in documentation comment Elijah Newren via GitGitGadget
2026-04-16 22:48 ` [PATCH 2/3] builtin/log: prefetch necessary blobs for `git cherry` Elijah Newren via GitGitGadget
2026-04-17 21:42   ` Junio C Hamano
2026-04-17 22:02     ` Elijah Newren
2026-04-16 22:48 ` Elijah Newren via GitGitGadget [this message]
2026-04-18  0:32 ` [PATCH v2 0/3] Batch prefetching Elijah Newren via GitGitGadget
2026-04-18  0:32   ` [PATCH v2 1/3] patch-ids.h: add missing trailing parenthesis in documentation comment Elijah Newren via GitGitGadget
2026-04-18  0:32   ` [PATCH v2 2/3] builtin/log: prefetch necessary blobs for `git cherry` Elijah Newren via GitGitGadget
2026-04-19 14:04     ` Phillip Wood
2026-04-21 21:28       ` Elijah Newren
2026-04-23 15:15         ` Phillip Wood
2026-04-23 17:38           ` Elijah Newren
2026-04-27 13:16     ` Derrick Stolee
2026-05-11  2:51       ` Junio C Hamano
2026-05-11 17:45         ` Elijah Newren
2026-05-13 23:17       ` Elijah Newren
2026-04-18  0:32   ` [PATCH v2 3/3] grep: prefetch necessary blobs Elijah Newren via GitGitGadget
2026-04-27 12:59     ` Derrick Stolee
2026-05-13 19:21       ` Elijah Newren
2026-05-14 16:25   ` [PATCH v3 0/4] Batch prefetching Elijah Newren via GitGitGadget
2026-05-14 16:25     ` [PATCH v3 1/4] promisor-remote: document caller filtering contract Elijah Newren via GitGitGadget
2026-05-14 16:25     ` [PATCH v3 2/4] patch-ids.h: add missing trailing parenthesis in documentation comment Elijah Newren via GitGitGadget
2026-05-14 16:25     ` [PATCH v3 3/4] builtin/log: prefetch necessary blobs for `git cherry` Elijah Newren via GitGitGadget
2026-05-14 16:25     ` [PATCH v3 4/4] grep: prefetch necessary blobs Elijah Newren via GitGitGadget

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6dbfc7608b7707decf9c036fade5d0fe25459aa8.1776379694.git.gitgitgadget@gmail.com \
    --to=gitgitgadget@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=newren@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.