git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Patrick Steinhardt <ps@pks.im>
To: Aaron Plattner <aplattner@nvidia.com>
Cc: git@vger.kernel.org
Subject: Re: [PATCH] packfile: skip decompressing and hashing blobs in add_promisor_object()
Date: Fri, 5 Dec 2025 13:36:33 +0100	[thread overview]
Message-ID: <aTLR0YXqRO63GkJu@pks.im> (raw)
In-Reply-To: <20251204172132.319360-1-aplattner@nvidia.com>

On Thu, Dec 04, 2025 at 09:21:29AM -0800, Aaron Plattner wrote:
> When is_promisor_object() is called for the first time, it lazily
> initializes a set of all promisor objects by iterating through all
> objects in promisor packs. For each object, add_promisor_object() calls
> parse_object(), which decompresses and hashes the entire object.
> 
> For repositories with large pack files, this can take an extremely long
> time. For example, on a production repository with a 176 GB promisor
> pack:
> 
>  $ time ~/git/git/git-rev-list --objects --all --exclude-promisor-objects --quiet
>  ________________________________________________________
>  Executed in   76.10 mins    fish           external
>     usr time   72.10 mins    1.83 millis   72.10 mins
>     sys time    3.56 mins    0.17 millis    3.56 mins
> 
> add_promisor_object() needs the full object for trees, commits, and
> tags. But blobs contain no references to other objects, so the function
> can just insert their oids into the set and move on.
> 
> For objects that weren't already parsed, use odb_read_object_info() to
> query the object type. If it's a blob, just insert it into the oidset
> without parsing it. This improves performance for very large pack files
> significantly:
> 
>  $ time ~/git/git/git-rev-list --objects --all --exclude-promisor-objects --quiet
>  ________________________________________________________
>  Executed in  118.76 secs    fish           external
>     usr time   50.88 secs   11.02 millis   50.87 secs
>     sys time   36.31 secs    0.08 millis   36.31 secs
> 
> Signed-off-by: Aaron Plattner <aplattner@nvidia.com>
> ---
>  packfile.c | 11 +++++++++++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/packfile.c b/packfile.c
> index 9cc11b6dc5..563fd14f0e 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -2309,6 +2309,17 @@ static int add_promisor_object(const struct object_id *oid,
>  	if (obj && obj->parsed) {
>  		we_parsed_object = 0;
>  	} else {
> +		/*
> +		 * Blobs don't reference other objects, so skip parsing them
> +		 * to save time.
> +		 */
> +		enum object_type type;
> +		type = odb_read_object_info(pack->repo->objects, oid, NULL);
> +		if (type == OBJ_BLOB) {
> +			oidset_insert(set, oid);
> +			return 0;
> +		}
> +
>  		we_parsed_object = 1;
>  		obj = parse_object(pack->repo, oid);
>  	}

This looks like an obvious improvement to me. While looking at the
function I noticed that we also free tree buffers here, which make
sense to reduce memory usage. I was wondering whether we want to do
the same for commit buffers, which may reduce memory usage even further.

See for example the below patch. Might be wort it to test in your large
repository to see whether it has an effect on the maximum RSS.

Thanks!

Patrick

diff --git a/packfile.c b/packfile.c
index 9cc11b6dc5..89227ada98 100644
--- a/packfile.c
+++ b/packfile.c
@@ -2296,12 +2296,17 @@ int for_each_packed_object(struct repository *repo, each_packed_object_fn cb,
 	return r ? r : pack_errors;
 }
 
+struct add_promisor_object_data {
+	struct repository *repo;
+	struct oidset promisor_objects;
+};
+
 static int add_promisor_object(const struct object_id *oid,
 			       struct packed_git *pack,
 			       uint32_t pos UNUSED,
-			       void *set_)
+			       void *cb_data)
 {
-	struct oidset *set = set_;
+	struct add_promisor_object_data *data = cb_data;
 	struct object *obj;
 	int we_parsed_object;
 
@@ -2316,7 +2321,7 @@ static int add_promisor_object(const struct object_id *oid,
 	if (!obj)
 		return 1;
 
-	oidset_insert(set, oid);
+	oidset_insert(&data->promisor_objects, oid);
 
 	/*
 	 * If this is a tree, commit, or tag, the objects it refers
@@ -2334,38 +2339,45 @@ static int add_promisor_object(const struct object_id *oid,
 			 */
 			return 0;
 		while (tree_entry_gently(&desc, &entry))
-			oidset_insert(set, &entry.oid);
+			oidset_insert(&data->promisor_objects, &entry.oid);
 		if (we_parsed_object)
 			free_tree_buffer(tree);
 	} else if (obj->type == OBJ_COMMIT) {
 		struct commit *commit = (struct commit *) obj;
 		struct commit_list *parents = commit->parents;
 
-		oidset_insert(set, get_commit_tree_oid(commit));
+		oidset_insert(&data->promisor_objects, get_commit_tree_oid(commit));
 		for (; parents; parents = parents->next)
-			oidset_insert(set, &parents->item->object.oid);
+			oidset_insert(&data->promisor_objects, &parents->item->object.oid);
+
+		if (we_parsed_object)
+			free_commit_buffer(data->repo->parsed_objects, commit);
 	} else if (obj->type == OBJ_TAG) {
 		struct tag *tag = (struct tag *) obj;
-		oidset_insert(set, get_tagged_oid(tag));
+		oidset_insert(&data->promisor_objects, get_tagged_oid(tag));
 	}
 	return 0;
 }
 
 int is_promisor_object(struct repository *r, const struct object_id *oid)
 {
-	static struct oidset promisor_objects;
+	static struct add_promisor_object_data data = {
+		.promisor_objects = OIDSET_INIT,
+	};
 	static int promisor_objects_prepared;
 
 	if (!promisor_objects_prepared) {
+		data.repo = r;
 		if (repo_has_promisor_remote(r)) {
 			for_each_packed_object(r, add_promisor_object,
-					       &promisor_objects,
+					       &data,
 					       FOR_EACH_OBJECT_PROMISOR_ONLY |
 					       FOR_EACH_OBJECT_PACK_ORDER);
 		}
 		promisor_objects_prepared = 1;
+		data.repo = NULL;
 	}
-	return oidset_contains(&promisor_objects, oid);
+	return oidset_contains(&data.promisor_objects, oid);
 }
 
 int parse_pack_header_option(const char *in, unsigned char *out, unsigned int *len)

  reply	other threads:[~2025-12-05 12:36 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-04 17:21 [PATCH] packfile: skip decompressing and hashing blobs in add_promisor_object() Aaron Plattner
2025-12-05 12:36 ` Patrick Steinhardt [this message]
2025-12-05 16:55   ` Aaron Plattner
2025-12-05 17:59     ` Jeff King
2025-12-05 17:48 ` Jeff King
2025-12-05 18:01   ` Jeff King
2025-12-05 18:50     ` Aaron Plattner
2025-12-05 21:28       ` Jeff King
2025-12-05 21:56         ` Aaron Plattner
2025-12-06  1:58           ` Jeff King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aTLR0YXqRO63GkJu@pks.im \
    --to=ps@pks.im \
    --cc=aplattner@nvidia.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).