[PATCH 0/6] odb: introduce generic object counting

public inbox for git@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/6] odb: introduce generic object counting
@ 2026-03-10 15:18 Patrick Steinhardt
  2026-03-10 15:18 ` [PATCH 1/6] odb: stop including "odb/source.h" Patrick Steinhardt
                   ` (6 more replies)
  0 siblings, 7 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-10 15:18 UTC (permalink / raw)
  To: git; +Cc: Justin Tobler

Hi,

this small patch series introduces generic object counting for pluggable
object databases. The series is built on top of d181b9354c (The 13th
batch, 2026-03-09) with ps/odb-sources at d6fc6fe6f8 (odb/source: make
`begin_transaction()` function pluggable, 2026-03-05) merged into it.

Thanks!

Patrick

---
Patrick Steinhardt (6):
      odb: stop including "odb/source.h"
      packfile: extract logic to count number of objects
      object-file: extract logic to approximate object count
      object-file: generalize counting objects
      odb/source: introduce generic object counting
      odb: introduce generic object counting

 builtin/gc.c                | 44 +++++++++----------------
 builtin/multi-pack-index.c  |  1 +
 builtin/submodule--helper.c |  1 +
 commit-graph.c              |  3 +-
 object-file.c               | 57 ++++++++++++++++++++++++++++++++
 object-file.h               | 14 ++++++++
 object-name.c               |  6 +++-
 odb.c                       | 37 ++++++++++++++++++++-
 odb.h                       | 76 +++++++++++++++++++++++++++++++++++++++++--
 odb/source-files.c          | 30 +++++++++++++++++
 odb/source.h                | 79 ++++++++++++++++-----------------------------
 odb/streaming.c             |  1 +
 packfile.c                  | 48 +++++++++++++--------------
 packfile.h                  | 16 +++++----
 repository.c                |  1 +
 submodule-config.c          |  1 +
 tmp-objdir.c                |  1 +
 17 files changed, 299 insertions(+), 117 deletions(-)


---
base-commit: 2247f478a898a7f8f8322cc51bdeb1cc773d8f4a
change-id: 20260224-b4-pks-odb-source-count-objects-479fe682cf6f


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/6] odb: stop including "odb/source.h"
  2026-03-10 15:18 [PATCH 0/6] odb: introduce generic object counting Patrick Steinhardt
@ 2026-03-10 15:18 ` Patrick Steinhardt
  2026-03-10 15:18 ` [PATCH 2/6] packfile: extract logic to count number of objects Patrick Steinhardt
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-10 15:18 UTC (permalink / raw)
  To: git; +Cc: Justin Tobler

The "odb.h" header currently includes the "odb/source.h" file. This is
somewhat roundabout though: most callers shouldn't have to care about
the `struct odb_source`, but should rather use the ODB-level functions.
Furthermore, it means that a couple of definitions have to live on the
source level even though they should be part of the generic interface.

Reverse the relation between "odb/source.h" and "odb.h" and move the
enums and typedefs that relate to the generic interfaces back into
"odb.h". Add the necessary includes to all files that rely on the
transitive include.

Suggested-by: Justin Tobler <jltobler@gmail.com>
Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 builtin/multi-pack-index.c  |  1 +
 builtin/submodule--helper.c |  1 +
 odb.h                       | 50 ++++++++++++++++++++++++++++++++++++++++++-
 odb/source.h                | 52 +--------------------------------------------
 odb/streaming.c             |  1 +
 repository.c                |  1 +
 submodule-config.c          |  1 +
 tmp-objdir.c                |  1 +
 8 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c
index 5f364aa816..3fcb207f1a 100644
--- a/builtin/multi-pack-index.c
+++ b/builtin/multi-pack-index.c
@@ -9,6 +9,7 @@
 #include "strbuf.h"
 #include "trace2.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "replace-object.h"
 #include "repository.h"
 
diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c
index 143f7cb3cc..4957487536 100644
--- a/builtin/submodule--helper.c
+++ b/builtin/submodule--helper.c
@@ -29,6 +29,7 @@
 #include "object-file.h"
 #include "object-name.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "advice.h"
 #include "branch.h"
 #include "list-objects-filter-options.h"
diff --git a/odb.h b/odb.h
index 86e0365c24..7a583e3873 100644
--- a/odb.h
+++ b/odb.h
@@ -3,7 +3,6 @@
 
 #include "hashmap.h"
 #include "object.h"
-#include "odb/source.h"
 #include "oidset.h"
 #include "oidmap.h"
 #include "string-list.h"
@@ -12,6 +11,7 @@
 struct oidmap;
 struct oidtree;
 struct strbuf;
+struct strvec;
 struct repository;
 struct multi_pack_index;
 
@@ -339,6 +339,42 @@ struct object_info {
  */
 #define OBJECT_INFO_INIT { 0 }
 
+/* Flags that can be passed to `odb_read_object_info_extended()`. */
+enum object_info_flags {
+	/* Invoke lookup_replace_object() on the given hash. */
+	OBJECT_INFO_LOOKUP_REPLACE = (1 << 0),
+
+	/* Do not reprepare object sources when the first lookup has failed. */
+	OBJECT_INFO_QUICK = (1 << 1),
+
+	/*
+	 * Do not attempt to fetch the object if missing (even if fetch_is_missing is
+	 * nonzero).
+	 */
+	OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2),
+
+	/* Die if object corruption (not just an object being missing) was detected. */
+	OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3),
+
+	/*
+	 * We have already tried reading the object, but it couldn't be found
+	 * via any of the attached sources, and are now doing a second read.
+	 * This second read asks the individual sources to also evaluate
+	 * whether any on-disk state may have changed that may have caused the
+	 * object to appear.
+	 *
+	 * This flag is for internal use, only. The second read only occurs
+	 * when `OBJECT_INFO_QUICK` was not passed.
+	 */
+	OBJECT_INFO_SECOND_READ = (1 << 4),
+
+	/*
+	 * This is meant for bulk prefetching of missing blobs in a partial
+	 * clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK.
+	 */
+	OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK),
+};
+
 /*
  * Read object info from the object database and populate the `object_info`
  * structure. Returns 0 on success, a negative error code otherwise.
@@ -432,6 +468,18 @@ enum odb_for_each_object_flags {
 	ODB_FOR_EACH_OBJECT_SKIP_ON_DISK_KEPT_PACKS = (1<<4),
 };
 
+/*
+ * A callback function that can be used to iterate through objects. If given,
+ * the optional `oi` parameter will be populated the same as if you would call
+ * `odb_read_object_info()`.
+ *
+ * Returning a non-zero error code will cause iteration to abort. The error
+ * code will be propagated.
+ */
+typedef int (*odb_for_each_object_cb)(const struct object_id *oid,
+				      struct object_info *oi,
+				      void *cb_data);
+
 /*
  * Iterate through all objects contained in the object database. Note that
  * objects may be iterated over multiple times in case they are either stored
diff --git a/odb/source.h b/odb/source.h
index caac558149..a1fd9dd920 100644
--- a/odb/source.h
+++ b/odb/source.h
@@ -2,6 +2,7 @@
 #define ODB_SOURCE_H
 
 #include "object.h"
+#include "odb.h"
 
 enum odb_source_type {
 	/*
@@ -14,61 +15,10 @@ enum odb_source_type {
 	ODB_SOURCE_FILES,
 };
 
-/* Flags that can be passed to `odb_read_object_info_extended()`. */
-enum object_info_flags {
-	/* Invoke lookup_replace_object() on the given hash. */
-	OBJECT_INFO_LOOKUP_REPLACE = (1 << 0),
-
-	/* Do not reprepare object sources when the first lookup has failed. */
-	OBJECT_INFO_QUICK = (1 << 1),
-
-	/*
-	 * Do not attempt to fetch the object if missing (even if fetch_is_missing is
-	 * nonzero).
-	 */
-	OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2),
-
-	/* Die if object corruption (not just an object being missing) was detected. */
-	OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3),
-
-	/*
-	 * We have already tried reading the object, but it couldn't be found
-	 * via any of the attached sources, and are now doing a second read.
-	 * This second read asks the individual sources to also evaluate
-	 * whether any on-disk state may have changed that may have caused the
-	 * object to appear.
-	 *
-	 * This flag is for internal use, only. The second read only occurs
-	 * when `OBJECT_INFO_QUICK` was not passed.
-	 */
-	OBJECT_INFO_SECOND_READ = (1 << 4),
-
-	/*
-	 * This is meant for bulk prefetching of missing blobs in a partial
-	 * clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK.
-	 */
-	OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK),
-};
-
 struct object_id;
-struct object_info;
 struct odb_read_stream;
-struct odb_transaction;
-struct odb_write_stream;
 struct strvec;
 
-/*
- * A callback function that can be used to iterate through objects. If given,
- * the optional `oi` parameter will be populated the same as if you would call
- * `odb_read_object_info()`.
- *
- * Returning a non-zero error code will cause iteration to abort. The error
- * code will be propagated.
- */
-typedef int (*odb_for_each_object_cb)(const struct object_id *oid,
-				      struct object_info *oi,
-				      void *cb_data);
-
 /*
  * The source is the part of the object database that stores the actual
  * objects. It thus encapsulates the logic to read and write the specific
diff --git a/odb/streaming.c b/odb/streaming.c
index a4355cd245..5927a12954 100644
--- a/odb/streaming.c
+++ b/odb/streaming.c
@@ -7,6 +7,7 @@
 #include "environment.h"
 #include "repository.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "odb/streaming.h"
 #include "replace-object.h"
 
diff --git a/repository.c b/repository.c
index e7fa42c14f..05c26bdbc3 100644
--- a/repository.c
+++ b/repository.c
@@ -2,6 +2,7 @@
 #include "abspath.h"
 #include "repository.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "config.h"
 #include "object.h"
 #include "lockfile.h"
diff --git a/submodule-config.c b/submodule-config.c
index 1f19fe2077..72a46b7a54 100644
--- a/submodule-config.c
+++ b/submodule-config.c
@@ -14,6 +14,7 @@
 #include "strbuf.h"
 #include "object-name.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "parse-options.h"
 #include "thread-utils.h"
 #include "tree-walk.h"
diff --git a/tmp-objdir.c b/tmp-objdir.c
index e436eed07e..d199d39e7c 100644
--- a/tmp-objdir.c
+++ b/tmp-objdir.c
@@ -11,6 +11,7 @@
 #include "strvec.h"
 #include "quote.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "repository.h"
 
 struct tmp_objdir {

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 2/6] packfile: extract logic to count number of objects
  2026-03-10 15:18 [PATCH 0/6] odb: introduce generic object counting Patrick Steinhardt
  2026-03-10 15:18 ` [PATCH 1/6] odb: stop including "odb/source.h" Patrick Steinhardt
@ 2026-03-10 15:18 ` Patrick Steinhardt
  2026-03-11 12:41   ` Toon Claes
  2026-03-10 15:18 ` [PATCH 3/6] object-file: extract logic to approximate object count Patrick Steinhardt
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-10 15:18 UTC (permalink / raw)
  To: git

In a subsequent commit we're about to introduce a new
`odb_source_count_objects()` function so that we can make the logic
pluggable. Prepare for this change by extracting the logic that we have
to count packed objects into a standalone function.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 packfile.c | 45 +++++++++++++++++++++++++++++++++++----------
 packfile.h |  9 +++++++++
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/packfile.c b/packfile.c
index 215a23e42b..1ee5dd3da3 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1101,6 +1101,36 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor
 	return store->packs.head;
 }
 
+int packfile_store_count_objects(struct packfile_store *store,
+				 unsigned long *out)
+{
+	struct packfile_list_entry *e;
+	struct multi_pack_index *m;
+	unsigned long count = 0;
+	int ret;
+
+	m = get_multi_pack_index(store->source);
+	if (m)
+		count += m->num_objects + m->num_objects_in_base;
+
+	for (e = packfile_store_get_packs(store); e; e = e->next) {
+		if (e->pack->multi_pack_index)
+			continue;
+		if (open_pack_index(e->pack)) {
+			ret = -1;
+			goto out;
+		}
+
+		count += e->pack->num_objects;
+	}
+
+	*out = count;
+	ret = 0;
+
+out:
+	return ret;
+}
+
 /*
  * Give a fast, rough count of the number of objects in the repository. This
  * ignores loose objects completely. If you have a lot of them, then either
@@ -1113,21 +1143,16 @@ unsigned long repo_approximate_object_count(struct repository *r)
 	if (!r->objects->approximate_object_count_valid) {
 		struct odb_source *source;
 		unsigned long count = 0;
-		struct packed_git *p;
 
 		odb_prepare_alternates(r->objects);
-
 		for (source = r->objects->sources; source; source = source->next) {
-			struct multi_pack_index *m = get_multi_pack_index(source);
-			if (m)
-				count += m->num_objects + m->num_objects_in_base;
-		}
+			struct odb_source_files *files = odb_source_files_downcast(source);
+			unsigned long c;
 
-		repo_for_each_pack(r, p) {
-			if (p->multi_pack_index || open_pack_index(p))
-				continue;
-			count += p->num_objects;
+			if (!packfile_store_count_objects(files->packed, &c))
+				count += c;
 		}
+
 		r->objects->approximate_object_count = count;
 		r->objects->approximate_object_count_valid = 1;
 	}
diff --git a/packfile.h b/packfile.h
index 8b04a258a7..1da8c729cb 100644
--- a/packfile.h
+++ b/packfile.h
@@ -268,6 +268,15 @@ enum kept_pack_type {
 	KEPT_PACK_IN_CORE = (1 << 1),
 };
 
+/*
+ * Count the number objects contained in the given packfile store. If
+ * successful, the number of objects will be written to the `out` pointer.
+ *
+ * Return 0 on success, a negative error code otherwise.
+ */
+int packfile_store_count_objects(struct packfile_store *store,
+				 unsigned long *out);
+
 /*
  * Retrieve the cache of kept packs from the given packfile store. Accepts a
  * combination of `kept_pack_type` flags. The cache is computed on demand and

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 3/6] object-file: extract logic to approximate object count
  2026-03-10 15:18 [PATCH 0/6] odb: introduce generic object counting Patrick Steinhardt
  2026-03-10 15:18 ` [PATCH 1/6] odb: stop including "odb/source.h" Patrick Steinhardt
  2026-03-10 15:18 ` [PATCH 2/6] packfile: extract logic to count number of objects Patrick Steinhardt
@ 2026-03-10 15:18 ` Patrick Steinhardt
  2026-03-10 17:44   ` Junio C Hamano
  2026-03-11 12:47   ` Toon Claes
  2026-03-10 15:18 ` [PATCH 4/6] object-file: generalize counting objects Patrick Steinhardt
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-10 15:18 UTC (permalink / raw)
  To: git

In "builtin/gc.c" we have some logic that checks whether we need to
repack objects. This is done by counting the number of objects that we
have and checking whether it exceeds a certain threshold. We don't
really need an accurate object count though, which is why we only
open a single object diretcroy shard and then extrapolate from there.

Extract this logic into a new function that is owned by the loose object
database source. This is done to prepare for a subsequent change, where
we'll introduce object counting on the object database source level.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 builtin/gc.c  | 37 +++++++++----------------------------
 object-file.c | 41 +++++++++++++++++++++++++++++++++++++++++
 object-file.h | 13 +++++++++++++
 3 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/builtin/gc.c b/builtin/gc.c
index fb329c2cff..a08c7554cb 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -467,37 +467,18 @@ static int rerere_gc_condition(struct gc_config *cfg UNUSED)
 static int too_many_loose_objects(int limit)
 {
 	/*
-	 * Quickly check if a "gc" is needed, by estimating how
-	 * many loose objects there are.  Because SHA-1 is evenly
-	 * distributed, we can check only one and get a reasonable
-	 * estimate.
+	 * This is weird, but stems from legacy behaviour: the GC auto
+	 * threshold was always essentially interpreted as if it was rounded up
+	 * to the next multiple 256 of, so we retain this behaviour for now.
 	 */
-	DIR *dir;
-	struct dirent *ent;
-	int auto_threshold;
-	int num_loose = 0;
-	int needed = 0;
-	const unsigned hexsz_loose = the_hash_algo->hexsz - 2;
-	char *path;
-
-	path = repo_git_path(the_repository, "objects/17");
-	dir = opendir(path);
-	free(path);
-	if (!dir)
+	int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
+	unsigned long loose_count;
+
+	if (odb_source_loose_approximate_object_count(the_repository->objects->sources,
+						      &loose_count) < 0)
 		return 0;
 
-	auto_threshold = DIV_ROUND_UP(limit, 256);
-	while ((ent = readdir(dir)) != NULL) {
-		if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose ||
-		    ent->d_name[hexsz_loose] != '\0')
-			continue;
-		if (++num_loose > auto_threshold) {
-			needed = 1;
-			break;
-		}
-	}
-	closedir(dir);
-	return needed;
+	return loose_count > auto_threshold;
 }
 
 static struct packed_git *find_base_packs(struct string_list *packs,
diff --git a/object-file.c b/object-file.c
index a3ff7f586c..da67e3c9ff 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1868,6 +1868,47 @@ int odb_source_loose_for_each_object(struct odb_source *source,
 					     NULL, NULL, &data);
 }
 
+int odb_source_loose_approximate_object_count(struct odb_source *source,
+					      unsigned long *out)
+{
+	const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
+	unsigned long count = 0;
+	struct dirent *ent;
+	char *path = NULL;
+	DIR *dir = NULL;
+	int ret;
+
+	path = xstrfmt("%s/17", source->path);
+
+	dir = opendir(path);
+	if (!dir) {
+		if (errno == ENOENT) {
+			*out = 0;
+			ret = 0;
+			goto out;
+		}
+
+		ret = error_errno("cannot open object shard '%s'", path);
+		goto out;
+	}
+
+	while ((ent = readdir(dir)) != NULL) {
+		if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
+		    ent->d_name[hexsz] != '\0')
+			continue;
+		count++;
+	}
+
+	*out = count * 256;
+	ret = 0;
+
+out:
+	if (dir)
+		closedir(dir);
+	free(path);
+	return ret;
+}
+
 static int append_loose_object(const struct object_id *oid,
 			       const char *path UNUSED,
 			       void *data)
diff --git a/object-file.h b/object-file.h
index ff6da65296..b870ea9fa8 100644
--- a/object-file.h
+++ b/object-file.h
@@ -139,6 +139,19 @@ int odb_source_loose_for_each_object(struct odb_source *source,
 				     void *cb_data,
 				     unsigned flags);
 
+/*
+ * Count the number of loose objects in this source.
+ *
+ * The object count is approximated by opening a single sharding directory for
+ * loose objects and scanning its contents. The result is then extrapolated by
+ * 256. This should generally work as a reasonable estimate given that the
+ * object hash is supposed to be indistinguishable from random.
+ *
+ * Returns 0 on success, a negative error code otherwise.
+ */
+int odb_source_loose_approximate_object_count(struct odb_source *source,
+					      unsigned long *out);
+
 /**
  * format_object_header() is a thin wrapper around s xsnprintf() that
  * writes the initial "<type> <obj-len>" part of the loose object

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 4/6] object-file: generalize counting objects
  2026-03-10 15:18 [PATCH 0/6] odb: introduce generic object counting Patrick Steinhardt
                   ` (2 preceding siblings ...)
  2026-03-10 15:18 ` [PATCH 3/6] object-file: extract logic to approximate object count Patrick Steinhardt
@ 2026-03-10 15:18 ` Patrick Steinhardt
  2026-03-11 13:53   ` Toon Claes
  2026-03-10 15:18 ` [PATCH 5/6] odb/source: introduce generic object counting Patrick Steinhardt
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-10 15:18 UTC (permalink / raw)
  To: git

Generalize the function introduced in the preceding commit to not only
be able to approximate the number of loose objects, but to also provide
an accurate count. The behaviour can be toggled via a new flag.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 builtin/gc.c  |  5 +++--
 object-file.c | 58 +++++++++++++++++++++++++++++++++++++---------------------
 object-file.h |  5 +++--
 odb.h         |  9 +++++++++
 4 files changed, 52 insertions(+), 25 deletions(-)

diff --git a/builtin/gc.c b/builtin/gc.c
index a08c7554cb..3a64d28da8 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -474,8 +474,9 @@ static int too_many_loose_objects(int limit)
 	int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
 	unsigned long loose_count;
 
-	if (odb_source_loose_approximate_object_count(the_repository->objects->sources,
-						      &loose_count) < 0)
+	if (odb_source_loose_count_objects(the_repository->objects->sources,
+					   ODB_COUNT_OBJECTS_APPROXIMATE,
+					   &loose_count) < 0)
 		return 0;
 
 	return loose_count > auto_threshold;
diff --git a/object-file.c b/object-file.c
index da67e3c9ff..d35cec201f 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1868,40 +1868,56 @@ int odb_source_loose_for_each_object(struct odb_source *source,
 					     NULL, NULL, &data);
 }
 
-int odb_source_loose_approximate_object_count(struct odb_source *source,
-					      unsigned long *out)
+static int count_loose_object(const struct object_id *oid UNUSED,
+			      struct object_info *oi UNUSED,
+			      void *payload)
+{
+	unsigned long *count = payload;
+	(*count)++;
+	return 0;
+}
+
+int odb_source_loose_count_objects(struct odb_source *source,
+				   enum odb_count_objects_flags flags,
+				   unsigned long *out)
 {
 	const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
-	unsigned long count = 0;
-	struct dirent *ent;
 	char *path = NULL;
 	DIR *dir = NULL;
 	int ret;
 
-	path = xstrfmt("%s/17", source->path);
+	if (flags & ODB_COUNT_OBJECTS_APPROXIMATE) {
+		unsigned long count = 0;
+		struct dirent *ent;
 
-	dir = opendir(path);
-	if (!dir) {
-		if (errno == ENOENT) {
-			*out = 0;
-			ret = 0;
+		path = xstrfmt("%s/17", source->path);
+
+		dir = opendir(path);
+		if (!dir) {
+			if (errno == ENOENT) {
+				*out = 0;
+				ret = 0;
+				goto out;
+			}
+
+			ret = error_errno("cannot open object shard '%s'", path);
 			goto out;
 		}
 
-		ret = error_errno("cannot open object shard '%s'", path);
-		goto out;
-	}
+		while ((ent = readdir(dir)) != NULL) {
+			if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
+			    ent->d_name[hexsz] != '\0')
+				continue;
+			count++;
+		}
 
-	while ((ent = readdir(dir)) != NULL) {
-		if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
-		    ent->d_name[hexsz] != '\0')
-			continue;
-		count++;
+		*out = count * 256;
+		ret = 0;
+	} else {
+		ret = odb_source_loose_for_each_object(source, NULL, count_loose_object,
+						       out, 0);
 	}
 
-	*out = count * 256;
-	ret = 0;
-
 out:
 	if (dir)
 		closedir(dir);
diff --git a/object-file.h b/object-file.h
index b870ea9fa8..f8d8805a18 100644
--- a/object-file.h
+++ b/object-file.h
@@ -149,8 +149,9 @@ int odb_source_loose_for_each_object(struct odb_source *source,
  *
  * Returns 0 on success, a negative error code otherwise.
  */
-int odb_source_loose_approximate_object_count(struct odb_source *source,
-					      unsigned long *out);
+int odb_source_loose_count_objects(struct odb_source *source,
+				   enum odb_count_objects_flags flags,
+				   unsigned long *out);
 
 /**
  * format_object_header() is a thin wrapper around s xsnprintf() that
diff --git a/odb.h b/odb.h
index 7a583e3873..e6057477f6 100644
--- a/odb.h
+++ b/odb.h
@@ -500,6 +500,15 @@ int odb_for_each_object(struct object_database *odb,
 			void *cb_data,
 			unsigned flags);
 
+enum odb_count_objects_flags {
+	/*
+	 * Instead of providing an accurate count, allow the number of objects
+	 * to be approximated. Details of how this approximation works are
+	 * subject to the specific source's implementation.
+	 */
+	ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0),
+};
+
 enum {
 	/*
 	 * By default, `odb_write_object()` does not actually write anything

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 5/6] odb/source: introduce generic object counting
  2026-03-10 15:18 [PATCH 0/6] odb: introduce generic object counting Patrick Steinhardt
                   ` (3 preceding siblings ...)
  2026-03-10 15:18 ` [PATCH 4/6] object-file: generalize counting objects Patrick Steinhardt
@ 2026-03-10 15:18 ` Patrick Steinhardt
  2026-03-10 17:51   ` Junio C Hamano
  2026-03-11 15:03   ` Toon Claes
  2026-03-10 15:18 ` [PATCH 6/6] odb: " Patrick Steinhardt
  2026-03-12  8:42 ` [PATCH v2 0/6] " Patrick Steinhardt
  6 siblings, 2 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-10 15:18 UTC (permalink / raw)
  To: git

Introduce generic object counting on the object database source level
with a new backend-specific callback function.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 odb/source-files.c | 30 ++++++++++++++++++++++++++++++
 odb/source.h       | 27 +++++++++++++++++++++++++++
 packfile.c         |  4 ++--
 packfile.h         |  1 +
 4 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/odb/source-files.c b/odb/source-files.c
index 14cb9adeca..c08d8993e3 100644
--- a/odb/source-files.c
+++ b/odb/source-files.c
@@ -93,6 +93,35 @@ static int odb_source_files_for_each_object(struct odb_source *source,
 	return 0;
 }
 
+static int odb_source_files_count_objects(struct odb_source *source,
+					  enum odb_count_objects_flags flags,
+					  unsigned long *out)
+{
+	struct odb_source_files *files = odb_source_files_downcast(source);
+	unsigned long count;
+	int ret;
+
+	ret = packfile_store_count_objects(files->packed, flags, &count);
+	if (ret < 0)
+		goto out;
+
+	if (!(flags & ODB_COUNT_OBJECTS_APPROXIMATE)) {
+		unsigned long loose_count;
+
+		ret = odb_source_loose_count_objects(source, flags, &loose_count);
+		if (ret < 0)
+			goto out;
+
+		count += loose_count;
+	}
+
+	*out = count;
+	ret = 0;
+
+out:
+	return ret;
+}
+
 static int odb_source_files_freshen_object(struct odb_source *source,
 					   const struct object_id *oid)
 {
@@ -220,6 +249,7 @@ struct odb_source_files *odb_source_files_new(struct object_database *odb,
 	files->base.read_object_info = odb_source_files_read_object_info;
 	files->base.read_object_stream = odb_source_files_read_object_stream;
 	files->base.for_each_object = odb_source_files_for_each_object;
+	files->base.count_objects = odb_source_files_count_objects;
 	files->base.freshen_object = odb_source_files_freshen_object;
 	files->base.write_object = odb_source_files_write_object;
 	files->base.write_object_stream = odb_source_files_write_object_stream;
diff --git a/odb/source.h b/odb/source.h
index a1fd9dd920..96c906e7a1 100644
--- a/odb/source.h
+++ b/odb/source.h
@@ -142,6 +142,21 @@ struct odb_source {
 			       void *cb_data,
 			       unsigned flags);
 
+	/*
+	 * This callback is expected to count objects in the given object
+	 * database source. The callback function does not have to guarantee
+	 * that only unique objects are counted. The result shall be assigned
+	 * to the `out` pointer.
+	 *
+	 * Accepts `enum odb_count_objects_flag` flags to alter the behaviour.
+	 *
+	 * The callback is expected to return 0 on success, or a negative error
+	 * code otherwise.
+	 */
+	int (*count_objects)(struct odb_source *source,
+			     enum odb_count_objects_flags flags,
+			     unsigned long *out);
+
 	/*
 	 * This callback is expected to freshen the given object so that its
 	 * last access time is set to the current time. This is used to ensure
@@ -333,6 +348,18 @@ static inline int odb_source_for_each_object(struct odb_source *source,
 	return source->for_each_object(source, request, cb, cb_data, flags);
 }
 
+/*
+ * Count the number of objects in the given object database source.
+ *
+ * Returns 0 on success, a negative error code otherwise.
+ */
+static inline int odb_source_count_objects(struct odb_source *source,
+					   enum odb_count_objects_flags flags,
+					   unsigned long *out)
+{
+	return source->count_objects(source, flags, out);
+}
+
 /*
  * Freshen an object in the object database by updating its timestamp.
  * Returns 1 in case the object has been freshened, 0 in case the object does
diff --git a/packfile.c b/packfile.c
index 1ee5dd3da3..8ee462303a 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1102,6 +1102,7 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor
 }
 
 int packfile_store_count_objects(struct packfile_store *store,
+				 enum odb_count_objects_flags flags UNUSED,
 				 unsigned long *out)
 {
 	struct packfile_list_entry *e;
@@ -1146,10 +1147,9 @@ unsigned long repo_approximate_object_count(struct repository *r)
 
 		odb_prepare_alternates(r->objects);
 		for (source = r->objects->sources; source; source = source->next) {
-			struct odb_source_files *files = odb_source_files_downcast(source);
 			unsigned long c;
 
-			if (!packfile_store_count_objects(files->packed, &c))
+			if (!odb_source_count_objects(source, ODB_COUNT_OBJECTS_APPROXIMATE, &c))
 				count += c;
 		}
 
diff --git a/packfile.h b/packfile.h
index 1da8c729cb..74b6bc58c5 100644
--- a/packfile.h
+++ b/packfile.h
@@ -275,6 +275,7 @@ enum kept_pack_type {
  * Return 0 on success, a negative error code otherwise.
  */
 int packfile_store_count_objects(struct packfile_store *store,
+				 enum odb_count_objects_flags flags,
 				 unsigned long *out);
 
 /*

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 6/6] odb: introduce generic object counting
  2026-03-10 15:18 [PATCH 0/6] odb: introduce generic object counting Patrick Steinhardt
                   ` (4 preceding siblings ...)
  2026-03-10 15:18 ` [PATCH 5/6] odb/source: introduce generic object counting Patrick Steinhardt
@ 2026-03-10 15:18 ` Patrick Steinhardt
  2026-03-11 15:30   ` Toon Claes
  2026-03-12  8:42 ` [PATCH v2 0/6] " Patrick Steinhardt
  6 siblings, 1 reply; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-10 15:18 UTC (permalink / raw)
  To: git

Similar to the preceding commit, introduce counting of objects on the
object database level, replacing the logic that we have in
`repo_approximate_object_count()`.

Note that the function knows to cache the object count. It's unclear
whether this cache is really required as we shouldn't have that many
cases where we count objects repeatedly. But to be on the safe side the
caching mechanism is retained, with the only excepting being that we
also have to use the passed flags as caching key.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 builtin/gc.c   |  6 +++++-
 commit-graph.c |  3 ++-
 object-name.c  |  6 +++++-
 odb.c          | 37 ++++++++++++++++++++++++++++++++++++-
 odb.h          | 17 +++++++++++++++--
 packfile.c     | 27 ---------------------------
 packfile.h     |  6 ------
 7 files changed, 63 insertions(+), 39 deletions(-)

diff --git a/builtin/gc.c b/builtin/gc.c
index 3a64d28da8..cb9ca89a97 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -574,9 +574,13 @@ static uint64_t total_ram(void)
 static uint64_t estimate_repack_memory(struct gc_config *cfg,
 				       struct packed_git *pack)
 {
-	unsigned long nr_objects = repo_approximate_object_count(the_repository);
+	unsigned long nr_objects;
 	size_t os_cache, heap;
 
+	if (odb_count_objects(the_repository->objects,
+			      ODB_COUNT_OBJECTS_APPROXIMATE, &nr_objects) < 0)
+		return 0;
+
 	if (!pack || !nr_objects)
 		return 0;
 
diff --git a/commit-graph.c b/commit-graph.c
index f8e24145a5..c030003330 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -2607,7 +2607,8 @@ int write_commit_graph(struct odb_source *source,
 			replace = ctx.opts->split_flags & COMMIT_GRAPH_SPLIT_REPLACE;
 	}
 
-	ctx.approx_nr_objects = repo_approximate_object_count(r);
+	if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &ctx.approx_nr_objects) < 0)
+		ctx.approx_nr_objects = 0;
 
 	if (ctx.append && g) {
 		for (i = 0; i < g->num_commits; i++) {
diff --git a/object-name.c b/object-name.c
index 7b14c3bf9b..e5adec4c9d 100644
--- a/object-name.c
+++ b/object-name.c
@@ -837,7 +837,11 @@ int repo_find_unique_abbrev_r(struct repository *r, char *hex,
 	const unsigned hexsz = algo->hexsz;
 
 	if (len < 0) {
-		unsigned long count = repo_approximate_object_count(r);
+		unsigned long count;
+
+		if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &count) < 0)
+			count = 0;
+
 		/*
 		 * Add one because the MSB only tells us the highest bit set,
 		 * not including the value of all the _other_ bits (so "15"
diff --git a/odb.c b/odb.c
index 84a31084d3..350e23f3c0 100644
--- a/odb.c
+++ b/odb.c
@@ -917,6 +917,41 @@ int odb_for_each_object(struct object_database *odb,
 	return 0;
 }
 
+int odb_count_objects(struct object_database *odb,
+		      enum odb_count_objects_flags flags,
+		      unsigned long *out)
+{
+	struct odb_source *source;
+	unsigned long count = 0;
+	int ret;
+
+	if (odb->object_count_valid && odb->object_count_flags == flags) {
+		*out = odb->object_count;
+		return 0;
+	}
+
+	odb_prepare_alternates(odb);
+	for (source = odb->sources; source; source = source->next) {
+		unsigned long c;
+
+		ret = odb_source_count_objects(source, flags, &c);
+		if (ret < 0)
+			goto out;
+
+		count += c;
+	}
+
+	odb->object_count = count;
+	odb->object_count_valid = 1;
+	odb->object_count_flags = flags;
+
+	*out = count;
+	ret = 0;
+
+out:
+	return ret;
+}
+
 void odb_assert_oid_type(struct object_database *odb,
 			 const struct object_id *oid, enum object_type expect)
 {
@@ -1030,7 +1065,7 @@ void odb_reprepare(struct object_database *o)
 	for (source = o->sources; source; source = source->next)
 		odb_source_reprepare(source);
 
-	o->approximate_object_count_valid = 0;
+	o->object_count_valid = 0;
 
 	obj_read_unlock();
 }
diff --git a/odb.h b/odb.h
index e6057477f6..7b004f1cf4 100644
--- a/odb.h
+++ b/odb.h
@@ -112,8 +112,9 @@ struct object_database {
 	 * These two fields are not meant for direct access. Use
 	 * repo_approximate_object_count() instead.
 	 */
-	unsigned long approximate_object_count;
-	unsigned approximate_object_count_valid : 1;
+	unsigned long object_count;
+	unsigned object_count_flags;
+	unsigned object_count_valid : 1;
 
 	/*
 	 * Submodule source paths that will be added as additional sources to
@@ -509,6 +510,18 @@ enum odb_count_objects_flags {
 	ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0),
 };
 
+/*
+ * Count the number of objects in the given object database. This object count
+ * may double-count objects that are stored in multiple backends, or which are
+ * stored multiple times in a single backend.
+ *
+ * Returns 0 on success, a negative error code otherwise. The number of objects
+ * will be assigned to the `out` pointer on success.
+ */
+int odb_count_objects(struct object_database *odb,
+		      enum odb_count_objects_flags flags,
+		      unsigned long *out);
+
 enum {
 	/*
 	 * By default, `odb_write_object()` does not actually write anything
diff --git a/packfile.c b/packfile.c
index 8ee462303a..d4de9f3ffe 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1132,33 +1132,6 @@ int packfile_store_count_objects(struct packfile_store *store,
 	return ret;
 }
 
-/*
- * Give a fast, rough count of the number of objects in the repository. This
- * ignores loose objects completely. If you have a lot of them, then either
- * you should repack because your performance will be awful, or they are
- * all unreachable objects about to be pruned, in which case they're not really
- * interesting as a measure of repo size in the first place.
- */
-unsigned long repo_approximate_object_count(struct repository *r)
-{
-	if (!r->objects->approximate_object_count_valid) {
-		struct odb_source *source;
-		unsigned long count = 0;
-
-		odb_prepare_alternates(r->objects);
-		for (source = r->objects->sources; source; source = source->next) {
-			unsigned long c;
-
-			if (!odb_source_count_objects(source, ODB_COUNT_OBJECTS_APPROXIMATE, &c))
-				count += c;
-		}
-
-		r->objects->approximate_object_count = count;
-		r->objects->approximate_object_count_valid = 1;
-	}
-	return r->objects->approximate_object_count;
-}
-
 unsigned long unpack_object_header_buffer(const unsigned char *buf,
 		unsigned long len, enum object_type *type, unsigned long *sizep)
 {
diff --git a/packfile.h b/packfile.h
index 74b6bc58c5..a16ec3950d 100644
--- a/packfile.h
+++ b/packfile.h
@@ -375,12 +375,6 @@ int packfile_store_for_each_object(struct packfile_store *store,
 #define PACKDIR_FILE_GARBAGE 4
 extern void (*report_garbage)(unsigned seen_bits, const char *path);
 
-/*
- * Give a rough count of objects in the repository. This sacrifices accuracy
- * for speed.
- */
-unsigned long repo_approximate_object_count(struct repository *r);
-
 void pack_report(struct repository *repo);
 
 /*

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/6] object-file: extract logic to approximate object count
  2026-03-10 15:18 ` [PATCH 3/6] object-file: extract logic to approximate object count Patrick Steinhardt
@ 2026-03-10 17:44   ` Junio C Hamano
  2026-03-11 12:47   ` Toon Claes
  1 sibling, 0 replies; 27+ messages in thread
From: Junio C Hamano @ 2026-03-10 17:44 UTC (permalink / raw)
  To: Patrick Steinhardt; +Cc: git

Patrick Steinhardt <ps@pks.im> writes:

>  static int too_many_loose_objects(int limit)
>  {
> ...
> +	int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
> +	unsigned long loose_count;
> +
> +	if (odb_source_loose_approximate_object_count(the_repository->objects->sources,
> +						      &loose_count) < 0)
>  		return 0;
>  
> -	auto_threshold = DIV_ROUND_UP(limit, 256);
> -	while ((ent = readdir(dir)) != NULL) {
> -		if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose ||
> -		    ent->d_name[hexsz_loose] != '\0')
> -			continue;
> -		if (++num_loose > auto_threshold) {
> -			needed = 1;
> -			break;
> -		}
> -	}
> -	closedir(dir);
> -	return needed;
> +	return loose_count > auto_threshold;
>  }

We used to sample one shared directory and stopped when we know we
have more than auto_threshold, which is roughly 1/256 of the given
limit.  Now, we ask "approximate" function to count and then compare
the result with the same auto_threshold (i.e., 1/256 of the given
limit), which means we expect approximate function to count only
1/256 of the total loose objects somehow?  Let's keep reading.

>  static struct packed_git *find_base_packs(struct string_list *packs,
> diff --git a/object-file.c b/object-file.c
> index a3ff7f586c..da67e3c9ff 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1868,6 +1868,47 @@ int odb_source_loose_for_each_object(struct odb_source *source,
>  					     NULL, NULL, &data);
>  }
>  
> +int odb_source_loose_approximate_object_count(struct odb_source *source,
> +					      unsigned long *out)
> +{
> +	const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
> +	unsigned long count = 0;
> +	struct dirent *ent;
> +	char *path = NULL;
> +	DIR *dir = NULL;
> +	int ret;
> +
> +	path = xstrfmt("%s/17", source->path);
> +
> +	dir = opendir(path);
> +	if (!dir) {
> +		if (errno == ENOENT) {
> +			*out = 0;
> +			ret = 0;
> +			goto out;
> +		}
> +
> +		ret = error_errno("cannot open object shard '%s'", path);
> +		goto out;
> +	}
> +
> +	while ((ent = readdir(dir)) != NULL) {
> +		if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
> +		    ent->d_name[hexsz] != '\0')
> +			continue;
> +		count++;
> +	}

This counts one shared ("17" that is randomly picked) fully and then ...

> +	*out = count * 256;

... estimate that the entire world would probably have 256 times as
many as the objects in that one shared.

Ah, my earlier read of the caller was confused.  auto_threshold used
to be 1/256 of the limit, but now the number used is computed in a
strange arithmetic, "DIV_ROUND_UP(limit,256) * 256".  Not directly
using "limit" fooled me into thinking that it somehow kept using the
same 1/256 of the limit.

So we are answering "do we have too many?" question using roughly
the same criteria as before, not 1/256 off as I suspected earlier.

The old implementation exited early as soon as the threshold was
hit.  While scanning a single shard directory is likely fast enough
that this may not matter in practice, it is a slight change in
behaviour. If a repository has an extremely large number of loose
objects (e.g. tens of thousands in shard 17), this will now count
all of them instead of stopping at ~30 (if the limit set to around
7000 objects).

Given that this is an "auto" GC check, the performance difference is
probably negligible, but I thought it worth pointing out.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/6] odb/source: introduce generic object counting
  2026-03-10 15:18 ` [PATCH 5/6] odb/source: introduce generic object counting Patrick Steinhardt
@ 2026-03-10 17:51   ` Junio C Hamano
  2026-03-11  6:44     ` Patrick Steinhardt
  2026-03-11 15:03   ` Toon Claes
  1 sibling, 1 reply; 27+ messages in thread
From: Junio C Hamano @ 2026-03-10 17:51 UTC (permalink / raw)
  To: Patrick Steinhardt; +Cc: git

Patrick Steinhardt <ps@pks.im> writes:

> +static int odb_source_files_count_objects(struct odb_source *source,
> +					  enum odb_count_objects_flags flags,
> +					  unsigned long *out)
> +{
> +	struct odb_source_files *files = odb_source_files_downcast(source);
> +	unsigned long count;
> +	int ret;
> +
> +	ret = packfile_store_count_objects(files->packed, flags, &count);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (!(flags & ODB_COUNT_OBJECTS_APPROXIMATE)) {
> +		unsigned long loose_count;
> +
> +		ret = odb_source_loose_count_objects(source, flags, &loose_count);
> +		if (ret < 0)
> +			goto out;
> +
> +		count += loose_count;
> +	}
> +
> +	*out = count;
> +	ret = 0;
> +
> +out:
> +	return ret;
> +}

The design to assume that the majority of objects should be in the
packfiles and the number of loose objects can be ignored when we are
getting approximation is inherited from the world before this
series, I think, which is a valid choice for this series to make.

As your "get an approximate count of loose objects" counts a single
shared fully, instead of punting as soon as the limit is hit, we
could ask that function and add it in when the APPROXIMATE flag is
passed, and get a bit more accurate number cheaply even when we are
approximating.  I am not sure what the pros and cons of doing so
myself, but you may already have thought about it and rejected it,
perhaps?

Thanks for a pleasant read.
Queued.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/6] odb/source: introduce generic object counting
  2026-03-10 17:51   ` Junio C Hamano
@ 2026-03-11  6:44     ` Patrick Steinhardt
  0 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-11  6:44 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Tue, Mar 10, 2026 at 10:51:33AM -0700, Junio C Hamano wrote:
> Patrick Steinhardt <ps@pks.im> writes:
> 
> > +static int odb_source_files_count_objects(struct odb_source *source,
> > +					  enum odb_count_objects_flags flags,
> > +					  unsigned long *out)
> > +{
> > +	struct odb_source_files *files = odb_source_files_downcast(source);
> > +	unsigned long count;
> > +	int ret;
> > +
> > +	ret = packfile_store_count_objects(files->packed, flags, &count);
> > +	if (ret < 0)
> > +		goto out;
> > +
> > +	if (!(flags & ODB_COUNT_OBJECTS_APPROXIMATE)) {
> > +		unsigned long loose_count;
> > +
> > +		ret = odb_source_loose_count_objects(source, flags, &loose_count);
> > +		if (ret < 0)
> > +			goto out;
> > +
> > +		count += loose_count;
> > +	}
> > +
> > +	*out = count;
> > +	ret = 0;
> > +
> > +out:
> > +	return ret;
> > +}
> 
> The design to assume that the majority of objects should be in the
> packfiles and the number of loose objects can be ignored when we are
> getting approximation is inherited from the world before this
> series, I think, which is a valid choice for this series to make.
> 
> As your "get an approximate count of loose objects" counts a single
> shared fully, instead of punting as soon as the limit is hit, we
> could ask that function and add it in when the APPROXIMATE flag is
> passed, and get a bit more accurate number cheaply even when we are
> approximating.  I am not sure what the pros and cons of doing so
> myself, but you may already have thought about it and rejected it,
> perhaps?

Yeah, I was very torn on this myself, and I switched back and forth
multiple times. I don't really think there is a large downside if we
started to also count loose objects here. The performance overhead
should be negligible, and it may arrive at a result that is closer to
the real world.

But the loose object counting approximation is somewhat vague overall
with the way we extrapolate the object count, and as you mentioned it
matches the old semantics to not include loose objects. So that's why I
decided against including it, so that we retain semantics.

That being said, with the current set of users it doesn't really matter
too much which of both approaches we pick.

Patrick

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/6] packfile: extract logic to count number of objects
  2026-03-10 15:18 ` [PATCH 2/6] packfile: extract logic to count number of objects Patrick Steinhardt
@ 2026-03-11 12:41   ` Toon Claes
  2026-03-11 13:55     ` Patrick Steinhardt
  0 siblings, 1 reply; 27+ messages in thread
From: Toon Claes @ 2026-03-11 12:41 UTC (permalink / raw)
  To: Patrick Steinhardt, git

Patrick Steinhardt <ps@pks.im> writes:

> In a subsequent commit we're about to introduce a new
> `odb_source_count_objects()` function so that we can make the logic
> pluggable. Prepare for this change by extracting the logic that we have
> to count packed objects into a standalone function.
>
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
>  packfile.c | 45 +++++++++++++++++++++++++++++++++++----------
>  packfile.h |  9 +++++++++
>  2 files changed, 44 insertions(+), 10 deletions(-)
>
> diff --git a/packfile.c b/packfile.c
> index 215a23e42b..1ee5dd3da3 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -1101,6 +1101,36 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor
>  	return store->packs.head;
>  }
>  
> +int packfile_store_count_objects(struct packfile_store *store,
> +				 unsigned long *out)
> +{
> +	struct packfile_list_entry *e;
> +	struct multi_pack_index *m;
> +	unsigned long count = 0;
> +	int ret;
> +
> +	m = get_multi_pack_index(store->source);
> +	if (m)
> +		count += m->num_objects + m->num_objects_in_base;

To make sure I understand correctly:

`m->num_objects` indicates how many objects are in the current pack, and
`m->num_objects_in_base` how many are in it's base (and that accumulates
what's in the bases of the base?).

> +	for (e = packfile_store_get_packs(store); e; e = e->next) {
> +		if (e->pack->multi_pack_index)
> +			continue;

Because we added the count through the midx already, we skip any
packfile that's included in the midx.

But some packfiles are not in the midx so we fall through for those.

Makes sense.


-- 
Cheers,
Toon

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/6] object-file: extract logic to approximate object count
  2026-03-10 15:18 ` [PATCH 3/6] object-file: extract logic to approximate object count Patrick Steinhardt
  2026-03-10 17:44   ` Junio C Hamano
@ 2026-03-11 12:47   ` Toon Claes
  2026-03-11 13:58     ` Patrick Steinhardt
  1 sibling, 1 reply; 27+ messages in thread
From: Toon Claes @ 2026-03-11 12:47 UTC (permalink / raw)
  To: Patrick Steinhardt, git

Patrick Steinhardt <ps@pks.im> writes:

> In "builtin/gc.c" we have some logic that checks whether we need to
> repack objects. This is done by counting the number of objects that we
> have and checking whether it exceeds a certain threshold. We don't
> really need an accurate object count though, which is why we only
> open a single object diretcroy shard and then extrapolate from there.

s/diretcroy/directory/

>
> Extract this logic into a new function that is owned by the loose object
> database source. This is done to prepare for a subsequent change, where
> we'll introduce object counting on the object database source level.
>
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
>  builtin/gc.c  | 37 +++++++++----------------------------
>  object-file.c | 41 +++++++++++++++++++++++++++++++++++++++++
>  object-file.h | 13 +++++++++++++
>  3 files changed, 63 insertions(+), 28 deletions(-)
>
> diff --git a/builtin/gc.c b/builtin/gc.c
> index fb329c2cff..a08c7554cb 100644
> --- a/builtin/gc.c
> +++ b/builtin/gc.c
> @@ -467,37 +467,18 @@ static int rerere_gc_condition(struct gc_config *cfg UNUSED)
>  static int too_many_loose_objects(int limit)
>  {
>  	/*
> -	 * Quickly check if a "gc" is needed, by estimating how
> -	 * many loose objects there are.  Because SHA-1 is evenly
> -	 * distributed, we can check only one and get a reasonable
> -	 * estimate.
> +	 * This is weird, but stems from legacy behaviour: the GC auto
> +	 * threshold was always essentially interpreted as if it was rounded up
> +	 * to the next multiple 256 of, so we retain this behaviour for now.
>  	 */
> -	DIR *dir;
> -	struct dirent *ent;
> -	int auto_threshold;
> -	int num_loose = 0;
> -	int needed = 0;
> -	const unsigned hexsz_loose = the_hash_algo->hexsz - 2;
> -	char *path;
> -
> -	path = repo_git_path(the_repository, "objects/17");
> -	dir = opendir(path);
> -	free(path);
> -	if (!dir)
> +	int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
> +	unsigned long loose_count;
> +
> +	if (odb_source_loose_approximate_object_count(the_repository->objects->sources,
> +						      &loose_count) < 0)
>  		return 0;
>  
> -	auto_threshold = DIV_ROUND_UP(limit, 256);
> -	while ((ent = readdir(dir)) != NULL) {
> -		if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose ||
> -		    ent->d_name[hexsz_loose] != '\0')
> -			continue;
> -		if (++num_loose > auto_threshold) {
> -			needed = 1;
> -			break;
> -		}
> -	}
> -	closedir(dir);
> -	return needed;
> +	return loose_count > auto_threshold;
>  }
>  
>  static struct packed_git *find_base_packs(struct string_list *packs,
> diff --git a/object-file.c b/object-file.c
> index a3ff7f586c..da67e3c9ff 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1868,6 +1868,47 @@ int odb_source_loose_for_each_object(struct odb_source *source,
>  					     NULL, NULL, &data);
>  }
>  
> +int odb_source_loose_approximate_object_count(struct odb_source *source,
> +					      unsigned long *out)
> +{
> +	const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
> +	unsigned long count = 0;
> +	struct dirent *ent;
> +	char *path = NULL;
> +	DIR *dir = NULL;
> +	int ret;
> +
> +	path = xstrfmt("%s/17", source->path);
> +
> +	dir = opendir(path);
> +	if (!dir) {
> +		if (errno == ENOENT) {
> +			*out = 0;
> +			ret = 0;
> +			goto out;
> +		}
> +
> +		ret = error_errno("cannot open object shard '%s'", path);
> +		goto out;
> +	}
> +
> +	while ((ent = readdir(dir)) != NULL) {
> +		if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
> +		    ent->d_name[hexsz] != '\0')
> +			continue;
> +		count++;
> +	}
> +
> +	*out = count * 256;

This makes the number way larger, but I don't think we need to worry
getting anywhere near ULONG_MAX, because I would expect to have Git
coming to a grind way before that happens (not to mention filesystems
would get unhappy about it too).

-- 
Cheers,
Toon

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] object-file: generalize counting objects
  2026-03-10 15:18 ` [PATCH 4/6] object-file: generalize counting objects Patrick Steinhardt
@ 2026-03-11 13:53   ` Toon Claes
  2026-03-11 14:01     ` Patrick Steinhardt
  0 siblings, 1 reply; 27+ messages in thread
From: Toon Claes @ 2026-03-11 13:53 UTC (permalink / raw)
  To: Patrick Steinhardt, git

Patrick Steinhardt <ps@pks.im> writes:

> Generalize the function introduced in the preceding commit to not only
> be able to approximate the number of loose objects, but to also provide
> an accurate count. The behaviour can be toggled via a new flag.
>
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
>  builtin/gc.c  |  5 +++--
>  object-file.c | 58 +++++++++++++++++++++++++++++++++++++---------------------
>  object-file.h |  5 +++--
>  odb.h         |  9 +++++++++
>  4 files changed, 52 insertions(+), 25 deletions(-)
>
> diff --git a/builtin/gc.c b/builtin/gc.c
> index a08c7554cb..3a64d28da8 100644
> --- a/builtin/gc.c
> +++ b/builtin/gc.c
> @@ -474,8 +474,9 @@ static int too_many_loose_objects(int limit)
>  	int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
>  	unsigned long loose_count;
>  
> -	if (odb_source_loose_approximate_object_count(the_repository->objects->sources,
> -						      &loose_count) < 0)
> +	if (odb_source_loose_count_objects(the_repository->objects->sources,
> +					   ODB_COUNT_OBJECTS_APPROXIMATE,
> +					   &loose_count) < 0)
>  		return 0;
>  
>  	return loose_count > auto_threshold;
> diff --git a/object-file.c b/object-file.c
> index da67e3c9ff..d35cec201f 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1868,40 +1868,56 @@ int odb_source_loose_for_each_object(struct odb_source *source,
>  					     NULL, NULL, &data);
>  }
>  
> -int odb_source_loose_approximate_object_count(struct odb_source *source,
> -					      unsigned long *out)
> +static int count_loose_object(const struct object_id *oid UNUSED,
> +			      struct object_info *oi UNUSED,
> +			      void *payload)
> +{
> +	unsigned long *count = payload;
> +	(*count)++;
> +	return 0;
> +}
> +
> +int odb_source_loose_count_objects(struct odb_source *source,
> +				   enum odb_count_objects_flags flags,
> +				   unsigned long *out)
>  {
>  	const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
> -	unsigned long count = 0;
> -	struct dirent *ent;
>  	char *path = NULL;
>  	DIR *dir = NULL;
>  	int ret;
>  
> -	path = xstrfmt("%s/17", source->path);
> +	if (flags & ODB_COUNT_OBJECTS_APPROXIMATE) {
> +		unsigned long count = 0;
> +		struct dirent *ent;
>  
> -	dir = opendir(path);
> -	if (!dir) {
> -		if (errno == ENOENT) {
> -			*out = 0;
> -			ret = 0;
> +		path = xstrfmt("%s/17", source->path);
> +
> +		dir = opendir(path);
> +		if (!dir) {
> +			if (errno == ENOENT) {
> +				*out = 0;
> +				ret = 0;
> +				goto out;
> +			}
> +
> +			ret = error_errno("cannot open object shard '%s'", path);
>  			goto out;
>  		}
>  
> -		ret = error_errno("cannot open object shard '%s'", path);
> -		goto out;
> -	}
> +		while ((ent = readdir(dir)) != NULL) {
> +			if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
> +			    ent->d_name[hexsz] != '\0')
> +				continue;
> +			count++;
> +		}
>  
> -	while ((ent = readdir(dir)) != NULL) {
> -		if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
> -		    ent->d_name[hexsz] != '\0')
> -			continue;
> -		count++;
> +		*out = count * 256;
> +		ret = 0;
> +	} else {
> +		ret = odb_source_loose_for_each_object(source, NULL, count_loose_object,
> +						       out, 0);

Isn't `*out` uninitialized here? Should we add `*out = 0;` before this
line?

>  	}
>  
> -	*out = count * 256;
> -	ret = 0;
> -
>  out:
>  	if (dir)
>  		closedir(dir);
> diff --git a/object-file.h b/object-file.h
> index b870ea9fa8..f8d8805a18 100644
> --- a/object-file.h
> +++ b/object-file.h
> @@ -149,8 +149,9 @@ int odb_source_loose_for_each_object(struct odb_source *source,
>   *
>   * Returns 0 on success, a negative error code otherwise.
>   */
> -int odb_source_loose_approximate_object_count(struct odb_source *source,
> -					      unsigned long *out);
> +int odb_source_loose_count_objects(struct odb_source *source,
> +				   enum odb_count_objects_flags flags,
> +				   unsigned long *out);
>  
>  /**
>   * format_object_header() is a thin wrapper around s xsnprintf() that
> diff --git a/odb.h b/odb.h
> index 7a583e3873..e6057477f6 100644
> --- a/odb.h
> +++ b/odb.h
> @@ -500,6 +500,15 @@ int odb_for_each_object(struct object_database *odb,
>  			void *cb_data,
>  			unsigned flags);
>  
> +enum odb_count_objects_flags {
> +	/*
> +	 * Instead of providing an accurate count, allow the number of objects
> +	 * to be approximated. Details of how this approximation works are
> +	 * subject to the specific source's implementation.
> +	 */
> +	ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0),
> +};
> +
>  enum {
>  	/*
>  	 * By default, `odb_write_object()` does not actually write anything
>
> -- 
> 2.53.0.880.g73c4285caa.dirty
>
>

-- 
Cheers,
Toon

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/6] packfile: extract logic to count number of objects
  2026-03-11 12:41   ` Toon Claes
@ 2026-03-11 13:55     ` Patrick Steinhardt
  0 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-11 13:55 UTC (permalink / raw)
  To: Toon Claes; +Cc: git

On Wed, Mar 11, 2026 at 01:41:05PM +0100, Toon Claes wrote:
> Patrick Steinhardt <ps@pks.im> writes:
> 
> > In a subsequent commit we're about to introduce a new
> > `odb_source_count_objects()` function so that we can make the logic
> > pluggable. Prepare for this change by extracting the logic that we have
> > to count packed objects into a standalone function.
> >
> > Signed-off-by: Patrick Steinhardt <ps@pks.im>
> > ---
> >  packfile.c | 45 +++++++++++++++++++++++++++++++++++----------
> >  packfile.h |  9 +++++++++
> >  2 files changed, 44 insertions(+), 10 deletions(-)
> >
> > diff --git a/packfile.c b/packfile.c
> > index 215a23e42b..1ee5dd3da3 100644
> > --- a/packfile.c
> > +++ b/packfile.c
> > @@ -1101,6 +1101,36 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor
> >  	return store->packs.head;
> >  }
> >  
> > +int packfile_store_count_objects(struct packfile_store *store,
> > +				 unsigned long *out)
> > +{
> > +	struct packfile_list_entry *e;
> > +	struct multi_pack_index *m;
> > +	unsigned long count = 0;
> > +	int ret;
> > +
> > +	m = get_multi_pack_index(store->source);
> > +	if (m)
> > +		count += m->num_objects + m->num_objects_in_base;
> 
> To make sure I understand correctly:
> 
> `m->num_objects` indicates how many objects are in the current pack, and
> `m->num_objects_in_base` how many are in it's base (and that accumulates
> what's in the bases of the base?).

Yup, that's exactly right, `num_objects_in_base` is basically recursive
across the bitmap layers.

> > +	for (e = packfile_store_get_packs(store); e; e = e->next) {
> > +		if (e->pack->multi_pack_index)
> > +			continue;
> 
> Because we added the count through the midx already, we skip any
> packfile that's included in the midx.
> 
> But some packfiles are not in the midx so we fall through for those.
> 
> Makes sense.

Correct.

Patrick

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/6] object-file: extract logic to approximate object count
  2026-03-11 12:47   ` Toon Claes
@ 2026-03-11 13:58     ` Patrick Steinhardt
  0 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-11 13:58 UTC (permalink / raw)
  To: Toon Claes; +Cc: git

On Wed, Mar 11, 2026 at 01:47:13PM +0100, Toon Claes wrote:
> Patrick Steinhardt <ps@pks.im> writes:
> 
> > In "builtin/gc.c" we have some logic that checks whether we need to
> > repack objects. This is done by counting the number of objects that we
> > have and checking whether it exceeds a certain threshold. We don't
> > really need an accurate object count though, which is why we only
> > open a single object diretcroy shard and then extrapolate from there.
> 
> s/diretcroy/directory/

Thanks, fixed locally.

> > diff --git a/object-file.c b/object-file.c
> > index a3ff7f586c..da67e3c9ff 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1868,6 +1868,47 @@ int odb_source_loose_for_each_object(struct odb_source *source,
> >  					     NULL, NULL, &data);
> >  }
> >  
> > +int odb_source_loose_approximate_object_count(struct odb_source *source,
> > +					      unsigned long *out)
> > +{
> > +	const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
> > +	unsigned long count = 0;
> > +	struct dirent *ent;
> > +	char *path = NULL;
> > +	DIR *dir = NULL;
> > +	int ret;
> > +
> > +	path = xstrfmt("%s/17", source->path);
> > +
> > +	dir = opendir(path);
> > +	if (!dir) {
> > +		if (errno == ENOENT) {
> > +			*out = 0;
> > +			ret = 0;
> > +			goto out;
> > +		}
> > +
> > +		ret = error_errno("cannot open object shard '%s'", path);
> > +		goto out;
> > +	}
> > +
> > +	while ((ent = readdir(dir)) != NULL) {
> > +		if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
> > +		    ent->d_name[hexsz] != '\0')
> > +			continue;
> > +		count++;
> > +	}
> > +
> > +	*out = count * 256;
> 
> This makes the number way larger, but I don't think we need to worry
> getting anywhere near ULONG_MAX, because I would expect to have Git
> coming to a grind way before that happens (not to mention filesystems
> would get unhappy about it too).

Yup. Even if `unsigned long` was 32 bits that would be >128 million
loose objects in a single directory. I agree that this is probably going
to make some things in Git unhappy. So we could have overflow checks
here, but I'm not sure it's worth it.

Thanks!

Patrick

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/6] object-file: generalize counting objects
  2026-03-11 13:53   ` Toon Claes
@ 2026-03-11 14:01     ` Patrick Steinhardt
  0 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-11 14:01 UTC (permalink / raw)
  To: Toon Claes; +Cc: git

On Wed, Mar 11, 2026 at 02:53:20PM +0100, Toon Claes wrote:
> Patrick Steinhardt <ps@pks.im> writes:
> > diff --git a/object-file.c b/object-file.c
> > index da67e3c9ff..d35cec201f 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1868,40 +1868,56 @@ int odb_source_loose_for_each_object(struct odb_source *source,
> >  					     NULL, NULL, &data);
> >  }
> >  
> > -int odb_source_loose_approximate_object_count(struct odb_source *source,
> > -					      unsigned long *out)
> > +static int count_loose_object(const struct object_id *oid UNUSED,
> > +			      struct object_info *oi UNUSED,
> > +			      void *payload)
> > +{
> > +	unsigned long *count = payload;
> > +	(*count)++;
> > +	return 0;
> > +}
> > +
> > +int odb_source_loose_count_objects(struct odb_source *source,
> > +				   enum odb_count_objects_flags flags,
> > +				   unsigned long *out)
> >  {
> >  	const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
> > -	unsigned long count = 0;
> > -	struct dirent *ent;
> >  	char *path = NULL;
> >  	DIR *dir = NULL;
> >  	int ret;
> >  
> > -	path = xstrfmt("%s/17", source->path);
> > +	if (flags & ODB_COUNT_OBJECTS_APPROXIMATE) {
> > +		unsigned long count = 0;
> > +		struct dirent *ent;
> >  
> > -	dir = opendir(path);
> > -	if (!dir) {
> > -		if (errno == ENOENT) {
> > -			*out = 0;
> > -			ret = 0;
> > +		path = xstrfmt("%s/17", source->path);
> > +
> > +		dir = opendir(path);
> > +		if (!dir) {
> > +			if (errno == ENOENT) {
> > +				*out = 0;
> > +				ret = 0;
> > +				goto out;
> > +			}
> > +
> > +			ret = error_errno("cannot open object shard '%s'", path);
> >  			goto out;
> >  		}
> >  
> > -		ret = error_errno("cannot open object shard '%s'", path);
> > -		goto out;
> > -	}
> > +		while ((ent = readdir(dir)) != NULL) {
> > +			if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
> > +			    ent->d_name[hexsz] != '\0')
> > +				continue;
> > +			count++;
> > +		}
> >  
> > -	while ((ent = readdir(dir)) != NULL) {
> > -		if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
> > -		    ent->d_name[hexsz] != '\0')
> > -			continue;
> > -		count++;
> > +		*out = count * 256;
> > +		ret = 0;
> > +	} else {
> > +		ret = odb_source_loose_for_each_object(source, NULL, count_loose_object,
> > +						       out, 0);
> 
> Isn't `*out` uninitialized here? Should we add `*out = 0;` before this
> line?

Oh, indeed. Will fix, thanks!

Patrick

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/6] odb/source: introduce generic object counting
  2026-03-10 15:18 ` [PATCH 5/6] odb/source: introduce generic object counting Patrick Steinhardt
  2026-03-10 17:51   ` Junio C Hamano
@ 2026-03-11 15:03   ` Toon Claes
  1 sibling, 0 replies; 27+ messages in thread
From: Toon Claes @ 2026-03-11 15:03 UTC (permalink / raw)
  To: Patrick Steinhardt, git

Patrick Steinhardt <ps@pks.im> writes:

> Introduce generic object counting on the object database source level
> with a new backend-specific callback function.
>
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
>  odb/source-files.c | 30 ++++++++++++++++++++++++++++++
>  odb/source.h       | 27 +++++++++++++++++++++++++++
>  packfile.c         |  4 ++--
>  packfile.h         |  1 +
>  4 files changed, 60 insertions(+), 2 deletions(-)
>
> diff --git a/odb/source-files.c b/odb/source-files.c
> index 14cb9adeca..c08d8993e3 100644
> --- a/odb/source-files.c
> +++ b/odb/source-files.c
> @@ -93,6 +93,35 @@ static int odb_source_files_for_each_object(struct odb_source *source,
>  	return 0;
>  }
>  
> +static int odb_source_files_count_objects(struct odb_source *source,
> +					  enum odb_count_objects_flags flags,
> +					  unsigned long *out)
> +{
> +	struct odb_source_files *files = odb_source_files_downcast(source);

I didn't read the other series this depends on, but good to see
odb_source_files_downcast() BUGs when the source isn't a 'files'
source.

> +	unsigned long count;
> +	int ret;
> +
> +	ret = packfile_store_count_objects(files->packed, flags, &count);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (!(flags & ODB_COUNT_OBJECTS_APPROXIMATE)) {
> +		unsigned long loose_count;
> +
> +		ret = odb_source_loose_count_objects(source, flags, &loose_count);
> +		if (ret < 0)
> +			goto out;
> +
> +		count += loose_count;
> +	}
> +
> +	*out = count;
> +	ret = 0;
> +
> +out:
> +	return ret;
> +}
> +
>  static int odb_source_files_freshen_object(struct odb_source *source,
>  					   const struct object_id *oid)
>  {
> @@ -220,6 +249,7 @@ struct odb_source_files *odb_source_files_new(struct object_database *odb,
>  	files->base.read_object_info = odb_source_files_read_object_info;
>  	files->base.read_object_stream = odb_source_files_read_object_stream;
>  	files->base.for_each_object = odb_source_files_for_each_object;
> +	files->base.count_objects = odb_source_files_count_objects;
>  	files->base.freshen_object = odb_source_files_freshen_object;
>  	files->base.write_object = odb_source_files_write_object;
>  	files->base.write_object_stream = odb_source_files_write_object_stream;
> diff --git a/odb/source.h b/odb/source.h
> index a1fd9dd920..96c906e7a1 100644
> --- a/odb/source.h
> +++ b/odb/source.h
> @@ -142,6 +142,21 @@ struct odb_source {
>  			       void *cb_data,
>  			       unsigned flags);
>  
> +	/*
> +	 * This callback is expected to count objects in the given object
> +	 * database source. The callback function does not have to guarantee
> +	 * that only unique objects are counted. The result shall be assigned
> +	 * to the `out` pointer.
> +	 *
> +	 * Accepts `enum odb_count_objects_flag` flags to alter the behaviour.
> +	 *
> +	 * The callback is expected to return 0 on success, or a negative error
> +	 * code otherwise.
> +	 */
> +	int (*count_objects)(struct odb_source *source,
> +			     enum odb_count_objects_flags flags,
> +			     unsigned long *out);
> +
>  	/*
>  	 * This callback is expected to freshen the given object so that its
>  	 * last access time is set to the current time. This is used to ensure
> @@ -333,6 +348,18 @@ static inline int odb_source_for_each_object(struct odb_source *source,
>  	return source->for_each_object(source, request, cb, cb_data, flags);
>  }
>  
> +/*
> + * Count the number of objects in the given object database source.
> + *
> + * Returns 0 on success, a negative error code otherwise.
> + */
> +static inline int odb_source_count_objects(struct odb_source *source,
> +					   enum odb_count_objects_flags flags,
> +					   unsigned long *out)
> +{
> +	return source->count_objects(source, flags, out);
> +}
> +
>  /*
>   * Freshen an object in the object database by updating its timestamp.
>   * Returns 1 in case the object has been freshened, 0 in case the object does
> diff --git a/packfile.c b/packfile.c
> index 1ee5dd3da3..8ee462303a 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -1102,6 +1102,7 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor
>  }
>  
>  int packfile_store_count_objects(struct packfile_store *store,
> +				 enum odb_count_objects_flags flags UNUSED,
>  				 unsigned long *out)
>  {
>  	struct packfile_list_entry *e;
> @@ -1146,10 +1147,9 @@ unsigned long repo_approximate_object_count(struct repository *r)
>  
>  		odb_prepare_alternates(r->objects);
>  		for (source = r->objects->sources; source; source = source->next) {
> -			struct odb_source_files *files = odb_source_files_downcast(source);
>  			unsigned long c;
>  
> -			if (!packfile_store_count_objects(files->packed, &c))
> +			if (!odb_source_count_objects(source, ODB_COUNT_OBJECTS_APPROXIMATE, &c))
>  				count += c;
>  		}
>  
> diff --git a/packfile.h b/packfile.h
> index 1da8c729cb..74b6bc58c5 100644
> --- a/packfile.h
> +++ b/packfile.h
> @@ -275,6 +275,7 @@ enum kept_pack_type {
>   * Return 0 on success, a negative error code otherwise.
>   */
>  int packfile_store_count_objects(struct packfile_store *store,
> +				 enum odb_count_objects_flags flags,
>  				 unsigned long *out);
>  
>  /*
>
> -- 
> 2.53.0.880.g73c4285caa.dirty
>

Okay.

-- 
Cheers,
Toon

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 6/6] odb: introduce generic object counting
  2026-03-10 15:18 ` [PATCH 6/6] odb: " Patrick Steinhardt
@ 2026-03-11 15:30   ` Toon Claes
  2026-03-12  6:57     ` Patrick Steinhardt
  0 siblings, 1 reply; 27+ messages in thread
From: Toon Claes @ 2026-03-11 15:30 UTC (permalink / raw)
  To: Patrick Steinhardt, git

Patrick Steinhardt <ps@pks.im> writes:

> Similar to the preceding commit, introduce counting of objects on the
> object database level, replacing the logic that we have in
> `repo_approximate_object_count()`.
>
> Note that the function knows to cache the object count. It's unclear
> whether this cache is really required as we shouldn't have that many
> cases where we count objects repeatedly. But to be on the safe side the
> caching mechanism is retained, with the only excepting being that we
> also have to use the passed flags as caching key.
>
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
>  builtin/gc.c   |  6 +++++-
>  commit-graph.c |  3 ++-
>  object-name.c  |  6 +++++-
>  odb.c          | 37 ++++++++++++++++++++++++++++++++++++-
>  odb.h          | 17 +++++++++++++++--
>  packfile.c     | 27 ---------------------------
>  packfile.h     |  6 ------
>  7 files changed, 63 insertions(+), 39 deletions(-)
>
> diff --git a/builtin/gc.c b/builtin/gc.c
> index 3a64d28da8..cb9ca89a97 100644
> --- a/builtin/gc.c
> +++ b/builtin/gc.c
> @@ -574,9 +574,13 @@ static uint64_t total_ram(void)
>  static uint64_t estimate_repack_memory(struct gc_config *cfg,
>  				       struct packed_git *pack)
>  {
> -	unsigned long nr_objects = repo_approximate_object_count(the_repository);
> +	unsigned long nr_objects;
>  	size_t os_cache, heap;
>  
> +	if (odb_count_objects(the_repository->objects,
> +			      ODB_COUNT_OBJECTS_APPROXIMATE, &nr_objects) < 0)
> +		return 0;
> +
>  	if (!pack || !nr_objects)
>  		return 0;
>  
> diff --git a/commit-graph.c b/commit-graph.c
> index f8e24145a5..c030003330 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -2607,7 +2607,8 @@ int write_commit_graph(struct odb_source *source,
>  			replace = ctx.opts->split_flags & COMMIT_GRAPH_SPLIT_REPLACE;
>  	}
>  
> -	ctx.approx_nr_objects = repo_approximate_object_count(r);
> +	if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &ctx.approx_nr_objects) < 0)
> +		ctx.approx_nr_objects = 0;
>  
>  	if (ctx.append && g) {
>  		for (i = 0; i < g->num_commits; i++) {
> diff --git a/object-name.c b/object-name.c
> index 7b14c3bf9b..e5adec4c9d 100644
> --- a/object-name.c
> +++ b/object-name.c
> @@ -837,7 +837,11 @@ int repo_find_unique_abbrev_r(struct repository *r, char *hex,
>  	const unsigned hexsz = algo->hexsz;
>  
>  	if (len < 0) {
> -		unsigned long count = repo_approximate_object_count(r);
> +		unsigned long count;
> +
> +		if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &count) < 0)
> +			count = 0;
> +
>  		/*
>  		 * Add one because the MSB only tells us the highest bit set,
>  		 * not including the value of all the _other_ bits (so "15"
> diff --git a/odb.c b/odb.c
> index 84a31084d3..350e23f3c0 100644
> --- a/odb.c
> +++ b/odb.c
> @@ -917,6 +917,41 @@ int odb_for_each_object(struct object_database *odb,
>  	return 0;
>  }
>  
> +int odb_count_objects(struct object_database *odb,
> +		      enum odb_count_objects_flags flags,
> +		      unsigned long *out)
> +{
> +	struct odb_source *source;
> +	unsigned long count = 0;
> +	int ret;
> +
> +	if (odb->object_count_valid && odb->object_count_flags == flags) {
> +		*out = odb->object_count;
> +		return 0;
> +	}
> +
> +	odb_prepare_alternates(odb);
> +	for (source = odb->sources; source; source = source->next) {
> +		unsigned long c;
> +
> +		ret = odb_source_count_objects(source, flags, &c);
> +		if (ret < 0)
> +			goto out;
> +
> +		count += c;
> +	}
> +
> +	odb->object_count = count;
> +	odb->object_count_valid = 1;
> +	odb->object_count_flags = flags;
> +
> +	*out = count;
> +	ret = 0;
> +
> +out:
> +	return ret;
> +}
> +
>  void odb_assert_oid_type(struct object_database *odb,
>  			 const struct object_id *oid, enum object_type expect)
>  {
> @@ -1030,7 +1065,7 @@ void odb_reprepare(struct object_database *o)
>  	for (source = o->sources; source; source = source->next)
>  		odb_source_reprepare(source);
>  
> -	o->approximate_object_count_valid = 0;
> +	o->object_count_valid = 0;
>  
>  	obj_read_unlock();
>  }
> diff --git a/odb.h b/odb.h
> index e6057477f6..7b004f1cf4 100644
> --- a/odb.h
> +++ b/odb.h
> @@ -112,8 +112,9 @@ struct object_database {
>  	 * These two fields are not meant for direct access. Use
>  	 * repo_approximate_object_count() instead.

This comment needs updating now.

Otherwise no comments here.

-- 
Cheers,
Toon

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 6/6] odb: introduce generic object counting
  2026-03-11 15:30   ` Toon Claes
@ 2026-03-12  6:57     ` Patrick Steinhardt
  0 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-12  6:57 UTC (permalink / raw)
  To: Toon Claes; +Cc: git

On Wed, Mar 11, 2026 at 04:30:50PM +0100, Toon Claes wrote:
> Patrick Steinhardt <ps@pks.im> writes:
> > diff --git a/odb.h b/odb.h
> > index e6057477f6..7b004f1cf4 100644
> > --- a/odb.h
> > +++ b/odb.h
> > @@ -112,8 +112,9 @@ struct object_database {
> >  	 * These two fields are not meant for direct access. Use
> >  	 * repo_approximate_object_count() instead.
> 
> This comment needs updating now.
> 
> Otherwise no comments here.

Good catch, will fix. Thanks!

Patrick

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2 0/6] odb: introduce generic object counting
  2026-03-10 15:18 [PATCH 0/6] odb: introduce generic object counting Patrick Steinhardt
                   ` (5 preceding siblings ...)
  2026-03-10 15:18 ` [PATCH 6/6] odb: " Patrick Steinhardt
@ 2026-03-12  8:42 ` Patrick Steinhardt
  2026-03-12  8:42   ` [PATCH v2 1/6] odb: stop including "odb/source.h" Patrick Steinhardt
                     ` (6 more replies)
  6 siblings, 7 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-12  8:42 UTC (permalink / raw)
  To: git; +Cc: Toon Claes, Junio C Hamano, Justin Tobler

Hi,

this small patch series introduces generic object counting for pluggable
object databases. The series is built on top of d181b9354c (The 13th
batch, 2026-03-09) with ps/odb-sources at d6fc6fe6f8 (odb/source: make
`begin_transaction()` function pluggable, 2026-03-05) merged into it.

Changes in v2:
  - Properly initialize `out` pointer when counting loose objects.
  - Fix a stale comment.
  - Fix a commit message type.
  - Link to v1: https://lore.kernel.org/r/20260310-b4-pks-odb-source-count-objects-v1-0-109e07d425f4@pks.im

Thanks!

Patrick

---
Patrick Steinhardt (6):
      odb: stop including "odb/source.h"
      packfile: extract logic to count number of objects
      object-file: extract logic to approximate object count
      object-file: generalize counting objects
      odb/source: introduce generic object counting
      odb: introduce generic object counting

 builtin/gc.c                | 44 +++++++++----------------
 builtin/multi-pack-index.c  |  1 +
 builtin/submodule--helper.c |  1 +
 commit-graph.c              |  3 +-
 object-file.c               | 58 +++++++++++++++++++++++++++++++++
 object-file.h               | 14 ++++++++
 object-name.c               |  6 +++-
 odb.c                       | 37 ++++++++++++++++++++-
 odb.h                       | 78 +++++++++++++++++++++++++++++++++++++++++---
 odb/source-files.c          | 30 +++++++++++++++++
 odb/source.h                | 79 ++++++++++++++++-----------------------------
 odb/streaming.c             |  1 +
 packfile.c                  | 48 +++++++++++++--------------
 packfile.h                  | 16 +++++----
 repository.c                |  1 +
 submodule-config.c          |  1 +
 tmp-objdir.c                |  1 +
 17 files changed, 301 insertions(+), 118 deletions(-)

Range-diff versus v1:

1:  3d5f8733d5 = 1:  1639bb1725 odb: stop including "odb/source.h"
2:  2fe42618f4 = 2:  056b6f3ae3 packfile: extract logic to count number of objects
3:  4cbf727523 ! 3:  069c908771 object-file: extract logic to approximate object count
    @@ Commit message
         repack objects. This is done by counting the number of objects that we
         have and checking whether it exceeds a certain threshold. We don't
         really need an accurate object count though, which is why we only
    -    open a single object diretcroy shard and then extrapolate from there.
    +    open a single object directory shard and then extrapolate from there.
     
         Extract this logic into a new function that is owned by the loose object
         database source. This is done to prepare for a subsequent change, where
4:  c1a527877a ! 4:  6f293f1352 object-file: generalize counting objects
    @@ object-file.c: int odb_source_loose_for_each_object(struct odb_source *source,
     +		*out = count * 256;
     +		ret = 0;
     +	} else {
    ++		*out = 0;
     +		ret = odb_source_loose_for_each_object(source, NULL, count_loose_object,
     +						       out, 0);
      	}
5:  c12d4ec401 = 5:  0bda4a3d01 odb/source: introduce generic object counting
6:  91307f205d ! 6:  89164dad76 odb: introduce generic object counting
    @@ odb.c: void odb_reprepare(struct object_database *o)
     
      ## odb.h ##
     @@ odb.h: struct object_database {
    + 	/*
    + 	 * A fast, rough count of the number of objects in the repository.
      	 * These two fields are not meant for direct access. Use
    - 	 * repo_approximate_object_count() instead.
    +-	 * repo_approximate_object_count() instead.
    ++	 * odb_count_objects() instead.
      	 */
     -	unsigned long approximate_object_count;
     -	unsigned approximate_object_count_valid : 1;

---
base-commit: 2247f478a898a7f8f8322cc51bdeb1cc773d8f4a
change-id: 20260224-b4-pks-odb-source-count-objects-479fe682cf6f


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2 1/6] odb: stop including "odb/source.h"
  2026-03-12  8:42 ` [PATCH v2 0/6] " Patrick Steinhardt
@ 2026-03-12  8:42   ` Patrick Steinhardt
  2026-03-12  8:42   ` [PATCH v2 2/6] packfile: extract logic to count number of objects Patrick Steinhardt
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-12  8:42 UTC (permalink / raw)
  To: git; +Cc: Toon Claes, Junio C Hamano, Justin Tobler

The "odb.h" header currently includes the "odb/source.h" file. This is
somewhat roundabout though: most callers shouldn't have to care about
the `struct odb_source`, but should rather use the ODB-level functions.
Furthermore, it means that a couple of definitions have to live on the
source level even though they should be part of the generic interface.

Reverse the relation between "odb/source.h" and "odb.h" and move the
enums and typedefs that relate to the generic interfaces back into
"odb.h". Add the necessary includes to all files that rely on the
transitive include.

Suggested-by: Justin Tobler <jltobler@gmail.com>
Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 builtin/multi-pack-index.c  |  1 +
 builtin/submodule--helper.c |  1 +
 odb.h                       | 50 ++++++++++++++++++++++++++++++++++++++++++-
 odb/source.h                | 52 +--------------------------------------------
 odb/streaming.c             |  1 +
 repository.c                |  1 +
 submodule-config.c          |  1 +
 tmp-objdir.c                |  1 +
 8 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c
index 5f364aa816..3fcb207f1a 100644
--- a/builtin/multi-pack-index.c
+++ b/builtin/multi-pack-index.c
@@ -9,6 +9,7 @@
 #include "strbuf.h"
 #include "trace2.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "replace-object.h"
 #include "repository.h"
 
diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c
index 143f7cb3cc..4957487536 100644
--- a/builtin/submodule--helper.c
+++ b/builtin/submodule--helper.c
@@ -29,6 +29,7 @@
 #include "object-file.h"
 #include "object-name.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "advice.h"
 #include "branch.h"
 #include "list-objects-filter-options.h"
diff --git a/odb.h b/odb.h
index 86e0365c24..7a583e3873 100644
--- a/odb.h
+++ b/odb.h
@@ -3,7 +3,6 @@
 
 #include "hashmap.h"
 #include "object.h"
-#include "odb/source.h"
 #include "oidset.h"
 #include "oidmap.h"
 #include "string-list.h"
@@ -12,6 +11,7 @@
 struct oidmap;
 struct oidtree;
 struct strbuf;
+struct strvec;
 struct repository;
 struct multi_pack_index;
 
@@ -339,6 +339,42 @@ struct object_info {
  */
 #define OBJECT_INFO_INIT { 0 }
 
+/* Flags that can be passed to `odb_read_object_info_extended()`. */
+enum object_info_flags {
+	/* Invoke lookup_replace_object() on the given hash. */
+	OBJECT_INFO_LOOKUP_REPLACE = (1 << 0),
+
+	/* Do not reprepare object sources when the first lookup has failed. */
+	OBJECT_INFO_QUICK = (1 << 1),
+
+	/*
+	 * Do not attempt to fetch the object if missing (even if fetch_is_missing is
+	 * nonzero).
+	 */
+	OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2),
+
+	/* Die if object corruption (not just an object being missing) was detected. */
+	OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3),
+
+	/*
+	 * We have already tried reading the object, but it couldn't be found
+	 * via any of the attached sources, and are now doing a second read.
+	 * This second read asks the individual sources to also evaluate
+	 * whether any on-disk state may have changed that may have caused the
+	 * object to appear.
+	 *
+	 * This flag is for internal use, only. The second read only occurs
+	 * when `OBJECT_INFO_QUICK` was not passed.
+	 */
+	OBJECT_INFO_SECOND_READ = (1 << 4),
+
+	/*
+	 * This is meant for bulk prefetching of missing blobs in a partial
+	 * clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK.
+	 */
+	OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK),
+};
+
 /*
  * Read object info from the object database and populate the `object_info`
  * structure. Returns 0 on success, a negative error code otherwise.
@@ -432,6 +468,18 @@ enum odb_for_each_object_flags {
 	ODB_FOR_EACH_OBJECT_SKIP_ON_DISK_KEPT_PACKS = (1<<4),
 };
 
+/*
+ * A callback function that can be used to iterate through objects. If given,
+ * the optional `oi` parameter will be populated the same as if you would call
+ * `odb_read_object_info()`.
+ *
+ * Returning a non-zero error code will cause iteration to abort. The error
+ * code will be propagated.
+ */
+typedef int (*odb_for_each_object_cb)(const struct object_id *oid,
+				      struct object_info *oi,
+				      void *cb_data);
+
 /*
  * Iterate through all objects contained in the object database. Note that
  * objects may be iterated over multiple times in case they are either stored
diff --git a/odb/source.h b/odb/source.h
index caac558149..a1fd9dd920 100644
--- a/odb/source.h
+++ b/odb/source.h
@@ -2,6 +2,7 @@
 #define ODB_SOURCE_H
 
 #include "object.h"
+#include "odb.h"
 
 enum odb_source_type {
 	/*
@@ -14,61 +15,10 @@ enum odb_source_type {
 	ODB_SOURCE_FILES,
 };
 
-/* Flags that can be passed to `odb_read_object_info_extended()`. */
-enum object_info_flags {
-	/* Invoke lookup_replace_object() on the given hash. */
-	OBJECT_INFO_LOOKUP_REPLACE = (1 << 0),
-
-	/* Do not reprepare object sources when the first lookup has failed. */
-	OBJECT_INFO_QUICK = (1 << 1),
-
-	/*
-	 * Do not attempt to fetch the object if missing (even if fetch_is_missing is
-	 * nonzero).
-	 */
-	OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2),
-
-	/* Die if object corruption (not just an object being missing) was detected. */
-	OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3),
-
-	/*
-	 * We have already tried reading the object, but it couldn't be found
-	 * via any of the attached sources, and are now doing a second read.
-	 * This second read asks the individual sources to also evaluate
-	 * whether any on-disk state may have changed that may have caused the
-	 * object to appear.
-	 *
-	 * This flag is for internal use, only. The second read only occurs
-	 * when `OBJECT_INFO_QUICK` was not passed.
-	 */
-	OBJECT_INFO_SECOND_READ = (1 << 4),
-
-	/*
-	 * This is meant for bulk prefetching of missing blobs in a partial
-	 * clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK.
-	 */
-	OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK),
-};
-
 struct object_id;
-struct object_info;
 struct odb_read_stream;
-struct odb_transaction;
-struct odb_write_stream;
 struct strvec;
 
-/*
- * A callback function that can be used to iterate through objects. If given,
- * the optional `oi` parameter will be populated the same as if you would call
- * `odb_read_object_info()`.
- *
- * Returning a non-zero error code will cause iteration to abort. The error
- * code will be propagated.
- */
-typedef int (*odb_for_each_object_cb)(const struct object_id *oid,
-				      struct object_info *oi,
-				      void *cb_data);
-
 /*
  * The source is the part of the object database that stores the actual
  * objects. It thus encapsulates the logic to read and write the specific
diff --git a/odb/streaming.c b/odb/streaming.c
index a4355cd245..5927a12954 100644
--- a/odb/streaming.c
+++ b/odb/streaming.c
@@ -7,6 +7,7 @@
 #include "environment.h"
 #include "repository.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "odb/streaming.h"
 #include "replace-object.h"
 
diff --git a/repository.c b/repository.c
index e7fa42c14f..05c26bdbc3 100644
--- a/repository.c
+++ b/repository.c
@@ -2,6 +2,7 @@
 #include "abspath.h"
 #include "repository.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "config.h"
 #include "object.h"
 #include "lockfile.h"
diff --git a/submodule-config.c b/submodule-config.c
index 1f19fe2077..72a46b7a54 100644
--- a/submodule-config.c
+++ b/submodule-config.c
@@ -14,6 +14,7 @@
 #include "strbuf.h"
 #include "object-name.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "parse-options.h"
 #include "thread-utils.h"
 #include "tree-walk.h"
diff --git a/tmp-objdir.c b/tmp-objdir.c
index e436eed07e..d199d39e7c 100644
--- a/tmp-objdir.c
+++ b/tmp-objdir.c
@@ -11,6 +11,7 @@
 #include "strvec.h"
 #include "quote.h"
 #include "odb.h"
+#include "odb/source.h"
 #include "repository.h"
 
 struct tmp_objdir {

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v2 2/6] packfile: extract logic to count number of objects
  2026-03-12  8:42 ` [PATCH v2 0/6] " Patrick Steinhardt
  2026-03-12  8:42   ` [PATCH v2 1/6] odb: stop including "odb/source.h" Patrick Steinhardt
@ 2026-03-12  8:42   ` Patrick Steinhardt
  2026-03-12  8:42   ` [PATCH v2 3/6] object-file: extract logic to approximate object count Patrick Steinhardt
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-12  8:42 UTC (permalink / raw)
  To: git; +Cc: Toon Claes, Junio C Hamano

In a subsequent commit we're about to introduce a new
`odb_source_count_objects()` function so that we can make the logic
pluggable. Prepare for this change by extracting the logic that we have
to count packed objects into a standalone function.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 packfile.c | 45 +++++++++++++++++++++++++++++++++++----------
 packfile.h |  9 +++++++++
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/packfile.c b/packfile.c
index 215a23e42b..1ee5dd3da3 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1101,6 +1101,36 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor
 	return store->packs.head;
 }
 
+int packfile_store_count_objects(struct packfile_store *store,
+				 unsigned long *out)
+{
+	struct packfile_list_entry *e;
+	struct multi_pack_index *m;
+	unsigned long count = 0;
+	int ret;
+
+	m = get_multi_pack_index(store->source);
+	if (m)
+		count += m->num_objects + m->num_objects_in_base;
+
+	for (e = packfile_store_get_packs(store); e; e = e->next) {
+		if (e->pack->multi_pack_index)
+			continue;
+		if (open_pack_index(e->pack)) {
+			ret = -1;
+			goto out;
+		}
+
+		count += e->pack->num_objects;
+	}
+
+	*out = count;
+	ret = 0;
+
+out:
+	return ret;
+}
+
 /*
  * Give a fast, rough count of the number of objects in the repository. This
  * ignores loose objects completely. If you have a lot of them, then either
@@ -1113,21 +1143,16 @@ unsigned long repo_approximate_object_count(struct repository *r)
 	if (!r->objects->approximate_object_count_valid) {
 		struct odb_source *source;
 		unsigned long count = 0;
-		struct packed_git *p;
 
 		odb_prepare_alternates(r->objects);
-
 		for (source = r->objects->sources; source; source = source->next) {
-			struct multi_pack_index *m = get_multi_pack_index(source);
-			if (m)
-				count += m->num_objects + m->num_objects_in_base;
-		}
+			struct odb_source_files *files = odb_source_files_downcast(source);
+			unsigned long c;
 
-		repo_for_each_pack(r, p) {
-			if (p->multi_pack_index || open_pack_index(p))
-				continue;
-			count += p->num_objects;
+			if (!packfile_store_count_objects(files->packed, &c))
+				count += c;
 		}
+
 		r->objects->approximate_object_count = count;
 		r->objects->approximate_object_count_valid = 1;
 	}
diff --git a/packfile.h b/packfile.h
index 8b04a258a7..1da8c729cb 100644
--- a/packfile.h
+++ b/packfile.h
@@ -268,6 +268,15 @@ enum kept_pack_type {
 	KEPT_PACK_IN_CORE = (1 << 1),
 };
 
+/*
+ * Count the number objects contained in the given packfile store. If
+ * successful, the number of objects will be written to the `out` pointer.
+ *
+ * Return 0 on success, a negative error code otherwise.
+ */
+int packfile_store_count_objects(struct packfile_store *store,
+				 unsigned long *out);
+
 /*
  * Retrieve the cache of kept packs from the given packfile store. Accepts a
  * combination of `kept_pack_type` flags. The cache is computed on demand and

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v2 3/6] object-file: extract logic to approximate object count
  2026-03-12  8:42 ` [PATCH v2 0/6] " Patrick Steinhardt
  2026-03-12  8:42   ` [PATCH v2 1/6] odb: stop including "odb/source.h" Patrick Steinhardt
  2026-03-12  8:42   ` [PATCH v2 2/6] packfile: extract logic to count number of objects Patrick Steinhardt
@ 2026-03-12  8:42   ` Patrick Steinhardt
  2026-03-12  8:42   ` [PATCH v2 4/6] object-file: generalize counting objects Patrick Steinhardt
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-12  8:42 UTC (permalink / raw)
  To: git; +Cc: Toon Claes, Junio C Hamano

In "builtin/gc.c" we have some logic that checks whether we need to
repack objects. This is done by counting the number of objects that we
have and checking whether it exceeds a certain threshold. We don't
really need an accurate object count though, which is why we only
open a single object directory shard and then extrapolate from there.

Extract this logic into a new function that is owned by the loose object
database source. This is done to prepare for a subsequent change, where
we'll introduce object counting on the object database source level.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 builtin/gc.c  | 37 +++++++++----------------------------
 object-file.c | 41 +++++++++++++++++++++++++++++++++++++++++
 object-file.h | 13 +++++++++++++
 3 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/builtin/gc.c b/builtin/gc.c
index fb329c2cff..a08c7554cb 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -467,37 +467,18 @@ static int rerere_gc_condition(struct gc_config *cfg UNUSED)
 static int too_many_loose_objects(int limit)
 {
 	/*
-	 * Quickly check if a "gc" is needed, by estimating how
-	 * many loose objects there are.  Because SHA-1 is evenly
-	 * distributed, we can check only one and get a reasonable
-	 * estimate.
+	 * This is weird, but stems from legacy behaviour: the GC auto
+	 * threshold was always essentially interpreted as if it was rounded up
+	 * to the next multiple 256 of, so we retain this behaviour for now.
 	 */
-	DIR *dir;
-	struct dirent *ent;
-	int auto_threshold;
-	int num_loose = 0;
-	int needed = 0;
-	const unsigned hexsz_loose = the_hash_algo->hexsz - 2;
-	char *path;
-
-	path = repo_git_path(the_repository, "objects/17");
-	dir = opendir(path);
-	free(path);
-	if (!dir)
+	int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
+	unsigned long loose_count;
+
+	if (odb_source_loose_approximate_object_count(the_repository->objects->sources,
+						      &loose_count) < 0)
 		return 0;
 
-	auto_threshold = DIV_ROUND_UP(limit, 256);
-	while ((ent = readdir(dir)) != NULL) {
-		if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose ||
-		    ent->d_name[hexsz_loose] != '\0')
-			continue;
-		if (++num_loose > auto_threshold) {
-			needed = 1;
-			break;
-		}
-	}
-	closedir(dir);
-	return needed;
+	return loose_count > auto_threshold;
 }
 
 static struct packed_git *find_base_packs(struct string_list *packs,
diff --git a/object-file.c b/object-file.c
index a3ff7f586c..da67e3c9ff 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1868,6 +1868,47 @@ int odb_source_loose_for_each_object(struct odb_source *source,
 					     NULL, NULL, &data);
 }
 
+int odb_source_loose_approximate_object_count(struct odb_source *source,
+					      unsigned long *out)
+{
+	const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
+	unsigned long count = 0;
+	struct dirent *ent;
+	char *path = NULL;
+	DIR *dir = NULL;
+	int ret;
+
+	path = xstrfmt("%s/17", source->path);
+
+	dir = opendir(path);
+	if (!dir) {
+		if (errno == ENOENT) {
+			*out = 0;
+			ret = 0;
+			goto out;
+		}
+
+		ret = error_errno("cannot open object shard '%s'", path);
+		goto out;
+	}
+
+	while ((ent = readdir(dir)) != NULL) {
+		if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
+		    ent->d_name[hexsz] != '\0')
+			continue;
+		count++;
+	}
+
+	*out = count * 256;
+	ret = 0;
+
+out:
+	if (dir)
+		closedir(dir);
+	free(path);
+	return ret;
+}
+
 static int append_loose_object(const struct object_id *oid,
 			       const char *path UNUSED,
 			       void *data)
diff --git a/object-file.h b/object-file.h
index ff6da65296..b870ea9fa8 100644
--- a/object-file.h
+++ b/object-file.h
@@ -139,6 +139,19 @@ int odb_source_loose_for_each_object(struct odb_source *source,
 				     void *cb_data,
 				     unsigned flags);
 
+/*
+ * Count the number of loose objects in this source.
+ *
+ * The object count is approximated by opening a single sharding directory for
+ * loose objects and scanning its contents. The result is then extrapolated by
+ * 256. This should generally work as a reasonable estimate given that the
+ * object hash is supposed to be indistinguishable from random.
+ *
+ * Returns 0 on success, a negative error code otherwise.
+ */
+int odb_source_loose_approximate_object_count(struct odb_source *source,
+					      unsigned long *out);
+
 /**
  * format_object_header() is a thin wrapper around s xsnprintf() that
  * writes the initial "<type> <obj-len>" part of the loose object

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v2 4/6] object-file: generalize counting objects
  2026-03-12  8:42 ` [PATCH v2 0/6] " Patrick Steinhardt
                     ` (2 preceding siblings ...)
  2026-03-12  8:42   ` [PATCH v2 3/6] object-file: extract logic to approximate object count Patrick Steinhardt
@ 2026-03-12  8:42   ` Patrick Steinhardt
  2026-03-12  8:43   ` [PATCH v2 5/6] odb/source: introduce generic object counting Patrick Steinhardt
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-12  8:42 UTC (permalink / raw)
  To: git; +Cc: Toon Claes, Junio C Hamano

Generalize the function introduced in the preceding commit to not only
be able to approximate the number of loose objects, but to also provide
an accurate count. The behaviour can be toggled via a new flag.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 builtin/gc.c  |  5 +++--
 object-file.c | 59 ++++++++++++++++++++++++++++++++++++++---------------------
 object-file.h |  5 +++--
 odb.h         |  9 +++++++++
 4 files changed, 53 insertions(+), 25 deletions(-)

diff --git a/builtin/gc.c b/builtin/gc.c
index a08c7554cb..3a64d28da8 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -474,8 +474,9 @@ static int too_many_loose_objects(int limit)
 	int auto_threshold = DIV_ROUND_UP(limit, 256) * 256;
 	unsigned long loose_count;
 
-	if (odb_source_loose_approximate_object_count(the_repository->objects->sources,
-						      &loose_count) < 0)
+	if (odb_source_loose_count_objects(the_repository->objects->sources,
+					   ODB_COUNT_OBJECTS_APPROXIMATE,
+					   &loose_count) < 0)
 		return 0;
 
 	return loose_count > auto_threshold;
diff --git a/object-file.c b/object-file.c
index da67e3c9ff..569ce6eaed 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1868,40 +1868,57 @@ int odb_source_loose_for_each_object(struct odb_source *source,
 					     NULL, NULL, &data);
 }
 
-int odb_source_loose_approximate_object_count(struct odb_source *source,
-					      unsigned long *out)
+static int count_loose_object(const struct object_id *oid UNUSED,
+			      struct object_info *oi UNUSED,
+			      void *payload)
+{
+	unsigned long *count = payload;
+	(*count)++;
+	return 0;
+}
+
+int odb_source_loose_count_objects(struct odb_source *source,
+				   enum odb_count_objects_flags flags,
+				   unsigned long *out)
 {
 	const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2;
-	unsigned long count = 0;
-	struct dirent *ent;
 	char *path = NULL;
 	DIR *dir = NULL;
 	int ret;
 
-	path = xstrfmt("%s/17", source->path);
+	if (flags & ODB_COUNT_OBJECTS_APPROXIMATE) {
+		unsigned long count = 0;
+		struct dirent *ent;
 
-	dir = opendir(path);
-	if (!dir) {
-		if (errno == ENOENT) {
-			*out = 0;
-			ret = 0;
+		path = xstrfmt("%s/17", source->path);
+
+		dir = opendir(path);
+		if (!dir) {
+			if (errno == ENOENT) {
+				*out = 0;
+				ret = 0;
+				goto out;
+			}
+
+			ret = error_errno("cannot open object shard '%s'", path);
 			goto out;
 		}
 
-		ret = error_errno("cannot open object shard '%s'", path);
-		goto out;
-	}
+		while ((ent = readdir(dir)) != NULL) {
+			if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
+			    ent->d_name[hexsz] != '\0')
+				continue;
+			count++;
+		}
 
-	while ((ent = readdir(dir)) != NULL) {
-		if (strspn(ent->d_name, "0123456789abcdef") != hexsz ||
-		    ent->d_name[hexsz] != '\0')
-			continue;
-		count++;
+		*out = count * 256;
+		ret = 0;
+	} else {
+		*out = 0;
+		ret = odb_source_loose_for_each_object(source, NULL, count_loose_object,
+						       out, 0);
 	}
 
-	*out = count * 256;
-	ret = 0;
-
 out:
 	if (dir)
 		closedir(dir);
diff --git a/object-file.h b/object-file.h
index b870ea9fa8..f8d8805a18 100644
--- a/object-file.h
+++ b/object-file.h
@@ -149,8 +149,9 @@ int odb_source_loose_for_each_object(struct odb_source *source,
  *
  * Returns 0 on success, a negative error code otherwise.
  */
-int odb_source_loose_approximate_object_count(struct odb_source *source,
-					      unsigned long *out);
+int odb_source_loose_count_objects(struct odb_source *source,
+				   enum odb_count_objects_flags flags,
+				   unsigned long *out);
 
 /**
  * format_object_header() is a thin wrapper around s xsnprintf() that
diff --git a/odb.h b/odb.h
index 7a583e3873..e6057477f6 100644
--- a/odb.h
+++ b/odb.h
@@ -500,6 +500,15 @@ int odb_for_each_object(struct object_database *odb,
 			void *cb_data,
 			unsigned flags);
 
+enum odb_count_objects_flags {
+	/*
+	 * Instead of providing an accurate count, allow the number of objects
+	 * to be approximated. Details of how this approximation works are
+	 * subject to the specific source's implementation.
+	 */
+	ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0),
+};
+
 enum {
 	/*
 	 * By default, `odb_write_object()` does not actually write anything

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v2 5/6] odb/source: introduce generic object counting
  2026-03-12  8:42 ` [PATCH v2 0/6] " Patrick Steinhardt
                     ` (3 preceding siblings ...)
  2026-03-12  8:42   ` [PATCH v2 4/6] object-file: generalize counting objects Patrick Steinhardt
@ 2026-03-12  8:43   ` Patrick Steinhardt
  2026-03-12  8:43   ` [PATCH v2 6/6] odb: " Patrick Steinhardt
  2026-03-13 11:52   ` [PATCH v2 0/6] " Toon Claes
  6 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-12  8:43 UTC (permalink / raw)
  To: git; +Cc: Toon Claes, Junio C Hamano

Introduce generic object counting on the object database source level
with a new backend-specific callback function.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 odb/source-files.c | 30 ++++++++++++++++++++++++++++++
 odb/source.h       | 27 +++++++++++++++++++++++++++
 packfile.c         |  4 ++--
 packfile.h         |  1 +
 4 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/odb/source-files.c b/odb/source-files.c
index 14cb9adeca..c08d8993e3 100644
--- a/odb/source-files.c
+++ b/odb/source-files.c
@@ -93,6 +93,35 @@ static int odb_source_files_for_each_object(struct odb_source *source,
 	return 0;
 }
 
+static int odb_source_files_count_objects(struct odb_source *source,
+					  enum odb_count_objects_flags flags,
+					  unsigned long *out)
+{
+	struct odb_source_files *files = odb_source_files_downcast(source);
+	unsigned long count;
+	int ret;
+
+	ret = packfile_store_count_objects(files->packed, flags, &count);
+	if (ret < 0)
+		goto out;
+
+	if (!(flags & ODB_COUNT_OBJECTS_APPROXIMATE)) {
+		unsigned long loose_count;
+
+		ret = odb_source_loose_count_objects(source, flags, &loose_count);
+		if (ret < 0)
+			goto out;
+
+		count += loose_count;
+	}
+
+	*out = count;
+	ret = 0;
+
+out:
+	return ret;
+}
+
 static int odb_source_files_freshen_object(struct odb_source *source,
 					   const struct object_id *oid)
 {
@@ -220,6 +249,7 @@ struct odb_source_files *odb_source_files_new(struct object_database *odb,
 	files->base.read_object_info = odb_source_files_read_object_info;
 	files->base.read_object_stream = odb_source_files_read_object_stream;
 	files->base.for_each_object = odb_source_files_for_each_object;
+	files->base.count_objects = odb_source_files_count_objects;
 	files->base.freshen_object = odb_source_files_freshen_object;
 	files->base.write_object = odb_source_files_write_object;
 	files->base.write_object_stream = odb_source_files_write_object_stream;
diff --git a/odb/source.h b/odb/source.h
index a1fd9dd920..96c906e7a1 100644
--- a/odb/source.h
+++ b/odb/source.h
@@ -142,6 +142,21 @@ struct odb_source {
 			       void *cb_data,
 			       unsigned flags);
 
+	/*
+	 * This callback is expected to count objects in the given object
+	 * database source. The callback function does not have to guarantee
+	 * that only unique objects are counted. The result shall be assigned
+	 * to the `out` pointer.
+	 *
+	 * Accepts `enum odb_count_objects_flag` flags to alter the behaviour.
+	 *
+	 * The callback is expected to return 0 on success, or a negative error
+	 * code otherwise.
+	 */
+	int (*count_objects)(struct odb_source *source,
+			     enum odb_count_objects_flags flags,
+			     unsigned long *out);
+
 	/*
 	 * This callback is expected to freshen the given object so that its
 	 * last access time is set to the current time. This is used to ensure
@@ -333,6 +348,18 @@ static inline int odb_source_for_each_object(struct odb_source *source,
 	return source->for_each_object(source, request, cb, cb_data, flags);
 }
 
+/*
+ * Count the number of objects in the given object database source.
+ *
+ * Returns 0 on success, a negative error code otherwise.
+ */
+static inline int odb_source_count_objects(struct odb_source *source,
+					   enum odb_count_objects_flags flags,
+					   unsigned long *out)
+{
+	return source->count_objects(source, flags, out);
+}
+
 /*
  * Freshen an object in the object database by updating its timestamp.
  * Returns 1 in case the object has been freshened, 0 in case the object does
diff --git a/packfile.c b/packfile.c
index 1ee5dd3da3..8ee462303a 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1102,6 +1102,7 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor
 }
 
 int packfile_store_count_objects(struct packfile_store *store,
+				 enum odb_count_objects_flags flags UNUSED,
 				 unsigned long *out)
 {
 	struct packfile_list_entry *e;
@@ -1146,10 +1147,9 @@ unsigned long repo_approximate_object_count(struct repository *r)
 
 		odb_prepare_alternates(r->objects);
 		for (source = r->objects->sources; source; source = source->next) {
-			struct odb_source_files *files = odb_source_files_downcast(source);
 			unsigned long c;
 
-			if (!packfile_store_count_objects(files->packed, &c))
+			if (!odb_source_count_objects(source, ODB_COUNT_OBJECTS_APPROXIMATE, &c))
 				count += c;
 		}
 
diff --git a/packfile.h b/packfile.h
index 1da8c729cb..74b6bc58c5 100644
--- a/packfile.h
+++ b/packfile.h
@@ -275,6 +275,7 @@ enum kept_pack_type {
  * Return 0 on success, a negative error code otherwise.
  */
 int packfile_store_count_objects(struct packfile_store *store,
+				 enum odb_count_objects_flags flags,
 				 unsigned long *out);
 
 /*

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v2 6/6] odb: introduce generic object counting
  2026-03-12  8:42 ` [PATCH v2 0/6] " Patrick Steinhardt
                     ` (4 preceding siblings ...)
  2026-03-12  8:43   ` [PATCH v2 5/6] odb/source: introduce generic object counting Patrick Steinhardt
@ 2026-03-12  8:43   ` Patrick Steinhardt
  2026-03-13 11:52   ` [PATCH v2 0/6] " Toon Claes
  6 siblings, 0 replies; 27+ messages in thread
From: Patrick Steinhardt @ 2026-03-12  8:43 UTC (permalink / raw)
  To: git; +Cc: Toon Claes, Junio C Hamano

Similar to the preceding commit, introduce counting of objects on the
object database level, replacing the logic that we have in
`repo_approximate_object_count()`.

Note that the function knows to cache the object count. It's unclear
whether this cache is really required as we shouldn't have that many
cases where we count objects repeatedly. But to be on the safe side the
caching mechanism is retained, with the only excepting being that we
also have to use the passed flags as caching key.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 builtin/gc.c   |  6 +++++-
 commit-graph.c |  3 ++-
 object-name.c  |  6 +++++-
 odb.c          | 37 ++++++++++++++++++++++++++++++++++++-
 odb.h          | 19 ++++++++++++++++---
 packfile.c     | 27 ---------------------------
 packfile.h     |  6 ------
 7 files changed, 64 insertions(+), 40 deletions(-)

diff --git a/builtin/gc.c b/builtin/gc.c
index 3a64d28da8..cb9ca89a97 100644
--- a/builtin/gc.c
+++ b/builtin/gc.c
@@ -574,9 +574,13 @@ static uint64_t total_ram(void)
 static uint64_t estimate_repack_memory(struct gc_config *cfg,
 				       struct packed_git *pack)
 {
-	unsigned long nr_objects = repo_approximate_object_count(the_repository);
+	unsigned long nr_objects;
 	size_t os_cache, heap;
 
+	if (odb_count_objects(the_repository->objects,
+			      ODB_COUNT_OBJECTS_APPROXIMATE, &nr_objects) < 0)
+		return 0;
+
 	if (!pack || !nr_objects)
 		return 0;
 
diff --git a/commit-graph.c b/commit-graph.c
index f8e24145a5..c030003330 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -2607,7 +2607,8 @@ int write_commit_graph(struct odb_source *source,
 			replace = ctx.opts->split_flags & COMMIT_GRAPH_SPLIT_REPLACE;
 	}
 
-	ctx.approx_nr_objects = repo_approximate_object_count(r);
+	if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &ctx.approx_nr_objects) < 0)
+		ctx.approx_nr_objects = 0;
 
 	if (ctx.append && g) {
 		for (i = 0; i < g->num_commits; i++) {
diff --git a/object-name.c b/object-name.c
index 7b14c3bf9b..e5adec4c9d 100644
--- a/object-name.c
+++ b/object-name.c
@@ -837,7 +837,11 @@ int repo_find_unique_abbrev_r(struct repository *r, char *hex,
 	const unsigned hexsz = algo->hexsz;
 
 	if (len < 0) {
-		unsigned long count = repo_approximate_object_count(r);
+		unsigned long count;
+
+		if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &count) < 0)
+			count = 0;
+
 		/*
 		 * Add one because the MSB only tells us the highest bit set,
 		 * not including the value of all the _other_ bits (so "15"
diff --git a/odb.c b/odb.c
index 84a31084d3..350e23f3c0 100644
--- a/odb.c
+++ b/odb.c
@@ -917,6 +917,41 @@ int odb_for_each_object(struct object_database *odb,
 	return 0;
 }
 
+int odb_count_objects(struct object_database *odb,
+		      enum odb_count_objects_flags flags,
+		      unsigned long *out)
+{
+	struct odb_source *source;
+	unsigned long count = 0;
+	int ret;
+
+	if (odb->object_count_valid && odb->object_count_flags == flags) {
+		*out = odb->object_count;
+		return 0;
+	}
+
+	odb_prepare_alternates(odb);
+	for (source = odb->sources; source; source = source->next) {
+		unsigned long c;
+
+		ret = odb_source_count_objects(source, flags, &c);
+		if (ret < 0)
+			goto out;
+
+		count += c;
+	}
+
+	odb->object_count = count;
+	odb->object_count_valid = 1;
+	odb->object_count_flags = flags;
+
+	*out = count;
+	ret = 0;
+
+out:
+	return ret;
+}
+
 void odb_assert_oid_type(struct object_database *odb,
 			 const struct object_id *oid, enum object_type expect)
 {
@@ -1030,7 +1065,7 @@ void odb_reprepare(struct object_database *o)
 	for (source = o->sources; source; source = source->next)
 		odb_source_reprepare(source);
 
-	o->approximate_object_count_valid = 0;
+	o->object_count_valid = 0;
 
 	obj_read_unlock();
 }
diff --git a/odb.h b/odb.h
index e6057477f6..9aee260105 100644
--- a/odb.h
+++ b/odb.h
@@ -110,10 +110,11 @@ struct object_database {
 	/*
 	 * A fast, rough count of the number of objects in the repository.
 	 * These two fields are not meant for direct access. Use
-	 * repo_approximate_object_count() instead.
+	 * odb_count_objects() instead.
 	 */
-	unsigned long approximate_object_count;
-	unsigned approximate_object_count_valid : 1;
+	unsigned long object_count;
+	unsigned object_count_flags;
+	unsigned object_count_valid : 1;
 
 	/*
 	 * Submodule source paths that will be added as additional sources to
@@ -509,6 +510,18 @@ enum odb_count_objects_flags {
 	ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0),
 };
 
+/*
+ * Count the number of objects in the given object database. This object count
+ * may double-count objects that are stored in multiple backends, or which are
+ * stored multiple times in a single backend.
+ *
+ * Returns 0 on success, a negative error code otherwise. The number of objects
+ * will be assigned to the `out` pointer on success.
+ */
+int odb_count_objects(struct object_database *odb,
+		      enum odb_count_objects_flags flags,
+		      unsigned long *out);
+
 enum {
 	/*
 	 * By default, `odb_write_object()` does not actually write anything
diff --git a/packfile.c b/packfile.c
index 8ee462303a..d4de9f3ffe 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1132,33 +1132,6 @@ int packfile_store_count_objects(struct packfile_store *store,
 	return ret;
 }
 
-/*
- * Give a fast, rough count of the number of objects in the repository. This
- * ignores loose objects completely. If you have a lot of them, then either
- * you should repack because your performance will be awful, or they are
- * all unreachable objects about to be pruned, in which case they're not really
- * interesting as a measure of repo size in the first place.
- */
-unsigned long repo_approximate_object_count(struct repository *r)
-{
-	if (!r->objects->approximate_object_count_valid) {
-		struct odb_source *source;
-		unsigned long count = 0;
-
-		odb_prepare_alternates(r->objects);
-		for (source = r->objects->sources; source; source = source->next) {
-			unsigned long c;
-
-			if (!odb_source_count_objects(source, ODB_COUNT_OBJECTS_APPROXIMATE, &c))
-				count += c;
-		}
-
-		r->objects->approximate_object_count = count;
-		r->objects->approximate_object_count_valid = 1;
-	}
-	return r->objects->approximate_object_count;
-}
-
 unsigned long unpack_object_header_buffer(const unsigned char *buf,
 		unsigned long len, enum object_type *type, unsigned long *sizep)
 {
diff --git a/packfile.h b/packfile.h
index 74b6bc58c5..a16ec3950d 100644
--- a/packfile.h
+++ b/packfile.h
@@ -375,12 +375,6 @@ int packfile_store_for_each_object(struct packfile_store *store,
 #define PACKDIR_FILE_GARBAGE 4
 extern void (*report_garbage)(unsigned seen_bits, const char *path);
 
-/*
- * Give a rough count of objects in the repository. This sacrifices accuracy
- * for speed.
- */
-unsigned long repo_approximate_object_count(struct repository *r);
-
 void pack_report(struct repository *repo);
 
 /*

-- 
2.53.0.880.g73c4285caa.dirty


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v2 0/6] odb: introduce generic object counting
  2026-03-12  8:42 ` [PATCH v2 0/6] " Patrick Steinhardt
                     ` (5 preceding siblings ...)
  2026-03-12  8:43   ` [PATCH v2 6/6] odb: " Patrick Steinhardt
@ 2026-03-13 11:52   ` Toon Claes
  6 siblings, 0 replies; 27+ messages in thread
From: Toon Claes @ 2026-03-13 11:52 UTC (permalink / raw)
  To: Patrick Steinhardt, git; +Cc: Junio C Hamano, Justin Tobler

Patrick Steinhardt <ps@pks.im> writes:

> Hi,
>
> this small patch series introduces generic object counting for pluggable
> object databases. The series is built on top of d181b9354c (The 13th
> batch, 2026-03-09) with ps/odb-sources at d6fc6fe6f8 (odb/source: make
> `begin_transaction()` function pluggable, 2026-03-05) merged into it.
>
> Changes in v2:
>   - Properly initialize `out` pointer when counting loose objects.
>   - Fix a stale comment.
>   - Fix a commit message type.
>   - Link to v1: https://lore.kernel.org/r/20260310-b4-pks-odb-source-count-objects-v1-0-109e07d425f4@pks.im

Looking at the range-diff, all my concerns are addressed. Thanks!

-- 
Cheers,
Toon

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2026-03-13 11:52 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-10 15:18 [PATCH 0/6] odb: introduce generic object counting Patrick Steinhardt
2026-03-10 15:18 ` [PATCH 1/6] odb: stop including "odb/source.h" Patrick Steinhardt
2026-03-10 15:18 ` [PATCH 2/6] packfile: extract logic to count number of objects Patrick Steinhardt
2026-03-11 12:41   ` Toon Claes
2026-03-11 13:55     ` Patrick Steinhardt
2026-03-10 15:18 ` [PATCH 3/6] object-file: extract logic to approximate object count Patrick Steinhardt
2026-03-10 17:44   ` Junio C Hamano
2026-03-11 12:47   ` Toon Claes
2026-03-11 13:58     ` Patrick Steinhardt
2026-03-10 15:18 ` [PATCH 4/6] object-file: generalize counting objects Patrick Steinhardt
2026-03-11 13:53   ` Toon Claes
2026-03-11 14:01     ` Patrick Steinhardt
2026-03-10 15:18 ` [PATCH 5/6] odb/source: introduce generic object counting Patrick Steinhardt
2026-03-10 17:51   ` Junio C Hamano
2026-03-11  6:44     ` Patrick Steinhardt
2026-03-11 15:03   ` Toon Claes
2026-03-10 15:18 ` [PATCH 6/6] odb: " Patrick Steinhardt
2026-03-11 15:30   ` Toon Claes
2026-03-12  6:57     ` Patrick Steinhardt
2026-03-12  8:42 ` [PATCH v2 0/6] " Patrick Steinhardt
2026-03-12  8:42   ` [PATCH v2 1/6] odb: stop including "odb/source.h" Patrick Steinhardt
2026-03-12  8:42   ` [PATCH v2 2/6] packfile: extract logic to count number of objects Patrick Steinhardt
2026-03-12  8:42   ` [PATCH v2 3/6] object-file: extract logic to approximate object count Patrick Steinhardt
2026-03-12  8:42   ` [PATCH v2 4/6] object-file: generalize counting objects Patrick Steinhardt
2026-03-12  8:43   ` [PATCH v2 5/6] odb/source: introduce generic object counting Patrick Steinhardt
2026-03-12  8:43   ` [PATCH v2 6/6] odb: " Patrick Steinhardt
2026-03-13 11:52   ` [PATCH v2 0/6] " Toon Claes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox