git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue
@ 2025-07-15 14:35 René Scharfe
  2025-07-15 14:51 ` [PATCH 1/3] " René Scharfe
                   ` (4 more replies)
  0 siblings, 5 replies; 41+ messages in thread
From: René Scharfe @ 2025-07-15 14:35 UTC (permalink / raw)
  To: Git List; +Cc: Jeff King

Use prio_queue to improve worst-case performance at the cost of slightly
worse best-case performance.  Then add and use prio_queue_replace() to
recover that loss.

  commit: convert pop_most_recent_commit() to prio_queue
  prio-queue: add prio_queue_replace()
  commit: use prio_queue_replace() in pop_most_recent_commit()

 commit.c                          | 14 ++++++--
 commit.h                          |  8 ++---
 fetch-pack.c                      | 13 +++++---
 object-name.c                     | 10 +++---
 prio-queue.c                      | 45 +++++++++++++++++--------
 prio-queue.h                      |  8 +++++
 t/perf/p1501-rev-parse-oneline.sh | 55 +++++++++++++++++++++++++++++++
 t/unit-tests/u-prio-queue.c       | 23 +++++++++++++
 walker.c                          | 11 ++++---
 9 files changed, 153 insertions(+), 34 deletions(-)
 create mode 100755 t/perf/p1501-rev-parse-oneline.sh

-- 
2.50.1

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-15 14:35 [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue René Scharfe
@ 2025-07-15 14:51 ` René Scharfe
  2025-07-15 19:23   ` Junio C Hamano
                     ` (3 more replies)
  2025-07-15 14:51 ` [PATCH 2/3] prio-queue: add prio_queue_replace() René Scharfe
                   ` (3 subsequent siblings)
  4 siblings, 4 replies; 41+ messages in thread
From: René Scharfe @ 2025-07-15 14:51 UTC (permalink / raw)
  To: Git List; +Cc: Jeff King

pop_most_recent_commit() calls commit_list_insert_by_date(), which and
is itself called in a loop, which can lead to quadratic complexity.
Replace the commit_list with a prio_queue to ensure logarithmic worst
case complexity and convert all three users.

Add a performance test that exercises one of them using a pathological
history that consists of 50% merges and 50% root commits to demonstrate
the speedup:

   Test                          v2.50.1           HEAD
   ----------------------------------------------------------------------
   1501.2: rev-parse ':/65535'   2.48(2.47+0.00)   0.20(0.19+0.00) -91.9%

Alas, sane histories don't benefit from the conversion much, and
traversing Git's own history takes a 1% performance hit on my machine:

   $ hyperfine -w3 -L git ./git_2.50.1,./git '{git} rev-parse :/^Initial.revision'
   Benchmark 1: ./git_2.50.1 rev-parse :/^Initial.revision
     Time (mean ± σ):      1.071 s ±  0.004 s    [User: 1.052 s, System: 0.017 s]
     Range (min … max):    1.067 s …  1.078 s    10 runs

   Benchmark 2: ./git rev-parse :/^Initial.revision
     Time (mean ± σ):      1.079 s ±  0.003 s    [User: 1.060 s, System: 0.017 s]
     Range (min … max):    1.074 s …  1.083 s    10 runs

   Summary
     ./git_2.50.1 rev-parse :/^Initial.revision ran
       1.01 ± 0.00 times faster than ./git rev-parse :/^Initial.revision

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 commit.c                          |  7 ++--
 commit.h                          |  8 ++---
 fetch-pack.c                      | 13 +++++---
 object-name.c                     | 10 +++---
 t/perf/p1501-rev-parse-oneline.sh | 55 +++++++++++++++++++++++++++++++
 walker.c                          | 11 ++++---
 6 files changed, 83 insertions(+), 21 deletions(-)
 create mode 100755 t/perf/p1501-rev-parse-oneline.sh

diff --git a/commit.c b/commit.c
index e915b2b9a1..0200759aaa 100644
--- a/commit.c
+++ b/commit.c
@@ -31,6 +31,7 @@
 #include "parse.h"
 #include "object-file.h"
 #include "object-file-convert.h"
+#include "prio-queue.h"
 
 static struct commit_extra_header *read_commit_extra_header_lines(const char *buf, size_t len, const char **);
 
@@ -738,17 +739,17 @@ void commit_list_sort_by_date(struct commit_list **list)
 	commit_list_sort(list, commit_list_compare_by_date);
 }
 
-struct commit *pop_most_recent_commit(struct commit_list **list,
+struct commit *pop_most_recent_commit(struct prio_queue *queue,
 				      unsigned int mark)
 {
-	struct commit *ret = pop_commit(list);
+	struct commit *ret = prio_queue_get(queue);
 	struct commit_list *parents = ret->parents;
 
 	while (parents) {
 		struct commit *commit = parents->item;
 		if (!repo_parse_commit(the_repository, commit) && !(commit->object.flags & mark)) {
 			commit->object.flags |= mark;
-			commit_list_insert_by_date(commit, list);
+			prio_queue_put(queue, commit);
 		}
 		parents = parents->next;
 	}
diff --git a/commit.h b/commit.h
index 70c870dae4..9630c076d6 100644
--- a/commit.h
+++ b/commit.h
@@ -201,10 +201,10 @@ const char *repo_logmsg_reencode(struct repository *r,
 
 const char *skip_blank_lines(const char *msg);
 
-/** Removes the first commit from a list sorted by date, and adds all
- * of its parents.
- **/
-struct commit *pop_most_recent_commit(struct commit_list **list,
+struct prio_queue;
+
+/* Removes the first commit from a prio_queue and adds its parents. */
+struct commit *pop_most_recent_commit(struct prio_queue *queue,
 				      unsigned int mark);
 
 struct commit *pop_commit(struct commit_list **stack);
diff --git a/fetch-pack.c b/fetch-pack.c
index fa4231fee7..6afe2f964e 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -34,6 +34,7 @@
 #include "commit-graph.h"
 #include "sigchain.h"
 #include "mergesort.h"
+#include "prio-queue.h"
 
 static int transfer_unpack_limit = -1;
 static int fetch_unpack_limit = -1;
@@ -600,7 +601,7 @@ static int find_common(struct fetch_negotiator *negotiator,
 	return count ? retval : 0;
 }
 
-static struct commit_list *complete;
+static struct prio_queue complete = { compare_commits_by_commit_date };
 
 static int mark_complete(const struct object_id *oid)
 {
@@ -608,7 +609,7 @@ static int mark_complete(const struct object_id *oid)
 
 	if (commit && !(commit->object.flags & COMPLETE)) {
 		commit->object.flags |= COMPLETE;
-		commit_list_insert(commit, &complete);
+		prio_queue_put(&complete, commit);
 	}
 	return 0;
 }
@@ -625,9 +626,12 @@ static int mark_complete_oid(const char *refname UNUSED,
 static void mark_recent_complete_commits(struct fetch_pack_args *args,
 					 timestamp_t cutoff)
 {
-	while (complete && cutoff <= complete->item->date) {
+	while (complete.nr) {
+		struct commit *item = prio_queue_peek(&complete);
+		if (item->date < cutoff)
+			break;
 		print_verbose(args, _("Marking %s as complete"),
-			      oid_to_hex(&complete->item->object.oid));
+			      oid_to_hex(&item->object.oid));
 		pop_most_recent_commit(&complete, COMPLETE);
 	}
 }
@@ -797,7 +801,6 @@ static void mark_complete_and_common_ref(struct fetch_negotiator *negotiator,
 		refs_for_each_rawref(get_main_ref_store(the_repository),
 				     mark_complete_oid, NULL);
 		for_each_cached_alternate(NULL, mark_alternate_complete);
-		commit_list_sort_by_date(&complete);
 		if (cutoff)
 			mark_recent_complete_commits(args, cutoff);
 	}
diff --git a/object-name.c b/object-name.c
index 851858975f..ab57722146 100644
--- a/object-name.c
+++ b/object-name.c
@@ -28,6 +28,7 @@
 #include "commit-reach.h"
 #include "date.h"
 #include "object-file-convert.h"
+#include "prio-queue.h"
 
 static int get_oid_oneline(struct repository *r, const char *, struct object_id *,
 			   const struct commit_list *);
@@ -1461,7 +1462,7 @@ static int get_oid_oneline(struct repository *r,
 			   const char *prefix, struct object_id *oid,
 			   const struct commit_list *list)
 {
-	struct commit_list *copy = NULL, **copy_tail = &copy;
+	struct prio_queue copy = { compare_commits_by_commit_date };
 	const struct commit_list *l;
 	int found = 0;
 	int negative = 0;
@@ -1483,9 +1484,9 @@ static int get_oid_oneline(struct repository *r,
 
 	for (l = list; l; l = l->next) {
 		l->item->object.flags |= ONELINE_SEEN;
-		copy_tail = &commit_list_insert(l->item, copy_tail)->next;
+		prio_queue_put(&copy, l->item);
 	}
-	while (copy) {
+	while (copy.nr) {
 		const char *p, *buf;
 		struct commit *commit;
 		int matches;
@@ -1507,7 +1508,7 @@ static int get_oid_oneline(struct repository *r,
 	regfree(&regex);
 	for (l = list; l; l = l->next)
 		clear_commit_marks(l->item, ONELINE_SEEN);
-	free_commit_list(copy);
+	clear_prio_queue(&copy);
 	return found ? 0 : -1;
 }
 
@@ -2061,7 +2062,6 @@ static enum get_oid_result get_oid_with_context_1(struct repository *repo,
 			cb.list = &list;
 			refs_for_each_ref(get_main_ref_store(repo), handle_one_ref, &cb);
 			refs_head_ref(get_main_ref_store(repo), handle_one_ref, &cb);
-			commit_list_sort_by_date(&list);
 			ret = get_oid_oneline(repo, name + 2, oid, list);
 
 			free_commit_list(list);
diff --git a/t/perf/p1501-rev-parse-oneline.sh b/t/perf/p1501-rev-parse-oneline.sh
new file mode 100755
index 0000000000..ae0f553be5
--- /dev/null
+++ b/t/perf/p1501-rev-parse-oneline.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+
+test_description='Test :/ object name notation'
+
+. ./perf-lib.sh
+
+test_perf_fresh_repo
+
+build_history () {
+	local max_level="$1" &&
+	local level="${2:-1}" &&
+	local mark="${3:-1}" &&
+	if test $level -eq $max_level
+	then
+		echo "reset refs/heads/master" &&
+		echo "from $ZERO_OID" &&
+		echo "commit refs/heads/master" &&
+		echo "mark :$mark" &&
+		echo "committer C <c@example.com> 1234567890 +0000" &&
+		echo "data <<EOF" &&
+		echo "$mark" &&
+		echo "EOF"
+	else
+		local level1=$((level+1)) &&
+		local mark1=$((2*mark)) &&
+		local mark2=$((2*mark+1)) &&
+		build_history $max_level $level1 $mark1 &&
+		build_history $max_level $level1 $mark2 &&
+		echo "commit refs/heads/master" &&
+		echo "mark :$mark" &&
+		echo "committer C <c@example.com> 1234567890 +0000" &&
+		echo "data <<EOF" &&
+		echo "$mark" &&
+		echo "EOF" &&
+		echo "from :$mark1" &&
+		echo "merge :$mark2"
+	fi
+}
+
+test_expect_success 'setup' '
+	build_history 16 | git fast-import &&
+	git log --format="%H %s" --reverse >commits &&
+	sed -n -e "s/ .*$//p" -e "q" <commits >expect &&
+	sed -n -e "s/^.* //p" -e "q" <commits >needle
+'
+
+test_perf "rev-parse ':/$(cat needle)'" "
+	git rev-parse ':/$(cat needle)' >actual
+"
+
+test_expect_success 'verify result' '
+	test_cmp expect actual
+'
+
+test_done
diff --git a/walker.c b/walker.c
index b470d43e54..c6d0e20c72 100644
--- a/walker.c
+++ b/walker.c
@@ -14,6 +14,7 @@
 #include "blob.h"
 #include "refs.h"
 #include "progress.h"
+#include "prio-queue.h"
 
 static struct object_id current_commit_oid;
 
@@ -78,7 +79,7 @@ static int process_tree(struct walker *walker, struct tree *tree)
 #define SEEN		(1U << 1)
 #define TO_SCAN		(1U << 2)
 
-static struct commit_list *complete = NULL;
+static struct prio_queue complete = { compare_commits_by_commit_date };
 
 static int process_commit(struct walker *walker, struct commit *commit)
 {
@@ -87,7 +88,10 @@ static int process_commit(struct walker *walker, struct commit *commit)
 	if (repo_parse_commit(the_repository, commit))
 		return -1;
 
-	while (complete && complete->item->date >= commit->date) {
+	while (complete.nr) {
+		struct commit *item = prio_queue_peek(&complete);
+		if (item->date < commit->date)
+			break;
 		pop_most_recent_commit(&complete, COMPLETE);
 	}
 
@@ -233,7 +237,7 @@ static int mark_complete(const char *path UNUSED,
 
 	if (commit) {
 		commit->object.flags |= COMPLETE;
-		commit_list_insert(commit, &complete);
+		prio_queue_put(&complete, commit);
 	}
 	return 0;
 }
@@ -302,7 +306,6 @@ int walker_fetch(struct walker *walker, int targets, char **target,
 	if (!walker->get_recover) {
 		refs_for_each_ref(get_main_ref_store(the_repository),
 				  mark_complete, NULL);
-		commit_list_sort_by_date(&complete);
 	}
 
 	for (i = 0; i < targets; i++) {
-- 
2.50.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 2/3] prio-queue: add prio_queue_replace()
  2025-07-15 14:35 [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue René Scharfe
  2025-07-15 14:51 ` [PATCH 1/3] " René Scharfe
@ 2025-07-15 14:51 ` René Scharfe
  2025-07-16  5:09   ` Jeff King
  2025-07-15 14:51 ` [PATCH 3/3] commit: use prio_queue_replace() in pop_most_recent_commit() René Scharfe
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 41+ messages in thread
From: René Scharfe @ 2025-07-15 14:51 UTC (permalink / raw)
  To: Git List; +Cc: Jeff King

Add a function to replace the top element of the queue that basically
does the same as prio_queue_get() followed by prio_queue_put(), but
without the work by prio_queue_get() to rebalance the heap.  It can be
used to optimize loops that get one element and then immediately add
another one.  That's common e.g., with commit history traversal, where
we get out a commit and then put in its parents.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 prio-queue.c                | 45 ++++++++++++++++++++++++++-----------
 prio-queue.h                |  8 +++++++
 t/unit-tests/u-prio-queue.c | 23 +++++++++++++++++++
 3 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/prio-queue.c b/prio-queue.c
index ec33ac27db..9748528ce6 100644
--- a/prio-queue.c
+++ b/prio-queue.c
@@ -58,22 +58,10 @@ void prio_queue_put(struct prio_queue *queue, void *thing)
 	}
 }
 
-void *prio_queue_get(struct prio_queue *queue)
+static void sift_down_root(struct prio_queue *queue)
 {
-	void *result;
 	size_t ix, child;
 
-	if (!queue->nr)
-		return NULL;
-	if (!queue->compare)
-		return queue->array[--queue->nr].data; /* LIFO */
-
-	result = queue->array[0].data;
-	if (!--queue->nr)
-		return result;
-
-	queue->array[0] = queue->array[queue->nr];
-
 	/* Push down the one at the root */
 	for (ix = 0; ix * 2 + 1 < queue->nr; ix = child) {
 		child = ix * 2 + 1; /* left */
@@ -86,6 +74,23 @@ void *prio_queue_get(struct prio_queue *queue)
 
 		swap(queue, child, ix);
 	}
+}
+
+void *prio_queue_get(struct prio_queue *queue)
+{
+	void *result;
+
+	if (!queue->nr)
+		return NULL;
+	if (!queue->compare)
+		return queue->array[--queue->nr].data; /* LIFO */
+
+	result = queue->array[0].data;
+	if (!--queue->nr)
+		return result;
+
+	queue->array[0] = queue->array[queue->nr];
+	sift_down_root(queue);
 	return result;
 }
 
@@ -97,3 +102,17 @@ void *prio_queue_peek(struct prio_queue *queue)
 		return queue->array[queue->nr - 1].data;
 	return queue->array[0].data;
 }
+
+void prio_queue_replace(struct prio_queue *queue, void *thing)
+{
+	if (!queue->nr) {
+		prio_queue_put(queue, thing);
+	} else if (!queue->compare) {
+		queue->array[queue->nr - 1].ctr = queue->insertion_ctr++;
+		queue->array[queue->nr - 1].data = thing;
+	} else {
+		queue->array[0].ctr = queue->insertion_ctr++;
+		queue->array[0].data = thing;
+		sift_down_root(queue);
+	}
+}
diff --git a/prio-queue.h b/prio-queue.h
index 38d032636d..da7fad2f1f 100644
--- a/prio-queue.h
+++ b/prio-queue.h
@@ -52,6 +52,14 @@ void *prio_queue_get(struct prio_queue *);
  */
 void *prio_queue_peek(struct prio_queue *);
 
+/*
+ * Replace the "thing" that compares the smallest with a new "thing",
+ * like prio_queue_get()+prio_queue_put() would do, but in a more
+ * efficient way.  Does the same as prio_queue_put() if the queue is
+ * empty.
+ */
+void prio_queue_replace(struct prio_queue *queue, void *thing);
+
 void clear_prio_queue(struct prio_queue *);
 
 /* Reverse the LIFO elements */
diff --git a/t/unit-tests/u-prio-queue.c b/t/unit-tests/u-prio-queue.c
index 145e689c9c..63e58114ae 100644
--- a/t/unit-tests/u-prio-queue.c
+++ b/t/unit-tests/u-prio-queue.c
@@ -13,6 +13,7 @@ static int intcmp(const void *va, const void *vb, void *data UNUSED)
 #define STACK	 -3
 #define GET	 -4
 #define REVERSE  -5
+#define REPLACE  -6
 
 static int show(int *v)
 {
@@ -51,6 +52,15 @@ static void test_prio_queue(int *input, size_t input_size,
 		case REVERSE:
 			prio_queue_reverse(&pq);
 			break;
+		case REPLACE:
+			peek = prio_queue_peek(&pq);
+			cl_assert(i + 1 < input_size);
+			cl_assert(input[i + 1] >= 0);
+			cl_assert(j < result_size);
+			cl_assert_equal_i(result[j], show(peek));
+			j++;
+			prio_queue_replace(&pq, &input[++i]);
+			break;
 		default:
 			prio_queue_put(&pq, &input[i]);
 			break;
@@ -81,6 +91,13 @@ void test_prio_queue__empty(void)
 		   ((int []){ 1, 2, MISSING, 1, 2, MISSING }));
 }
 
+void test_prio_queue__replace(void)
+{
+	TEST_INPUT(((int []){ REPLACE, 6, 2, 4, REPLACE, 5, 7, GET,
+			      REPLACE, 1, DUMP }),
+		   ((int []){ MISSING, 2, 4, 5, 1, 6, 7 }));
+}
+
 void test_prio_queue__stack(void)
 {
 	TEST_INPUT(((int []){ STACK, 8, 1, 5, 4, 6, 2, 3, DUMP }),
@@ -92,3 +109,9 @@ void test_prio_queue__reverse_stack(void)
 	TEST_INPUT(((int []){ STACK, 1, 2, 3, 4, 5, 6, REVERSE, DUMP }),
 		   ((int []){ 1, 2, 3, 4, 5, 6 }));
 }
+
+void test_prio_queue__replace_stack(void)
+{
+	TEST_INPUT(((int []){ STACK, 8, 1, 5, REPLACE, 4, 6, 2, 3, DUMP }),
+		   ((int []){ 5, 3, 2, 6, 4, 1, 8 }));
+}
-- 
2.50.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 3/3] commit: use prio_queue_replace() in pop_most_recent_commit()
  2025-07-15 14:35 [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue René Scharfe
  2025-07-15 14:51 ` [PATCH 1/3] " René Scharfe
  2025-07-15 14:51 ` [PATCH 2/3] prio-queue: add prio_queue_replace() René Scharfe
@ 2025-07-15 14:51 ` René Scharfe
  2025-07-15 20:43   ` Junio C Hamano
  2025-07-16  0:07 ` [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue Junio C Hamano
  2025-07-18  9:09 ` [PATCH v2 " René Scharfe
  4 siblings, 1 reply; 41+ messages in thread
From: René Scharfe @ 2025-07-15 14:51 UTC (permalink / raw)
  To: Git List; +Cc: Jeff King

Optimize pop_most_recent_commit() by adding the first parent using the
more efficient prio_queue_peek() and prio_queue_replace() instead of
prio_queue_get() and prio_queue_put().

On my machine this neutralizes the performance hit it took in Git's own
repository when we converted it to prio_queue two patches ago (git_pq):

   $ hyperfine -w3 -L git ./git_2.50.1,./git_pq,./git '{git} rev-parse :/^Initial.revision'
   Benchmark 1: ./git_2.50.1 rev-parse :/^Initial.revision
     Time (mean ± σ):      1.073 s ±  0.003 s    [User: 1.053 s, System: 0.019 s]
     Range (min … max):    1.069 s …  1.078 s    10 runs

   Benchmark 2: ./git_pq rev-parse :/^Initial.revision
     Time (mean ± σ):      1.077 s ±  0.002 s    [User: 1.057 s, System: 0.018 s]
     Range (min … max):    1.072 s …  1.079 s    10 runs

   Benchmark 3: ./git rev-parse :/^Initial.revision
     Time (mean ± σ):      1.069 s ±  0.003 s    [User: 1.049 s, System: 0.018 s]
     Range (min … max):    1.065 s …  1.074 s    10 runs

   Summary
     ./git rev-parse :/^Initial.revision ran
       1.00 ± 0.00 times faster than ./git_2.50.1 rev-parse :/^Initial.revision
       1.01 ± 0.00 times faster than ./git_pq rev-parse :/^Initial.revision

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 commit.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/commit.c b/commit.c
index 0200759aaa..8244221b30 100644
--- a/commit.c
+++ b/commit.c
@@ -742,17 +742,24 @@ void commit_list_sort_by_date(struct commit_list **list)
 struct commit *pop_most_recent_commit(struct prio_queue *queue,
 				      unsigned int mark)
 {
-	struct commit *ret = prio_queue_get(queue);
+	struct commit *ret = prio_queue_peek(queue);
+	int delete_pending = 1;
 	struct commit_list *parents = ret->parents;
 
 	while (parents) {
 		struct commit *commit = parents->item;
 		if (!repo_parse_commit(the_repository, commit) && !(commit->object.flags & mark)) {
 			commit->object.flags |= mark;
-			prio_queue_put(queue, commit);
+			if (delete_pending)
+				prio_queue_replace(queue, commit);
+			else
+				prio_queue_put(queue, commit);
+			delete_pending = 0;
 		}
 		parents = parents->next;
 	}
+	if (delete_pending)
+		prio_queue_get(queue);
 	return ret;
 }
 
-- 
2.50.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-15 14:51 ` [PATCH 1/3] " René Scharfe
@ 2025-07-15 19:23   ` Junio C Hamano
  2025-07-15 20:47   ` Justin Tobler
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 41+ messages in thread
From: Junio C Hamano @ 2025-07-15 19:23 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Jeff King

René Scharfe <l.s.r@web.de> writes:

> pop_most_recent_commit() calls commit_list_insert_by_date(), which and
> is itself called in a loop, which can lead to quadratic complexity.
> Replace the commit_list with a prio_queue to ensure logarithmic worst
> case complexity and convert all three users.
>
> Add a performance test that exercises one of them using a pathological
> history that consists of 50% merges and 50% root commits to demonstrate
> the speedup:
>
>    Test                          v2.50.1           HEAD
>    ----------------------------------------------------------------------
>    1501.2: rev-parse ':/65535'   2.48(2.47+0.00)   0.20(0.19+0.00) -91.9%
>
> Alas, sane histories don't benefit from the conversion much, and
> traversing Git's own history takes a 1% performance hit on my machine:

;-)

>    $ hyperfine -w3 -L git ./git_2.50.1,./git '{git} rev-parse :/^Initial.revision'
>    Benchmark 1: ./git_2.50.1 rev-parse :/^Initial.revision
>      Time (mean ± σ):      1.071 s ±  0.004 s    [User: 1.052 s, System: 0.017 s]
>      Range (min … max):    1.067 s …  1.078 s    10 runs
>
>    Benchmark 2: ./git rev-parse :/^Initial.revision
>      Time (mean ± σ):      1.079 s ±  0.003 s    [User: 1.060 s, System: 0.017 s]
>      Range (min … max):    1.074 s …  1.083 s    10 runs
>
>    Summary
>      ./git_2.50.1 rev-parse :/^Initial.revision ran
>        1.01 ± 0.00 times faster than ./git rev-parse :/^Initial.revision

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] commit: use prio_queue_replace() in pop_most_recent_commit()
  2025-07-15 14:51 ` [PATCH 3/3] commit: use prio_queue_replace() in pop_most_recent_commit() René Scharfe
@ 2025-07-15 20:43   ` Junio C Hamano
  2025-07-16  9:38     ` René Scharfe
  0 siblings, 1 reply; 41+ messages in thread
From: Junio C Hamano @ 2025-07-15 20:43 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Jeff King

René Scharfe <l.s.r@web.de> writes:

> Optimize pop_most_recent_commit() by adding the first parent using the
> more efficient prio_queue_peek() and prio_queue_replace() instead of
> prio_queue_get() and prio_queue_put().
>
> On my machine this neutralizes the performance hit it took in Git's own
> repository when we converted it to prio_queue two patches ago (git_pq):

Given that our history has more merges than other projects, and the
_replace() optimization would help primarily single-strand-of-pearls
(and the first parent of a merge), that result is very good.

> diff --git a/commit.c b/commit.c
> index 0200759aaa..8244221b30 100644
> --- a/commit.c
> +++ b/commit.c
> @@ -742,17 +742,24 @@ void commit_list_sort_by_date(struct commit_list **list)
>  struct commit *pop_most_recent_commit(struct prio_queue *queue,
>  				      unsigned int mark)
>  {
> -	struct commit *ret = prio_queue_get(queue);
> +	struct commit *ret = prio_queue_peek(queue);
> +	int delete_pending = 1;

Briefly I was puzzled by the name (I would have called first-parent
since the logic was "we treat first parent specially by using
replace instead of get/put"), but the variable signals "instead of
get to remove the item from the queue, we just peeked, so we need to
remove it later" with its name, which is understandable.

>  	struct commit_list *parents = ret->parents;
>  
>  	while (parents) {
>  		struct commit *commit = parents->item;
>  		if (!repo_parse_commit(the_repository, commit) && !(commit->object.flags & mark)) {
>  			commit->object.flags |= mark;
> -			prio_queue_put(queue, commit);
> +			if (delete_pending)
> +				prio_queue_replace(queue, commit);
> +			else
> +				prio_queue_put(queue, commit);
> +			delete_pending = 0;
>  		}
>  		parents = parents->next;
>  	}
> +	if (delete_pending)
> +		prio_queue_get(queue);
>  	return ret;
>  }

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-15 14:51 ` [PATCH 1/3] " René Scharfe
  2025-07-15 19:23   ` Junio C Hamano
@ 2025-07-15 20:47   ` Justin Tobler
  2025-07-16  9:39     ` René Scharfe
  2025-07-16  5:05   ` Jeff King
  2025-07-16 22:23   ` Junio C Hamano
  3 siblings, 1 reply; 41+ messages in thread
From: Justin Tobler @ 2025-07-15 20:47 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Jeff King

On 25/07/15 04:51PM, René Scharfe wrote:
> pop_most_recent_commit() calls commit_list_insert_by_date(), which and

Did you mean?

s/which and/which/

> is itself called in a loop, which can lead to quadratic complexity.
> Replace the commit_list with a prio_queue to ensure logarithmic worst
> case complexity and convert all three users.

If I understand correctly, `pop_most_recent_commit()` removes the most
recent commit from a list of commits sorted by date and then inserts
each of the removed commit's parents into the list while maintaining
date order. Iterating through `struct commit_list` every time to find
where to insert each parent parent leads to quadratic complexity. For
repositories with many merge commits, this could scale poorly.

> Add a performance test that exercises one of them using a pathological
> history that consists of 50% merges and 50% root commits to demonstrate
> the speedup:
> 
>    Test                          v2.50.1           HEAD
>    ----------------------------------------------------------------------
>    1501.2: rev-parse ':/65535'   2.48(2.47+0.00)   0.20(0.19+0.00) -91.9%
> 
> Alas, sane histories don't benefit from the conversion much, and
> traversing Git's own history takes a 1% performance hit on my machine:

As "normal" repositories don't benefit here, it might be nice to more
explicitly mention the the types of repositories that do benefit.

>    $ hyperfine -w3 -L git ./git_2.50.1,./git '{git} rev-parse :/^Initial.revision'
>    Benchmark 1: ./git_2.50.1 rev-parse :/^Initial.revision
>      Time (mean ± σ):      1.071 s ±  0.004 s    [User: 1.052 s, System: 0.017 s]
>      Range (min … max):    1.067 s …  1.078 s    10 runs
> 
>    Benchmark 2: ./git rev-parse :/^Initial.revision
>      Time (mean ± σ):      1.079 s ±  0.003 s    [User: 1.060 s, System: 0.017 s]
>      Range (min … max):    1.074 s …  1.083 s    10 runs
> 
>    Summary
>      ./git_2.50.1 rev-parse :/^Initial.revision ran
>        1.01 ± 0.00 times faster than ./git rev-parse :/^Initial.revision
> 
> Signed-off-by: René Scharfe <l.s.r@web.de>
> ---
>  commit.c                          |  7 ++--
>  commit.h                          |  8 ++---
>  fetch-pack.c                      | 13 +++++---
>  object-name.c                     | 10 +++---
>  t/perf/p1501-rev-parse-oneline.sh | 55 +++++++++++++++++++++++++++++++
>  walker.c                          | 11 ++++---
>  6 files changed, 83 insertions(+), 21 deletions(-)
>  create mode 100755 t/perf/p1501-rev-parse-oneline.sh
> 
> diff --git a/commit.c b/commit.c
> index e915b2b9a1..0200759aaa 100644
> --- a/commit.c
> +++ b/commit.c
> @@ -31,6 +31,7 @@
>  #include "parse.h"
>  #include "object-file.h"
>  #include "object-file-convert.h"
> +#include "prio-queue.h"
>  
>  static struct commit_extra_header *read_commit_extra_header_lines(const char *buf, size_t len, const char **);
>  
> @@ -738,17 +739,17 @@ void commit_list_sort_by_date(struct commit_list **list)
>  	commit_list_sort(list, commit_list_compare_by_date);
>  }
>  
> -struct commit *pop_most_recent_commit(struct commit_list **list,
> +struct commit *pop_most_recent_commit(struct prio_queue *queue,
>  				      unsigned int mark)
>  {
> -	struct commit *ret = pop_commit(list);
> +	struct commit *ret = prio_queue_get(queue);
>  	struct commit_list *parents = ret->parents;
>  
>  	while (parents) {
>  		struct commit *commit = parents->item;
>  		if (!repo_parse_commit(the_repository, commit) && !(commit->object.flags & mark)) {
>  			commit->object.flags |= mark;
> -			commit_list_insert_by_date(commit, list);
> +			prio_queue_put(queue, commit);
>  		}
>  		parents = parents->next;
>  	}
> diff --git a/commit.h b/commit.h
> index 70c870dae4..9630c076d6 100644
> --- a/commit.h
> +++ b/commit.h
> @@ -201,10 +201,10 @@ const char *repo_logmsg_reencode(struct repository *r,
>  
>  const char *skip_blank_lines(const char *msg);
>  
> -/** Removes the first commit from a list sorted by date, and adds all
> - * of its parents.
> - **/
> -struct commit *pop_most_recent_commit(struct commit_list **list,
> +struct prio_queue;
> +
> +/* Removes the first commit from a prio_queue and adds its parents. */
> +struct commit *pop_most_recent_commit(struct prio_queue *queue,
>  				      unsigned int mark);

Previously, `pop_most_recent_commit()` would ensure commits inserted in
the list were done it date order. Now this depends on how the caller has
configured the `struct prio_queue`. This is fine though as previously
the caller was required to ensure the list was sorted to begin with
otherwise it wouldn't work properly.

-Justin

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-15 14:35 [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue René Scharfe
                   ` (2 preceding siblings ...)
  2025-07-15 14:51 ` [PATCH 3/3] commit: use prio_queue_replace() in pop_most_recent_commit() René Scharfe
@ 2025-07-16  0:07 ` Junio C Hamano
  2025-07-16  5:15   ` Jeff King
  2025-07-18  9:09 ` [PATCH v2 " René Scharfe
  4 siblings, 1 reply; 41+ messages in thread
From: Junio C Hamano @ 2025-07-16  0:07 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Jeff King

René Scharfe <l.s.r@web.de> writes:

> Use prio_queue to improve worst-case performance at the cost of slightly
> worse best-case performance.  Then add and use prio_queue_replace() to
> recover that loss.

Would change in the tiebreaking behaviour (aka sort stability) also
a cost of this change, as this swaps use of sorted linearly linked
list with priority queue?

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-15 14:51 ` [PATCH 1/3] " René Scharfe
  2025-07-15 19:23   ` Junio C Hamano
  2025-07-15 20:47   ` Justin Tobler
@ 2025-07-16  5:05   ` Jeff King
  2025-07-16  9:39     ` René Scharfe
  2025-07-16 22:23   ` Junio C Hamano
  3 siblings, 1 reply; 41+ messages in thread
From: Jeff King @ 2025-07-16  5:05 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List

On Tue, Jul 15, 2025 at 04:51:07PM +0200, René Scharfe wrote:

> pop_most_recent_commit() calls commit_list_insert_by_date(), which and
> is itself called in a loop, which can lead to quadratic complexity.
> Replace the commit_list with a prio_queue to ensure logarithmic worst
> case complexity and convert all three users.

I guess I'm cc'd because of my frequent complains about the quadratic
nature of our commit lists? :)

Mostly I asked because I had to look at pop_most_recent_commit() to see
what operation would be made faster here. Looks like it's mostly ":/",
but maybe also fetch's mark_recent_complete_commits()? I guess we might
hit that if you have a huge number of refs?

Anyway, I am in support of the direction regardless.

I actually have a series turning rev_info.commits into a prio_queue
which I need to polish up (mostly just writing commit messages; I've
been running with it for almost 2 years without trouble). Ironically it
does not touch this spot, as these commit lists are formed on their own.

The patch itself looks reasonable. I think here:

> @@ -1461,7 +1462,7 @@ static int get_oid_oneline(struct repository *r,
>  			   const char *prefix, struct object_id *oid,
>  			   const struct commit_list *list)
>  {
> -	struct commit_list *copy = NULL, **copy_tail = &copy;
> +	struct prio_queue copy = { compare_commits_by_commit_date };
>  	const struct commit_list *l;
>  	int found = 0;
>  	int negative = 0;
> @@ -1483,9 +1484,9 @@ static int get_oid_oneline(struct repository *r,
>  
>  	for (l = list; l; l = l->next) {
>  		l->item->object.flags |= ONELINE_SEEN;
> -		copy_tail = &commit_list_insert(l->item, copy_tail)->next;
> +		prio_queue_put(&copy, l->item);
>  	}
> -	while (copy) {
> +	while (copy.nr) {
>  		const char *p, *buf;
>  		struct commit *commit;
>  		int matches;

our callers are always generating and passing in a list. So we could
avoid the overhead of allocating the list in the first place by just
taking a prio_queue. But maybe it gets weird with clearing the
ONELINE_SEEN marks? We make a copy even in the current code so that we
can call clear_commit_marks() on the complete set.

I guess we could add them to an array or something, but that probably
ends up being roughly the same amount of work.

> +build_history () {
> +	local max_level="$1" &&
> +	local level="${2:-1}" &&
> +	local mark="${3:-1}" &&
> +	if test $level -eq $max_level
> +	then
> +		echo "reset refs/heads/master" &&
> +		echo "from $ZERO_OID" &&
> +		echo "commit refs/heads/master" &&
> +		echo "mark :$mark" &&
> +		echo "committer C <c@example.com> 1234567890 +0000" &&
> +		echo "data <<EOF" &&
> +		echo "$mark" &&
> +		echo "EOF"
> +	else
> +		local level1=$((level+1)) &&
> +		local mark1=$((2*mark)) &&
> +		local mark2=$((2*mark+1)) &&
> +		build_history $max_level $level1 $mark1 &&
> +		build_history $max_level $level1 $mark2 &&
> +		echo "commit refs/heads/master" &&
> +		echo "mark :$mark" &&
> +		echo "committer C <c@example.com> 1234567890 +0000" &&
> +		echo "data <<EOF" &&
> +		echo "$mark" &&
> +		echo "EOF" &&
> +		echo "from :$mark1" &&
> +		echo "merge :$mark2"
> +	fi
> +}

This took some brain cycles to decipher. It looks like we'll make
2^$level commits in a filled tree? It might be worth a brief comment
describing the goal (and maybe even giving an example graph drawing for
N=3 or something, though it gets out of hand quickly).

> +test_perf "rev-parse ':/$(cat needle)'" "
> +	git rev-parse ':/$(cat needle)' >actual
> +"

Hmm, usually we frown on putting snippets inside double-quotes because
it's so easy to accidentally interpolate outside of the test_eval. But
maybe this is short enough to be OK. I guess you did it here especially
so that the title is a nice ":/65535" and not the opaque "$(cat
needle)".

-Peff

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] prio-queue: add prio_queue_replace()
  2025-07-15 14:51 ` [PATCH 2/3] prio-queue: add prio_queue_replace() René Scharfe
@ 2025-07-16  5:09   ` Jeff King
  2025-07-16  9:38     ` René Scharfe
  0 siblings, 1 reply; 41+ messages in thread
From: Jeff King @ 2025-07-16  5:09 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List

On Tue, Jul 15, 2025 at 04:51:22PM +0200, René Scharfe wrote:

> Add a function to replace the top element of the queue that basically
> does the same as prio_queue_get() followed by prio_queue_put(), but
> without the work by prio_queue_get() to rebalance the heap.  It can be
> used to optimize loops that get one element and then immediately add
> another one.  That's common e.g., with commit history traversal, where
> we get out a commit and then put in its parents.

Hmm. But surely we still need to rebalance the heap after adding an
element? And indeed, we still call the new sift_down_root() function.

But I guess we are getting away without the "bubble up" operation that
put would do? So we are doing half as much work (but still big-O the
same)?

-Peff

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-16  0:07 ` [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue Junio C Hamano
@ 2025-07-16  5:15   ` Jeff King
  2025-07-16  9:38     ` René Scharfe
  2025-07-16 14:49     ` Junio C Hamano
  0 siblings, 2 replies; 41+ messages in thread
From: Jeff King @ 2025-07-16  5:15 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: René Scharfe, Git List

On Tue, Jul 15, 2025 at 05:07:38PM -0700, Junio C Hamano wrote:

> René Scharfe <l.s.r@web.de> writes:
> 
> > Use prio_queue to improve worst-case performance at the cost of slightly
> > worse best-case performance.  Then add and use prio_queue_replace() to
> > recover that loss.
> 
> Would change in the tiebreaking behaviour (aka sort stability) also
> a cost of this change, as this swaps use of sorted linearly linked
> list with priority queue?

The prio_queue uses insertion order as a tie-breaker for stability (with
earlier entries coming first). For building the initial queue from the
list, I think that is obviously fine (we feed them in sorted order,
which the prio queue will retain). For inserting while we walk the list,
we'll produce the same results as long as the original code always
inserted new entries after existing ones (in the case of a tie on commit
date, that is).

And I think that is the case, since commit_list_insert_by_date() does
this:

          while ((p = *pp) != NULL) {
                  if (p->item->date < item->date) {
                          break;
                  }
                  pp = &p->next;
          }
          return commit_list_insert(item, pp);

So we only insert once we have found an item in the list _after_ us,
retaining the same order.

But hopefully somebody can double check my logic, as it is quite
possible I got something reversed above. ;)

-Peff

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-16  5:15   ` Jeff King
@ 2025-07-16  9:38     ` René Scharfe
  2025-07-19  6:45       ` Jeff King
  2025-07-16 14:49     ` Junio C Hamano
  1 sibling, 1 reply; 41+ messages in thread
From: René Scharfe @ 2025-07-16  9:38 UTC (permalink / raw)
  To: Jeff King, Junio C Hamano; +Cc: Git List

On 7/16/25 7:15 AM, Jeff King wrote:
> On Tue, Jul 15, 2025 at 05:07:38PM -0700, Junio C Hamano wrote:
> 
>> René Scharfe <l.s.r@web.de> writes:
>>
>>> Use prio_queue to improve worst-case performance at the cost of slightly
>>> worse best-case performance.  Then add and use prio_queue_replace() to
>>> recover that loss.
>>
>> Would change in the tiebreaking behaviour (aka sort stability) also
>> a cost of this change, as this swaps use of sorted linearly linked
>> list with priority queue?
> 
> The prio_queue uses insertion order as a tie-breaker for stability (with
> earlier entries coming first). For building the initial queue from the
> list, I think that is obviously fine (we feed them in sorted order,
> which the prio queue will retain). For inserting while we walk the list,
> we'll produce the same results as long as the original code always
> inserted new entries after existing ones (in the case of a tie on commit
> date, that is).
> 
> And I think that is the case, since commit_list_insert_by_date() does
> this:
> 
>           while ((p = *pp) != NULL) {
>                   if (p->item->date < item->date) {
>                           break;
>                   }
>                   pp = &p->next;
>           }
>           return commit_list_insert(item, pp);
> 
> So we only insert once we have found an item in the list _after_ us,
> retaining the same order.
> 
> But hopefully somebody can double check my logic, as it is quite
> possible I got something reversed above. ;)
Yes, commit_list_insert_by_date() is stable, as it inserts commits after
ones with the same date.  Items are popped from the top, so this ensures
FIFO behavior for commits with the same date.

prio_queue ensures stability using an ID and favors lower ones, so it
provides the same order.

We should add unit tests for that, no?

René


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] prio-queue: add prio_queue_replace()
  2025-07-16  5:09   ` Jeff King
@ 2025-07-16  9:38     ` René Scharfe
  2025-07-17  9:20       ` René Scharfe
  0 siblings, 1 reply; 41+ messages in thread
From: René Scharfe @ 2025-07-16  9:38 UTC (permalink / raw)
  To: Jeff King; +Cc: Git List

On 7/16/25 7:09 AM, Jeff King wrote:
> On Tue, Jul 15, 2025 at 04:51:22PM +0200, René Scharfe wrote:
> 
>> Add a function to replace the top element of the queue that basically
>> does the same as prio_queue_get() followed by prio_queue_put(), but
>> without the work by prio_queue_get() to rebalance the heap.  It can be
>> used to optimize loops that get one element and then immediately add
>> another one.  That's common e.g., with commit history traversal, where
>> we get out a commit and then put in its parents.
> 
> Hmm. But surely we still need to rebalance the heap after adding an
> element? And indeed, we still call the new sift_down_root() function.

Yes.

> But I guess we are getting away without the "bubble up" operation that
> put would do? So we are doing half as much work (but still big-O the
> same)?

Yes.

I thought about building this optimization into prio_queue_get(), but
that would require prio_queue_peek() and prio_queue_put() to be adjusted
as well and all prio_queue users would have to be either made aware of
the not-fully-heapified state, or prevented from accessing prio_queue
properties like .nr and .array.  Adding a new function seemed like the
simplest and safest way for now.

René


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 3/3] commit: use prio_queue_replace() in pop_most_recent_commit()
  2025-07-15 20:43   ` Junio C Hamano
@ 2025-07-16  9:38     ` René Scharfe
  0 siblings, 0 replies; 41+ messages in thread
From: René Scharfe @ 2025-07-16  9:38 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git List, Jeff King

On 7/15/25 10:43 PM, Junio C Hamano wrote:
> René Scharfe <l.s.r@web.de> writes:
> 
>> diff --git a/commit.c b/commit.c
>> index 0200759aaa..8244221b30 100644
>> --- a/commit.c
>> +++ b/commit.c
>> @@ -742,17 +742,24 @@ void commit_list_sort_by_date(struct commit_list **list)
>>  struct commit *pop_most_recent_commit(struct prio_queue *queue,
>>  				      unsigned int mark)
>>  {
>> -	struct commit *ret = prio_queue_get(queue);
>> +	struct commit *ret = prio_queue_peek(queue);
>> +	int delete_pending = 1;
> 
> Briefly I was puzzled by the name (I would have called first-parent
> since the logic was "we treat first parent specially by using
> replace instead of get/put"), but the variable signals "instead of
> get to remove the item from the queue, we just peeked, so we need to
> remove it later" with its name, which is understandable.

Indeed, we're just interested in the removal part of prio_queue_get()
here, as we have done the cheap half (the peeking) already.  We don't
have a prio_queue_delete().  Adding one would perhaps add some clarity
here, but also widen the interface and probably not bring much of a
performance gain.

So perhaps calling the variable get_pending like the prio_queue_get()
that we end up invoking would reduce the initial puzzlement?

> 
>>  	struct commit_list *parents = ret->parents;
>>  
>>  	while (parents) {
>>  		struct commit *commit = parents->item;
>>  		if (!repo_parse_commit(the_repository, commit) && !(commit->object.flags & mark)) {
>>  			commit->object.flags |= mark;
>> -			prio_queue_put(queue, commit);
>> +			if (delete_pending)
>> +				prio_queue_replace(queue, commit);
>> +			else
>> +				prio_queue_put(queue, commit);
>> +			delete_pending = 0;
>>  		}
>>  		parents = parents->next;
>>  	}
>> +	if (delete_pending)
>> +		prio_queue_get(queue);
>>  	return ret;
>>  }


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-15 20:47   ` Justin Tobler
@ 2025-07-16  9:39     ` René Scharfe
  0 siblings, 0 replies; 41+ messages in thread
From: René Scharfe @ 2025-07-16  9:39 UTC (permalink / raw)
  To: Justin Tobler; +Cc: Git List, Jeff King

On 7/15/25 10:47 PM, Justin Tobler wrote:
> On 25/07/15 04:51PM, René Scharfe wrote:
>> pop_most_recent_commit() calls commit_list_insert_by_date(), which and
> 
> Did you mean?
> 
> s/which and/which/

Oh, yes.

>> is itself called in a loop, which can lead to quadratic complexity.
>> Replace the commit_list with a prio_queue to ensure logarithmic worst
>> case complexity and convert all three users.
> 
> If I understand correctly, `pop_most_recent_commit()` removes the most
> recent commit from a list of commits sorted by date and then inserts
> each of the removed commit's parents into the list while maintaining
> date order. Iterating through `struct commit_list` every time to find
> where to insert each parent parent leads to quadratic complexity. For
> repositories with many merge commits, this could scale poorly.

Right.

>> Add a performance test that exercises one of them using a pathological
>> history that consists of 50% merges and 50% root commits to demonstrate
>> the speedup:
>>
>>    Test                          v2.50.1           HEAD
>>    ----------------------------------------------------------------------
>>    1501.2: rev-parse ':/65535'   2.48(2.47+0.00)   0.20(0.19+0.00) -91.9%
>>
>> Alas, sane histories don't benefit from the conversion much, and
>> traversing Git's own history takes a 1% performance hit on my machine:
> 
> As "normal" repositories don't benefit here, it might be nice to more
> explicitly mention the the types of repositories that do benefit.

Good idea.

>> diff --git a/commit.h b/commit.h
>> index 70c870dae4..9630c076d6 100644
>> --- a/commit.h
>> +++ b/commit.h
>> @@ -201,10 +201,10 @@ const char *repo_logmsg_reencode(struct repository *r,
>>  
>>  const char *skip_blank_lines(const char *msg);
>>  
>> -/** Removes the first commit from a list sorted by date, and adds all
>> - * of its parents.
>> - **/
>> -struct commit *pop_most_recent_commit(struct commit_list **list,
>> +struct prio_queue;
>> +
>> +/* Removes the first commit from a prio_queue and adds its parents. */
>> +struct commit *pop_most_recent_commit(struct prio_queue *queue,
>>  				      unsigned int mark);
> 
> Previously, `pop_most_recent_commit()` would ensure commits inserted in
> the list were done it date order. Now this depends on how the caller has
> configured the `struct prio_queue`. This is fine though as previously
> the caller was required to ensure the list was sorted to begin with
> otherwise it wouldn't work properly.

Indeed.

René


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-16  5:05   ` Jeff King
@ 2025-07-16  9:39     ` René Scharfe
  2025-07-17  8:22       ` René Scharfe
  2025-07-19  6:55       ` Jeff King
  0 siblings, 2 replies; 41+ messages in thread
From: René Scharfe @ 2025-07-16  9:39 UTC (permalink / raw)
  To: Jeff King; +Cc: Git List

On 7/16/25 7:05 AM, Jeff King wrote:
> On Tue, Jul 15, 2025 at 04:51:07PM +0200, René Scharfe wrote:
> 
>> pop_most_recent_commit() calls commit_list_insert_by_date(), which and
>> is itself called in a loop, which can lead to quadratic complexity.
>> Replace the commit_list with a prio_queue to ensure logarithmic worst
>> case complexity and convert all three users.
> 
> I guess I'm cc'd because of my frequent complains about the quadratic
> nature of our commit lists? :)

And because you introduced prio_queue.

> Mostly I asked because I had to look at pop_most_recent_commit() to see
> what operation would be made faster here. Looks like it's mostly ":/",
> but maybe also fetch's mark_recent_complete_commits()? I guess we might
> hit that if you have a huge number of refs?

The :/ handling was the easiest to test for me.  fetch_pack() and
walker_fetch() require some server side to set up, which seems not worth
it just to demonstrate quadratic behavior.  Having thousands of refs
would make the list long enough to notice, as would having lots of
merges.

My general idea is to get rid of commit_list_insert_by_date() eventually
to avoid quadratic complexity.

> I actually have a series turning rev_info.commits into a prio_queue
> which I need to polish up (mostly just writing commit messages; I've
> been running with it for almost 2 years without trouble). Ironically it
> does not touch this spot, as these commit lists are formed on their own.

That is not a coincidence.  I had a look at that series and tried to
reach its goals while keeping rev_info.commits a commit_list.  Why?
Mostly being vaguely uncomfortable with prio_queue' memory overhead,
lack of type safety and dual use as a stack.  I still used it, but only
as local variable, not in the central struct rev_info.

Anyway, I failed; revision.c::get_revision_1() took an 8% performance
hit in my versions and none in yours, and I couldn't figure out why.
Perhaps I should revisit it with the new prio_queue_replace() in hand,
hmm..  For now I try to salvage the commit_list_insert_by_date()
replacement work outside of revision.c from that effort. 

> The patch itself looks reasonable. I think here:
> 
>> @@ -1461,7 +1462,7 @@ static int get_oid_oneline(struct repository *r,
>>  			   const char *prefix, struct object_id *oid,
>>  			   const struct commit_list *list)
>>  {
>> -	struct commit_list *copy = NULL, **copy_tail = &copy;
>> +	struct prio_queue copy = { compare_commits_by_commit_date };
>>  	const struct commit_list *l;
>>  	int found = 0;
>>  	int negative = 0;
>> @@ -1483,9 +1484,9 @@ static int get_oid_oneline(struct repository *r,
>>  
>>  	for (l = list; l; l = l->next) {
>>  		l->item->object.flags |= ONELINE_SEEN;
>> -		copy_tail = &commit_list_insert(l->item, copy_tail)->next;
>> +		prio_queue_put(&copy, l->item);
>>  	}
>> -	while (copy) {
>> +	while (copy.nr) {
>>  		const char *p, *buf;
>>  		struct commit *commit;
>>  		int matches;
> 
> our callers are always generating and passing in a list. So we could
> avoid the overhead of allocating the list in the first place by just
> taking a prio_queue. But maybe it gets weird with clearing the
> ONELINE_SEEN marks? We make a copy even in the current code so that we
> can call clear_commit_marks() on the complete set.
> 
> I guess we could add them to an array or something, but that probably
> ends up being roughly the same amount of work.
Right, if get_oid_oneline() gets handed a prio_queue, it would need to
copy all commits to a list or array for marks clearing at the end.  I
don't see a way around having both.  And I doubt there would be much of
a difference between list and array here, but could be convinced by
numbers. ;)

>> +build_history () {
>> +	local max_level="$1" &&
>> +	local level="${2:-1}" &&
>> +	local mark="${3:-1}" &&
>> +	if test $level -eq $max_level
>> +	then
>> +		echo "reset refs/heads/master" &&
>> +		echo "from $ZERO_OID" &&
>> +		echo "commit refs/heads/master" &&
>> +		echo "mark :$mark" &&
>> +		echo "committer C <c@example.com> 1234567890 +0000" &&
>> +		echo "data <<EOF" &&
>> +		echo "$mark" &&
>> +		echo "EOF"
>> +	else
>> +		local level1=$((level+1)) &&
>> +		local mark1=$((2*mark)) &&
>> +		local mark2=$((2*mark+1)) &&
>> +		build_history $max_level $level1 $mark1 &&
>> +		build_history $max_level $level1 $mark2 &&
>> +		echo "commit refs/heads/master" &&
>> +		echo "mark :$mark" &&
>> +		echo "committer C <c@example.com> 1234567890 +0000" &&
>> +		echo "data <<EOF" &&
>> +		echo "$mark" &&
>> +		echo "EOF" &&
>> +		echo "from :$mark1" &&
>> +		echo "merge :$mark2"
>> +	fi
>> +}
> 
> This took some brain cycles to decipher. It looks like we'll make
> 2^$level commits in a filled tree? It might be worth a brief comment
> describing the goal (and maybe even giving an example graph drawing for
> N=3 or something, though it gets out of hand quickly).

The goal is to have lots of merges, to make the list that
pop_most_recent_commit() works on long (each pop adds back two parents).

The script builds a binary tree of merges, with root commits (without
parents) as leaf nodes.  (The nomenclature is a bit weird because we
call the child nodes parent commits.)  So it creates 2^($level-1) root
commits and 2^($level-1)-1 two-way merges stacked on top.

    _1_
   /   \
  2     3
 / \   / \
4   5 6   7

The numbers are the marks; 1 to 3 are merges (have two parents), 4 to 7
are root commits.

>> +test_perf "rev-parse ':/$(cat needle)'" "
>> +	git rev-parse ':/$(cat needle)' >actual
>> +"
> 
> Hmm, usually we frown on putting snippets inside double-quotes because
> it's so easy to accidentally interpolate outside of the test_eval. But
> maybe this is short enough to be OK. I guess you did it here especially
> so that the title is a nice ":/65535" and not the opaque "$(cat
> needle)".
The first one allowed me to check whether the setup step created the
expected number of commits and seems kind of interesting for people
running the test.  The second one could be switched to be evaluated at
test_eval time, but that would make it harder to see that title and
test do the same.  We could strip out the quotes:

   test_perf "rev-parse :/$(cat needle)" '
           git rev-parse :/$(cat needle) >actual
   '

René


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-16  5:15   ` Jeff King
  2025-07-16  9:38     ` René Scharfe
@ 2025-07-16 14:49     ` Junio C Hamano
  1 sibling, 0 replies; 41+ messages in thread
From: Junio C Hamano @ 2025-07-16 14:49 UTC (permalink / raw)
  To: Jeff King; +Cc: René Scharfe, Git List

Jeff King <peff@peff.net> writes:

> On Tue, Jul 15, 2025 at 05:07:38PM -0700, Junio C Hamano wrote:
>
>> René Scharfe <l.s.r@web.de> writes:
>> 
>> > Use prio_queue to improve worst-case performance at the cost of slightly
>> > worse best-case performance.  Then add and use prio_queue_replace() to
>> > recover that loss.
>> 
>> Would change in the tiebreaking behaviour (aka sort stability) also
>> a cost of this change, as this swaps use of sorted linearly linked
>> list with priority queue?
>
> The prio_queue uses insertion order as a tie-breaker for stability (with
> earlier entries coming first). For building the initial queue from the
> list, I think that is obviously fine (we feed them in sorted order,
> which the prio queue will retain).

OK, then everything looks great.  Thanks.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-15 14:51 ` [PATCH 1/3] " René Scharfe
                     ` (2 preceding siblings ...)
  2025-07-16  5:05   ` Jeff King
@ 2025-07-16 22:23   ` Junio C Hamano
  2025-07-17  8:22     ` René Scharfe
  3 siblings, 1 reply; 41+ messages in thread
From: Junio C Hamano @ 2025-07-16 22:23 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Jeff King

René Scharfe <l.s.r@web.de> writes:

>  t/perf/p1501-rev-parse-oneline.sh | 55 +++++++++++++++++++++++++++++++
>  create mode 100755 t/perf/p1501-rev-parse-oneline.sh

This unfortunately calls for something like.

Tonight's integration cycle I have this on top of your series
in 'seen'.

Subject: [PATCH] fixup! commit: convert pop_most_recent_commit() to prio_queue

 t/meson.build | 1 +
 1 file changed, 1 insertion(+)

diff --git a/t/meson.build b/t/meson.build
index d052fc3e23..b39f6d008d 100644
--- a/t/meson.build
+++ b/t/meson.build
@@ -1117,6 +1117,7 @@ benchmarks = [
   'perf/p1450-fsck.sh',
   'perf/p1451-fsck-skip-list.sh',
   'perf/p1500-graph-walks.sh',
+  'perf/p1501-rev-parse-oneline.sh',
   'perf/p2000-sparse-operations.sh',
   'perf/p3400-rebase.sh',
   'perf/p3404-rebase-interactive.sh',
-- 
2.50.1-447-g1e759a1f67


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-16 22:23   ` Junio C Hamano
@ 2025-07-17  8:22     ` René Scharfe
  0 siblings, 0 replies; 41+ messages in thread
From: René Scharfe @ 2025-07-17  8:22 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git List, Jeff King

On 7/17/25 12:23 AM, Junio C Hamano wrote:
> René Scharfe <l.s.r@web.de> writes:
> 
>>  t/perf/p1501-rev-parse-oneline.sh | 55 +++++++++++++++++++++++++++++++
>>  create mode 100755 t/perf/p1501-rev-parse-oneline.sh
> 
> This unfortunately calls for something like.
> 
> Tonight's integration cycle I have this on top of your series
> in 'seen'.
> 
> Subject: [PATCH] fixup! commit: convert pop_most_recent_commit() to prio_queue
> 
>  t/meson.build | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/t/meson.build b/t/meson.build
> index d052fc3e23..b39f6d008d 100644
> --- a/t/meson.build
> +++ b/t/meson.build
> @@ -1117,6 +1117,7 @@ benchmarks = [
>    'perf/p1450-fsck.sh',
>    'perf/p1451-fsck-skip-list.sh',
>    'perf/p1500-graph-walks.sh',
> +  'perf/p1501-rev-parse-oneline.sh',
>    'perf/p2000-sparse-operations.sh',
>    'perf/p3400-rebase.sh',
>    'perf/p3404-rebase-interactive.sh',

Oh, OK, thanks.  Will include it in v2.

René


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-16  9:39     ` René Scharfe
@ 2025-07-17  8:22       ` René Scharfe
  2025-07-19  6:55       ` Jeff King
  1 sibling, 0 replies; 41+ messages in thread
From: René Scharfe @ 2025-07-17  8:22 UTC (permalink / raw)
  To: Jeff King; +Cc: Git List

On 7/16/25 11:39 AM, René Scharfe wrote:
> On 7/16/25 7:05 AM, Jeff King wrote:
> 
>> I actually have a series turning rev_info.commits into a prio_queue
>> which I need to polish up (mostly just writing commit messages; I've
>> been running with it for almost 2 years without trouble). Ironically it
>> does not touch this spot, as these commit lists are formed on their own.
> 
> That is not a coincidence.  I had a look at that series and tried to
> reach its goals while keeping rev_info.commits a commit_list.  Why?
> Mostly being vaguely uncomfortable with prio_queue' memory overhead,
> lack of type safety and dual use as a stack.  I still used it, but only
> as local variable, not in the central struct rev_info.
> 
> Anyway, I failed; revision.c::get_revision_1() took an 8% performance
> hit in my versions and none in yours, and I couldn't figure out why.
> Perhaps I should revisit it with the new prio_queue_replace() in hand,
> hmm..
Checked now, and it's still slower.  So I don't see an alternative to
making rev_info.commits a prio_queue.

René


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] prio-queue: add prio_queue_replace()
  2025-07-16  9:38     ` René Scharfe
@ 2025-07-17  9:20       ` René Scharfe
  2025-07-19  7:02         ` Jeff King
  0 siblings, 1 reply; 41+ messages in thread
From: René Scharfe @ 2025-07-17  9:20 UTC (permalink / raw)
  To: Jeff King; +Cc: Git List

On 7/16/25 11:38 AM, René Scharfe wrote:
> On 7/16/25 7:09 AM, Jeff King wrote:
>> On Tue, Jul 15, 2025 at 04:51:22PM +0200, René Scharfe wrote:
>>
>>> Add a function to replace the top element of the queue that basically
>>> does the same as prio_queue_get() followed by prio_queue_put(), but
>>> without the work by prio_queue_get() to rebalance the heap.  It can be
>>> used to optimize loops that get one element and then immediately add
>>> another one.  That's common e.g., with commit history traversal, where
>>> we get out a commit and then put in its parents.
>>
>> Hmm. But surely we still need to rebalance the heap after adding an
>> element? And indeed, we still call the new sift_down_root() function.
> 
> Yes.
> 
>> But I guess we are getting away without the "bubble up" operation that
>> put would do? So we are doing half as much work (but still big-O the
>> same)?
> 
> Yes.
> 
> I thought about building this optimization into prio_queue_get(), but
> that would require prio_queue_peek() and prio_queue_put() to be adjusted
> as well and all prio_queue users would have to be either made aware of
> the not-fully-heapified state, or prevented from accessing prio_queue
> properties like .nr and .array.

Here's what that would like like.  .nr and .array elements are kept
consistent at all times, but the root item is not in heap order after a
prio_queue_get().  That's good enough to enumerate all prio_queue items
like commit-reach.c::queue_has_nonstale() or
negotiator/skipping.c::push_parent() do.

Not sure what other weird patterns people might come up with that
require touching the innards of a prio_queue directly.  I don't even
understand why negotiator/skipping.c::push_parent() does a linear
search for each parent -- isn't that expensive?  Setting an object
flag on prio_queue_put() and clearing it on prio_queue_get() or a
using a commit_slab seem to be better options from a very cursory
glance.

Anyway, here's what doing prio_queue_replace() automatically could look
like.  I almost talked myself into using it now.  Any objections, ideas
on how to make it safer or clearer, other thoughts?

René

---
 prio-queue.c | 52 +++++++++++++++++++++++++++++++++++++++-------------
 prio-queue.h |  1 +
 2 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/prio-queue.c b/prio-queue.c
index ec33ac27db..265663e116 100644
--- a/prio-queue.c
+++ b/prio-queue.c
@@ -34,12 +34,46 @@ void clear_prio_queue(struct prio_queue *queue)
 	queue->nr = 0;
 	queue->alloc = 0;
 	queue->insertion_ctr = 0;
+	queue->sift_down_root_pending = false;
+}
+
+static void sift_down_root(struct prio_queue *queue)
+{
+	size_t ix, child;
+
+	/* Push down the one at the root */
+	for (ix = 0; ix * 2 + 1 < queue->nr; ix = child) {
+		child = ix * 2 + 1; /* left */
+		if (child + 1 < queue->nr &&
+		    compare(queue, child, child + 1) >= 0)
+			child++; /* use right child */
+
+		if (compare(queue, ix, child) <= 0)
+			break;
+
+		swap(queue, child, ix);
+	}
+	queue->sift_down_root_pending = false;
 }
 
 void prio_queue_put(struct prio_queue *queue, void *thing)
 {
 	size_t ix, parent;
 
+	if (queue->sift_down_root_pending) {
+		/*
+		 * Restore the original heap size.  The last item is
+		 * still in the right place.
+		 */
+		queue->nr++;
+
+		/* Now fill the hole at the root with the new item. */
+		queue->array[0].ctr = queue->insertion_ctr++;
+		queue->array[0].data = thing;
+		sift_down_root(queue);
+		return;
+	}
+
 	/* Append at the end */
 	ALLOC_GROW(queue->array, queue->nr + 1, queue->alloc);
 	queue->array[queue->nr].ctr = queue->insertion_ctr++;
@@ -61,31 +95,21 @@ void prio_queue_put(struct prio_queue *queue, void *thing)
 void *prio_queue_get(struct prio_queue *queue)
 {
 	void *result;
-	size_t ix, child;
 
 	if (!queue->nr)
 		return NULL;
 	if (!queue->compare)
 		return queue->array[--queue->nr].data; /* LIFO */
 
+	if (queue->sift_down_root_pending)
+		sift_down_root(queue);
 	result = queue->array[0].data;
 	if (!--queue->nr)
 		return result;
 
 	queue->array[0] = queue->array[queue->nr];
 
-	/* Push down the one at the root */
-	for (ix = 0; ix * 2 + 1 < queue->nr; ix = child) {
-		child = ix * 2 + 1; /* left */
-		if (child + 1 < queue->nr &&
-		    compare(queue, child, child + 1) >= 0)
-			child++; /* use right child */
-
-		if (compare(queue, ix, child) <= 0)
-			break;
-
-		swap(queue, child, ix);
-	}
+	queue->sift_down_root_pending = true;
 	return result;
 }
 
@@ -95,5 +119,7 @@ void *prio_queue_peek(struct prio_queue *queue)
 		return NULL;
 	if (!queue->compare)
 		return queue->array[queue->nr - 1].data;
+	if (queue->sift_down_root_pending)
+		sift_down_root(queue);
 	return queue->array[0].data;
 }
diff --git a/prio-queue.h b/prio-queue.h
index 38d032636d..6d8d268f41 100644
--- a/prio-queue.h
+++ b/prio-queue.h
@@ -32,6 +32,7 @@ struct prio_queue {
 	void *cb_data;
 	size_t alloc, nr;
 	struct prio_queue_entry *array;
+	bool sift_down_root_pending;
 };
 
 /*
-- 
2.50.1


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH v2 0/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-15 14:35 [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue René Scharfe
                   ` (3 preceding siblings ...)
  2025-07-16  0:07 ` [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue Junio C Hamano
@ 2025-07-18  9:09 ` René Scharfe
  2025-07-18  9:39   ` [PATCH v2 1/3] " René Scharfe
                     ` (4 more replies)
  4 siblings, 5 replies; 41+ messages in thread
From: René Scharfe @ 2025-07-18  9:09 UTC (permalink / raw)
  To: Git List; +Cc: Jeff King, Junio C Hamano, Justin Tobler

Use prio_queue to improve worst-case performance at the cost of slightly
worse best-case performance.  Then add and use prio_queue_replace() to
recover that loss.

Changes since v2:
- Mention that a prio_queue improves performance for merge-heavy
  histories in the commit message.
- Add the new perf script to Meson build file.
- Mention which kind of history we are aiming for and show its shape in
  a comment in the perf script.
- Remove unnecessary quotes and use single quotes for the perf test
  code.
- Rename the variable delete_pending to get_pending to align it with
  the concrete function prio_queue_get() instead of referring to the
  abstract concept of deletion, to improve readability.

  commit: convert pop_most_recent_commit() to prio_queue
  prio-queue: add prio_queue_replace()
  commit: use prio_queue_replace() in pop_most_recent_commit()

 commit.c                          | 14 ++++--
 commit.h                          |  8 ++--
 fetch-pack.c                      | 13 +++---
 object-name.c                     | 10 ++---
 prio-queue.c                      | 45 ++++++++++++++------
 prio-queue.h                      |  8 ++++
 t/meson.build                     |  1 +
 t/perf/p1501-rev-parse-oneline.sh | 71 +++++++++++++++++++++++++++++++
 t/unit-tests/u-prio-queue.c       | 23 ++++++++++
 walker.c                          | 11 +++--
 10 files changed, 170 insertions(+), 34 deletions(-)
 create mode 100755 t/perf/p1501-rev-parse-oneline.sh

Range-diff against v1:
1:  d9c0d13801 ! 1:  5bff885d0f commit: convert pop_most_recent_commit() to prio_queue
    @@ Metadata
      ## Commit message ##
         commit: convert pop_most_recent_commit() to prio_queue
     
    -    pop_most_recent_commit() calls commit_list_insert_by_date(), which and
    -    is itself called in a loop, which can lead to quadratic complexity.
    -    Replace the commit_list with a prio_queue to ensure logarithmic worst
    -    case complexity and convert all three users.
    +    pop_most_recent_commit() calls commit_list_insert_by_date() for parent
    +    commits, which is itself called in a loop.  This can lead to quadratic
    +    complexity if there are many merges.  Replace the commit_list with a
    +    prio_queue to ensure logarithmic worst case complexity and convert all
    +    three users.
     
         Add a performance test that exercises one of them using a pathological
         history that consists of 50% merges and 50% root commits to demonstrate
    @@ object-name.c: static enum get_oid_result get_oid_with_context_1(struct reposito
      
      			free_commit_list(list);
     
    + ## t/meson.build ##
    +@@ t/meson.build: benchmarks = [
    +   'perf/p1450-fsck.sh',
    +   'perf/p1451-fsck-skip-list.sh',
    +   'perf/p1500-graph-walks.sh',
    ++  'perf/p1501-rev-parse-oneline.sh',
    +   'perf/p2000-sparse-operations.sh',
    +   'perf/p3400-rebase.sh',
    +   'perf/p3404-rebase-interactive.sh',
    +
      ## t/perf/p1501-rev-parse-oneline.sh (new) ##
     @@
     +#!/bin/sh
    @@ t/perf/p1501-rev-parse-oneline.sh (new)
     +
     +test_perf_fresh_repo
     +
    ++#
    ++# Creates lots of merges to make history traversal costly.  In
    ++# particular it creates 2^($max_level-1)-1 2-way merges on top of
    ++# 2^($max_level-1) root commits.  E.g., the commit history looks like
    ++# this for a $max_level of 3:
    ++#
    ++#     _1_
    ++#    /   \
    ++#   2     3
    ++#  / \   / \
    ++# 4   5 6   7
    ++#
    ++# The numbers are the fast-import marks, which also are the commit
    ++# messages.  1 is the HEAD commit and a merge, 2 and 3 are also merges,
    ++# 4-7 are the root commits.
    ++#
     +build_history () {
     +	local max_level="$1" &&
     +	local level="${2:-1}" &&
    @@ t/perf/p1501-rev-parse-oneline.sh (new)
     +	sed -n -e "s/^.* //p" -e "q" <commits >needle
     +'
     +
    -+test_perf "rev-parse ':/$(cat needle)'" "
    -+	git rev-parse ':/$(cat needle)' >actual
    -+"
    ++test_perf "rev-parse :/$(cat needle)" '
    ++	git rev-parse :/$(cat needle) >actual
    ++'
     +
     +test_expect_success 'verify result' '
     +	test_cmp expect actual
2:  a4011d850c = 2:  a2e57ca443 prio-queue: add prio_queue_replace()
3:  dc535e8b64 ! 3:  1cbea38524 commit: use prio_queue_replace() in pop_most_recent_commit()
    @@ commit.c: void commit_list_sort_by_date(struct commit_list **list)
      {
     -	struct commit *ret = prio_queue_get(queue);
     +	struct commit *ret = prio_queue_peek(queue);
    -+	int delete_pending = 1;
    ++	int get_pending = 1;
      	struct commit_list *parents = ret->parents;
      
      	while (parents) {
    @@ commit.c: void commit_list_sort_by_date(struct commit_list **list)
      		if (!repo_parse_commit(the_repository, commit) && !(commit->object.flags & mark)) {
      			commit->object.flags |= mark;
     -			prio_queue_put(queue, commit);
    -+			if (delete_pending)
    ++			if (get_pending)
     +				prio_queue_replace(queue, commit);
     +			else
     +				prio_queue_put(queue, commit);
    -+			delete_pending = 0;
    ++			get_pending = 0;
      		}
      		parents = parents->next;
      	}
    -+	if (delete_pending)
    ++	if (get_pending)
     +		prio_queue_get(queue);
      	return ret;
      }
-- 
2.50.1

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-18  9:09 ` [PATCH v2 " René Scharfe
@ 2025-07-18  9:39   ` René Scharfe
  2025-07-21 14:02     ` Lidong Yan
  2025-07-18  9:39   ` [PATCH v2 3/3] commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0 René Scharfe
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 41+ messages in thread
From: René Scharfe @ 2025-07-18  9:39 UTC (permalink / raw)
  To: Git List; +Cc: Jeff King, Junio C Hamano, Justin Tobler

pop_most_recent_commit() calls commit_list_insert_by_date() for parent
commits, which is itself called in a loop.  This can lead to quadratic
complexity if there are many merges.  Replace the commit_list with a
prio_queue to ensure logarithmic worst case complexity and convert all
three users.

Add a performance test that exercises one of them using a pathological
history that consists of 50% merges and 50% root commits to demonstrate
the speedup:

Test                          v2.50.1           HEAD
----------------------------------------------------------------------
1501.2: rev-parse ':/65535'   2.48(2.47+0.00)   0.20(0.19+0.00) -91.9%

Alas, sane histories don't benefit from the conversion much, and
traversing Git's own history takes a 1% performance hit on my machine:

   $ hyperfine -w3 -L git ./git_2.50.1,./git '{git} rev-parse :/^Initial.revision'
   Benchmark 1: ./git_2.50.1 rev-parse :/^Initial.revision
     Time (mean ± σ):      1.071 s ±  0.004 s    [User: 1.052 s, System: 0.017 s]
     Range (min … max):    1.067 s …  1.078 s    10 runs

   Benchmark 2: ./git rev-parse :/^Initial.revision
     Time (mean ± σ):      1.079 s ±  0.003 s    [User: 1.060 s, System: 0.017 s]
     Range (min … max):    1.074 s …  1.083 s    10 runs

   Summary
     ./git_2.50.1 rev-parse :/^Initial.revision ran
       1.01 ± 0.00 times faster than ./git rev-parse :/^Initial.revision

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 commit.c                          |  7 +--
 commit.h                          |  8 ++--
 fetch-pack.c                      | 13 +++---
 object-name.c                     | 10 ++---
 t/meson.build                     |  1 +
 t/perf/p1501-rev-parse-oneline.sh | 71 +++++++++++++++++++++++++++++++
 walker.c                          | 11 +++--
 7 files changed, 100 insertions(+), 21 deletions(-)
 create mode 100755 t/perf/p1501-rev-parse-oneline.sh

diff --git a/commit.c b/commit.c
index 15115125c3..f4712ad9a7 100644
--- a/commit.c
+++ b/commit.c
@@ -31,6 +31,7 @@
 #include "parse.h"
 #include "object-file.h"
 #include "object-file-convert.h"
+#include "prio-queue.h"
 
 static struct commit_extra_header *read_commit_extra_header_lines(const char *buf, size_t len, const char **);
 
@@ -739,17 +740,17 @@ void commit_list_sort_by_date(struct commit_list **list)
 	commit_list_sort(list, commit_list_compare_by_date);
 }
 
-struct commit *pop_most_recent_commit(struct commit_list **list,
+struct commit *pop_most_recent_commit(struct prio_queue *queue,
 				      unsigned int mark)
 {
-	struct commit *ret = pop_commit(list);
+	struct commit *ret = prio_queue_get(queue);
 	struct commit_list *parents = ret->parents;
 
 	while (parents) {
 		struct commit *commit = parents->item;
 		if (!repo_parse_commit(the_repository, commit) && !(commit->object.flags & mark)) {
 			commit->object.flags |= mark;
-			commit_list_insert_by_date(commit, list);
+			prio_queue_put(queue, commit);
 		}
 		parents = parents->next;
 	}
diff --git a/commit.h b/commit.h
index 70c870dae4..9630c076d6 100644
--- a/commit.h
+++ b/commit.h
@@ -201,10 +201,10 @@ const char *repo_logmsg_reencode(struct repository *r,
 
 const char *skip_blank_lines(const char *msg);
 
-/** Removes the first commit from a list sorted by date, and adds all
- * of its parents.
- **/
-struct commit *pop_most_recent_commit(struct commit_list **list,
+struct prio_queue;
+
+/* Removes the first commit from a prio_queue and adds its parents. */
+struct commit *pop_most_recent_commit(struct prio_queue *queue,
 				      unsigned int mark);
 
 struct commit *pop_commit(struct commit_list **stack);
diff --git a/fetch-pack.c b/fetch-pack.c
index 5e74235fc0..8ad5bf2931 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -34,6 +34,7 @@
 #include "commit-graph.h"
 #include "sigchain.h"
 #include "mergesort.h"
+#include "prio-queue.h"
 
 static int transfer_unpack_limit = -1;
 static int fetch_unpack_limit = -1;
@@ -601,7 +602,7 @@ static int find_common(struct fetch_negotiator *negotiator,
 	return count ? retval : 0;
 }
 
-static struct commit_list *complete;
+static struct prio_queue complete = { compare_commits_by_commit_date };
 
 static int mark_complete(const struct object_id *oid)
 {
@@ -609,7 +610,7 @@ static int mark_complete(const struct object_id *oid)
 
 	if (commit && !(commit->object.flags & COMPLETE)) {
 		commit->object.flags |= COMPLETE;
-		commit_list_insert(commit, &complete);
+		prio_queue_put(&complete, commit);
 	}
 	return 0;
 }
@@ -626,9 +627,12 @@ static int mark_complete_oid(const char *refname UNUSED,
 static void mark_recent_complete_commits(struct fetch_pack_args *args,
 					 timestamp_t cutoff)
 {
-	while (complete && cutoff <= complete->item->date) {
+	while (complete.nr) {
+		struct commit *item = prio_queue_peek(&complete);
+		if (item->date < cutoff)
+			break;
 		print_verbose(args, _("Marking %s as complete"),
-			      oid_to_hex(&complete->item->object.oid));
+			      oid_to_hex(&item->object.oid));
 		pop_most_recent_commit(&complete, COMPLETE);
 	}
 }
@@ -798,7 +802,6 @@ static void mark_complete_and_common_ref(struct fetch_negotiator *negotiator,
 		refs_for_each_rawref(get_main_ref_store(the_repository),
 				     mark_complete_oid, NULL);
 		for_each_cached_alternate(NULL, mark_alternate_complete);
-		commit_list_sort_by_date(&complete);
 		if (cutoff)
 			mark_recent_complete_commits(args, cutoff);
 	}
diff --git a/object-name.c b/object-name.c
index ddafe7f9b1..41930609e3 100644
--- a/object-name.c
+++ b/object-name.c
@@ -28,6 +28,7 @@
 #include "commit-reach.h"
 #include "date.h"
 #include "object-file-convert.h"
+#include "prio-queue.h"
 
 static int get_oid_oneline(struct repository *r, const char *, struct object_id *,
 			   const struct commit_list *);
@@ -1461,7 +1462,7 @@ static int get_oid_oneline(struct repository *r,
 			   const char *prefix, struct object_id *oid,
 			   const struct commit_list *list)
 {
-	struct commit_list *copy = NULL, **copy_tail = &copy;
+	struct prio_queue copy = { compare_commits_by_commit_date };
 	const struct commit_list *l;
 	int found = 0;
 	int negative = 0;
@@ -1483,9 +1484,9 @@ static int get_oid_oneline(struct repository *r,
 
 	for (l = list; l; l = l->next) {
 		l->item->object.flags |= ONELINE_SEEN;
-		copy_tail = &commit_list_insert(l->item, copy_tail)->next;
+		prio_queue_put(&copy, l->item);
 	}
-	while (copy) {
+	while (copy.nr) {
 		const char *p, *buf;
 		struct commit *commit;
 		int matches;
@@ -1507,7 +1508,7 @@ static int get_oid_oneline(struct repository *r,
 	regfree(&regex);
 	for (l = list; l; l = l->next)
 		clear_commit_marks(l->item, ONELINE_SEEN);
-	free_commit_list(copy);
+	clear_prio_queue(&copy);
 	return found ? 0 : -1;
 }
 
@@ -2061,7 +2062,6 @@ static enum get_oid_result get_oid_with_context_1(struct repository *repo,
 			cb.list = &list;
 			refs_for_each_ref(get_main_ref_store(repo), handle_one_ref, &cb);
 			refs_head_ref(get_main_ref_store(repo), handle_one_ref, &cb);
-			commit_list_sort_by_date(&list);
 			ret = get_oid_oneline(repo, name + 2, oid, list);
 
 			free_commit_list(list);
diff --git a/t/meson.build b/t/meson.build
index 1af289425d..660d780dcc 100644
--- a/t/meson.build
+++ b/t/meson.build
@@ -1116,6 +1116,7 @@ benchmarks = [
   'perf/p1450-fsck.sh',
   'perf/p1451-fsck-skip-list.sh',
   'perf/p1500-graph-walks.sh',
+  'perf/p1501-rev-parse-oneline.sh',
   'perf/p2000-sparse-operations.sh',
   'perf/p3400-rebase.sh',
   'perf/p3404-rebase-interactive.sh',
diff --git a/t/perf/p1501-rev-parse-oneline.sh b/t/perf/p1501-rev-parse-oneline.sh
new file mode 100755
index 0000000000..538fa9c404
--- /dev/null
+++ b/t/perf/p1501-rev-parse-oneline.sh
@@ -0,0 +1,71 @@
+#!/bin/sh
+
+test_description='Test :/ object name notation'
+
+. ./perf-lib.sh
+
+test_perf_fresh_repo
+
+#
+# Creates lots of merges to make history traversal costly.  In
+# particular it creates 2^($max_level-1)-1 2-way merges on top of
+# 2^($max_level-1) root commits.  E.g., the commit history looks like
+# this for a $max_level of 3:
+#
+#     _1_
+#    /   \
+#   2     3
+#  / \   / \
+# 4   5 6   7
+#
+# The numbers are the fast-import marks, which also are the commit
+# messages.  1 is the HEAD commit and a merge, 2 and 3 are also merges,
+# 4-7 are the root commits.
+#
+build_history () {
+	local max_level="$1" &&
+	local level="${2:-1}" &&
+	local mark="${3:-1}" &&
+	if test $level -eq $max_level
+	then
+		echo "reset refs/heads/master" &&
+		echo "from $ZERO_OID" &&
+		echo "commit refs/heads/master" &&
+		echo "mark :$mark" &&
+		echo "committer C <c@example.com> 1234567890 +0000" &&
+		echo "data <<EOF" &&
+		echo "$mark" &&
+		echo "EOF"
+	else
+		local level1=$((level+1)) &&
+		local mark1=$((2*mark)) &&
+		local mark2=$((2*mark+1)) &&
+		build_history $max_level $level1 $mark1 &&
+		build_history $max_level $level1 $mark2 &&
+		echo "commit refs/heads/master" &&
+		echo "mark :$mark" &&
+		echo "committer C <c@example.com> 1234567890 +0000" &&
+		echo "data <<EOF" &&
+		echo "$mark" &&
+		echo "EOF" &&
+		echo "from :$mark1" &&
+		echo "merge :$mark2"
+	fi
+}
+
+test_expect_success 'setup' '
+	build_history 16 | git fast-import &&
+	git log --format="%H %s" --reverse >commits &&
+	sed -n -e "s/ .*$//p" -e "q" <commits >expect &&
+	sed -n -e "s/^.* //p" -e "q" <commits >needle
+'
+
+test_perf "rev-parse :/$(cat needle)" '
+	git rev-parse :/$(cat needle) >actual
+'
+
+test_expect_success 'verify result' '
+	test_cmp expect actual
+'
+
+test_done
diff --git a/walker.c b/walker.c
index d131af04c7..8073754517 100644
--- a/walker.c
+++ b/walker.c
@@ -14,6 +14,7 @@
 #include "blob.h"
 #include "refs.h"
 #include "progress.h"
+#include "prio-queue.h"
 
 static struct object_id current_commit_oid;
 
@@ -78,7 +79,7 @@ static int process_tree(struct walker *walker, struct tree *tree)
 #define SEEN		(1U << 1)
 #define TO_SCAN		(1U << 2)
 
-static struct commit_list *complete = NULL;
+static struct prio_queue complete = { compare_commits_by_commit_date };
 
 static int process_commit(struct walker *walker, struct commit *commit)
 {
@@ -87,7 +88,10 @@ static int process_commit(struct walker *walker, struct commit *commit)
 	if (repo_parse_commit(the_repository, commit))
 		return -1;
 
-	while (complete && complete->item->date >= commit->date) {
+	while (complete.nr) {
+		struct commit *item = prio_queue_peek(&complete);
+		if (item->date < commit->date)
+			break;
 		pop_most_recent_commit(&complete, COMPLETE);
 	}
 
@@ -233,7 +237,7 @@ static int mark_complete(const char *path UNUSED,
 
 	if (commit) {
 		commit->object.flags |= COMPLETE;
-		commit_list_insert(commit, &complete);
+		prio_queue_put(&complete, commit);
 	}
 	return 0;
 }
@@ -302,7 +306,6 @@ int walker_fetch(struct walker *walker, int targets, char **target,
 	if (!walker->get_recover) {
 		refs_for_each_ref(get_main_ref_store(the_repository),
 				  mark_complete, NULL);
-		commit_list_sort_by_date(&complete);
 	}
 
 	for (i = 0; i < targets; i++) {
-- 
2.50.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH v2 3/3] commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0
  2025-07-18  9:09 ` [PATCH v2 " René Scharfe
  2025-07-18  9:39   ` [PATCH v2 1/3] " René Scharfe
@ 2025-07-18  9:39   ` René Scharfe
  2025-08-03 11:12     ` Johannes Schindelin
  2025-07-18  9:39   ` [PATCH v2 2/3] prio-queue: add prio_queue_replace() René Scharfe
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 41+ messages in thread
From: René Scharfe @ 2025-07-18  9:39 UTC (permalink / raw)
  To: Git List; +Cc: Jeff King, Junio C Hamano, Justin Tobler

Optimize pop_most_recent_commit() by adding the first parent using the
more efficient prio_queue_peek() and prio_queue_replace() instead of
prio_queue_get() and prio_queue_put().

On my machine this neutralizes the performance hit it took in Git's own
repository when we converted it to prio_queue two patches ago (git_pq):

   $ hyperfine -w3 -L git ./git_2.50.1,./git_pq,./git '{git} rev-parse :/^Initial.revision'
   Benchmark 1: ./git_2.50.1 rev-parse :/^Initial.revision
     Time (mean ± σ):      1.073 s ±  0.003 s    [User: 1.053 s, System: 0.019 s]
     Range (min … max):    1.069 s …  1.078 s    10 runs

   Benchmark 2: ./git_pq rev-parse :/^Initial.revision
     Time (mean ± σ):      1.077 s ±  0.002 s    [User: 1.057 s, System: 0.018 s]
     Range (min … max):    1.072 s …  1.079 s    10 runs

   Benchmark 3: ./git rev-parse :/^Initial.revision
     Time (mean ± σ):      1.069 s ±  0.003 s    [User: 1.049 s, System: 0.018 s]
     Range (min … max):    1.065 s …  1.074 s    10 runs

   Summary
     ./git rev-parse :/^Initial.revision ran
       1.00 ± 0.00 times faster than ./git_2.50.1 rev-parse :/^Initial.revision
       1.01 ± 0.00 times faster than ./git_pq rev-parse :/^Initial.revision

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 commit.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/commit.c b/commit.c
index f4712ad9a7..ea84c8e7f6 100644
--- a/commit.c
+++ b/commit.c
@@ -743,17 +743,24 @@ void commit_list_sort_by_date(struct commit_list **list)
 struct commit *pop_most_recent_commit(struct prio_queue *queue,
 				      unsigned int mark)
 {
-	struct commit *ret = prio_queue_get(queue);
+	struct commit *ret = prio_queue_peek(queue);
+	int get_pending = 1;
 	struct commit_list *parents = ret->parents;
 
 	while (parents) {
 		struct commit *commit = parents->item;
 		if (!repo_parse_commit(the_repository, commit) && !(commit->object.flags & mark)) {
 			commit->object.flags |= mark;
-			prio_queue_put(queue, commit);
+			if (get_pending)
+				prio_queue_replace(queue, commit);
+			else
+				prio_queue_put(queue, commit);
+			get_pending = 0;
 		}
 		parents = parents->next;
 	}
+	if (get_pending)
+		prio_queue_get(queue);
 	return ret;
 }
 
-- 
2.50.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH v2 2/3] prio-queue: add prio_queue_replace()
  2025-07-18  9:09 ` [PATCH v2 " René Scharfe
  2025-07-18  9:39   ` [PATCH v2 1/3] " René Scharfe
  2025-07-18  9:39   ` [PATCH v2 3/3] commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0 René Scharfe
@ 2025-07-18  9:39   ` René Scharfe
  2025-07-19  7:04   ` [PATCH v2 0/3] commit: convert pop_most_recent_commit() to prio_queue Jeff King
  2025-07-22  6:26   ` SZEDER Gábor
  4 siblings, 0 replies; 41+ messages in thread
From: René Scharfe @ 2025-07-18  9:39 UTC (permalink / raw)
  To: Git List; +Cc: Jeff King, Junio C Hamano, Justin Tobler

Add a function to replace the top element of the queue that basically
does the same as prio_queue_get() followed by prio_queue_put(), but
without the work by prio_queue_get() to rebalance the heap.  It can be
used to optimize loops that get one element and then immediately add
another one.  That's common e.g., with commit history traversal, where
we get out a commit and then put in its parents.

Signed-off-by: René Scharfe <l.s.r@web.de>
---
 prio-queue.c                | 45 ++++++++++++++++++++++++++-----------
 prio-queue.h                |  8 +++++++
 t/unit-tests/u-prio-queue.c | 23 +++++++++++++++++++
 3 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/prio-queue.c b/prio-queue.c
index ec33ac27db..9748528ce6 100644
--- a/prio-queue.c
+++ b/prio-queue.c
@@ -58,22 +58,10 @@ void prio_queue_put(struct prio_queue *queue, void *thing)
 	}
 }
 
-void *prio_queue_get(struct prio_queue *queue)
+static void sift_down_root(struct prio_queue *queue)
 {
-	void *result;
 	size_t ix, child;
 
-	if (!queue->nr)
-		return NULL;
-	if (!queue->compare)
-		return queue->array[--queue->nr].data; /* LIFO */
-
-	result = queue->array[0].data;
-	if (!--queue->nr)
-		return result;
-
-	queue->array[0] = queue->array[queue->nr];
-
 	/* Push down the one at the root */
 	for (ix = 0; ix * 2 + 1 < queue->nr; ix = child) {
 		child = ix * 2 + 1; /* left */
@@ -86,6 +74,23 @@ void *prio_queue_get(struct prio_queue *queue)
 
 		swap(queue, child, ix);
 	}
+}
+
+void *prio_queue_get(struct prio_queue *queue)
+{
+	void *result;
+
+	if (!queue->nr)
+		return NULL;
+	if (!queue->compare)
+		return queue->array[--queue->nr].data; /* LIFO */
+
+	result = queue->array[0].data;
+	if (!--queue->nr)
+		return result;
+
+	queue->array[0] = queue->array[queue->nr];
+	sift_down_root(queue);
 	return result;
 }
 
@@ -97,3 +102,17 @@ void *prio_queue_peek(struct prio_queue *queue)
 		return queue->array[queue->nr - 1].data;
 	return queue->array[0].data;
 }
+
+void prio_queue_replace(struct prio_queue *queue, void *thing)
+{
+	if (!queue->nr) {
+		prio_queue_put(queue, thing);
+	} else if (!queue->compare) {
+		queue->array[queue->nr - 1].ctr = queue->insertion_ctr++;
+		queue->array[queue->nr - 1].data = thing;
+	} else {
+		queue->array[0].ctr = queue->insertion_ctr++;
+		queue->array[0].data = thing;
+		sift_down_root(queue);
+	}
+}
diff --git a/prio-queue.h b/prio-queue.h
index 38d032636d..da7fad2f1f 100644
--- a/prio-queue.h
+++ b/prio-queue.h
@@ -52,6 +52,14 @@ void *prio_queue_get(struct prio_queue *);
  */
 void *prio_queue_peek(struct prio_queue *);
 
+/*
+ * Replace the "thing" that compares the smallest with a new "thing",
+ * like prio_queue_get()+prio_queue_put() would do, but in a more
+ * efficient way.  Does the same as prio_queue_put() if the queue is
+ * empty.
+ */
+void prio_queue_replace(struct prio_queue *queue, void *thing);
+
 void clear_prio_queue(struct prio_queue *);
 
 /* Reverse the LIFO elements */
diff --git a/t/unit-tests/u-prio-queue.c b/t/unit-tests/u-prio-queue.c
index 145e689c9c..63e58114ae 100644
--- a/t/unit-tests/u-prio-queue.c
+++ b/t/unit-tests/u-prio-queue.c
@@ -13,6 +13,7 @@ static int intcmp(const void *va, const void *vb, void *data UNUSED)
 #define STACK	 -3
 #define GET	 -4
 #define REVERSE  -5
+#define REPLACE  -6
 
 static int show(int *v)
 {
@@ -51,6 +52,15 @@ static void test_prio_queue(int *input, size_t input_size,
 		case REVERSE:
 			prio_queue_reverse(&pq);
 			break;
+		case REPLACE:
+			peek = prio_queue_peek(&pq);
+			cl_assert(i + 1 < input_size);
+			cl_assert(input[i + 1] >= 0);
+			cl_assert(j < result_size);
+			cl_assert_equal_i(result[j], show(peek));
+			j++;
+			prio_queue_replace(&pq, &input[++i]);
+			break;
 		default:
 			prio_queue_put(&pq, &input[i]);
 			break;
@@ -81,6 +91,13 @@ void test_prio_queue__empty(void)
 		   ((int []){ 1, 2, MISSING, 1, 2, MISSING }));
 }
 
+void test_prio_queue__replace(void)
+{
+	TEST_INPUT(((int []){ REPLACE, 6, 2, 4, REPLACE, 5, 7, GET,
+			      REPLACE, 1, DUMP }),
+		   ((int []){ MISSING, 2, 4, 5, 1, 6, 7 }));
+}
+
 void test_prio_queue__stack(void)
 {
 	TEST_INPUT(((int []){ STACK, 8, 1, 5, 4, 6, 2, 3, DUMP }),
@@ -92,3 +109,9 @@ void test_prio_queue__reverse_stack(void)
 	TEST_INPUT(((int []){ STACK, 1, 2, 3, 4, 5, 6, REVERSE, DUMP }),
 		   ((int []){ 1, 2, 3, 4, 5, 6 }));
 }
+
+void test_prio_queue__replace_stack(void)
+{
+	TEST_INPUT(((int []){ STACK, 8, 1, 5, REPLACE, 4, 6, 2, 3, DUMP }),
+		   ((int []){ 5, 3, 2, 6, 4, 1, 8 }));
+}
-- 
2.50.1

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-16  9:38     ` René Scharfe
@ 2025-07-19  6:45       ` Jeff King
  0 siblings, 0 replies; 41+ messages in thread
From: Jeff King @ 2025-07-19  6:45 UTC (permalink / raw)
  To: René Scharfe; +Cc: Junio C Hamano, Git List

On Wed, Jul 16, 2025 at 11:38:39AM +0200, René Scharfe wrote:

> Yes, commit_list_insert_by_date() is stable, as it inserts commits after
> ones with the same date.  Items are popped from the top, so this ensures
> FIFO behavior for commits with the same date.
> 
> prio_queue ensures stability using an ID and favors lower ones, so it
> provides the same order.
> 
> We should add unit tests for that, no?

Probably wouldn't hurt to do so, but I don't think doing so needs to
hold up your series.

-Peff

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-16  9:39     ` René Scharfe
  2025-07-17  8:22       ` René Scharfe
@ 2025-07-19  6:55       ` Jeff King
  2025-07-19  6:57         ` Jeff King
  2025-07-19 11:15         ` René Scharfe
  1 sibling, 2 replies; 41+ messages in thread
From: Jeff King @ 2025-07-19  6:55 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List

On Wed, Jul 16, 2025 at 11:39:49AM +0200, René Scharfe wrote:

> On 7/16/25 7:05 AM, Jeff King wrote:
> > On Tue, Jul 15, 2025 at 04:51:07PM +0200, René Scharfe wrote:
> > 
> >> pop_most_recent_commit() calls commit_list_insert_by_date(), which and
> >> is itself called in a loop, which can lead to quadratic complexity.
> >> Replace the commit_list with a prio_queue to ensure logarithmic worst
> >> case complexity and convert all three users.
> > 
> > I guess I'm cc'd because of my frequent complains about the quadratic
> > nature of our commit lists? :)
> 
> And because you introduced prio_queue.

I think that was Junio, but I think I can be counted as a cheerleader
for the topic. :)

> > Mostly I asked because I had to look at pop_most_recent_commit() to see
> > what operation would be made faster here. Looks like it's mostly ":/",
> > but maybe also fetch's mark_recent_complete_commits()? I guess we might
> > hit that if you have a huge number of refs?
> 
> The :/ handling was the easiest to test for me.  fetch_pack() and
> walker_fetch() require some server side to set up, which seems not worth
> it just to demonstrate quadratic behavior.  Having thousands of refs
> would make the list long enough to notice, as would having lots of
> merges.

OK, that makes sense. Just making sure I understand the benefits.

> My general idea is to get rid of commit_list_insert_by_date() eventually
> to avoid quadratic complexity.

Yeah, it's certainly at the root of many such problems we've seen over
the years.

> > I actually have a series turning rev_info.commits into a prio_queue
> > which I need to polish up (mostly just writing commit messages; I've
> > been running with it for almost 2 years without trouble). Ironically it
> > does not touch this spot, as these commit lists are formed on their own.
> 
> That is not a coincidence.  I had a look at that series and tried to
> reach its goals while keeping rev_info.commits a commit_list.  Why?
> Mostly being vaguely uncomfortable with prio_queue' memory overhead,
> lack of type safety and dual use as a stack.  I still used it, but only
> as local variable, not in the central struct rev_info.

Hmm, I would have thought prio_queue had less memory overhead. You're
spending one pointer per entry in a packed array, versus list nodes. But
it's true that it doesn't shrink as items are removed (though that is
something we _could_ implement).

The dual use as a stack actually came in handy for my series, IIRC.
There are spots which use a commit_list but care about a specific order,
and my list/prio_queue conversion helpers use that to create a non-heap
prio_queue that just returns the items in the original order (it's
actually FIFO, but we can get that by reversing).

I dunno. That's kind of horrible when I say it out loud, but it did make
things work. I'm surprised that your attempt ended up with a performance
hit when mine did not. Mine tried not to be clever, and even leaves in
place a few spots where we convert between the two representations to
satisfy various interfaces (with the goal that we'd probably eventually
switch to prio_queue everywhere).

-Peff

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-19  6:55       ` Jeff King
@ 2025-07-19  6:57         ` Jeff King
  2025-07-19 11:15         ` René Scharfe
  1 sibling, 0 replies; 41+ messages in thread
From: Jeff King @ 2025-07-19  6:57 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List

On Sat, Jul 19, 2025 at 02:55:58AM -0400, Jeff King wrote:

> > That is not a coincidence.  I had a look at that series and tried to
> > reach its goals while keeping rev_info.commits a commit_list.  Why?
> > Mostly being vaguely uncomfortable with prio_queue' memory overhead,
> > lack of type safety and dual use as a stack.  I still used it, but only
> > as local variable, not in the central struct rev_info.
> 
> Hmm, I would have thought prio_queue had less memory overhead. You're
> spending one pointer per entry in a packed array, versus list nodes. But
> it's true that it doesn't shrink as items are removed (though that is
> something we _could_ implement).
> 
> The dual use as a stack actually came in handy for my series, IIRC.
> There are spots which use a commit_list but care about a specific order,
> and my list/prio_queue conversion helpers use that to create a non-heap
> prio_queue that just returns the items in the original order (it's
> actually FIFO, but we can get that by reversing).
> 
> I dunno. That's kind of horrible when I say it out loud, but it did make
> things work. I'm surprised that your attempt ended up with a performance
> hit when mine did not. Mine tried not to be clever, and even leaves in
> place a few spots where we convert between the two representations to
> satisfy various interfaces (with the goal that we'd probably eventually
> switch to prio_queue everywhere).

Oh, re-reading what you wrote again: you left rev_info.commits as a
list, so presumably you were paying more conversion overhead as you
walked (whereas I think in mine the prio_queue becomes the data
structure for the hot path during traversal).

-Peff

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] prio-queue: add prio_queue_replace()
  2025-07-17  9:20       ` René Scharfe
@ 2025-07-19  7:02         ` Jeff King
  0 siblings, 0 replies; 41+ messages in thread
From: Jeff King @ 2025-07-19  7:02 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List

On Thu, Jul 17, 2025 at 11:20:53AM +0200, René Scharfe wrote:

> > I thought about building this optimization into prio_queue_get(), but
> > that would require prio_queue_peek() and prio_queue_put() to be adjusted
> > as well and all prio_queue users would have to be either made aware of
> > the not-fully-heapified state, or prevented from accessing prio_queue
> > properties like .nr and .array.
> 
> Here's what that would like like.  .nr and .array elements are kept
> consistent at all times, but the root item is not in heap order after a
> prio_queue_get().  That's good enough to enumerate all prio_queue items
> like commit-reach.c::queue_has_nonstale() or
> negotiator/skipping.c::push_parent() do.

Hmm, I agree that _probably_ we'd be fine as long as .nr and .array were
always consistent. It does make me feel a bit dirty to violate the heap
property in a way that callers can see. I guess the argument in favor of
it would be:

  - if you are directly walking over all elements, then almost all
    ordering is out the window. Yes, the root item is supposed to be the
    min, but in a heap the rest of the elements won't be sorted.

  - if you really want things in order from a heap, you'll be calling
    the get() or peek() accessors. And that gives us an opportunity to
    lazily put things in order.

I guess one alternative would be to make the array private and require
some kind of foreach accessor. True struct-field privacy in C sucks.
You have to hide behind a pointer, so there's runtime cost, plus
iteration requires a clunky callback rather than a loop. I guess we
could call it "array_" or something, and provide a foreach macro.

I dunno. I'm on the fence.

-Peff

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 0/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-18  9:09 ` [PATCH v2 " René Scharfe
                     ` (2 preceding siblings ...)
  2025-07-18  9:39   ` [PATCH v2 2/3] prio-queue: add prio_queue_replace() René Scharfe
@ 2025-07-19  7:04   ` Jeff King
  2025-07-22  6:26   ` SZEDER Gábor
  4 siblings, 0 replies; 41+ messages in thread
From: Jeff King @ 2025-07-19  7:04 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Junio C Hamano, Justin Tobler

On Fri, Jul 18, 2025 at 11:09:04AM +0200, René Scharfe wrote:

> Use prio_queue to improve worst-case performance at the cost of slightly
> worse best-case performance.  Then add and use prio_queue_replace() to
> recover that loss.
> 
> Changes since v2:
> [...]

Thanks for fleshing out the test script. This version looks good to me.

-Peff

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-19  6:55       ` Jeff King
  2025-07-19  6:57         ` Jeff King
@ 2025-07-19 11:15         ` René Scharfe
  2025-07-20  0:03           ` Jeff King
  1 sibling, 1 reply; 41+ messages in thread
From: René Scharfe @ 2025-07-19 11:15 UTC (permalink / raw)
  To: Jeff King; +Cc: Git List

On 7/19/25 8:55 AM, Jeff King wrote:
> On Wed, Jul 16, 2025 at 11:39:49AM +0200, René Scharfe wrote:
> 
>> On 7/16/25 7:05 AM, Jeff King wrote:
>>> On Tue, Jul 15, 2025 at 04:51:07PM +0200, René Scharfe wrote:
>>>
>>>> pop_most_recent_commit() calls commit_list_insert_by_date(), which and
>>>> is itself called in a loop, which can lead to quadratic complexity.
>>>> Replace the commit_list with a prio_queue to ensure logarithmic worst
>>>> case complexity and convert all three users.
>>>
>>> I guess I'm cc'd because of my frequent complains about the quadratic
>>> nature of our commit lists? :)
>>
>> And because you introduced prio_queue.
> 
> I think that was Junio, but I think I can be counted as a cheerleader
> for the topic. :)

Ah, sorry.  You did make it stable, though, which allows using it for
backward-compatible history traversal.
>>> I actually have a series turning rev_info.commits into a prio_queue
>>> which I need to polish up (mostly just writing commit messages; I've
>>> been running with it for almost 2 years without trouble). Ironically it
>>> does not touch this spot, as these commit lists are formed on their own.
>>
>> That is not a coincidence.  I had a look at that series and tried to
>> reach its goals while keeping rev_info.commits a commit_list.  Why?
>> Mostly being vaguely uncomfortable with prio_queue' memory overhead,
>> lack of type safety and dual use as a stack.  I still used it, but only
>> as local variable, not in the central struct rev_info.
> 
> Hmm, I would have thought prio_queue had less memory overhead. You're
> spending one pointer per entry in a packed array, versus list nodes. But
> it's true that it doesn't shrink as items are removed (though that is
> something we _could_ implement).

If we just count the net data then a commit_list item has two pointers
and a prio_queue_entry has a pointer and an ID for stability.  That's a
tie.  ALLOC_GROW overallocates by ca. 50%, so that's 25% more on
average for the prio_queue.  No idea what overhead malloc() needs per
allocation, but I guess it's enough to tilt the scale back against
commit_lists.

However, a prio_queue without a comparison function acts as a FIFO
stack, but needs double the amount of memory than a pointer array
without the stability ID would, for the same behavior.

I don't think lack of shrinking causes much of an issue.  I did stumble
over at least one place where using a prio_queue in FIFO mode made the
code slightly but measurably slower than using a commit_list, though,
which could be rectified by using a raw array of pointers.

René


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-19 11:15         ` René Scharfe
@ 2025-07-20  0:03           ` Jeff King
  2025-07-20  1:22             ` Junio C Hamano
  0 siblings, 1 reply; 41+ messages in thread
From: Jeff King @ 2025-07-20  0:03 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List

On Sat, Jul 19, 2025 at 01:15:28PM +0200, René Scharfe wrote:

> > Hmm, I would have thought prio_queue had less memory overhead. You're
> > spending one pointer per entry in a packed array, versus list nodes. But
> > it's true that it doesn't shrink as items are removed (though that is
> > something we _could_ implement).
> 
> If we just count the net data then a commit_list item has two pointers
> and a prio_queue_entry has a pointer and an ID for stability.  That's a
> tie.  ALLOC_GROW overallocates by ca. 50%, so that's 25% more on
> average for the prio_queue.  No idea what overhead malloc() needs per
> allocation, but I guess it's enough to tilt the scale back against
> commit_lists.

Oh right, I totally forgot about the extra counter.

-Peff

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-20  0:03           ` Jeff King
@ 2025-07-20  1:22             ` Junio C Hamano
  0 siblings, 0 replies; 41+ messages in thread
From: Junio C Hamano @ 2025-07-20  1:22 UTC (permalink / raw)
  To: Jeff King; +Cc: René Scharfe, Git List

Jeff King <peff@peff.net> writes:

> On Sat, Jul 19, 2025 at 01:15:28PM +0200, René Scharfe wrote:
>
>> > Hmm, I would have thought prio_queue had less memory overhead. You're
>> > spending one pointer per entry in a packed array, versus list nodes. But
>> > it's true that it doesn't shrink as items are removed (though that is
>> > something we _could_ implement).
>> 
>> If we just count the net data then a commit_list item has two pointers
>> and a prio_queue_entry has a pointer and an ID for stability.  That's a
>> tie.  ALLOC_GROW overallocates by ca. 50%, so that's 25% more on
>> average for the prio_queue.  No idea what overhead malloc() needs per
>> allocation, but I guess it's enough to tilt the scale back against
>> commit_lists.
>
> Oh right, I totally forgot about the extra counter.

Don't feel bad.  I forgot about it, too, when I gave my "is it
stable?" comment to René's patch.  ;-)

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-18  9:39   ` [PATCH v2 1/3] " René Scharfe
@ 2025-07-21 14:02     ` Lidong Yan
  2025-08-03  9:54       ` René Scharfe
  0 siblings, 1 reply; 41+ messages in thread
From: Lidong Yan @ 2025-07-21 14:02 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Jeff King, Junio C Hamano, Justin Tobler

René Scharfe <l.s.r@web.de> writes:
> 
> +#
> +# Creates lots of merges to make history traversal costly.  In
> +# particular it creates 2^($max_level-1)-1 2-way merges on top of
> +# 2^($max_level-1) root commits.  E.g., the commit history looks like
> +# this for a $max_level of 3:
> +#
> +#     _1_
> +#    /   \
> +#   2     3
> +#  / \   / \
> +# 4   5 6   7
> +#
> +# The numbers are the fast-import marks, which also are the commit
> +# messages.  1 is the HEAD commit and a merge, 2 and 3 are also merges,
> +# 4-7 are the root commits.
> +#

I feel that the reason there's no significant performance improvement is probably
because mostly we are using the priority queue to sort O(siblings) nodes.
For example, in this case, the most time-consuming operation is when the priority
queue or commit list contains 4 and 5, and we then need to insert 6 and 7.

Assuming the maximum number of siblings is W and the number of nodes is N,
the time complexity with a commit list is O(W² × N), while using a priority queue
gives O(W log W × N). Perhaps in many projects, W isn't particularly large,
which results in the performance improvement not being very significant.

- Lidong

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 0/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-18  9:09 ` [PATCH v2 " René Scharfe
                     ` (3 preceding siblings ...)
  2025-07-19  7:04   ` [PATCH v2 0/3] commit: convert pop_most_recent_commit() to prio_queue Jeff King
@ 2025-07-22  6:26   ` SZEDER Gábor
  2025-07-22 14:27     ` Junio C Hamano
  4 siblings, 1 reply; 41+ messages in thread
From: SZEDER Gábor @ 2025-07-22  6:26 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Git List, René Scharfe, Jeff King, Justin Tobler

On Fri, Jul 18, 2025 at 11:09:04AM +0200, René Scharfe wrote:
> Use prio_queue to improve worst-case performance at the cost of slightly
> worse best-case performance.  Then add and use prio_queue_replace() to
> recover that loss.
> 
> Changes since v2:
> - Mention that a prio_queue improves performance for merge-heavy
>   histories in the commit message.
> - Add the new perf script to Meson build file.
> - Mention which kind of history we are aiming for and show its shape in
>   a comment in the perf script.
> - Remove unnecessary quotes and use single quotes for the perf test
>   code.
> - Rename the variable delete_pending to get_pending to align it with
>   the concrete function prio_queue_get() instead of referring to the
>   abstract concept of deletion, to improve readability.
> 
>   commit: convert pop_most_recent_commit() to prio_queue
>   prio-queue: add prio_queue_replace()
>   commit: use prio_queue_replace() in pop_most_recent_commit()

The patches in this series were picked up in the wrong order:

  $ git log --topo-order --reverse --oneline -3 e436bc94f3
  36554bf51a commit: convert pop_most_recent_commit() to prio_queue
  304f06e0c0 commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0
  e436bc94f3 prio-queue: add prio_queue_replace()

Note that the order of the second and third patches is reversed.

Since the second patch/third commit introduces a new function used by
the third patch/second commit, this leads to build failure of
304f06e0c0:

      CC commit.o
  commit.c: In function ‘pop_most_recent_commit’:
  commit.c:754:33: error: implicit declaration of function ‘prio_queue_replace’; did you mean ‘prio_queue_reverse’? [-Werror=implicit-function-declaration]
    754 |                                 prio_queue_replace(queue, commit);
        |                                 ^~~~~~~~~~~~~~~~~~
        |                                 prio_queue_reverse
  cc1: all warnings being treated as errors
  make: *** [Makefile:2821: commit.o] Error 1

(And there's that MIME-Version thing at the end of 304f06e0c0's subject
line as well.)


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 0/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-22  6:26   ` SZEDER Gábor
@ 2025-07-22 14:27     ` Junio C Hamano
  0 siblings, 0 replies; 41+ messages in thread
From: Junio C Hamano @ 2025-07-22 14:27 UTC (permalink / raw)
  To: SZEDER Gábor; +Cc: Git List, René Scharfe, Jeff King, Justin Tobler

SZEDER Gábor <szeder.dev@gmail.com> writes:

>   $ git log --topo-order --reverse --oneline -3 e436bc94f3
>   36554bf51a commit: convert pop_most_recent_commit() to prio_queue
>   304f06e0c0 commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0
>   e436bc94f3 prio-queue: add prio_queue_replace()

Funny.  Thanks for spotting.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-07-21 14:02     ` Lidong Yan
@ 2025-08-03  9:54       ` René Scharfe
  2025-08-03 16:48         ` Junio C Hamano
  0 siblings, 1 reply; 41+ messages in thread
From: René Scharfe @ 2025-08-03  9:54 UTC (permalink / raw)
  To: Lidong Yan; +Cc: Git List, Jeff King, Junio C Hamano, Justin Tobler

On 7/21/25 4:02 PM, Lidong Yan wrote:
> René Scharfe <l.s.r@web.de> writes:
>>
>> +#
>> +# Creates lots of merges to make history traversal costly.  In
>> +# particular it creates 2^($max_level-1)-1 2-way merges on top of
>> +# 2^($max_level-1) root commits.  E.g., the commit history looks like
>> +# this for a $max_level of 3:
>> +#
>> +#     _1_
>> +#    /   \
>> +#   2     3
>> +#  / \   / \
>> +# 4   5 6   7
>> +#
>> +# The numbers are the fast-import marks, which also are the commit
>> +# messages.  1 is the HEAD commit and a merge, 2 and 3 are also merges,
>> +# 4-7 are the root commits.
>> +#
> 
> I feel that the reason there's no significant performance improvement is probably
> because mostly we are using the priority queue to sort O(siblings) nodes.
> For example, in this case, the most time-consuming operation is when the priority
> queue or commit list contains 4 and 5, and we then need to insert 6 and 7.
> 
> Assuming the maximum number of siblings is W and the number of nodes is N,
> the time complexity with a commit list is O(W² × N), while using a priority queue
> gives O(W log W × N). Perhaps in many projects, W isn't particularly large,
> which results in the performance improvement not being very significant.

Kinda.  While traversing the history we take a commit from to the
commit_list or prio_queue and put back its parents.  For single-parent
commits this sequence keeps the number of stored items the same.  Merges
increase that number.

We add and retrieve each commit in the (relevant part of) history.  That
takes O(N) and O(1) for the sorted list, and O(log N) and O(log N) for
the prio_queue, where N is the length of the list.

So the best-case history is a string of single-parent commits, keeping
only a single item on the list/queue throughout.  That requires no
sorting or heaping, making the additions and retrievals O(1).  The
overall complexity is then O(N) for both variants, N being the number
of commits in the history.

Worst-case history might be a single merge of all commits -- a
centipede or myriapod?  With all commits on the sorted list we get a
complexity of O(N²) for the traversal, and O(N log N) with a prio_queue.

René


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 3/3] commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0
  2025-07-18  9:39   ` [PATCH v2 3/3] commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0 René Scharfe
@ 2025-08-03 11:12     ` Johannes Schindelin
  2025-08-03 11:33       ` René Scharfe
  0 siblings, 1 reply; 41+ messages in thread
From: Johannes Schindelin @ 2025-08-03 11:12 UTC (permalink / raw)
  To: René Scharfe; +Cc: Git List, Jeff King, Junio C Hamano, Justin Tobler

[-- Attachment #1: Type: text/plain, Size: 574 bytes --]

Hi René,

On Fri, 18 Jul 2025, René Scharfe wrote:

> Optimize pop_most_recent_commit() by adding the first parent using the
> more efficient prio_queue_peek() and prio_queue_replace() instead of
> prio_queue_get() and prio_queue_put().
> 
> [... clipped ...]

I noticed that v2 of this patch not only made it into `next`, but it also
introduced a commit subject suffix (likely unintended?):

  [PATCH v2 3/3] commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0

Maybe that is something you want to look into?

Ciao,
Johannes

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 3/3] commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0
  2025-08-03 11:12     ` Johannes Schindelin
@ 2025-08-03 11:33       ` René Scharfe
  0 siblings, 0 replies; 41+ messages in thread
From: René Scharfe @ 2025-08-03 11:33 UTC (permalink / raw)
  To: Johannes Schindelin, Junio C Hamano; +Cc: Git List, Jeff King, Justin Tobler

On 8/3/25 1:12 PM, Johannes Schindelin wrote:
> Hi René,
> 
> On Fri, 18 Jul 2025, René Scharfe wrote:
> 
>> Optimize pop_most_recent_commit() by adding the first parent using the
>> more efficient prio_queue_peek() and prio_queue_replace() instead of
>> prio_queue_get() and prio_queue_put().
>>
>> [... clipped ...]
> 
> I noticed that v2 of this patch not only made it into `next`, but it also
> introduced a commit subject suffix (likely unintended?):
> 
>   [PATCH v2 3/3] commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0
> 
> Maybe that is something you want to look into?

My bad, copy+paste error.  A fixed version made it into master in the
meantime.  Sorry about that, and thanks a lot for correcting it, Junio!

René


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-08-03  9:54       ` René Scharfe
@ 2025-08-03 16:48         ` Junio C Hamano
  2025-08-04 19:56           ` René Scharfe
  0 siblings, 1 reply; 41+ messages in thread
From: Junio C Hamano @ 2025-08-03 16:48 UTC (permalink / raw)
  To: René Scharfe; +Cc: Lidong Yan, Git List, Jeff King, Justin Tobler

René Scharfe <l.s.r@web.de> writes:

> We add and retrieve each commit in the (relevant part of) history.  That
> takes O(N) and O(1) for the sorted list, and O(log N) and O(log N) for
> the prio_queue, where N is the length of the list.
>
> So the best-case history is a string of single-parent commits, keeping
> only a single item on the list/queue throughout.  That requires no
> sorting or heaping, making the additions and retrievals O(1).  The
> overall complexity is then O(N) for both variants, N being the number
> of commits in the history.
>
> Worst-case history might be a single merge of all commits -- a
> centipede or myriapod?  With all commits on the sorted list we get a
> complexity of O(N²) for the traversal, and O(N log N) with a prio_queue.

In other words, for a typical two-parent merge, we peek the current
one, "replace" it with its first parent and then do the usual "put
and sift it down into place" for the second one.  

I am wondering if there is a more optimization opportunity if we
allowed "put more than one, and then sift all of them down into
place".  In other words, if I told the machinery:

    I am doing this put.  I promise I won't do get until I say "now
    I'll start doing get's, so you are free to delay your internal
    state maintenance and do so immediately before my next 'get'".

and did such put's a few times before I do a 'get', would there be a
way to teach the machinery to take advantage of the promise?

In any case, it is very much welcome to speed up "describe" with a
better data structure and algorithm.  Will queue.

Thanks.



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 1/3] commit: convert pop_most_recent_commit() to prio_queue
  2025-08-03 16:48         ` Junio C Hamano
@ 2025-08-04 19:56           ` René Scharfe
  0 siblings, 0 replies; 41+ messages in thread
From: René Scharfe @ 2025-08-04 19:56 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Lidong Yan, Git List, Jeff King, Justin Tobler

On 8/3/25 6:48 PM, Junio C Hamano wrote:
> René Scharfe <l.s.r@web.de> writes:
> 
>> We add and retrieve each commit in the (relevant part of) history.  That
>> takes O(N) and O(1) for the sorted list, and O(log N) and O(log N) for
>> the prio_queue, where N is the length of the list.
>>
>> So the best-case history is a string of single-parent commits, keeping
>> only a single item on the list/queue throughout.  That requires no
>> sorting or heaping, making the additions and retrievals O(1).  The
>> overall complexity is then O(N) for both variants, N being the number
>> of commits in the history.
>>
>> Worst-case history might be a single merge of all commits -- a
>> centipede or myriapod?  With all commits on the sorted list we get a
>> complexity of O(N²) for the traversal, and O(N log N) with a prio_queue.
> 
> In other words, for a typical two-parent merge, we peek the current
> one, "replace" it with its first parent and then do the usual "put
> and sift it down into place" for the second one.  
> 
> I am wondering if there is a more optimization opportunity if we
> allowed "put more than one, and then sift all of them down into
> place".  In other words, if I told the machinery:
> 
>     I am doing this put.  I promise I won't do get until I say "now
>     I'll start doing get's, so you are free to delay your internal
>     state maintenance and do so immediately before my next 'get'".
> 
> and did such put's a few times before I do a 'get', would there be a
> way to teach the machinery to take advantage of the promise?

Well, we could reestablish the heap at a cost of O(N), which only pays
off if it's less than the O(P log N) needed for regular puts of P
parents, with N being the number of queue elements.  This starts to
lose once queues become too long -- just when an optimization would be
most welcome.  So it seems impractical.

We could replace our binary heap with an algorithm that has O(1)
inserts, like a pairing heap, though.

René


^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2025-08-04 19:56 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-15 14:35 [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue René Scharfe
2025-07-15 14:51 ` [PATCH 1/3] " René Scharfe
2025-07-15 19:23   ` Junio C Hamano
2025-07-15 20:47   ` Justin Tobler
2025-07-16  9:39     ` René Scharfe
2025-07-16  5:05   ` Jeff King
2025-07-16  9:39     ` René Scharfe
2025-07-17  8:22       ` René Scharfe
2025-07-19  6:55       ` Jeff King
2025-07-19  6:57         ` Jeff King
2025-07-19 11:15         ` René Scharfe
2025-07-20  0:03           ` Jeff King
2025-07-20  1:22             ` Junio C Hamano
2025-07-16 22:23   ` Junio C Hamano
2025-07-17  8:22     ` René Scharfe
2025-07-15 14:51 ` [PATCH 2/3] prio-queue: add prio_queue_replace() René Scharfe
2025-07-16  5:09   ` Jeff King
2025-07-16  9:38     ` René Scharfe
2025-07-17  9:20       ` René Scharfe
2025-07-19  7:02         ` Jeff King
2025-07-15 14:51 ` [PATCH 3/3] commit: use prio_queue_replace() in pop_most_recent_commit() René Scharfe
2025-07-15 20:43   ` Junio C Hamano
2025-07-16  9:38     ` René Scharfe
2025-07-16  0:07 ` [PATCH 0/3] commit: convert pop_most_recent_commit() to prio_queue Junio C Hamano
2025-07-16  5:15   ` Jeff King
2025-07-16  9:38     ` René Scharfe
2025-07-19  6:45       ` Jeff King
2025-07-16 14:49     ` Junio C Hamano
2025-07-18  9:09 ` [PATCH v2 " René Scharfe
2025-07-18  9:39   ` [PATCH v2 1/3] " René Scharfe
2025-07-21 14:02     ` Lidong Yan
2025-08-03  9:54       ` René Scharfe
2025-08-03 16:48         ` Junio C Hamano
2025-08-04 19:56           ` René Scharfe
2025-07-18  9:39   ` [PATCH v2 3/3] commit: use prio_queue_replace() in pop_most_recent_commit(),MIME-Version: 1.0 René Scharfe
2025-08-03 11:12     ` Johannes Schindelin
2025-08-03 11:33       ` René Scharfe
2025-07-18  9:39   ` [PATCH v2 2/3] prio-queue: add prio_queue_replace() René Scharfe
2025-07-19  7:04   ` [PATCH v2 0/3] commit: convert pop_most_recent_commit() to prio_queue Jeff King
2025-07-22  6:26   ` SZEDER Gábor
2025-07-22 14:27     ` Junio C Hamano

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).