git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] Fix quadratic performance in rewrite_one.
@ 2008-07-12 18:00 Alexander N. Gavrilov
  2008-07-12 22:55 ` Linus Torvalds
  0 siblings, 1 reply; 3+ messages in thread
From: Alexander N. Gavrilov @ 2008-07-12 18:00 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano

Parent commits are usually older than their children. Thus,
on each iteration of the loop in rewrite_one, add_parents_to_list
traverses all commits previously processed by the loop.
It performs very poorly in case of very long rewrite chains.

Signed-off-by: Alexander Gavrilov <angavrilov@gmail.com>
---
	
	While experimenting with a large Git repository (~10 years of history), I noticed that
	gitk (and git-log) is very slow when asked to show history for a recently created directory:

	$ time git log --no-color --pretty=oneline --parents --boundary -- A_5_Year_Old_Dir | wc
	1620    8042  166759

	real    0m29.969s
	user    0m28.419s
	sys     0m0.304s

	$ time git log --no-color --pretty=oneline --parents --boundary -- A_3_Year_Old_Dir | wc
	66     315    6599
	
	real    1m41.523s
	user    1m38.585s
	sys     0m0.549s

	$ time git log --no-color --pretty=oneline --parents --boundary -- A_1_Year_Old_Dir | wc
	18     108    1898
	
	real    3m33.023s
	user    3m28.817s
	sys     0m0.859s
	
	Profiling showed that most of the time is spent manipulating commit lists:
	
	%   cumulative   self              self     total           
	time   seconds   seconds    calls   s/call   s/call  name    
	98.91    206.21   206.21    76909     0.00     0.00  insert_by_date
	0.36    206.97     0.76   409268     0.00     0.00  find_pack_entry_one
	0.17    207.33     0.36   258208     0.00     0.00  unpack_object_header_gently
	
	After some research and debugging I narrowed it down to this loop in rewrite_one (revision.c):
	
		for (;;) {
			struct commit *p = *pp;
			...
				if (add_parents_to_list(revs, p, &revs->commits) < 0)
			...
			*pp = p->parents->item;
		}
	
	When git-log is called for a recently created directory, all history before its creation ends up
	being processed in this loop. Since add_parents_to_list traverses the list linearly to find the
	point of insertion, which in this case is always at the end of the list, it actually works
	in quadratical time.
	
	I made a fix for it by caching a pointer to the farthest added element. The performance gain
	is obvious, but as it's my first ever change in Git source code, I believe that there is a better
	solution, or, at least, implementation.
	
	$ time git log --no-color --pretty=oneline --parents --boundary -- A_1_Year_Old_Dir | wc
	18     108    1898
	
	real    0m4.101s
	user    0m3.897s
	sys     0m0.123s
	
	Alexander.


 revision.c |   34 ++++++++++++++++++++++++++++------
 1 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/revision.c b/revision.c
index fc66755..5bf8039 100644
--- a/revision.c
+++ b/revision.c
@@ -412,11 +412,31 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
 	commit->object.flags |= TREESAME;
 }
 
-static int add_parents_to_list(struct rev_info *revs, struct commit *commit, struct commit_list **list)
+static void insert_by_date_cached(struct commit *p, struct commit_list **head,
+		    struct commit_list *cached_base, struct commit_list **cache)
+{
+	struct commit_list *new_entry;
+    
+	if (cached_base && p->date < cached_base->item->date) {
+		new_entry = insert_by_date(p, &cached_base->next);
+	}
+	else {
+		new_entry = insert_by_date(p, head);
+	}
+	
+	if (cache && (!*cache || p->date < (*cache)->item->date)) {
+		*cache = new_entry;
+	}
+}
+
+static int add_parents_to_list(struct rev_info *revs, struct commit *commit,
+		    struct commit_list **list, struct commit_list **cache_ptr)
 {
 	struct commit_list *parent = commit->parents;
 	unsigned left_flag;
 
+	struct commit_list *cached_base = cache_ptr ? *cache_ptr : NULL;
+
 	if (commit->object.flags & ADDED)
 		return 0;
 	commit->object.flags |= ADDED;
@@ -445,7 +465,7 @@ static int add_parents_to_list(struct rev_info *revs, struct commit *commit, str
 			if (p->object.flags & SEEN)
 				continue;
 			p->object.flags |= SEEN;
-			insert_by_date(p, list);
+			insert_by_date_cached(p, list, cached_base, cache_ptr);
 		}
 		return 0;
 	}
@@ -470,7 +490,7 @@ static int add_parents_to_list(struct rev_info *revs, struct commit *commit, str
 		p->object.flags |= left_flag;
 		if (!(p->object.flags & SEEN)) {
 			p->object.flags |= SEEN;
-			insert_by_date(p, list);
+			insert_by_date_cached(p, list, cached_base, cache_ptr);
 		}
 		if(revs->first_parent_only)
 			break;
@@ -611,7 +631,7 @@ static int limit_list(struct rev_info *revs)
 
 		if (revs->max_age != -1 && (commit->date < revs->max_age))
 			obj->flags |= UNINTERESTING;
-		if (add_parents_to_list(revs, commit, &list) < 0)
+		if (add_parents_to_list(revs, commit, &list, NULL) < 0)
 			return -1;
 		if (obj->flags & UNINTERESTING) {
 			mark_parents_uninteresting(commit);
@@ -1458,10 +1478,12 @@ enum rewrite_result {
 
 static enum rewrite_result rewrite_one(struct rev_info *revs, struct commit **pp)
 {
+	struct commit_list *cache = NULL;
+
 	for (;;) {
 		struct commit *p = *pp;
 		if (!revs->limited)
-			if (add_parents_to_list(revs, p, &revs->commits) < 0)
+			if (add_parents_to_list(revs, p, &revs->commits, &cache) < 0)
 				return rewrite_one_error;
 		if (p->parents && p->parents->next)
 			return rewrite_one_ok;
@@ -1580,7 +1602,7 @@ static struct commit *get_revision_1(struct rev_info *revs)
 			if (revs->max_age != -1 &&
 			    (commit->date < revs->max_age))
 				continue;
-			if (add_parents_to_list(revs, commit, &revs->commits) < 0)
+			if (add_parents_to_list(revs, commit, &revs->commits, NULL) < 0)
 				return NULL;
 		}
 
-- 
1.5.6.2.24.ge09c4

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH] Fix quadratic performance in rewrite_one.
  2008-07-12 18:00 [RFC PATCH] Fix quadratic performance in rewrite_one Alexander N. Gavrilov
@ 2008-07-12 22:55 ` Linus Torvalds
  2008-07-13  0:37   ` Alexander Gavrilov
  0 siblings, 1 reply; 3+ messages in thread
From: Linus Torvalds @ 2008-07-12 22:55 UTC (permalink / raw)
  To: Alexander N. Gavrilov; +Cc: git, Junio C Hamano



On Sat, 12 Jul 2008, Alexander N. Gavrilov wrote:
>
> Parent commits are usually older than their children. Thus,
> on each iteration of the loop in rewrite_one, add_parents_to_list
> traverses all commits previously processed by the loop.
> It performs very poorly in case of very long rewrite chains.

Good call, but you don't seem to invalidate the cache when we remove 
things from the list.

The top of the limit_list() loop does that "get top entry from list, an 
free it", and I'm not seeing you invalidating the cache if that entry that 
just got free'd happened to be the cache entry?

Or did I miss some reason why that couldn't happen? 

		Linus

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH] Fix quadratic performance in rewrite_one.
  2008-07-12 22:55 ` Linus Torvalds
@ 2008-07-13  0:37   ` Alexander Gavrilov
  0 siblings, 0 replies; 3+ messages in thread
From: Alexander Gavrilov @ 2008-07-13  0:37 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: git, Junio C Hamano

On Sunday 13 July 2008 02:55:27 Linus Torvalds wrote:
> On Sat, 12 Jul 2008, Alexander N. Gavrilov wrote:
> > Parent commits are usually older than their children. Thus,
> > on each iteration of the loop in rewrite_one, add_parents_to_list
> > traverses all commits previously processed by the loop.
> > It performs very poorly in case of very long rewrite chains.
>
> Good call, but you don't seem to invalidate the cache when we remove
> things from the list.

The cache is local to rewrite_one, and is invalidated by exiting from that 
function. Other users of add_parents_to_list just pass NULL as cache_ptr, 
thus causing insert_by_date_cached to degenerate into a simple 
insert_by_date.

> The top of the limit_list() loop does that "get top entry from list, an
> free it", and I'm not seeing you invalidating the cache if that entry that
> just got free'd happened to be the cache entry?

This type of workflow can be expected to keep the list relatively short 
(roughly limited by the number of simultaneously existing branches); and if 
it is already long, new entries will probably be added near the beginning 
anyway, so there doesn't seem to be any need to use caching. 

rewrite_one() is special in that it can sometimes walk through thousands of 
commits at once and put them all into the list -- i.e. it is bound not by the 
width of the history, but by its length.

Alexander

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2008-07-13  0:39 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-07-12 18:00 [RFC PATCH] Fix quadratic performance in rewrite_one Alexander N. Gavrilov
2008-07-12 22:55 ` Linus Torvalds
2008-07-13  0:37   ` Alexander Gavrilov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).