From: "Alexander N. Gavrilov" <angavrilov@gmail.com>
To: git@vger.kernel.org
Cc: Junio C Hamano <gitster@pobox.com>
Subject: [RFC PATCH] Fix quadratic performance in rewrite_one.
Date: Sat, 12 Jul 2008 22:00:57 +0400 [thread overview]
Message-ID: <200807122200.58187.angavrilov@gmail.com> (raw)
Parent commits are usually older than their children. Thus,
on each iteration of the loop in rewrite_one, add_parents_to_list
traverses all commits previously processed by the loop.
It performs very poorly in case of very long rewrite chains.
Signed-off-by: Alexander Gavrilov <angavrilov@gmail.com>
---
While experimenting with a large Git repository (~10 years of history), I noticed that
gitk (and git-log) is very slow when asked to show history for a recently created directory:
$ time git log --no-color --pretty=oneline --parents --boundary -- A_5_Year_Old_Dir | wc
1620 8042 166759
real 0m29.969s
user 0m28.419s
sys 0m0.304s
$ time git log --no-color --pretty=oneline --parents --boundary -- A_3_Year_Old_Dir | wc
66 315 6599
real 1m41.523s
user 1m38.585s
sys 0m0.549s
$ time git log --no-color --pretty=oneline --parents --boundary -- A_1_Year_Old_Dir | wc
18 108 1898
real 3m33.023s
user 3m28.817s
sys 0m0.859s
Profiling showed that most of the time is spent manipulating commit lists:
% cumulative self self total
time seconds seconds calls s/call s/call name
98.91 206.21 206.21 76909 0.00 0.00 insert_by_date
0.36 206.97 0.76 409268 0.00 0.00 find_pack_entry_one
0.17 207.33 0.36 258208 0.00 0.00 unpack_object_header_gently
After some research and debugging I narrowed it down to this loop in rewrite_one (revision.c):
for (;;) {
struct commit *p = *pp;
...
if (add_parents_to_list(revs, p, &revs->commits) < 0)
...
*pp = p->parents->item;
}
When git-log is called for a recently created directory, all history before its creation ends up
being processed in this loop. Since add_parents_to_list traverses the list linearly to find the
point of insertion, which in this case is always at the end of the list, it actually works
in quadratical time.
I made a fix for it by caching a pointer to the farthest added element. The performance gain
is obvious, but as it's my first ever change in Git source code, I believe that there is a better
solution, or, at least, implementation.
$ time git log --no-color --pretty=oneline --parents --boundary -- A_1_Year_Old_Dir | wc
18 108 1898
real 0m4.101s
user 0m3.897s
sys 0m0.123s
Alexander.
revision.c | 34 ++++++++++++++++++++++++++++------
1 files changed, 28 insertions(+), 6 deletions(-)
diff --git a/revision.c b/revision.c
index fc66755..5bf8039 100644
--- a/revision.c
+++ b/revision.c
@@ -412,11 +412,31 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
commit->object.flags |= TREESAME;
}
-static int add_parents_to_list(struct rev_info *revs, struct commit *commit, struct commit_list **list)
+static void insert_by_date_cached(struct commit *p, struct commit_list **head,
+ struct commit_list *cached_base, struct commit_list **cache)
+{
+ struct commit_list *new_entry;
+
+ if (cached_base && p->date < cached_base->item->date) {
+ new_entry = insert_by_date(p, &cached_base->next);
+ }
+ else {
+ new_entry = insert_by_date(p, head);
+ }
+
+ if (cache && (!*cache || p->date < (*cache)->item->date)) {
+ *cache = new_entry;
+ }
+}
+
+static int add_parents_to_list(struct rev_info *revs, struct commit *commit,
+ struct commit_list **list, struct commit_list **cache_ptr)
{
struct commit_list *parent = commit->parents;
unsigned left_flag;
+ struct commit_list *cached_base = cache_ptr ? *cache_ptr : NULL;
+
if (commit->object.flags & ADDED)
return 0;
commit->object.flags |= ADDED;
@@ -445,7 +465,7 @@ static int add_parents_to_list(struct rev_info *revs, struct commit *commit, str
if (p->object.flags & SEEN)
continue;
p->object.flags |= SEEN;
- insert_by_date(p, list);
+ insert_by_date_cached(p, list, cached_base, cache_ptr);
}
return 0;
}
@@ -470,7 +490,7 @@ static int add_parents_to_list(struct rev_info *revs, struct commit *commit, str
p->object.flags |= left_flag;
if (!(p->object.flags & SEEN)) {
p->object.flags |= SEEN;
- insert_by_date(p, list);
+ insert_by_date_cached(p, list, cached_base, cache_ptr);
}
if(revs->first_parent_only)
break;
@@ -611,7 +631,7 @@ static int limit_list(struct rev_info *revs)
if (revs->max_age != -1 && (commit->date < revs->max_age))
obj->flags |= UNINTERESTING;
- if (add_parents_to_list(revs, commit, &list) < 0)
+ if (add_parents_to_list(revs, commit, &list, NULL) < 0)
return -1;
if (obj->flags & UNINTERESTING) {
mark_parents_uninteresting(commit);
@@ -1458,10 +1478,12 @@ enum rewrite_result {
static enum rewrite_result rewrite_one(struct rev_info *revs, struct commit **pp)
{
+ struct commit_list *cache = NULL;
+
for (;;) {
struct commit *p = *pp;
if (!revs->limited)
- if (add_parents_to_list(revs, p, &revs->commits) < 0)
+ if (add_parents_to_list(revs, p, &revs->commits, &cache) < 0)
return rewrite_one_error;
if (p->parents && p->parents->next)
return rewrite_one_ok;
@@ -1580,7 +1602,7 @@ static struct commit *get_revision_1(struct rev_info *revs)
if (revs->max_age != -1 &&
(commit->date < revs->max_age))
continue;
- if (add_parents_to_list(revs, commit, &revs->commits) < 0)
+ if (add_parents_to_list(revs, commit, &revs->commits, NULL) < 0)
return NULL;
}
--
1.5.6.2.24.ge09c4
next reply other threads:[~2008-07-12 18:02 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-07-12 18:00 Alexander N. Gavrilov [this message]
2008-07-12 22:55 ` [RFC PATCH] Fix quadratic performance in rewrite_one Linus Torvalds
2008-07-13 0:37 ` Alexander Gavrilov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=200807122200.58187.angavrilov@gmail.com \
--to=angavrilov@gmail.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).