* git-rev-list: add "--dense" flag
@ 2005-10-21 23:40 Linus Torvalds
2005-10-21 23:47 ` Linus Torvalds
` (3 more replies)
0 siblings, 4 replies; 21+ messages in thread
From: Linus Torvalds @ 2005-10-21 23:40 UTC (permalink / raw)
To: Junio C Hamano, Git Mailing List
This is what the recent git-rev-list changes have all been gearing up for.
When we use a path filter to git-rev-list, the new "--dense" flag asks
git-rev-list to compress the history so that it _only_ contains commits
that change files in the path filter. It also rewrites the parent
information so that tools like "gitk" will see the result as a dense
history tree.
For example, on the current kernel archive:
[torvalds@g5 linux]$ git-rev-list HEAD | wc -l
9904
[torvalds@g5 linux]$ git-rev-list HEAD -- kernel | wc -l
5442
[torvalds@g5 linux]$ git-rev-list --dense HEAD -- kernel | wc -l
356
which shows that while we have almost ten thousand commits, we can prune
down the work to slightly more than half by only following the merges
that are interesting. But further, we can then compress the history to
just 356 entries that actually make changes to the kernel subdirectory.
To see this in action, try something like
gitk --dense -- gitk
to see just the history that affects gitk. Or, to show that true
parallell development still remains parallell, do
gitk --dense -- daemon.c
which shows some parallell commits in the current git tree.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
I'm really happy with how this turned out. It's a bit expensive to run on
big archives, but I _really_ think it's quite spectacular. And likely very
useful indeed.
For example, say you love gitk, but only care about networking changes.
Easy enough, just do
gitk --dense -- net/ include/net/
and off you go. It's not free (we do a _lot_ of tree comparisons), but
dammit, it's fast enough that it's very very useful. The tree comparisons
are done very efficiently.
This is _way_ more powerful than annotate. Interested in just a single
file? Just do
gitk --dense -- kernel/exit.c
and it will show the 17 or so commits that change kernel/exit.c with the
right history (it turns out that there is no parallell development at all
in that file, so in this case it will linearize history entirely).
Damn, I'm good.
diff --git a/rev-list.c b/rev-list.c
index b5dbb9f..5f125fd 100644
--- a/rev-list.c
+++ b/rev-list.c
@@ -11,6 +11,7 @@
#define INTERESTING (1u << 1)
#define COUNTED (1u << 2)
#define SHOWN (1u << 3)
+#define TREECHANGE (1u << 4)
static const char rev_list_usage[] =
"git-rev-list [OPTION] commit-id <commit-id>\n"
@@ -27,6 +28,7 @@ static const char rev_list_usage[] =
" --merge-order [ --show-breaks ]\n"
" --topo-order";
+static int dense = 0;
static int unpacked = 0;
static int bisect_list = 0;
static int tag_objects = 0;
@@ -79,6 +81,26 @@ static void show_commit(struct commit *c
fflush(stdout);
}
+static void rewrite_one(struct commit **pp)
+{
+ for (;;) {
+ struct commit *p = *pp;
+ if (p->object.flags & (TREECHANGE | UNINTERESTING))
+ return;
+ /* Only single-parent commits don't have TREECHANGE */
+ *pp = p->parents->item;
+ }
+}
+
+static void rewrite_parents(struct commit *commit)
+{
+ struct commit_list *parent = commit->parents;
+ while (parent) {
+ rewrite_one(&parent->item);
+ parent = parent->next;
+ }
+}
+
static int filter_commit(struct commit * commit)
{
if (stop_traversal && (commit->object.flags & BOUNDARY))
@@ -95,6 +117,11 @@ static int filter_commit(struct commit *
return STOP;
if (no_merges && (commit->parents && commit->parents->next))
return CONTINUE;
+ if (paths && dense) {
+ if (!(commit->object.flags & TREECHANGE))
+ return CONTINUE;
+ rewrite_parents(commit);
+ }
return DO;
}
@@ -404,6 +431,14 @@ static struct diff_options diff_opt = {
.change = file_change,
};
+static int same_tree(struct tree *t1, struct tree *t2)
+{
+ is_different = 0;
+ if (diff_tree_sha1(t1->object.sha1, t2->object.sha1, "", &diff_opt) < 0)
+ return 0;
+ return !is_different;
+}
+
static struct commit *try_to_simplify_merge(struct commit *commit, struct commit_list *parent)
{
if (!commit->tree)
@@ -415,11 +450,7 @@ static struct commit *try_to_simplify_me
parse_commit(p);
if (!p->tree)
continue;
- is_different = 0;
- if (diff_tree_sha1(commit->tree->object.sha1,
- p->tree->object.sha1, "", &diff_opt) < 0)
- continue;
- if (!is_different)
+ if (same_tree(commit->tree, p->tree))
return p;
}
return NULL;
@@ -485,6 +516,27 @@ static void add_parents_to_list(struct c
}
}
+static void compress_list(struct commit_list *list)
+{
+ while (list) {
+ struct commit *commit = list->item;
+ struct commit_list *parent = commit->parents;
+ list = list->next;
+
+ /*
+ * Exactly one parent? Check if it leaves the tree
+ * unchanged
+ */
+ if (parent && !parent->next) {
+ struct tree *t1 = commit->tree;
+ struct tree *t2 = parent->item->tree;
+ if (!t1 || !t2 || same_tree(t1, t2))
+ continue;
+ }
+ commit->object.flags |= TREECHANGE;
+ }
+}
+
static struct commit_list *limit_list(struct commit_list *list)
{
struct commit_list *newlist = NULL;
@@ -514,6 +566,8 @@ static struct commit_list *limit_list(st
}
if (tree_objects)
mark_edges_uninteresting(newlist);
+ if (paths && dense)
+ compress_list(newlist);
if (bisect_list)
newlist = find_bisection(newlist);
return newlist;
@@ -700,6 +754,10 @@ int main(int argc, const char **argv)
limited = 1;
continue;
}
+ if (!strcmp(arg, "--dense")) {
+ dense = 1;
+ continue;
+ }
if (!strcmp(arg, "--")) {
paths = get_pathspec(prefix, argv + i + 1);
if (paths) {
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-21 23:40 git-rev-list: add "--dense" flag Linus Torvalds
@ 2005-10-21 23:47 ` Linus Torvalds
2005-10-21 23:55 ` Linus Torvalds
` (2 subsequent siblings)
3 siblings, 0 replies; 21+ messages in thread
From: Linus Torvalds @ 2005-10-21 23:47 UTC (permalink / raw)
To: Junio C Hamano, Git Mailing List
On Fri, 21 Oct 2005, Linus Torvalds wrote:
>
> parallell
Oh, please also fix my spelling. That's one 'l' too many at the end.
Damn, I think I'm a reasonably good speller most of the time, but this
time it shows that Swedish is my first language (where it really _is_
written that way, with four 'l's total).
Linus
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-21 23:40 git-rev-list: add "--dense" flag Linus Torvalds
2005-10-21 23:47 ` Linus Torvalds
@ 2005-10-21 23:55 ` Linus Torvalds
2005-10-22 0:37 ` Petr Baudis
2005-10-25 18:07 ` Jonas Fonseca
3 siblings, 0 replies; 21+ messages in thread
From: Linus Torvalds @ 2005-10-21 23:55 UTC (permalink / raw)
To: Junio C Hamano, Git Mailing List
On Fri, 21 Oct 2005, Linus Torvalds wrote:
>
> [torvalds@g5 linux]$ git-rev-list --dense HEAD -- kernel | wc -l
> 356
Btw, one additional point.
This took 0.91 seconds to complete on the current kernel history on my
machine with a pretty much fully packed archive, and the memory footprint
was a total of about 12MB.
And it scales pretty well too. On the historical linux archive, which is
three years of history, the same thing takes me just over 12 seconds and
52MB, and that's for the _whole_ history. And it's not just following one
file: it's following that subdirectory.
So it really is pretty damn cool.
Of course, I might have a bug somewhere, but it all _seems_ to work very
well indeed.
Linus
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-21 23:40 git-rev-list: add "--dense" flag Linus Torvalds
2005-10-21 23:47 ` Linus Torvalds
2005-10-21 23:55 ` Linus Torvalds
@ 2005-10-22 0:37 ` Petr Baudis
2005-10-22 0:47 ` Handling renames Petr Baudis
2005-10-22 1:26 ` git-rev-list: add "--dense" flag Linus Torvalds
2005-10-25 18:07 ` Jonas Fonseca
3 siblings, 2 replies; 21+ messages in thread
From: Petr Baudis @ 2005-10-22 0:37 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Junio C Hamano, Git Mailing List
Dear diary, on Sat, Oct 22, 2005 at 01:40:54AM CEST, I got a letter
where Linus Torvalds <torvalds@osdl.org> told me that...
> When we use a path filter to git-rev-list, the new "--dense" flag asks
> git-rev-list to compress the history so that it _only_ contains commits
> that change files in the path filter. It also rewrites the parent
> information so that tools like "gitk" will see the result as a dense
> history tree.
There is no documentation for the --dense flag and it is even missing
from the usage string.
But my main concern is - will it be possible to do the rename detection
here as well? Using --dense instead of explicit diff-tree calls in
cg-log would be nice optimization, but I was about to add support for
optional following of renames for cg-log <filename>. That's really
pretty useful to have, every time I hit the Junio's big scripts rename
I keep repeating that to myself. ;-) Now when core GIT got the comfort
of per-file history, I only hope that it will start to annoy you as well.
--
Petr "Pasky" Baudis
Stuff: http://pasky.or.cz/
VI has two modes: the one in which it beeps and the one in which
it doesn't.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Handling renames.
2005-10-22 0:37 ` Petr Baudis
@ 2005-10-22 0:47 ` Petr Baudis
2005-10-22 1:28 ` Linus Torvalds
2005-10-22 1:26 ` git-rev-list: add "--dense" flag Linus Torvalds
1 sibling, 1 reply; 21+ messages in thread
From: Petr Baudis @ 2005-10-22 0:47 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Junio C Hamano, Git Mailing List
(Having Apr 14 flashbacks? Good memory!)
Dear diary, on Sat, Oct 22, 2005 at 02:37:33AM CEST, I got a letter
where Petr Baudis <pasky@suse.cz> told me that...
> But my main concern is - will it be possible to do the rename detection
> here as well? Using --dense instead of explicit diff-tree calls in
> cg-log would be nice optimization, but I was about to add support for
> optional following of renames for cg-log <filename>. That's really
> pretty useful to have, every time I hit the Junio's big scripts rename
> I keep repeating that to myself. ;-) Now when core GIT got the comfort
> of per-file history, I only hope that it will start to annoy you as well.
After all, this might be as good time to bring this up as any. Heavy
post-1.0 material follows:
How to track renames? I believe the situation has changed in the last
half a year. GIT really is a full-fledged SCM by now (at least its
major part code-wise), and I think it's hopefully becoming obvious that
we need to track renames. I actually decided to skip the whole
discussion why so, because we already _do_ concern ourselves with
renames - that's what the cool diff -M gadget does. And people start to
want to use it for all kind of history-digging stuff (not just for nice
diffs between trees).
So the problem is whether we should make this explicit. diff -M is
only a heuristic and it can go wrong, while it was empirically found
out in other SCMs that people actually don't mind telling their SCM
about renames explicitly - no more than telling it about adds and
removals explicitly. So the user is willing to tell us what precisely
happened and it would be foolish to throw that away and insist on
guessing. Besides, guessing (and even doing that everytime we go
through the history) is fundamentally slow, orders of magnitude more
than just a tree diff.
If I convince you that it is worth tracking the renames explicitly,
"how" is already a minor question. One idea of mine was to add an "edge"
object describing the edge between two trees (that's optimized for
flexible use - some people use GIT for really weird things and perhaps
do not use commits at all; edge between two commits would be optimized
for flexibility in case we will later think of some other cool stuff to
track at the edge):
trees c53f757133bb84a2d87e901c49207e9b7c48e1a6 6bc7aa4f652d0ef49108d9e30a7ea7fbf8e44639
rename git-pull-script\0git-pull.sh\0
copy somefile1\0somefile2\0
rewrite anotherfile1\0anotherfile2\0
(The "rewrite" line might be controversial. And you might want to
merge this with the delta objects during packing, or do something
similarly clever.)
Then you could e.g. pass the edge object ID as the second ID on the
parent line:
parent 99977bd5fdeabbd0608a70e9411c243007ec4ea2 edgeobjectid
--
Petr "Pasky" Baudis
Stuff: http://pasky.or.cz/
VI has two modes: the one in which it beeps and the one in which
it doesn't.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-22 0:37 ` Petr Baudis
2005-10-22 0:47 ` Handling renames Petr Baudis
@ 2005-10-22 1:26 ` Linus Torvalds
2005-10-22 2:56 ` Petr Baudis
2005-10-22 3:55 ` Junio C Hamano
1 sibling, 2 replies; 21+ messages in thread
From: Linus Torvalds @ 2005-10-22 1:26 UTC (permalink / raw)
To: Petr Baudis; +Cc: Junio C Hamano, Git Mailing List
On Sat, 22 Oct 2005, Petr Baudis wrote:
>
> There is no documentation for the --dense flag and it is even missing
> from the usage string.
I'm not much for docs ;)
The whole path argument also isn't even there (and without paths, --dense
doesn't matter)
> But my main concern is - will it be possible to do the rename detection
> here as well?
Yes. Note that git-rev-list doesn't actually _do_ the diff, it only checks
whether the tree is changed. You still just get a list of commits out of
it, and it is up to you to decide what to do with it.
If you're just going to feed them to diff-tree _anyway_, then you might as
well not even do the dense thing, because quite frankly, you're just doing
extra work.
But let's say that you want to follow a certain filename, what you can do
is basically (fake shell syntax with "goto restart")
rev=HEAD
restart:
git-rev-list $rev --dense --parents -- "$filename" |
while read commit parent1 restofparents
do
if [ "$restofparents" ]; then
.. it's a merge, do whatever it is you do
with merges ..
else
shaold=$(git-ls-tree $parent -- "$filename")
shanew=$(git-ls-tree $commit -- "$filename")
# Did it disappear?
# Maybe it got renamed from something
if [ -z "$shaold" ]; then
old=$(git-diff-tree -M -r $commit |
grep " R .* $filename")
echo "rename? $old"
filename=figure-it-out-from-$old
rev=$parent
goto restart
fi
git-diff-tree -p $commit -- "$filename"
fi
while
or something similar.
In other words: you'll basically have to figure out the renames on your
own, and follow the renaming, but something like the above should do it.
Linus
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: Handling renames.
2005-10-22 0:47 ` Handling renames Petr Baudis
@ 2005-10-22 1:28 ` Linus Torvalds
2005-10-22 1:51 ` Petr Baudis
0 siblings, 1 reply; 21+ messages in thread
From: Linus Torvalds @ 2005-10-22 1:28 UTC (permalink / raw)
To: Petr Baudis; +Cc: Junio C Hamano, Git Mailing List
On Sat, 22 Oct 2005, Petr Baudis wrote:
>
> How to track renames? I believe the situation has changed in the last
> half a year.
I disagree.
Every single thing that said that renames were a bad idea to track when
git started is still equally true.
> If I convince you that it is worth tracking the renames explicitly,
> "how" is already a minor question.
Never. I'm 100% convinced that tracking renames is WRONG WRONG WRONG.
You can follow renames _afterwards_.
Git tracks contents. And I think we've proven that figuring out renames
after-the-fact from those contents is not only doable, but very well
supported already.
I'm convinced that git handles renames better than any other SCM ever.
Exactly because we figure it out when it matters.
Linus
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: Handling renames.
2005-10-22 1:28 ` Linus Torvalds
@ 2005-10-22 1:51 ` Petr Baudis
2005-10-22 2:10 ` Junio C Hamano
2005-10-22 3:23 ` Linus Torvalds
0 siblings, 2 replies; 21+ messages in thread
From: Petr Baudis @ 2005-10-22 1:51 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Junio C Hamano, Git Mailing List
> Every single thing that said that renames were a bad idea to track
> when git started is still equally true.
It'd be good to clarify whether we discuss whether the idea of tracking
renames is good or bad, or whether having the user explicitly specify
renames is better than figuring that out automagically. Your first
comment would indicate the former, but the rest of your reply the
latter.
> You can follow renames _afterwards_.
I can - crudely, but what's the point, if the user is dying to give me
the information.
> Git tracks contents. And I think we've proven that figuring out renames
> after-the-fact from those contents is not only doable, but very well
> supported already.
It's unreliable and it's slow (well, perhaps I should get some numbers
to back that out, but given how it is done I take it for granted). Does
not sound too "very well" to me.
> I'm convinced that git handles renames better than any other SCM ever.
> Exactly because we figure it out when it matters.
It matters at least every time you show per-file history and every time
you merge cross the rename. I think that can be both pretty common if
you ever do the rename. That means you can do an expensive guess
every time you hit that, and the guess can get it wrong, in which case
there is no way around that and you lose.
--
Petr "Pasky" Baudis
Stuff: http://pasky.or.cz/
VI has two modes: the one in which it beeps and the one in which
it doesn't.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: Handling renames.
2005-10-22 1:51 ` Petr Baudis
@ 2005-10-22 2:10 ` Junio C Hamano
2005-10-22 2:49 ` Petr Baudis
2005-10-22 3:23 ` Linus Torvalds
1 sibling, 1 reply; 21+ messages in thread
From: Junio C Hamano @ 2005-10-22 2:10 UTC (permalink / raw)
To: Petr Baudis; +Cc: git
Petr Baudis <pasky@suse.cz> writes:
>> I'm convinced that git handles renames better than any other SCM ever.
>> Exactly because we figure it out when it matters.
>
> It matters at least every time you show per-file history and every time
> you merge cross the rename. I think that can be both pretty common if
> you ever do the rename. That means you can do an expensive guess
> every time you hit that, and the guess can get it wrong, in which case
> there is no way around that and you lose.
I think it is OK for the higher level layer (like your
single-file-history follower) to use what you outlined with
"edges", as either a hint/request from the user and/or a cache
of what the expensive and unreliable thing figured out.
I however do not necessarily think adding the "edges"
information as an optional second item on "parent " line in a
commit is a good idea. Rather, treating the set of "edges" just
like we treat the grafts feel more appropriate to me. IOW,
create .git/info/edges and keep your edge information there.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: Handling renames.
2005-10-22 2:10 ` Junio C Hamano
@ 2005-10-22 2:49 ` Petr Baudis
0 siblings, 0 replies; 21+ messages in thread
From: Petr Baudis @ 2005-10-22 2:49 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git
Dear diary, on Sat, Oct 22, 2005 at 04:10:59AM CEST, I got a letter
where Junio C Hamano <junkio@cox.net> told me that...
> I think it is OK for the higher level layer (like your
> single-file-history follower) to use what you outlined with
> "edges", as either a hint/request from the user and/or a cache
> of what the expensive and unreliable thing figured out.
>
> I however do not necessarily think adding the "edges"
> information as an optional second item on "parent " line in a
> commit is a good idea.
Well, it's obviously unflying idea when it doesn't become core part of
GIT. ;-) That won't stop me with Cogito, though...
> Rather, treating the set of "edges" just
> like we treat the grafts feel more appropriate to me. IOW,
> create .git/info/edges and keep your edge information there.
The thing is, you aren't supposed to have a lot of grafts, they are
expected to be rare. OTOH, renames aren't really all that rare, and as
the history goes, you'll accumulate a lot of them. Besides, it's not
just a cache - in case the user manually recorded the rename, it's an
important historical information, as important as e.g. the commit
message. It should be also distributed with the rest of the history
data, not be a per-repository thing. (Actually, this does not mix with
the cache at all - that should be a separate thing, quite possibly in
info/.)
But if GIT won't do this, I will have to do this in some Cogito-specific
way. My preliminary plan is appending something like this to the commit
message:
---
!%$rename foo\x20bar baz\nquux
!%$ being some arbitrary magic literal, and filter this out from cg-log
and such. Also, all Cogito commits adding/removing files but not
renaming any would have a '!%$norenames' line in the commit message, so
that I know that I don't have to do the expensive heuristics because no
rename/copy happened.
--
Petr "Pasky" Baudis
Stuff: http://pasky.or.cz/
VI has two modes: the one in which it beeps and the one in which
it doesn't.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-22 1:26 ` git-rev-list: add "--dense" flag Linus Torvalds
@ 2005-10-22 2:56 ` Petr Baudis
2005-10-22 3:30 ` Linus Torvalds
2005-10-22 3:55 ` Junio C Hamano
1 sibling, 1 reply; 21+ messages in thread
From: Petr Baudis @ 2005-10-22 2:56 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Junio C Hamano, Git Mailing List
Dear diary, on Sat, Oct 22, 2005 at 03:26:27AM CEST, I got a letter
where Linus Torvalds <torvalds@osdl.org> told me that...
> On Sat, 22 Oct 2005, Petr Baudis wrote:
> > There is no documentation for the --dense flag and it is even missing
> > from the usage string.
>
> I'm not much for docs ;)
>
> The whole path argument also isn't even there (and without paths, --dense
> doesn't matter)
Then it should be added to the docs as well. ;-)
Oh well, I guess I can always send a patch when yours will get applied. :-)
> > But my main concern is - will it be possible to do the rename detection
> > here as well?
>
> Yes. Note that git-rev-list doesn't actually _do_ the diff, it only checks
> whether the tree is changed. You still just get a list of commits out of
> it, and it is up to you to decide what to do with it.
Ok, fair enough.
> If you're just going to feed them to diff-tree _anyway_, then you might as
> well not even do the dense thing, because quite frankly, you're just doing
> extra work.
fork() is awfully expensive for me. fork()ing something (supposedly)
trivial (it was stat) slowed my (non-trivial) loop about 4 times. So I
think it will still pay off hugely if I will be able do the diff-tree
only on the interesting commits.
--
Petr "Pasky" Baudis
Stuff: http://pasky.or.cz/
VI has two modes: the one in which it beeps and the one in which
it doesn't.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: Handling renames.
2005-10-22 1:51 ` Petr Baudis
2005-10-22 2:10 ` Junio C Hamano
@ 2005-10-22 3:23 ` Linus Torvalds
1 sibling, 0 replies; 21+ messages in thread
From: Linus Torvalds @ 2005-10-22 3:23 UTC (permalink / raw)
To: Petr Baudis; +Cc: Junio C Hamano, Git Mailing List
On Sat, 22 Oct 2005, Petr Baudis wrote:
>
> > You can follow renames _afterwards_.
>
> I can - crudely, but what's the point, if the user is dying to give me
> the information.
No the user is NOT.
The fact is, users have not a frigging clue when a rename happens.
I told you before, I'll tell you again: if you depend on users telling you
about renames, you'll get it wrong. You'll get it wrong quite often, in
fact.
This is not something I'm going to discuss again. Go back to all the same
arguments from 6 months ago. I was right then, I'm right now.
Linus
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-22 2:56 ` Petr Baudis
@ 2005-10-22 3:30 ` Linus Torvalds
0 siblings, 0 replies; 21+ messages in thread
From: Linus Torvalds @ 2005-10-22 3:30 UTC (permalink / raw)
To: Petr Baudis; +Cc: Junio C Hamano, Git Mailing List
On Sat, 22 Oct 2005, Petr Baudis wrote:
>
> > If you're just going to feed them to diff-tree _anyway_, then you might as
> > well not even do the dense thing, because quite frankly, you're just doing
> > extra work.
>
> fork() is awfully expensive for me. fork()ing something (supposedly)
> trivial (it was stat) slowed my (non-trivial) loop about 4 times.
Not fork. More like
git-rev-list .. | git-diff-tree --stdin
If you do it that way (which is quite common - git-whatchanged, git-log
etc), there's no reason to ask for a dense output, because git-diff-tree
can handle the non-dense one as fast as git-rev-list can.
But the --dense option is wonderful for "gitk", and for "slow stuff".
Where "slow" can indeed be a fork, or just an interpreter loop like a
shell/perl script, which would fork/exec git-diff-tree for each commit.
>From a commit-list standpoint
git-rev-list --dense ... -- paths
is the same as
git-rev-list -- paths | git-diff-tree -s -r --stdin -- paths
but it's somewhat more efficient, and more importantly, it rewrites the
parents. So quite often, you'd want to use "--dense" together with
"--parent" so that you get the "rewritten" history. That's how come gitk
didn't need any changes, and it "just worked". It's also how my example
script did the rename detection without having to traverse unnecessary
commits.
Linus
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-22 1:26 ` git-rev-list: add "--dense" flag Linus Torvalds
2005-10-22 2:56 ` Petr Baudis
@ 2005-10-22 3:55 ` Junio C Hamano
2005-10-22 20:37 ` Linus Torvalds
1 sibling, 1 reply; 21+ messages in thread
From: Junio C Hamano @ 2005-10-22 3:55 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Petr Baudis, Git Mailing List
Linus Torvalds <torvalds@osdl.org> writes:
> Yes. Note that git-rev-list doesn't actually _do_ the diff, it only checks
> whether the tree is changed. You still just get a list of commits out of
> it, and it is up to you to decide what to do with it.
>
> If you're just going to feed them to diff-tree _anyway_, then you might as
> well not even do the dense thing, because quite frankly, you're just doing
> extra work.
>
> But let's say that you want to follow a certain filename, what you can do
> is basically (fake shell syntax with "goto restart")
>...
> or something similar.
Isn't that only true because you are not doing more than "have
these paths change" in the new rev-list that already has part of
diff-tree?
If rev-list can optionally be told to detect renames internally
(it has necessary bits after all), it could adjust the set of
paths to follow when it sees something got renamed, either by
replacing the original path given from the command line with its
previous name, or adding its previous name to the set of path
limitters (to cover the copy case as well).
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-22 3:55 ` Junio C Hamano
@ 2005-10-22 20:37 ` Linus Torvalds
0 siblings, 0 replies; 21+ messages in thread
From: Linus Torvalds @ 2005-10-22 20:37 UTC (permalink / raw)
To: Junio C Hamano; +Cc: Petr Baudis, Git Mailing List
On Fri, 21 Oct 2005, Junio C Hamano wrote:
>
> If rev-list can optionally be told to detect renames internally
> (it has necessary bits after all), it could adjust the set of
> paths to follow when it sees something got renamed, either by
> replacing the original path given from the command line with its
> previous name, or adding its previous name to the set of path
> limitters (to cover the copy case as well).
The problem with renames isn't the straight-line case.
The problem with renames is the merge case. And quite frankly, I don't
know how to handle that sanely.
If everything was straight-line (CVS), renames would be trivial. But
git-rev-list very fundamentally works on something that isn't. So let's
look at the _fundamental_ problem:
- git-rev-list traverses one commit at a time, and it doesn't even _know_
whether it has seem all the parents of that commit during the first
phase.
The first phase is the "limit_list()" thing, which is when we decide
which commits are reachable, uninteresting, and which merges to follow.
Now, think about this: since we don't even know that we've seen all
parents, that pretty much by definition means that we can't know what
has been renamed if we were to track it.
- in the second phase (which is where I do the densification), we do
actually have the full tree and that makes a lot of things a lot easier
to do. When it comes to dense, for example, it means that I know all
the UNINTERESTING logic has already been done, and that all merges have
been simplified.
But in the second phase, we couldn't do rename detection either, since
by then, we've already fixed the list of names as far as merges are
concerned.
And note that this fundamental issue is true _whether_ we have some
explicit rename information in a commit or not.
Git-rev-list has a few additional issues that make it even more
interesting:
- git-rev-list fundamentally is designed for multiple heads. There is no
one special "origin" head. We might not have _one_ end-point, we might
have twenty-five different heads we're looking at at the same time. Try
gitk --all --dense -d -- git-fetch.c
on the current git archive, and be impressed by how well it works (and
yes, you'll see a funny artifact: "--dense" never removes a line of
development entirely, so you'll see a single dot for the two
"unrelated" lines: "gitk" and "to-do" do not share a root with the rest
of them. You'll get a single dot for that root, even if it
obviously doesn't actually change "git-fetch.c").
You'll also very clearly see the "Big tool rename" which created that
name: it will be the first commit after the root that is visible that
way.
- path limiters are fundamentally designed to take a list of paths and
directories. Personally, I find that to be a lot more important than a
single file. That's very efficient the way we do it, but if you were to
_change_ the list of paths, you'd basically have to track those changes
over history.
(This actually happens even with a single path - imagine a merge, and a
rename down one line of the merge. You'll have to keep track of
different files for the different lines of development, and then when
they join together, and the rename only happened in one of them, you'll
have to make an executive decision, or just continue to track _both_)
So what do I propose? I propose that you realize that "git-rev-list" is
just the _building_ block. It does one thing, and one thing only: it
creates revision list.
A user basically _never_ uses git-rev-list directly: it just doesn't make
sense. It's not what git-rev-list is there for. git-rev-list is meant to
be used as a base for doing the real work. And that's how you can use it
for renaming too.
If you think of "git-rev-list --dense -- name" as a fast way to get a set
of commits that affect "name", suddenly it all makes sense. You suddenly
realize that that's a nice building block for figuring out renames. It's
not _all_ of it, but it's a big portion.
To go back to "gitk", let's see what the path limitation shows us. Right
now, doing a
gitk --all --dense -d -- git-fetch.sh
only shows that particular name, and that's by design. Maybe that's what
the user wants? You have to realize that especially if you remember an
_old_ name that may not even exist any more, that's REALLY what you want.
Something that works more like "annotate" is useless, because something
that works like annotate would just say "I don't have that file, I can't
follow renames" and exit.
So the first lesson to learn is that following just pure path-names is
actually meaningful ON ITS OWN! Sometimes you do NOT want to follow
renames.
For example, let's say that you used to work on git 4 months ago, but gave
up, since it was too rough for you. But you played around with it, and now
you're back, and you have an old patch that you want to re-apply, but it
doesn't talk about "git-fetch.sh", it talks about "git-fetch-script". So
you do
gitk --dense -- git-fetch-script
and voila, it does the right thing, and top of the thing is "Big tool
rename", which tells you _exactly_ what happened to that PATHNAME.
See? Static pathnames are important!
Now, this does show that when you _do_ care about renames, "gitk" right
now doesn't help you very much (no offense to gitk - how could it know
that git-rev-list would give it pathname detection one day?). Let's
go back to the original example, and see what we could do to make gitk
more useful..
gitk --all --dense -d -- git-fetch.sh
Go and select that "Big tool rename" thing, and start looking for the
rename..
You won't find it. Why? You'll see all-new files, no renames. It turns out
that "gitk" follows the "parent" thing a bit _too_ slavishly, which is
correct on one level, but in this case with "--dense" it turns out that
what you want to do is see only what _that_ commit does, not what that
commit did relative to its parent (which was the initial revision).
So while "--dense" made gitk work even without any changes, it's clear
that the new capability means that gitk might want to have a new button:
"show diff against 'fake parent'" and "show diff against 'real parent'".
If you want the global view, the default gitk behaviour is correct (it
will show a "valid" diff - you'll see everything that changed between the
points it shows). But in a rename-centric world, you want the _local_
change to that commit, and right now gitk can't show you that.
So for trackign renames, we probably want that as a helper to gitk.
Also, gitk has a "re-read references" button, but if you track renames,
you probably want to do more than re-read them: you want to re-DO them
with the "Big rename" as the new head (forget the old references
entirely), and with the name list changed. New functionality (possibly
you'd like to havea "New wiew" button, which actually starts a new gitk
so that you can see both of them). Right now you'd have to do it by hand:
gitk --dense 215a7ad1ef790467a4cd3f0dcffbd6e5f04c38f7 -- git-fetch-script
(where 215a.. is the thing you get when you select the "Big tool rename")
You'd also probably like to have some way to limit the names shown for the
git-diff-tree in gitk.
In short, the new "gitk --dense -- filename" doesn't help you nearly as
much as it _could_. But when you squint a bit, I think you'll see how it's
quite possible to do...
Linus
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
@ 2005-10-23 19:30 Marco Costalba
0 siblings, 0 replies; 21+ messages in thread
From: Marco Costalba @ 2005-10-23 19:30 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Git Mailing List
Linus Torvalds wrote:
>
>And it scales pretty well too. On the historical linux archive, which is
>three years of history, the same thing takes me just over 12 seconds and
>52MB, and that's for the _whole_ history. And it's not just following one
>file: it's following that subdirectory.
>
>So it really is pretty damn cool.
>
Yes, it is. Very powerful and useful tool indeed. IMHO kudos!
>Of course, I might have a bug somewhere, but it all _seems_ to work very
>well indeed.
>
The only bug I found is in qgit ;-) that failed to correctly handle --dense option.
It is now fixed in my GIT archive: http://digilander.libero.it/mcostalba/qgit.git
Marco
__________________________________
Yahoo! FareChase: Search multiple travel sites in one click.
http://farechase.yahoo.com
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-21 23:40 git-rev-list: add "--dense" flag Linus Torvalds
` (2 preceding siblings ...)
2005-10-22 0:37 ` Petr Baudis
@ 2005-10-25 18:07 ` Jonas Fonseca
2005-10-25 18:23 ` Linus Torvalds
3 siblings, 1 reply; 21+ messages in thread
From: Jonas Fonseca @ 2005-10-25 18:07 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Junio C Hamano, Git Mailing List
Linus Torvalds <torvalds@osdl.org> wrote Fri, Oct 21, 2005:
> To see this in action, try something like
>
> gitk --dense -- gitk
>
> to see just the history that affects gitk.
Is the initial commit supposed to be listed when the file has been added
later? I would expect it to only list until (and including) the commit
where the file was introduced.
prompt > git-rev-list --dense HEAD -- compat/mmap.c
f48000fcbe1009c18f1cc46e56cde2cb632071fa
730d48a2ef88a7fb7aa4409d40b1e6964a93267f
e83c5163316f89bfbde7d9ab23ca2e25604af290
prompt > git-cat-file commit e83c
tree 2b5bfdf7798569e0b59b16eb9602d5fa572d6038
author Linus Torvalds <torvalds@ppc970.osdl.org> 1112911993 -0700
committer Linus Torvalds <torvalds@ppc970.osdl.org> 1112911993 -0700
Initial revision of "git", the information manager from hell
Interestingly, for the special gitk, the correct commit is the last rev
listed.
--
Jonas Fonseca
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-25 18:07 ` Jonas Fonseca
@ 2005-10-25 18:23 ` Linus Torvalds
2005-10-25 18:40 ` Jonas Fonseca
2005-10-25 18:50 ` Linus Torvalds
0 siblings, 2 replies; 21+ messages in thread
From: Linus Torvalds @ 2005-10-25 18:23 UTC (permalink / raw)
To: Jonas Fonseca; +Cc: Junio C Hamano, Git Mailing List
On Tue, 25 Oct 2005, Jonas Fonseca wrote:
>
> Is the initial commit supposed to be listed when the file has been added
> later?
Right now --dense will _always_ show the root commit. I didn't do the
logic that does the diff against an empty tree. I was lazy.
This patch does that, and may or may not work.
Does this match what you expected?
Linus
----
diff --git a/rev-list.c b/rev-list.c
index 5f125fd..038dae2 100644
--- a/rev-list.c
+++ b/rev-list.c
@@ -439,6 +439,30 @@ static int same_tree(struct tree *t1, st
return !is_different;
}
+static int same_tree_as_empty(struct tree *t1)
+{
+ int retval;
+ void *tree;
+ struct tree_desc empty, real;
+
+ if (!t1)
+ return 0;
+
+ tree = read_object_with_reference(t1->object.sha1, "tree", &real.size, NULL);
+ if (!tree)
+ return 0;
+ real.buf = tree;
+
+ empty.buf = "";
+ empty.size = 0;
+
+ is_different = 0;
+ retval = diff_tree(&empty, &real, "", &diff_opt);
+ free(tree);
+
+ return !retval && !is_different;
+}
+
static struct commit *try_to_simplify_merge(struct commit *commit, struct commit_list *parent)
{
if (!commit->tree)
@@ -523,11 +547,17 @@ static void compress_list(struct commit_
struct commit_list *parent = commit->parents;
list = list->next;
+ if (!parent) {
+ if (!same_tree_as_empty(commit->tree))
+ commit->object.flags |= TREECHANGE;
+ continue;
+ }
+
/*
* Exactly one parent? Check if it leaves the tree
* unchanged
*/
- if (parent && !parent->next) {
+ if (!parent->next) {
struct tree *t1 = commit->tree;
struct tree *t2 = parent->item->tree;
if (!t1 || !t2 || same_tree(t1, t2))
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-25 18:23 ` Linus Torvalds
@ 2005-10-25 18:40 ` Jonas Fonseca
2005-10-25 18:52 ` Linus Torvalds
2005-10-25 18:50 ` Linus Torvalds
1 sibling, 1 reply; 21+ messages in thread
From: Jonas Fonseca @ 2005-10-25 18:40 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Junio C Hamano, Git Mailing List
Linus Torvalds <torvalds@osdl.org> wrote Tue, Oct 25, 2005:
> On Tue, 25 Oct 2005, Jonas Fonseca wrote:
> >
> > Is the initial commit supposed to be listed when the file has been added
> > later?
>
> Right now --dense will _always_ show the root commit. I didn't do the
> logic that does the diff against an empty tree. I was lazy.
>
> This patch does that, and may or may not work.
Without the workaround below it segfaults.
> Does this match what you expected?
Yes, thanks.
diff --git a/rev-list.c b/rev-list.c
index 5f125fd..82ec656 100644
--- a/rev-list.c
+++ b/rev-list.c
@@ -87,6 +87,7 @@ static void rewrite_one(struct commit **
struct commit *p = *pp;
if (p->object.flags & (TREECHANGE | UNINTERESTING))
return;
+ if (!p->parents) return;
/* Only single-parent commits don't have TREECHANGE */
*pp = p->parents->item;
}
--
Jonas Fonseca
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-25 18:23 ` Linus Torvalds
2005-10-25 18:40 ` Jonas Fonseca
@ 2005-10-25 18:50 ` Linus Torvalds
1 sibling, 0 replies; 21+ messages in thread
From: Linus Torvalds @ 2005-10-25 18:50 UTC (permalink / raw)
To: Jonas Fonseca; +Cc: Junio C Hamano, Git Mailing List
On Tue, 25 Oct 2005, Linus Torvalds wrote:
>
> Right now --dense will _always_ show the root commit. I didn't do the
> logic that does the diff against an empty tree. I was lazy.
>
> This patch does that, and may or may not work.
Never mind. It's incorrect.
It's -close- to being correct, but it will SIGSEGV because now the root
won't necessarily have the TREECHANGED flag, so now it can follow the root
down to its parents (which is a NULL pointer - that's the definition of a
root commit, of course).
It needs to remove the parent pointers that become NULL.
This patch is even slightly tested, and might do a better job.
[ Sorry for sending untested crap out, but I have a fairly good track
record in general. Testing is for wimps ]
Linus
---
diff --git a/rev-list.c b/rev-list.c
index 5f125fd..edf3b37 100644
--- a/rev-list.c
+++ b/rev-list.c
@@ -81,23 +81,28 @@ static void show_commit(struct commit *c
fflush(stdout);
}
-static void rewrite_one(struct commit **pp)
+static int rewrite_one(struct commit **pp)
{
for (;;) {
struct commit *p = *pp;
if (p->object.flags & (TREECHANGE | UNINTERESTING))
- return;
- /* Only single-parent commits don't have TREECHANGE */
+ return 0;
+ if (!p->parents)
+ return -1;
*pp = p->parents->item;
}
}
static void rewrite_parents(struct commit *commit)
{
- struct commit_list *parent = commit->parents;
- while (parent) {
- rewrite_one(&parent->item);
- parent = parent->next;
+ struct commit_list **pp = &commit->parents;
+ while (*pp) {
+ struct commit_list *parent = *pp;
+ if (rewrite_one(&parent->item) < 0) {
+ *pp = parent->next;
+ continue;
+ }
+ pp = &parent->next;
}
}
@@ -439,6 +444,30 @@ static int same_tree(struct tree *t1, st
return !is_different;
}
+static int same_tree_as_empty(struct tree *t1)
+{
+ int retval;
+ void *tree;
+ struct tree_desc empty, real;
+
+ if (!t1)
+ return 0;
+
+ tree = read_object_with_reference(t1->object.sha1, "tree", &real.size, NULL);
+ if (!tree)
+ return 0;
+ real.buf = tree;
+
+ empty.buf = "";
+ empty.size = 0;
+
+ is_different = 0;
+ retval = diff_tree(&empty, &real, "", &diff_opt);
+ free(tree);
+
+ return retval >= 0 && !is_different;
+}
+
static struct commit *try_to_simplify_merge(struct commit *commit, struct commit_list *parent)
{
if (!commit->tree)
@@ -523,11 +552,17 @@ static void compress_list(struct commit_
struct commit_list *parent = commit->parents;
list = list->next;
+ if (!parent) {
+ if (!same_tree_as_empty(commit->tree))
+ commit->object.flags |= TREECHANGE;
+ continue;
+ }
+
/*
* Exactly one parent? Check if it leaves the tree
* unchanged
*/
- if (parent && !parent->next) {
+ if (!parent->next) {
struct tree *t1 = commit->tree;
struct tree *t2 = parent->item->tree;
if (!t1 || !t2 || same_tree(t1, t2))
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: git-rev-list: add "--dense" flag
2005-10-25 18:40 ` Jonas Fonseca
@ 2005-10-25 18:52 ` Linus Torvalds
0 siblings, 0 replies; 21+ messages in thread
From: Linus Torvalds @ 2005-10-25 18:52 UTC (permalink / raw)
To: Jonas Fonseca; +Cc: Junio C Hamano, Git Mailing List
On Tue, 25 Oct 2005, Jonas Fonseca wrote:
>
> Without the workaround below it segfaults.
Yes, but your patch only partly helps. It fixes the SIGSEGV, but it still
needs to remove the "parents" pointer when the parent ends up NULL.
The patch I just sent out should be better. I think.
Linus
^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2005-10-25 18:52 UTC | newest]
Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-10-21 23:40 git-rev-list: add "--dense" flag Linus Torvalds
2005-10-21 23:47 ` Linus Torvalds
2005-10-21 23:55 ` Linus Torvalds
2005-10-22 0:37 ` Petr Baudis
2005-10-22 0:47 ` Handling renames Petr Baudis
2005-10-22 1:28 ` Linus Torvalds
2005-10-22 1:51 ` Petr Baudis
2005-10-22 2:10 ` Junio C Hamano
2005-10-22 2:49 ` Petr Baudis
2005-10-22 3:23 ` Linus Torvalds
2005-10-22 1:26 ` git-rev-list: add "--dense" flag Linus Torvalds
2005-10-22 2:56 ` Petr Baudis
2005-10-22 3:30 ` Linus Torvalds
2005-10-22 3:55 ` Junio C Hamano
2005-10-22 20:37 ` Linus Torvalds
2005-10-25 18:07 ` Jonas Fonseca
2005-10-25 18:23 ` Linus Torvalds
2005-10-25 18:40 ` Jonas Fonseca
2005-10-25 18:52 ` Linus Torvalds
2005-10-25 18:50 ` Linus Torvalds
-- strict thread matches above, loose matches on Subject: below --
2005-10-23 19:30 Marco Costalba
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).