Git development
 help / color / mirror / Atom feed
* Re: [PATCH] add a diff-files command (revised and cleaned up)
From: Nicolas Pitre @ 2005-04-28 17:35 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Junio C Hamano, git
In-Reply-To: <Pine.LNX.4.58.0504280950340.18901@ppc970.osdl.org>

On Thu, 28 Apr 2005, Linus Torvalds wrote:

> HOWEVER, there clearly is a separate problem, which is what "show-files"  
> currently does very badly (and not at all in some cases), which is the 
> "ok, what about the _other_ files?"

Right.  And that's the problem I'm trying to solve.

What about this patch then?

=====

Give show-files the ability to process exclusion pattern.

This can be used with the famous dontdiff file as follows to find out 
about uncommitted files just like dontdiff is used with the diff 
command:

    show-files --others --exclude-from=dontdiff

and the exclude list can be reversed with the --ignore switch.

Signed-off-by: Nicolas Pitre <nico@cam.org>

--- a/show-files.c
+++ b/show-files.c
@@ -6,6 +6,7 @@
  * Copyright (C) Linus Torvalds, 2005
  */
 #include <dirent.h>
+#include <fnmatch.h>
 
 #include "cache.h"
 
@@ -17,6 +18,70 @@ static int show_stage = 0;
 static int show_unmerged = 0;
 static int line_terminator = '\n';
 
+static int nr_excludes;
+static const char **excludes;
+static int excludes_alloc;
+
+static void add_exclude(const char *string)
+{
+	if (nr_excludes == excludes_alloc) {
+		excludes_alloc = alloc_nr(excludes_alloc);
+		excludes = realloc(excludes, excludes_alloc*sizeof(char *));
+	}
+	excludes[nr_excludes++] = string;
+}
+
+static void add_excludes_from_file(const char *fname)
+{
+	int fd, i;
+	long size;
+	char *buf, *entry;
+
+	fd = open(fname, O_RDONLY);
+	if (fd < 0)
+		goto err;
+	size = lseek(fd, 0, SEEK_END);
+	if (size < 0)
+		goto err;
+	lseek(fd, 0, SEEK_SET);
+	if (size == 0) {
+		close(fd);
+		return;
+	}
+	buf = xmalloc(size);
+	if (read(fd, buf, size) != size)
+		goto err;
+	close(fd);
+
+	entry = buf;
+	for (i = 0; i < size; i++) {
+		if (buf[i] == '\n') {
+			if (entry != buf + i) {
+				buf[i] = 0;
+				add_exclude(entry);
+			}
+			entry = buf + i + 1;
+		}
+	}
+	return;
+
+err:	perror(fname);
+	exit(1);
+}
+
+static int excluded(const char *pathname)
+{
+	int i;
+	if (nr_excludes) {
+		const char *basename = strrchr(pathname, '/');
+		basename = (basename) ? basename+1 : pathname;
+		for (i = 0; i < nr_excludes; i++)
+			if (fnmatch(excludes[i], basename, 0) == 0)
+				return 1;
+	}
+	return 0;
+}
+
 static const char **dir;
 static int nr_dir;
 static int dir_alloc;
@@ -59,6 +124,8 @@ static void read_directory(const char *p
 
 			if (de->d_name[0] == '.')
 				continue;
+			if (excluded(de->d_name) != show_ignored)
+				continue;
 			len = strlen(de->d_name);
 			memcpy(fullname + baselen, de->d_name, len+1);
 
@@ -101,17 +168,17 @@ static void show_files(void)
 	int i;
 
 	/* For cached/deleted files we don't need to even do the readdir */
-	if (show_others | show_ignored) {
+	if (show_others) {
 		read_directory(".", "", 0);
 		qsort(dir, nr_dir, sizeof(char *), cmp_name);
-	}
-	if (show_others) {
 		for (i = 0; i < nr_dir; i++)
 			printf("%s%c", dir[i], line_terminator);
 	}
 	if (show_cached | show_stage) {
 		for (i = 0; i < active_nr; i++) {
 			struct cache_entry *ce = active_cache[i];
+			if (excluded(ce->name) != show_ignored)
+				continue;
 			if (show_unmerged && !ce_stage(ce))
 				continue;
 			if (!show_stage)
@@ -130,14 +197,13 @@ static void show_files(void)
 		for (i = 0; i < active_nr; i++) {
 			struct cache_entry *ce = active_cache[i];
 			struct stat st;
+			if (excluded(ce->name) != show_ignored)
+				continue;
 			if (!stat(ce->name, &st))
 				continue;
 			printf("%s%c", ce->name, line_terminator);
 		}
 	}
-	if (show_ignored) {
-		/* We don't have any "ignore" list yet */
-	}
 }
 
 int main(int argc, char **argv)
@@ -179,11 +245,34 @@ int main(int argc, char **argv)
 			continue;
 		}
 
-		usage("show-files [-z] (--[cached|deleted|others|ignored|stage])*");
+		if (!strcmp(arg, "-x") && i+1 < argc) {
+			add_exclude(argv[++i]);
+			continue;
+		}
+		if (!strncmp(arg, "--exclude=", 10)) {
+			add_exclude(arg+10);
+			continue;
+		}
+		if (!strcmp(arg, "-X") && i+1 < argc) {
+			add_excludes_from_file(argv[++i]);
+			continue;
+		}
+		if (!strncmp(arg, "--exclude-from=", 15)) {
+			add_excludes_from_file(arg+15);
+			continue;
+		}
+
+		usage("show-files [-z] (--[cached|deleted|others|stage])* "
+		      "[ --ignored [--exclude=<pattern>] [--exclude-from=<file>) ]");
+	}
+
+	if (show_ignored && !nr_excludes) {
+		fprintf(stderr, "%s: --ignored needs some exclude pattern\n", argv[0]);
+		exit(1);
 	}
 
 	/* With no flags, we default to showing the cached files */
-	if (!(show_stage | show_deleted | show_others | show_ignored | show_unmerged))
+	if (!(show_stage | show_deleted | show_others | show_unmerged))
 		show_cached = 1;
 
 	read_cache();

^ permalink raw reply

* Re: Finding file revisions
From: Thomas Glanzmann @ 2005-04-28 17:22 UTC (permalink / raw)
  To: git
In-Reply-To: <12c511ca050428101070e12e74@mail.gmail.com>

Hello,

> Looks very useful.  Would it be possible to display the date (from the
> commit) instead of the 40-hex-char blobname (but have the link still
> point to the blob).  Like this:

First of all there is a date on the site and second I think the sha1
hash much more useful than the date.

	Thomas

^ permalink raw reply

* Re: [PATCH 0/3] Merge leftover bits
From: Junio C Hamano @ 2005-04-28 17:20 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: git
In-Reply-To: <Pine.LNX.4.58.0504281016390.18901@ppc970.osdl.org>

>>>>> "LT" == Linus Torvalds <torvalds@osdl.org> writes:

LT> On Thu, 28 Apr 2005, Junio C Hamano wrote:
>> 
>> This series is a resend of various fixes rediffed against your
>> HEAD this morning.

LT> I actually just applied (and pushed out) your older versions.

What I prepared and were going to send are just a rediff against
2f97813870c73a89b673ea7882f2a078d25c2dcd which did not have
these three bits, so if you have not forgotten about them then
it's fine.  I'll not re-send them.

LT> Can you check my current tree, especially if you made some
LT> improvements in the meantime?

No improvements in the meantime but I'll let you know if I see
any problems.


^ permalink raw reply

* Re: [PATCH] add a diff-files command
From: Junio C Hamano @ 2005-04-28 17:16 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Linus Torvalds, git
In-Reply-To: <Pine.LNX.4.62.0504281238130.14033@localhost.localdomain>

>>>>> "NP" == Nicolas Pitre <nico@cam.org> writes:

NP> ...  And yesterday I realized that the (currently unimplemented) 
NP> --ignore switch to show-files, combined with the exclusion pattern list, 
NP> whould be more logical than teaching show-diff (which I still think is a 
NP> misnamer in the context of the other diff tools) about files unknown to 
NP> the cache.  The patch to show-files is also much smaller and logical.

I agree wholeheartedly with both counts.  (1) Linus and I
discussed briefly about renaming show-diff to diff-files but it
is on hold, waiting for a big wholesale rename.  (2) the logical
place for the -X and -x is "show-files --ignore".


^ permalink raw reply

* Re: [PATCH 0/3] Merge leftover bits
From: Linus Torvalds @ 2005-04-28 17:17 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git
In-Reply-To: <7voeby968l.fsf@assigned-by-dhcp.cox.net>



On Thu, 28 Apr 2005, Junio C Hamano wrote:
>
> This series is a resend of various fixes rediffed against your
> HEAD this morning.

I actually just applied (and pushed out) your older versions.

Can you check my current tree, especially if you made some improvements in 
the meantime?

		Linus

^ permalink raw reply

* [PATCH 0/3] Merge leftover bits
From: Junio C Hamano @ 2005-04-28 17:11 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: git

This series is a resend of various fixes rediffed against your
HEAD this morning.

    [PATCH 1/3] diff-cache.c compilation warning fix.
    [PATCH 2/3] diff.c: clean temporary files
    [PATCH 3/3] diff-tree-helper: ignore unmerged path outside specification.


^ permalink raw reply

* Re: Finding file revisions
From: Tony Luck @ 2005-04-28 17:10 UTC (permalink / raw)
  To: Kay Sievers; +Cc: Chris Mason, Linus Torvalds, git
In-Reply-To: <1114706099.4212.25.camel@localhost.localdomain>

> Not really happy clicky, but ... :)
> 
> Look at the (history) link:
>   http://ehlo.org/~kay/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=fb3b4ebc0be618dbcc2326482a83c920d51af7de

Looks very useful.  Would it be possible to display the date (from the
commit) instead of
the 40-hex-char blobname (but have the link still point to the blob). 
Like this:

2005-04-27 [PATCH] USB: MODALIAS change for bcdDevice
2005-04-26 Merge with
kernel.org:/pub/scm/linux/kernel/git/gregkh/driver-2.6.git/
2005-04-26 Merge with kernel.org:/pub/scm/linux/kernel/git/gregkh/aoe-2.6.git/

That way you'd trade some screen space that is filled with hex numbers for some
useful information.  Dates could either be absolute (as in my
example), or relative
("4 hours ago", "2 weeks ago", etc.)

-Tony

^ permalink raw reply

* Re: [PATCH] add a diff-files command (revised and cleaned up)
From: Linus Torvalds @ 2005-04-28 17:08 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git
In-Reply-To: <Pine.LNX.4.62.0504272239420.14033@localhost.localdomain>



On Wed, 27 Apr 2005, Nicolas Pitre wrote:
>
> In the same spirit as diff-tree and diff-cache, here is a diff-files 
> command that processes differences between the index cache and the 
> working directory content.

I really think the current "show-diff" does that very well, and what 
you're doing is really different.

I think this thing is really a replacement for "show-files", which is a 
piece of crap (hey, I wrote it, but I don't have to be proud of it), and 
which really was meant to be more of what your diff-files is.

The thing is, I really don't want the "core" diff-xxx programs to worry 
about exclude patters, and current directory contents. They do one thing, 
and one thing only: compare the files they were explicitly told to 
compare. 

HOWEVER, there clearly is a separate problem, which is what "show-files"  
currently does very badly (and not at all in some cases), which is the 
"ok, what about the _other_ files?"

And once you start talking about files that are _not_ mentioned in the
index, now you really do have something totally different, and now it does
need to be able to have exclude patterns to know to avoid object files and
other crud that we know we're not interested in).

But for the crud we don't know about, we're not really interested in the
diff against something we _do_ know about. So I think that the whole
"--others" and "--all" thing is wrong (yeah, yeah, it was me that started
it with show-files), and that this thing should always _only_ look at
files that aren't mentioned in the index file (ie "others" is always
enabled, and "all" is pointless).

Because those are special files: they are files we don't know what to do
with (conversely, files that _are_ mentioned in the index but don't 
actually seem to show up are interesting for the exact same reason).

That set of files is interesting for several reasons:

 - maybe we're about to check something in. We want to know whether maybe 
   we've forgotten to "add" a file or "remove" a file.

 - is it a file we've lost track of, and if so, does it look anything like 
   some _other_ file in the index. This happens if you do a "mv", and 
   don't tell the SCM about it: git doesn't care (it looks like a remove 
   and add), but it would be good to have a tool that warns about it and 
   then it would be nice if it could actually say "files xxx and yyy seem 
   to have gone away, but I see new files aaa and bbb, and it looks like 
   bbb might be similar to yyy, and aaa looks a lot like the old xxx"

This was all stuff that "show-files" was kind of supposed to work up to, 
but I just couldn't find it in myself to be interested enough.

		Linus

^ permalink raw reply

* Re: Finding file revisions
From: Chris Mason @ 2005-04-28 17:05 UTC (permalink / raw)
  To: Daniel Barkalow; +Cc: git, David Woodhouse, Kay Sievers
In-Reply-To: <Pine.LNX.4.21.0504281147500.30848-100000@iabervon.org>

On Thursday 28 April 2005 12:08, Daniel Barkalow wrote:
> On Wed, 27 Apr 2005, Chris Mason wrote:
> > I haven't seen a tool yet to find which changeset modified a given file,
> > so I whipped up something.  The basic idea is to:
>
> What is the answer supposed to be in the presence of merges? It seems like
> you shouldn't report the merge that brought in the change, but rather
> (assuming it's available) the changeset that originally made it.

Based on comments from Linus I did make it a little more merge aware.  But 
since my tool was just to tide me over until someone fixed things in gui 
form, I didn't want to kill off too many brain cells coding it.

It sounds as though David's script is already has more merge brains then mine, 
and the git web stuff is pretty slick.  So it seems I didn't look hard enough 
before...

-chris

^ permalink raw reply

* Re: [PATCH] add a diff-files command
From: Nicolas Pitre @ 2005-04-28 16:56 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Linus Torvalds, git
In-Reply-To: <7vr7gu97xq.fsf@assigned-by-dhcp.cox.net>

On Thu, 28 Apr 2005, Junio C Hamano wrote:

> If you want to see if working tree has some junk other than
> those listed in dontdiff, wouldn't this be sufficient?
> 
>   $ show-files --others | grep -f dontdiff

Well, it would work if the dontdiff file other people are maintaining 
was made up of regexps.  But it is made of shell wildcard patterns meant 
to be used with the -X switch of the diff command.

> Again, "checking for potentially uncommitted" files is what
> you use show-files --others for, not show-diff.

Indeed.  And yesterday I realized that the (currently unimplemented) 
--ignore switch to show-files, combined with the exclusion pattern list, 
whould be more logical than teaching show-diff (which I still think is a 
misnamer in the context of the other diff tools) about files unknown to 
the cache.  The patch to show-files is also much smaller and logical.

BTW, I don't do this out of pure entousiasm but rather trying to make my 
own workflow with the Linux kernel source tree more efficient in the 
context of git usage.  My pure coding entousiasm lies somewhere else.  


Nicolas

^ permalink raw reply

* Re: Finding file revisions
From: Kay Sievers @ 2005-04-28 16:34 UTC (permalink / raw)
  To: Chris Mason; +Cc: Linus Torvalds, git
In-Reply-To: <200504280745.05505.mason@suse.com>

On Thu, 2005-04-28 at 07:45 -0400, Chris Mason wrote:
> On Wednesday 27 April 2005 18:19, Linus Torvalds wrote:
> > On Wed, 27 Apr 2005, Chris Mason wrote:
> > > So, new prog attached.  New usage:
> > >
> > > file-changes [-c commit_id] [-s commit_id] file ...
> > >
> > > -c is the commit where you want to start searching
> > > -s is the commit where you want to stop searching
> >
> > Your script will do some funky stuff, because you incorrectly think that
> > the rev-list is sorted linearly. It's not. It's sorted in a rough
> > chronological order, but you really can't do the "last" vs "cur" thing
> > that you do, because two commits after each other in the rev-list listing
> > may well be from two totally different branches, so when you compare one
> > tree against the other, you're really doing something pretty nonsensical.
> 
> One more rev that should work as you suggested Here's the example output 
> from a cogito changeset with merges.  I print the diff-tree lines once for each 
> matching parent and then print the commit once.  It's very primitive, but
> hopefully some day someone will make a gui with happy clicky buttons
> for changesets and filerevs.

Not really happy clicky, but ... :)

Look at the (history) link:
  http://ehlo.org/~kay/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=fb3b4ebc0be618dbcc2326482a83c920d51af7de

Kay


^ permalink raw reply

* Re: [PATCH] add a diff-files command
From: Junio C Hamano @ 2005-04-28 16:34 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Linus Torvalds, git
In-Reply-To: <Pine.LNX.4.62.0504272031330.14033@localhost.localdomain>

>>>>> "NP" == Nicolas Pitre <nico@cam.org> writes:

Having thought about it more, although I praise your enthusiasm
to improve git, I suspect your diff-files is a solution to a
problem that does not exist.

NP> It also has the ability to accept exclude file patterns with
NP> -x and even a file containing a list of patterns to exclude
NP> with -X.  This is especially useful to use the famous
NP> dontdiff file when looking for uncommitted files in a
NP> compiled kernel tree.

If you want to see if working tree has some junk other than
those listed in dontdiff, wouldn't this be sufficient?

  $ show-files --others | grep -f dontdiff

NP> First, show-diff doesn't handle files in the work tree which
NP> are not listed in the cache.

That's the whole point of git (and show-diff).  If it is not
listed in the cache, it does not exist.

NP> ...  So trust me it _is_ pretty damn useful, unless you
NP> always run "make clean" on your kernel tree before checking
NP> for potentially uncommitted files then recompile everything
NP> afterwards which is a hassle.

Why do you need to "make clean"?  Is it because otherwise you
would get lots of output for things that are listed in dontdiff
but not listed in the cache, and the uncommitted but not listed
in dontdiff file that you care about would get lost in the
noise?

Earlier you complained show-diff does _not_ look at what are not
listed in the cache.  Now you want to exclude garbage that comes
out of it because you have tons of stuff that are not listed in
the cache, which implies show-diff _does_ look at what are not
listed in the cache.  Either you are contradicting yourself or I
am confused.

The truth is, as you said earlier, show-diff does not look at
them, so I do not understand what problem you are trying to
describe here.

Again, "checking for potentially uncommitted" files is what
you use show-files --others for, not show-diff.


^ permalink raw reply

* Re: I'm missing isofs.h
From: Linus Torvalds @ 2005-04-28 16:28 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Petr Baudis, Andrew Morton, git
In-Reply-To: <7vvf6698zq.fsf@assigned-by-dhcp.cox.net>



On Thu, 28 Apr 2005, Junio C Hamano wrote:
> 
> Is /dev/null convention OK with you?

Absolutely. In fact, I prefer it, but I end up just using standard "patch 
-p1" in the end, so..

> Here is an example of diffstat getting confused:

diffstat is _way_ too easily confused by various things. I've seen it
claim "no files" just because the diff had some headers that confused it.  
And yes, you should always tell it to use "-p1" to get the right
pathnames, otherwise it does nonsensical things (if all the diffs happen
to be in "drivers/usb/" it ends up deciding that that's just a common
prefix, and won't actually show it at all).

However, I'm surprised that it's confused by /dev/null. Usually the 
confusion comes from the stuff _after_ the name (ie adding the "mode" etc 
is what I'd have expected to confuse it).

One way to un-confuse diffstat is to add the "Index: " line. I'm not
actually much of a fan of Index: lines myself, and I'd rather somebody
fixed diffstat, but they _do_ work around diffstat problems.

		Linus

^ permalink raw reply

* Re: I'm missing isofs.h
From: Junio C Hamano @ 2005-04-28 16:11 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Petr Baudis, Andrew Morton, git
In-Reply-To: <Pine.LNX.4.58.0504280740450.18901@ppc970.osdl.org>

>>>>> "LT" == Linus Torvalds <torvalds@osdl.org> writes:

LT> On Wed, 27 Apr 2005, Junio C Hamano wrote:
>> 
>> Linus & Andrew, is the above (second) format acceptable for the
>> kernel work?

LT> The only thing my stuff needs is that it's "-p1" format, ...

Is /dev/null convention OK with you?  I know it is OK for patch
and cg-patch, both of which have built-in knowledge of the
convention, but other tools may get confused.  Here is an
example of diffstat getting confused:

    $ jit-snap -v linus-mirror:0 Makefile comm-z.c
    # - [PATCH] diff-tree -p implies diff-tree -p -r
    # + JIT: indent help text from jit-snap.
    --- k/Makefile  (mode:100644)
    +++ l/Makefile  (mode:100644)
    @@ -7,7 +7,8 @@
     # BREAK YOUR LOCAL DIFFS! show-diff and anything us.....
     # break unless your underlying filesystem supports .....
    ...
    --- /dev/null
    +++ l/comm-z.c  (mode:100644)
    @@ -0,0 +1,101 @@
    +#include <stdio.h>
    +#include <string.h>
    ...

    $ jit-snap -v linus-mirror:0 Makefile comm-z.c | diffstat
    Makefile   |   15 +++++++--
    l/comm-z.c |  101 ++++++++++++++++++++++++++++++++++.....
     2 files changed, 114 insertions(+), 2 deletions(-)

LT> ... but I don't care 
LT> if the prefix is the sha1 tree-name, or "a/" and "b/" or anything else (I 
LT> think the current thing that the built-in stuff defaults to is a bit 
LT> strange. "k/" and "l/"? I understand "a/" and "b/", and I'd even get "x/" 
LT> and "y/" or "old/" and "new/", but starting counting at "l" is strange ;)

I am _very_ glad somebody finally has noticed and voiced the
puzzlement.  It is meant to be a gentle reminder that the tool's
primary purpose is to support development of l-k ;-).

I do not mind if you told me to change them to a/ or b/ (or do
it yourself---it is a single character change in the source) if
you want.


^ permalink raw reply

* Re: Finding file revisions
From: Daniel Barkalow @ 2005-04-28 16:08 UTC (permalink / raw)
  To: Chris Mason; +Cc: git
In-Reply-To: <200504271251.00635.mason@suse.com>

On Wed, 27 Apr 2005, Chris Mason wrote:

> I haven't seen a tool yet to find which changeset modified a given file, so 
> I whipped up something.  The basic idea is to:

What is the answer supposed to be in the presence of merges? It seems like
you shouldn't report the merge that brought in the change, but rather
(assuming it's available) the changeset that originally made it.

That is:

go through the history tree:
  if a commit has a parent with a different version:
    if it also has a parent with the same version as the child, ignore the
      different parent(s) and enqueue the same parent(s)
    otherwise, report it (for a single head, it's the original change; for
      a merge, it merged two changes to the file)
  otherwise, enqueue all the parents

Sorting by time is probably not useful, because there must be some source
of the current version, and all paths going back, after ignoring versions
that were replaced by it in a merge, must go back to that source, so
depth-first search is fastest. (If there are multiple possible solutions,
then it means that multiple people applied the same patch, and any of them
should do).

This should be easy in C, but difficult in something that isn't generating
the history info itself.

	-Daniel
*This .sig left intentionally blank*


^ permalink raw reply

* Re: Finding file revisions
From: Thomas Gleixner @ 2005-04-28 16:47 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Chris Mason, git
In-Reply-To: <Pine.LNX.4.58.0504280815120.18901@ppc970.osdl.org>

On Thu, 2005-04-28 at 08:24 -0700, Linus Torvalds wrote:
> I disagree. I'm not database allergic, I just don't believe in the notion 
> that databases solve all the worlds problems.

I never claimed, they did

> You just made creating a commit etc much slower. You now have to update 
> per-file information that you never updated before, and look at 
> information that git simply doesn't _care_ about. 

I did not say, that such a fetaure should be included into git itself.
That was never my intention.

> what? We've had that. It's called RCS/SCCS/CVS, and it's a piece of total
> and absolute crap. Exactly because single-file revisions simply do not
> matter.

I agree that RCS is crap for distributed development, but seeing a
change in a file in the correct context is quite helpful at times.

> If you want to use a database, go wild. But use it as a _cache_. Then you 
> can build up the database of file revisions "after the fact", and always 
> know that your database is not the real data, it's just an index, and can 
> be thrown away and regenerated at will.

Thats all I want to use it for. Exactly for of tracking information over
various repos and longer time intervals.

tglx



^ permalink raw reply

* Re: [darcs-devel] Re: Darcs-git pulling from the Linux repo: a Linux VM question
From: Juliusz Chroboczek @ 2005-04-28 15:36 UTC (permalink / raw)
  To: Git Mailing List, darcs-devel
In-Reply-To: <20050428113947.GC9422@abridgegame.org>

> When we're desperate, we'll special-case the initial commit, but currently
> I'm sure we can pretty easily adjust things by making the git-tree-reading
> lazy,

Just to make it clear: reading the git tree is lazy.  The problem is
somewhere in the higher layers, probably in pull_cmd.

There's also another problem: reading the git tree takes 220MB.  Then
Darcs allocates a further 500MB without calling my code at all.  (Some
of it is doubtless due to linesPS, that should be more than a handful
of megabytes.)

                                        Juliusz

^ permalink raw reply

* Re: Git fork removal?
From: Daniel Barkalow @ 2005-04-28 15:29 UTC (permalink / raw)
  To: Petr Baudis; +Cc: git
In-Reply-To: <20050428091039.GI8612@pasky.ji.cz>

On Thu, 28 Apr 2005, Petr Baudis wrote:

> Dear diary, on Thu, Apr 28, 2005 at 04:47:24AM CEST, I got a letter
> where Daniel Barkalow <barkalow@iabervon.org> told me that...
>
> > The thing that I think requires the symlinks is step 2, which requires
> > that there be somewhere I can run git and have it able to see a pair of
> > unrelated local heads and the relevant trees.
> 
> Just do cg-pull barkalow, to get the latest changes from that repository
> (perhaps clone should inherit branches information?).

You can pull from local repositories? I think that having the same concept
for remote repositories and for different lines of local development is
what confuses users.

> But if you want Linus to pull from your tree, you generally want it to
> be clean - that is, you want to manage clean separation (as Pavel Machek
> describes in his document).

I'll be using rpush and he'd have to use http-pull (I don't have rsync
set up); both of these only transfer the reachable objects, so cleanliness
isn't an issue.

> That is another advantage of hardlinking -
> you don't get any unrelated stuff in if you don't explicitly pull it, so
> you can keep your for-linus branch clean. I'd do cg-diff linus:this in
> the barkalow branch instead to keep this property.

But that doesn't work; when I'm preparing the second patch in the series,
I want to compare linus+patch 1 against barkalow, so that I'm looking at
what's left to split. That's why I need to have the unrelated heads, not
just the linus head and my head based on it. If I go back to linus each
time, it's more work making the patches and I don't have an easy way of
telling whether I've included the same part twice or missed a part.

	-Daniel
*This .sig left intentionally blank*


^ permalink raw reply

* Re: Finding file revisions
From: Linus Torvalds @ 2005-04-28 15:24 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Chris Mason, git
In-Reply-To: <1114627268.20916.8.camel@tglx.tec.linutronix.de>



On Wed, 27 Apr 2005, Thomas Gleixner wrote:
>
> On Wed, 2005-04-27 at 10:34 -0700, Linus Torvalds wrote:
> > 
> > With more history, "rev-list" should do basically the right thing: it will
> > be constant-time for _recent_ commits, and it is linear time in how far
> > back you want to go. Which seems quite reasonable.
> 
> Which is quite horrible, if you have a 500k+ blobs repo.

It's _not_ linear in blobs. It doesn't care at all about them, in fact. 

It's linear in how many revisions you go backwards. And I claim that you 
can't do any better than that, without doing _really_ bad things.

> I know you are database allergic, but there a database is the correct
> solution.

I disagree. I'm not database allergic, I just don't believe in the notion 
that databases solve all the worlds problems.

> Having stored all the relations of those file/tree/commit
> blobs in a database it takes <20ms to have a list of all those file
> blobs in historical order with some context information retrieved.

.. and such an SCM will _suck_ for anything else.

You just made creating a commit etc much slower. You now have to update 
per-file information that you never updated before, and look at 
information that git simply doesn't _care_ about. 

Right now, when we create a new version, it's pretty much instantaneous.  
Exactly becaue we do not look at a _single_ file, and we don't care how
they changed from the "previous" version. We just write out the knowledge
about what the files are now.

Doing a database of file changes would absolutely _suck_. Anybody who
thinks that databases are magically faster than not using a database
doesn't understand basic physics. Things don't go faster just because you
call it a database. Things go faster by _doing_less_.

Normally, a database does less by keeping indexes etc around, and the 
indexes require less work than the data itself. But git _does_ all of that 
already. Git very much _is_ a database, it's just a specialized one.

I dare you to show me wrong. I don't _care_ of you can show the revision
history of a single file in 20ms. The easiest way to do that is with a
delta format, where the file information basically is single-file in the
first place, and you just open the file and print out the results. Guess
what? We've had that. It's called RCS/SCCS/CVS, and it's a piece of total
and absolute crap. Exactly because single-file revisions simply do not
matter.

If you want to use a database, go wild. But use it as a _cache_. Then you 
can build up the database of file revisions "after the fact", and always 
know that your database is not the real data, it's just an index, and can 
be thrown away and regenerated at will.

That way you don't add overhead to the stuff that actually matters, and 
that git does a lot better than a general-purpose database could ever do.

			Linus

^ permalink raw reply

* RE: A shortcoming of the git repo format
From: Barry Silverman @ 2005-04-28 15:08 UTC (permalink / raw)
  To: Linus Torvalds, Tom Lord; +Cc: hpa, git
In-Reply-To: <Pine.LNX.4.58.0504271722260.18901@ppc970.osdl.org>

>>In contrast, with git, I'm totally uninterested in anything that doesn't
>>make my kernel work go faster or more smoothly, and does so _today_. Which
>>makes me a cantancerous old bastard, and bit the heads off anybody who
>>isn't focused on that one thing.

Focus is the totally operative word here!

If you really want to feel good about the world, re-read the initial set of
git postings that Linus made on April 7th:
http://kerneltrap.org/node/4982

Contrast the picture today with the fact that three weeks ago:
April 7:
1) the kernel workflow was at a standstill
2) git was just a totally unproven concept in Linus' head, that could have
ended up as a band-aid while a REAL SCM (...sound of choking from the
wings...) was chosen
3) the performance issues in dealing with both the size of the kernel
project, and the velocity of the changes were completely up in the air

Today:
1) the kernel workflow has restarted, and has already made its first
milestone
2) git is solid in architecture, is maintained and updated by a proven set
of developers, and has been demonstrated to have all the performance
necessary going forward
3) the primary traffic on the mailing list is related to tactical issues -
not architecture, or strategy, or big-ticket item stuff - with the
occasional flame about "renames" ;-)

Are there any large strategic issues left to be resolved for git?, or is it
just a matter of getting all the kernel developers over the learning curve,
and iterating the details of the workflow to make everyone maximally
productive?

How long do you think it will take for the kernel workflow to get back to
its height during the BK days?

The achievement of going from a complete standstill, to full velocity kernel
workflow production in a couple of months has got to be something everyone
involved should be intensely proud of.
Thanks, Linus, for being such a "cantancerous old bastard". I don't think it
could have happened if you were anything but....

Barry Silverman


^ permalink raw reply

* Re: I'm missing isofs.h
From: Linus Torvalds @ 2005-04-28 14:42 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Petr Baudis, Andrew Morton, git
In-Reply-To: <7vhdhra2sg.fsf@assigned-by-dhcp.cox.net>



On Wed, 27 Apr 2005, Junio C Hamano wrote:
> 
> Linus & Andrew, is the above (second) format acceptable for the
> kernel work?

The only thing my stuff needs is that it's "-p1" format, but I don't care 
if the prefix is the sha1 tree-name, or "a/" and "b/" or anything else (I 
think the current thing that the built-in stuff defaults to is a bit 
strange. "k/" and "l/"? I understand "a/" and "b/", and I'd even get "x/" 
and "y/" or "old/" and "new/", but starting counting at "l" is strange ;)

		Linus

^ permalink raw reply

* Re: The criss-cross merge case
From: Adam J. Richter @ 2005-04-28 14:25 UTC (permalink / raw)
  To: ry102; +Cc: barkalow, bram, droundry, git, tupshin

On 2005-04-28, Benedikt Schmidt wrote:
>AFAIK the paper mentioned in the GNU diff sources [1] is an improvement
>to an earlier paper by the same author titled
>"A File Comparison Program" - Miller, Myers - 1985.
[...]
>[1] http://citeseer.ist.psu.edu/myers86ond.html

	Monotone apparently uses a futher acceleration of that algorithm
from the 1989 paper, also co-authored by the Myers, "An O(NP) Sequence
Comparison Algorithm" by Sun Wu, Udi Manber, and Gene Myers.
http://www.eecs.berkeley.edu/~gene/Papers/np_diff.pdf .  The Monotone
implementation was apparently a port of an implementation originally
written in Scheme by Aubrey Jaffer.

	I don't fully understand the 1989 paper, but I get the
general impression that is a small change to the previous algorithm
(the one in GNU diff) that might be a 30 line patch if someone
got around to submitting it, and seems to make the code run more
than twice as fast in practice.  One of these days, I will probably get
around to coding up a patch to GNU diff if nobody beats me to it.

	Making diff run faster may have at least one potentially useful
benefit for merging.  A faster diff makes it more practical run diff
on smaller units of comparison.  I posted a note here before about
converting the input files to diff3 to have just one character per
line, and then undoing that transformation of the result to produce
a character based merge that seemed to work pretty well in the
couple of tests that I tried.

                    __     ______________ 
Adam J. Richter        \ /
adam@yggdrasil.com      | g g d r a s i l

^ permalink raw reply

* RT[3/3]: Reverse lookup of SHA1 references
From: Kris Shannon @ 2005-04-28 13:54 UTC (permalink / raw)
  To: GIT Mailing List
In-Reply-To: <5d4799ae0504280559109cd00e@mail.gmail.com>

There are a number of places where you want to find all the objects
which reference this particular object. AIUI this is not currently an easy
task. Some thought should be put into how to make these reverse
lookups fast.

The other two random thoughts would benefit greatly from a relationship
cache.

Umm.... I was going to write some more,  but I've gotta go :(

More thoughts later...

-- 
Kris Shannon <kris.shannon.kernel@gmail.com>

^ permalink raw reply

* RT[1/3]: Alternate Encodings (esp. Delta Compression)
From: Kris Shannon @ 2005-04-28 13:47 UTC (permalink / raw)
  To: GIT Mailing List
In-Reply-To: <5d4799ae0504280559109cd00e@mail.gmail.com>

If a format is defined for representing delta compression then
it would be prudent to make sure that it could be used for
encoding both forward and backward deltas (not necessarily
in the same delta :) These deltas could then by kept in the 
objects directory (i.e. 00/a29c403e751c2a2a61eb24fa2249c8956d1c80.xdelta)

Doing delta compression of old versions is something that should
be done manually (the subversion people have empirical data
to back that up I think but I can't seem to find a link ATM)
A command for wiping old versions from a repository to save space
could be altered to replace the files with their xdelta equivalents
for a reduced space savings but still keeping a full history.

Using delta compression of the new versions (against the old) for
efficient bandwidth consumption is another possible area.  If these
delta's are produced on the fly,  they could be cached in the objects
directory.

These two different use cases are IMO a good argument for
using this as a convention even if it doesn't become a part of git's
core (i.e. changing read_sha1_file to transparently expand xdelta's)

If you add .xdelta it would follow that other encodings might be useful,
and added to the objects directory in the same way.

-- 
Kris Shannon <kris.shannon.kernel@gmail.com>

^ permalink raw reply

* Re: A shortcoming of the git repo format
From: David Woodhouse @ 2005-04-28 13:39 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Linus Torvalds, Git Mailing List
In-Reply-To: <426FD3EE.5000404@zytor.com>

On Wed, 2005-04-27 at 11:03 -0700, H. Peter Anvin wrote:
> > To find the email address, look for the first '<'. To find the date, look 
> > for the first '>'. Those characters are not allowed in the name or the 
> > email, so they act as well-defined delimeters.
> > 
> 
> That's true for email addresses,

Not in general. You can have just about any character, including @, <
and >, in either a display-name or a local-part.

For git we actually _remove_ any instances of '<' and '>' from both
'AUTHOR_NAME' and 'AUTHOR_EMAIL', so what you say becomes true.

I still say these shouldn't be considered email addresses, any more than
the 'user@host.domain' you see when you connect to an IRC server is
considered an IP address.

-- 
dwmw2


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox