* [PATCH] Respect core.autocrlf when preparing temporary files for external diff
[not found] <cover.1237635609u.git.johannes.schindelin@gmx.de>
@ 2009-03-21 11:42 ` Johannes Schindelin
2009-03-21 17:02 ` Michael J Gruber
2009-03-21 19:35 ` Junio C Hamano
0 siblings, 2 replies; 14+ messages in thread
From: Johannes Schindelin @ 2009-03-21 11:42 UTC (permalink / raw)
To: git, gitster; +Cc: Sebastian Schuberth
When preparing temporary files for an external diff, the files should be
handled as if they were worktree files.
This makes things consistent for the case when one side of the diff was
found in the current working directory (and therefore creating a temporary
file could be avoided altogether).
This fixes msysGit issue 177.
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
---
diff.c | 13 ++++++++++---
t/t4020-diff-external.sh | 16 ++++++++++++++++
2 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/diff.c b/diff.c
index 75d9fab..699ae6a 100644
--- a/diff.c
+++ b/diff.c
@@ -1946,17 +1946,23 @@ void diff_free_filespec_data(struct diff_filespec *s)
s->cnt_data = NULL;
}
-static void prep_temp_blob(struct diff_tempfile *temp,
+static void prep_temp_blob(const char *path, struct diff_tempfile *temp,
void *blob,
unsigned long size,
const unsigned char *sha1,
int mode)
{
int fd;
+ struct strbuf buf = STRBUF_INIT;
fd = git_mkstemp(temp->tmp_path, PATH_MAX, ".diff_XXXXXX");
if (fd < 0)
die("unable to create temp-file: %s", strerror(errno));
+ if (convert_to_working_tree(path,
+ (const char *)blob, (size_t)size, &buf)) {
+ blob = buf.buf;
+ size = buf.len;
+ }
if (write_in_full(fd, blob, size) != size)
die("unable to write temp-file");
close(fd);
@@ -1964,6 +1970,7 @@ static void prep_temp_blob(struct diff_tempfile *temp,
strcpy(temp->hex, sha1_to_hex(sha1));
temp->hex[40] = 0;
sprintf(temp->mode, "%06o", mode);
+ strbuf_release(&buf);
}
static struct diff_tempfile *prepare_temp_file(const char *name,
@@ -2004,7 +2011,7 @@ static struct diff_tempfile *prepare_temp_file(const char *name,
die("readlink(%s)", name);
if (ret == sizeof(buf))
die("symlink too long: %s", name);
- prep_temp_blob(temp, buf, ret,
+ prep_temp_blob(name, temp, buf, ret,
(one->sha1_valid ?
one->sha1 : null_sha1),
(one->sha1_valid ?
@@ -2030,7 +2037,7 @@ static struct diff_tempfile *prepare_temp_file(const char *name,
else {
if (diff_populate_filespec(one, 0))
die("cannot read data blob for %s", one->path);
- prep_temp_blob(temp, one->data, one->size,
+ prep_temp_blob(name, temp, one->data, one->size,
one->sha1, one->mode);
}
return temp;
diff --git a/t/t4020-diff-external.sh b/t/t4020-diff-external.sh
index 281680d..f8c99f1 100755
--- a/t/t4020-diff-external.sh
+++ b/t/t4020-diff-external.sh
@@ -136,4 +136,20 @@ test_expect_success 'GIT_EXTERNAL_DIFF with more than one changed files' '
GIT_EXTERNAL_DIFF=echo git diff
'
+echo "#!$SHELL_PATH" >fake-diff.sh
+cat >> fake-diff.sh <<\EOF
+cat $2 >> crlfed.txt
+EOF
+chmod a+x fake-diff.sh
+
+keep_only_cr () {
+ tr -dc '\015'
+}
+
+test_expect_success 'external diff with autocrlf = true' '
+ git config core.autocrlf true &&
+ GIT_EXTERNAL_DIFF=./fake-diff.sh git diff &&
+ test $(wc -l < crlfed.txt) = $(cat crlfed.txt | keep_only_cr | wc -c)
+'
+
test_done
--
1.6.2.1.493.g67cf3
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-21 11:42 ` [PATCH] Respect core.autocrlf when preparing temporary files for external diff Johannes Schindelin
@ 2009-03-21 17:02 ` Michael J Gruber
2009-03-21 19:35 ` Junio C Hamano
1 sibling, 0 replies; 14+ messages in thread
From: Michael J Gruber @ 2009-03-21 17:02 UTC (permalink / raw)
To: Johannes Schindelin; +Cc: git, gitster, Sebastian Schuberth
Johannes Schindelin venit, vidit, dixit 21.03.2009 12:42:
> When preparing temporary files for an external diff, the files should be
> handled as if they were worktree files.
>
> This makes things consistent for the case when one side of the diff was
> found in the current working directory (and therefore creating a temporary
> file could be avoided altogether).
>
> This fixes msysGit issue 177.
>
> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
> ---
> diff.c | 13 ++++++++++---
> t/t4020-diff-external.sh | 16 ++++++++++++++++
> 2 files changed, 26 insertions(+), 3 deletions(-)
>
> diff --git a/diff.c b/diff.c
> index 75d9fab..699ae6a 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -1946,17 +1946,23 @@ void diff_free_filespec_data(struct diff_filespec *s)
> s->cnt_data = NULL;
> }
>
> -static void prep_temp_blob(struct diff_tempfile *temp,
> +static void prep_temp_blob(const char *path, struct diff_tempfile *temp,
> void *blob,
> unsigned long size,
> const unsigned char *sha1,
> int mode)
> {
> int fd;
> + struct strbuf buf = STRBUF_INIT;
>
> fd = git_mkstemp(temp->tmp_path, PATH_MAX, ".diff_XXXXXX");
> if (fd < 0)
> die("unable to create temp-file: %s", strerror(errno));
> + if (convert_to_working_tree(path,
> + (const char *)blob, (size_t)size, &buf)) {
> + blob = buf.buf;
> + size = buf.len;
> + }
> if (write_in_full(fd, blob, size) != size)
> die("unable to write temp-file");
> close(fd);
> @@ -1964,6 +1970,7 @@ static void prep_temp_blob(struct diff_tempfile *temp,
> strcpy(temp->hex, sha1_to_hex(sha1));
> temp->hex[40] = 0;
> sprintf(temp->mode, "%06o", mode);
> + strbuf_release(&buf);
> }
>
> static struct diff_tempfile *prepare_temp_file(const char *name,
> @@ -2004,7 +2011,7 @@ static struct diff_tempfile *prepare_temp_file(const char *name,
> die("readlink(%s)", name);
> if (ret == sizeof(buf))
> die("symlink too long: %s", name);
> - prep_temp_blob(temp, buf, ret,
> + prep_temp_blob(name, temp, buf, ret,
> (one->sha1_valid ?
> one->sha1 : null_sha1),
> (one->sha1_valid ?
> @@ -2030,7 +2037,7 @@ static struct diff_tempfile *prepare_temp_file(const char *name,
> else {
> if (diff_populate_filespec(one, 0))
> die("cannot read data blob for %s", one->path);
> - prep_temp_blob(temp, one->data, one->size,
> + prep_temp_blob(name, temp, one->data, one->size,
> one->sha1, one->mode);
> }
> return temp;
> diff --git a/t/t4020-diff-external.sh b/t/t4020-diff-external.sh
> index 281680d..f8c99f1 100755
> --- a/t/t4020-diff-external.sh
> +++ b/t/t4020-diff-external.sh
> @@ -136,4 +136,20 @@ test_expect_success 'GIT_EXTERNAL_DIFF with more than one changed files' '
> GIT_EXTERNAL_DIFF=echo git diff
> '
>
> +echo "#!$SHELL_PATH" >fake-diff.sh
> +cat >> fake-diff.sh <<\EOF
> +cat $2 >> crlfed.txt
> +EOF
> +chmod a+x fake-diff.sh
> +
> +keep_only_cr () {
> + tr -dc '\015'
> +}
> +
> +test_expect_success 'external diff with autocrlf = true' '
> + git config core.autocrlf true &&
> + GIT_EXTERNAL_DIFF=./fake-diff.sh git diff &&
> + test $(wc -l < crlfed.txt) = $(cat crlfed.txt | keep_only_cr | wc -c)
> +'
> +
> test_done
Nice! This fixes also an issue I reported earlier: When
diff.foo.textconv is set, the textconv filter is fed with cleaned files
some times, with smudged files other times. Now it's always fed with
smudged files. Back then I even suspected crlf problems.
Michael
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-21 11:42 ` [PATCH] Respect core.autocrlf when preparing temporary files for external diff Johannes Schindelin
2009-03-21 17:02 ` Michael J Gruber
@ 2009-03-21 19:35 ` Junio C Hamano
2009-03-21 23:54 ` Johannes Schindelin
1 sibling, 1 reply; 14+ messages in thread
From: Junio C Hamano @ 2009-03-21 19:35 UTC (permalink / raw)
To: Johannes Schindelin; +Cc: git, Sebastian Schuberth
Johannes Schindelin <johannes.schindelin@gmx.de> writes:
> When preparing temporary files for an external diff, the files should be
> handled as if they were worktree files.
I do not think so. convert_to_working_tree() aka "smudge" means you would
be feeding crap like $Id$ expansion to the external diff, which we chose
not to do quite on purpose.
I think the right solution is to filter inside GIT_EXTERNAL_DIFF *if* the
diff tool you are dispatching into from the external diff script wants to
take smudged representation. I know we don't have such a command right
now, but something like
git filter --take-attr-for-path=$path --direction=out <clean >smudged
git filter --take-attr-for-path=$path --direction=in >clean ><mudged
to invoke convert_to_working_tree() and convert_to_git() filters would be
sufficient.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-21 19:35 ` Junio C Hamano
@ 2009-03-21 23:54 ` Johannes Schindelin
2009-03-22 0:03 ` Junio C Hamano
0 siblings, 1 reply; 14+ messages in thread
From: Johannes Schindelin @ 2009-03-21 23:54 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git, Sebastian Schuberth
Hi,
On Sat, 21 Mar 2009, Junio C Hamano wrote:
> Johannes Schindelin <johannes.schindelin@gmx.de> writes:
>
> > When preparing temporary files for an external diff, the files should
> > be handled as if they were worktree files.
>
> I do not think so. convert_to_working_tree() aka "smudge" means you
> would be feeding crap like $Id$ expansion to the external diff, which we
> chose not to do quite on purpose.
You might have missed me mentioning that we often can do without temporary
files, taking the working directory copies right away?
And if you think about it, it makes complete sense.
Ciao,
Dscho
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-21 23:54 ` Johannes Schindelin
@ 2009-03-22 0:03 ` Junio C Hamano
2009-03-22 0:41 ` Junio C Hamano
0 siblings, 1 reply; 14+ messages in thread
From: Junio C Hamano @ 2009-03-22 0:03 UTC (permalink / raw)
To: Johannes Schindelin; +Cc: git, Sebastian Schuberth
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> On Sat, 21 Mar 2009, Junio C Hamano wrote:
>
>> Johannes Schindelin <johannes.schindelin@gmx.de> writes:
>>
>> > When preparing temporary files for an external diff, the files should
>> > be handled as if they were worktree files.
>>
>> I do not think so. convert_to_working_tree() aka "smudge" means you
>> would be feeding crap like $Id$ expansion to the external diff, which we
>> chose not to do quite on purpose.
>
> You might have missed me mentioning that we often can do without temporary
> files, taking the working directory copies right away?
>
> And if you think about it, it makes complete sense.
Not "complete".
What is at issue is how consistent the codepath that calls out to an
external diff should be with the rest of git that solely work with the
"clean" version of blob contents. If you value consistency, I would say
that it is valid to argue that letting it borrow from a work tree is a
bug. It should be always feeding a clean version.
But if you think that we do not care really about the correctness of the
external diff codepath, because it is a mechanism used mostly for giving
output to humans (as opposed to feeding the output of the external diff
program back to programs to be processed further) , I can understand why
you think it is easier to the external programs if "smudged" version is
fed to them.
Not just I can _understand_, but I think I could be pursuaded to agree
with that position, iff you try harder.
But the assumption/rationale behind this change needs to be spelled out.
In essence, we are sacrificing purity and correctness because we consider
ease of building external diff filter is more important.
Or something like that.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-22 0:03 ` Junio C Hamano
@ 2009-03-22 0:41 ` Junio C Hamano
2009-03-22 6:10 ` Jeff King
0 siblings, 1 reply; 14+ messages in thread
From: Junio C Hamano @ 2009-03-22 0:41 UTC (permalink / raw)
To: Johannes Schindelin; +Cc: git, Sebastian Schuberth, Jeff King
Junio C Hamano <gitster@pobox.com> writes:
> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
>
>> On Sat, 21 Mar 2009, Junio C Hamano wrote:
>>
>>> Johannes Schindelin <johannes.schindelin@gmx.de> writes:
>>>
>>> > When preparing temporary files for an external diff, the files should
>>> > be handled as if they were worktree files.
>>>
>>> I do not think so. convert_to_working_tree() aka "smudge" means you
>>> would be feeding crap like $Id$ expansion to the external diff, which we
>>> chose not to do quite on purpose.
>>
>> You might have missed me mentioning that we often can do without temporary
>> files, taking the working directory copies right away?
>>
>> And if you think about it, it makes complete sense.
>
> Not "complete".
>
> What is at issue is how consistent the codepath that calls out to an
> external diff should be with the rest of git that solely work with the
> "clean" version of blob contents. If you value consistency, I would say
> that it is valid to argue that letting it borrow from a work tree is a
> bug. It should be always feeding a clean version.
>
> But if you think that we do not care really about the correctness of the
> external diff codepath, because it is a mechanism used mostly for giving
> output to humans (as opposed to feeding the output of the external diff
> program back to programs to be processed further) , I can understand why
> you think it is easier to the external programs if "smudged" version is
> fed to them.
>
> Not just I can _understand_, but I think I could be pursuaded to agree
> with that position, iff you try harder.
>
> But the assumption/rationale behind this change needs to be spelled out.
> In essence, we are sacrificing purity and correctness because we consider
> ease of building external diff filter is more important.
>
> Or something like that.
The situation is worse than I thought, and I am surely glad that this was
brought into my attention. I think diff.c::reuse_worktree_file() should
always say "Don't" when we know convert_to_working_tree() is not a no-op.
I have a suspicion that:
(1) borrowing from the work tree may not be buying us very much these
days; the introduction of the logic predates not just "clean/smudge"
feature but packfiles. Back then, the tradeoff was between opening a
loose object file, deflating and writing out a temporary and reusing
a file in the work tree. These days, most of the time the former
would be to reconstruct a blob from the pack data that is already
mapped, and performance characteristics would be different.
(2) having to check if convert_to_working_tree() is a no-op or not may be
a lot more costly than any performance gain we get from borrowing
from the work tree.
Here is a patch to get rid of the borrowing.
You can work on top of this patch to add the convert_to_working_tree()
call to prep_temp_blob(). Such a change would also affect the "textconv"
codepath and the result will start feeding smudged contents to the
"textconv" filter, and I think the argument "external tools prefer to be
fed smudged contents, not clean ones, because that is the representation
tailored for the platform" would hold even more stronger in that context.
diff.c | 69 ++-------------------------------------------------------------
1 files changed, 3 insertions(+), 66 deletions(-)
diff --git a/diff.c b/diff.c
index 75d9fab..4d71ebc 100644
--- a/diff.c
+++ b/diff.c
@@ -1746,67 +1746,6 @@ void fill_filespec(struct diff_filespec *spec, const unsigned char *sha1,
}
}
-/*
- * Given a name and sha1 pair, if the index tells us the file in
- * the work tree has that object contents, return true, so that
- * prepare_temp_file() does not have to inflate and extract.
- */
-static int reuse_worktree_file(const char *name, const unsigned char *sha1, int want_file)
-{
- struct cache_entry *ce;
- struct stat st;
- int pos, len;
-
- /* We do not read the cache ourselves here, because the
- * benchmark with my previous version that always reads cache
- * shows that it makes things worse for diff-tree comparing
- * two linux-2.6 kernel trees in an already checked out work
- * tree. This is because most diff-tree comparisons deal with
- * only a small number of files, while reading the cache is
- * expensive for a large project, and its cost outweighs the
- * savings we get by not inflating the object to a temporary
- * file. Practically, this code only helps when we are used
- * by diff-cache --cached, which does read the cache before
- * calling us.
- */
- if (!active_cache)
- return 0;
-
- /* We want to avoid the working directory if our caller
- * doesn't need the data in a normal file, this system
- * is rather slow with its stat/open/mmap/close syscalls,
- * and the object is contained in a pack file. The pack
- * is probably already open and will be faster to obtain
- * the data through than the working directory. Loose
- * objects however would tend to be slower as they need
- * to be individually opened and inflated.
- */
- if (!FAST_WORKING_DIRECTORY && !want_file && has_sha1_pack(sha1))
- return 0;
-
- len = strlen(name);
- pos = cache_name_pos(name, len);
- if (pos < 0)
- return 0;
- ce = active_cache[pos];
-
- /*
- * This is not the sha1 we are looking for, or
- * unreusable because it is not a regular file.
- */
- if (hashcmp(sha1, ce->sha1) || !S_ISREG(ce->ce_mode))
- return 0;
-
- /*
- * If ce matches the file in the work tree, we can reuse it.
- */
- if (ce_uptodate(ce) ||
- (!lstat(name, &st) && !ce_match_stat(ce, &st, 0)))
- return 1;
-
- return 0;
-}
-
static int populate_from_stdin(struct diff_filespec *s)
{
struct strbuf buf = STRBUF_INIT;
@@ -1861,8 +1800,7 @@ int diff_populate_filespec(struct diff_filespec *s, int size_only)
if (S_ISGITLINK(s->mode))
return diff_populate_gitlink(s, size_only);
- if (!s->sha1_valid ||
- reuse_worktree_file(s->path, s->sha1, 0)) {
+ if (!s->sha1_valid) {
struct strbuf buf = STRBUF_INIT;
struct stat st;
int fd;
@@ -1988,8 +1926,7 @@ static struct diff_tempfile *prepare_temp_file(const char *name,
remove_tempfile_installed = 1;
}
- if (!one->sha1_valid ||
- reuse_worktree_file(name, one->sha1, 1)) {
+ if (!one->sha1_valid) {
struct stat st;
if (lstat(name, &st) < 0) {
if (errno == ENOENT)
@@ -2011,7 +1948,7 @@ static struct diff_tempfile *prepare_temp_file(const char *name,
one->mode : S_IFLNK));
}
else {
- /* we can borrow from the file in the work tree */
+ /* we use the file in the work tree */
temp->name = name;
if (!one->sha1_valid)
strcpy(temp->hex, sha1_to_hex(null_sha1));
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-22 0:41 ` Junio C Hamano
@ 2009-03-22 6:10 ` Jeff King
2009-03-22 7:18 ` Junio C Hamano
0 siblings, 1 reply; 14+ messages in thread
From: Jeff King @ 2009-03-22 6:10 UTC (permalink / raw)
To: Junio C Hamano; +Cc: Johannes Schindelin, git, Sebastian Schuberth
On Sat, Mar 21, 2009 at 05:41:52PM -0700, Junio C Hamano wrote:
> I have a suspicion that:
>
> (1) borrowing from the work tree may not be buying us very much these
> days; the introduction of the logic predates not just "clean/smudge"
> feature but packfiles. Back then, the tradeoff was between opening a
> loose object file, deflating and writing out a temporary and reusing
> a file in the work tree. These days, most of the time the former
> would be to reconstruct a blob from the pack data that is already
> mapped, and performance characteristics would be different.
I don't think this is the case. We only reuse the worktree file when we
have already read the index, which means the main user is "git diff
--cached". If a file's SHA1 is the same in the HEAD and the index, then
we don't need to run the diff at all. But if it is, then that implies
content newly added for a commit, which is probably not in a packfile,
but rather a loose object.
Try something like this with and without your patch:
cd linux-2.6 &&
git ls-files |
while read f; do
echo trivial change >>$f
done
git add -u
git diff --cached
I get about 10.5s with stock git, and 13.5s with your patch (actually,
the first run with without reuse_worktree_file is doubly painful -- it
touches the disk for all of the newly written loose objects).
So I think it does make a difference now. The question, though, is:
> (2) having to check if convert_to_working_tree() is a no-op or not may be
> a lot more costly than any performance gain we get from borrowing
> from the work tree.
...how expensive is the check for convert_to_working_tree? It should
just be a gitattributes lookup, shouldn't it? Which:
a. we are doing anyway and caching
b. which takes a fraction of a second (try "time git ls-files | git
check-attr --stdin diff >/dev/null", which should give a
worst-case).
To be fair, though, the diff I outlined above is pretty ridiculous.
Worktree reuse saved me three seconds, but out of 26683 files. Even if
you had 1000 files in your commit, that's still only .1s.
Where I think it _would_ make an impact is in crazy large-file
repositories. Where the I/O cost of dumping a 200MB avi into a tempfile
is actually meaningful. But for that, even the current code is not very
good; it is optimized for many small files. For a few large files, you
are probably better off reading the index from disk in order to make
that optimization for other cases (like tree to tree diff, where one of
the trees happens to have the same contents for the file as HEAD).
> You can work on top of this patch to add the convert_to_working_tree()
> call to prep_temp_blob(). Such a change would also affect the "textconv"
> codepath and the result will start feeding smudged contents to the
> "textconv" filter, and I think the argument "external tools prefer to be
> fed smudged contents, not clean ones, because that is the representation
> tailored for the platform" would hold even more stronger in that context.
In practice, I don't think it will affect textconv users much. The point
of textconv is to munge ugly binary files into something humans can
read. I'm not sure if people are actually smudging those (though I guess
people have talked about smuding openoffice files, so anything is
possible...).
Anyway, I was planning to make a patch to always feed textconv the
_clean_ version of each file. My thinking was:
1. Then tools get a consistent view of the data across platforms.
I.e., my textconv munger or external diff script will work no
matter what you think the working tree should look like.
2. The tool may want the clean version, or it may want the smudged
version. Or it may be able to operate on either. If we give it a
format it doesn't like, it will have to undo whatever we did.
For most cases, we start with the clean file (i.e., from a tree or
from the index). If we hand out the clean file and the script
doesn't like it, it pays the cost to smudge once. If we hand it the
smudged file and the script doesn't like it, we pay the cost to
smudge _and_ the script pays the cost to clean.
But I have to admit that I have never once used a clean/smudge filter in
an actual project. So I am coming at this purely from a hypothetical
position. I do think the format that any scripts get should be
consistent (i.e., always clean or always smudged), and textconv and
extdiff should use the same format.
One thing that I have considered that may make it moot is adding a
"fast" extdiff/textconv interface: instead of writing temp files, call
the script with the quick information (like sha-1 and filename), and let
it ask git for the data as appropriate.
In my case, I'm motivated by making textconv faster for my media repo;
the textconv script just displays the exif (and similar) metadata for
the files. So right now for each diffed file it writes out the entire
large file to a tempfile, reads the exif data, and prints it. If it got
just the sha-1 it could:
- use its own caching layer (which my metadata reading tools already
have) to avoid looking at the file at all, since it knows the
textconv for a particular sha-1 is immutable
- when it _does_ have to read, it can stream only as much data as it
needs to get the metadata and never touch the disk at all. I.e., by
piping from "git cat-file".
Given such an interface, it would also be trivial to ask cat-file for
your data with or without smudging, as appropriate. Which is not that
different from what you suggested earlier in the thread with "git
filter", except that instead of smudging the existing tempfile, we skip
the part where we write out the uninteresting bit. :)
So it's a little more work on the part of the script-writer (who now has
to know about getting the data from git instead of just opening some
files), but it can potentially be a lot faster. And of course I would
retain the original interface both for historical reasons and because it
is simpler to use; this would be diff.*.textconv-quick or similar.
> diff.c | 69 ++-------------------------------------------------------------
> 1 files changed, 3 insertions(+), 66 deletions(-)
For some reason, with your patch the tempfiles are created with mode
0005 for me (whereas they are usually 0505), which makes open() in the
called script unhappy. Looking over the patch text, though, I have no
idea what change could be causing that.
-Peff
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-22 6:10 ` Jeff King
@ 2009-03-22 7:18 ` Junio C Hamano
2009-03-22 7:46 ` Jeff King
0 siblings, 1 reply; 14+ messages in thread
From: Junio C Hamano @ 2009-03-22 7:18 UTC (permalink / raw)
To: Jeff King; +Cc: Johannes Schindelin, git, Sebastian Schuberth
Jeff King <peff@peff.net> writes:
> ...how expensive is the check for convert_to_working_tree? It should
> just be a gitattributes lookup, shouldn't it? Which:
>
> a. we are doing anyway and caching
>
> b. which takes a fraction of a second (try "time git ls-files | git
> check-attr --stdin diff >/dev/null", which should give a
> worst-case).
Ok. Although I already queued the removal to 'pu' for tonight's pushout
and it is way too late to revert that, I think I didn't have to remove the
function. The codepath that lets you cheat by borrowing from the checkout
runs convert_to_git() when it borrows, and if you are seeing a meaningful
optimization even with that overhead, perhaps it would be worth keeping.
There is another check that should be there but is missing from the
current implementation of reuse_worktree_file(), other than the "is
convert_to_working_tree() a no-op for this path?" check. The last check
in the function would say "Yeah, we can reuse it" if the ce is marked
"assume unchanged"; we do not want to blindly reuse the file from the work
tree in that case.
> Anyway, I was planning to make a patch to always feed textconv the
> _clean_ version of each file. My thinking was:
>
> 1. Then tools get a consistent view of the data across platforms.
> I.e., my textconv munger or external diff script will work no
> matter what you think the working tree should look like.
>
> 2. The tool may want the clean version, or it may want the smudged
> version. Or it may be able to operate on either. If we give it a
> format it doesn't like, it will have to undo whatever we did.
>
> For most cases, we start with the clean file (i.e., from a tree or
> from the index). If we hand out the clean file and the script
> doesn't like it, it pays the cost to smudge once. If we hand it the
> smudged file and the script doesn't like it, we pay the cost to
> smudge _and_ the script pays the cost to clean.
While the purist in me says #1 above is the right argument to make for
feeding "clean" version, I suspect that the textconv or extdiff tools more
often are not made from scratch and ported across platforms than are
cobbled up together out of tools the script writer finds on his platform.
I suspect that Dscho's "a tempfile should look like a checkout" would be
much friendlier to them in practice for this reason.
> For some reason, with your patch the tempfiles are created with mode
> 0005 for me (whereas they are usually 0505), which makes open() in the
> called script unhappy. Looking over the patch text, though, I have no
> idea what change could be causing that.
Neither 0005 nor 0505 sounds correct to me; shouldn't they be 0600 or
something like that?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-22 7:18 ` Junio C Hamano
@ 2009-03-22 7:46 ` Jeff King
2009-03-22 15:30 ` Sebastian Schuberth
2009-03-23 0:39 ` Junio C Hamano
0 siblings, 2 replies; 14+ messages in thread
From: Jeff King @ 2009-03-22 7:46 UTC (permalink / raw)
To: Junio C Hamano; +Cc: Johannes Schindelin, git, Sebastian Schuberth
On Sun, Mar 22, 2009 at 12:18:33AM -0700, Junio C Hamano wrote:
> Ok. Although I already queued the removal to 'pu' for tonight's pushout
> and it is way too late to revert that, I think I didn't have to remove the
> function. The codepath that lets you cheat by borrowing from the checkout
> runs convert_to_git() when it borrows, and if you are seeing a meaningful
> optimization even with that overhead, perhaps it would be worth keeping.
I certainly haven't done exhaustive tests. Obviously the one I did was a
bit contrived. I just think it makes sense to have numbers rather than
saying "this probably doesn't do anything anymore".
> While the purist in me says #1 above is the right argument to make for
> feeding "clean" version, I suspect that the textconv or extdiff tools more
> often are not made from scratch and ported across platforms than are
> cobbled up together out of tools the script writer finds on his platform.
> I suspect that Dscho's "a tempfile should look like a checkout" would be
> much friendlier to them in practice for this reason.
I think you and I have about the same feeling on this, then. As somebody
who does not actually use smudge/clean filters at all, I am willing to
defer to Dscho's opinion, which is based on practical experience.
> > For some reason, with your patch the tempfiles are created with mode
> > 0005 for me (whereas they are usually 0505), which makes open() in the
> > called script unhappy. Looking over the patch text, though, I have no
> > idea what change could be causing that.
>
> Neither 0005 nor 0505 sounds correct to me; shouldn't they be 0600 or
> something like that?
Argh, sorry, of course I should have realized something was wrong with
my baseline when I saw 0505. I was building on a half-finished WIP
related to textconv, which obviously is broken. Sorry for the noise.
-Peff
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-22 7:46 ` Jeff King
@ 2009-03-22 15:30 ` Sebastian Schuberth
2009-03-22 21:59 ` Junio C Hamano
2009-03-23 0:39 ` Junio C Hamano
1 sibling, 1 reply; 14+ messages in thread
From: Sebastian Schuberth @ 2009-03-22 15:30 UTC (permalink / raw)
To: Jeff King; +Cc: Junio C Hamano, Johannes Schindelin, git
>> While the purist in me says #1 above is the right argument to make for
>> feeding "clean" version, I suspect that the textconv or extdiff tools more
>> often are not made from scratch and ported across platforms than are
>> cobbled up together out of tools the script writer finds on his platform.
>> I suspect that Dscho's "a tempfile should look like a checkout" would be
>> much friendlier to them in practice for this reason.
>
> I think you and I have about the same feeling on this, then. As somebody
> who does not actually use smudge/clean filters at all, I am willing to
> defer to Dscho's opinion, which is based on practical experience.
Me being the reporter of the original msysGit issue #177, I'd like to
clarify that my intention not necessarily was to make
"core.autocrlf=true" affect temporary files (i.e. to "smudge" them),
but to ensure that the files fed into "git diff" are always generated
/ acquired in a consistent way, so that they are in fact comparable.
I'd also be happy with a solution that always feeds clean files into
"git diff", although that would probably mean that we could not reuse
working tree files if "core.autocrlf=true" is set.
Maybe it's a good idea to look at how gitk displays the diff, for an
orientation. If the diff gitk shows is based on smudged files, git
diff should probably also always be fed with smudged files, and if the
diff gitk shows is based on clean files, git diff should probably also
always be fed with clean files.
--
Sebastian Schuberth
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-22 15:30 ` Sebastian Schuberth
@ 2009-03-22 21:59 ` Junio C Hamano
2009-03-22 22:48 ` Sebastian Schuberth
0 siblings, 1 reply; 14+ messages in thread
From: Junio C Hamano @ 2009-03-22 21:59 UTC (permalink / raw)
To: Sebastian Schuberth; +Cc: Jeff King, Johannes Schindelin, git
Sebastian Schuberth <sschuberth@gmail.com> writes:
> Me being the reporter of the original msysGit issue #177, I'd like to
> clarify that my intention not necessarily was to make
> "core.autocrlf=true" affect temporary files (i.e. to "smudge" them),
> but to ensure that the files fed into "git diff" are always generated
> / acquired in a consistent way, so that they are in fact comparable.
Thanks. I think everybody involved in the thread is in agreement with
that.
> I'd also be happy with a solution that always feeds clean files into
> "git diff", although that would probably mean that we could not reuse
> working tree files if "core.autocrlf=true" is set.
When we generate diff internally, even when we borrow from the work tree,
we clean it before using. See diff_populate_filespec(), ll.1900-1915.
Borrowing done by diff_tempfile(), which currently does not run clean, and
the call to prep_temp_blob() in ll.2030-2035 that gives a temporary file
without convert_to_working_tree() are inconsistent, as pointed out by you
and Dscho.
If you run "git diff <filename>" after cloning, I expect that no temporary
files are involved, _unless_ you have some settings that force "git diff"
not to use the internal diff. Do you use GIT_EXTERNAL_DIFF? Do you use
"textconv" attribute? What external program do you invoke from these
mechanisms, and what does it expect to see as its input?
The discussion in the last few messages in this thread speculates that the
external programs are more likely to expect representations suitable in
the work tree, aka "smudged", than "clean" one. It would be nice to get a
datapoint from you as the original reporter to confirm or refute that
speculation.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-22 21:59 ` Junio C Hamano
@ 2009-03-22 22:48 ` Sebastian Schuberth
2009-03-22 23:12 ` Junio C Hamano
0 siblings, 1 reply; 14+ messages in thread
From: Sebastian Schuberth @ 2009-03-22 22:48 UTC (permalink / raw)
To: Junio C Hamano; +Cc: Jeff King, Johannes Schindelin, git
> If you run "git diff <filename>" after cloning, I expect that no temporary
> files are involved, _unless_ you have some settings that force "git diff"
> not to use the internal diff. Do you use GIT_EXTERNAL_DIFF? Do you use
> "textconv" attribute? What external program do you invoke from these
> mechanisms, and what does it expect to see as its input?
As I prefer graphical diff tools, I do not use internal diff, but
Beyond Compare [1]. In order to do that I have configured
diff.external to point to a wrapper script that contains the following
lines:
---8<---
#!/bin/bash
# diff is called with 7 parameters:
# path old-file old-hex old-mode new-file new-hex new-mode
"C:/Program Files/Beyond Compare 3/BCompare.exe" "$2" "$5" | cat
---8<---
As Beyond Compare is a stand-alone diff / merge tool, it expects to be
working on regular files in the file system. And to be hostest, I did
not know about the "textconv" attribute until now.
> The discussion in the last few messages in this thread speculates that the
> external programs are more likely to expect representations suitable in
> the work tree, aka "smudged", than "clean" one. It would be nice to get a
> datapoint from you as the original reporter to confirm or refute that
> speculation.
I agree to the speculations. IMHO calling an external diff tool with
two revisions of a file should result in the same as e.g. checking out
the two revisions in two different working trees and then launching
the user's external diff tool on the two working tree files.
[1] http://www.scootersoftware.com/
--
Sebastian Schuberth
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-22 22:48 ` Sebastian Schuberth
@ 2009-03-22 23:12 ` Junio C Hamano
0 siblings, 0 replies; 14+ messages in thread
From: Junio C Hamano @ 2009-03-22 23:12 UTC (permalink / raw)
To: Sebastian Schuberth; +Cc: Jeff King, Johannes Schindelin, git
Sebastian Schuberth <sschuberth@gmail.com> writes:
>> The discussion in the last few messages in this thread speculates that the
>> external programs are more likely to expect representations suitable in
>> the work tree, aka "smudged", than "clean" one. It would be nice to get a
>> datapoint from you as the original reporter to confirm or refute that
>> speculation.
>
> I agree to the speculations. IMHO calling an external diff tool with
> two revisions of a file should result in the same as e.g. checking out
> the two revisions in two different working trees and then launching
> the user's external diff tool on the two working tree files.
Ok.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Respect core.autocrlf when preparing temporary files for external diff
2009-03-22 7:46 ` Jeff King
2009-03-22 15:30 ` Sebastian Schuberth
@ 2009-03-23 0:39 ` Junio C Hamano
1 sibling, 0 replies; 14+ messages in thread
From: Junio C Hamano @ 2009-03-23 0:39 UTC (permalink / raw)
To: Jeff King; +Cc: Johannes Schindelin, git, Sebastian Schuberth
Jeff King <peff@peff.net> writes:
> On Sun, Mar 22, 2009 at 12:18:33AM -0700, Junio C Hamano wrote:
>
>> Ok. Although I already queued the removal to 'pu' for tonight's pushout
>> and it is way too late to revert that, I think I didn't have to remove the
>> function. The codepath that lets you cheat by borrowing from the checkout
>> runs convert_to_git() when it borrows, and if you are seeing a meaningful
>> optimization even with that overhead, perhaps it would be worth keeping.
>
> I certainly haven't done exhaustive tests. Obviously the one I did was a
> bit contrived. I just think it makes sense to have numbers rather than
> saying "this probably doesn't do anything anymore".
>
>> While the purist in me says #1 above is the right argument to make for
>> feeding "clean" version, I suspect that the textconv or extdiff tools more
>> often are not made from scratch and ported across platforms than are
>> cobbled up together out of tools the script writer finds on his platform.
>> I suspect that Dscho's "a tempfile should look like a checkout" would be
>> much friendlier to them in practice for this reason.
>
> I think you and I have about the same feeling on this, then. As somebody
> who does not actually use smudge/clean filters at all, I am willing to
> defer to Dscho's opinion, which is based on practical experience.
Thanks for a sanity check.
I've split these as two unrelated issues, and have queued
(1) Dscho's patch, that always feeds smudged representation to the
external diff and textconv filter;
(2) A change to reuse_worktree_file() that says "don't reuse" for paths
with CE_VALID set.
in 'pu'.
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2009-03-23 0:41 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <cover.1237635609u.git.johannes.schindelin@gmx.de>
2009-03-21 11:42 ` [PATCH] Respect core.autocrlf when preparing temporary files for external diff Johannes Schindelin
2009-03-21 17:02 ` Michael J Gruber
2009-03-21 19:35 ` Junio C Hamano
2009-03-21 23:54 ` Johannes Schindelin
2009-03-22 0:03 ` Junio C Hamano
2009-03-22 0:41 ` Junio C Hamano
2009-03-22 6:10 ` Jeff King
2009-03-22 7:18 ` Junio C Hamano
2009-03-22 7:46 ` Jeff King
2009-03-22 15:30 ` Sebastian Schuberth
2009-03-22 21:59 ` Junio C Hamano
2009-03-22 22:48 ` Sebastian Schuberth
2009-03-22 23:12 ` Junio C Hamano
2009-03-23 0:39 ` Junio C Hamano
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).