* [PATCH 1/2] diffcore-rename: support rename cache
@ 2008-11-07 14:35 Nguyễn Thái Ngọc Duy
2008-11-07 14:35 ` [PATCH 2/2] diffcore-rename: add config option to allow to cache renames Nguyễn Thái Ngọc Duy
2008-11-07 22:21 ` [PATCH 1/2] diffcore-rename: support rename cache Yann Dirson
0 siblings, 2 replies; 10+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2008-11-07 14:35 UTC (permalink / raw)
To: git; +Cc: Nguyễn Thái Ngọc Duy
This patch teaches diffcore_rename() to look into
$GIT_DIR/rename-cache and make use of it to recreate diff_filepair.
With proper cache, there should be no available entry for estimation
after exact matching.
Rename caching is per commit. I don't think abitrary tree-tree caching
is worth it.
$GIT_DIR/rename-cache spans out like $GIT_DIR/objects. Each file
corresponds to one commit. Its content consists of lines like this
<Destination SHA-1> <SPC> <Source SHA-1> <SPC> <Score in decimal> <NL>
This can be used to:
- Make --find-copies-harder pratically usable for moderate-size
repositories. The first "git show" on a linux kernel commit was 5.3
sec, it then went down to 0.13 sec.
- Give git-svn a chance to (locally) import explicit renames from
Subversion
- People may correct rename results for better diff, if automatic
rename detection is not good enough.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
diff.h | 2 +
diffcore-rename.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++-
log-tree.c | 2 +
t/t4030-rename-cache.sh | 55 ++++++++++++++++++
4 files changed, 199 insertions(+), 2 deletions(-)
create mode 100755 t/t4030-rename-cache.sh
diff --git a/diff.h b/diff.h
index a49d865..8b68f6f 100644
--- a/diff.h
+++ b/diff.h
@@ -110,6 +110,8 @@ struct diff_options {
add_remove_fn_t add_remove;
diff_format_fn_t format_callback;
void *format_callback_data;
+
+ struct commit *commit;
};
enum color_diff {
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 168a95b..598cc8d 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -5,6 +5,7 @@
#include "diff.h"
#include "diffcore.h"
#include "hash.h"
+#include "commit.h"
/* Table of rename/copy destinations */
@@ -409,13 +410,130 @@ static void record_if_better(struct diff_score m[], struct diff_score *o)
m[worst] = *o;
}
+struct cached_filepair {
+ unsigned char dst[20];
+ unsigned char src[20];
+ int score;
+};
+
+static int free_cached_filepair(void *p)
+{
+ free(p);
+ return 0;
+}
+
+static void load_rename_cache(struct diff_queue_struct *q,
+ struct diff_queue_struct *cacheq,
+ struct diff_options *options)
+{
+ char *sha1_hex;
+ FILE *fp;
+ struct hash_table filepair_table;
+ struct hash_table src_table;
+ struct cached_filepair *pp;
+ int i, hash;
+ static int no_cache_available = -1;
+ struct stat st;
+ char *path;
+
+ if (no_cache_available == -1)
+ no_cache_available = stat(git_path("rename-cache"), &st) || !S_ISDIR(st.st_mode);
+
+ /* return soon so we don't need to waste CPU */
+ if (no_cache_available > 0)
+ return;
+
+
+ /* src_table initialization */
+ init_hash(&src_table);
+ for (i = 0; i < q->nr; i++) {
+ struct diff_filepair *p = q->queue[i];
+ if (DIFF_FILE_VALID(p->one)) {
+ unsigned int hash = hash_filespec(p->one);
+ insert_hash(hash, p, &src_table);
+ }
+ }
+
+ /* filepair_table initialization */
+ init_hash(&filepair_table);
+ sha1_hex = sha1_to_hex(options->commit->object.sha1);
+ path = git_path("rename-cache/%c%c/%s",sha1_hex[0], sha1_hex[1], sha1_hex+2);
+ if (stat(path, &st))
+ fp = NULL;
+ else
+ fp = fopen(path, "r");
+ if (fp) {
+ char src_sha1_hex[41], dst_sha1_hex[41];
+ struct cached_filepair p;
+
+ src_sha1_hex[40] = dst_sha1_hex[40] = '\0';
+ while (fscanf(fp, "%40c %40c %d\n", dst_sha1_hex, src_sha1_hex, &p.score) == 3) {
+ if (get_sha1_hex(src_sha1_hex, p.src) ||
+ get_sha1_hex(dst_sha1_hex, p.dst))
+ break;
+
+ pp = xmalloc(sizeof(*pp));
+ memcpy(pp, &p, sizeof(*pp));
+ memcpy(&hash, p.dst, sizeof(hash));
+ insert_hash(hash, pp, &filepair_table);
+ }
+ fclose(fp);
+ }
+
+ for (i = 0; i < q->nr; i++) {
+ struct diff_filepair *p = q->queue[i];
+ struct diff_filepair *dp, *src;
+
+ /* find remote_dst */
+ if (DIFF_FILE_VALID(p->one) ||
+ !DIFF_FILE_VALID(p->two) ||
+ (options->single_follow && strcmp(options->single_follow, p->two->path)))
+ continue;
+
+ memcpy(&hash, p->two->sha1, sizeof(hash));
+ pp = lookup_hash(hash, &filepair_table);
+ if (!pp || memcmp(p->two->sha1, pp->dst, 20))
+ continue;
+
+ /* create pair */
+ if (is_null_sha1(pp->src)) {
+ if (DIFF_FILE_VALID(p->one))
+ continue;
+ diff_q(cacheq, p);
+ q->queue[i] = NULL;
+ continue;
+ }
+
+ memcpy(&hash, pp->src, sizeof(hash));
+ src = lookup_hash(hash, &src_table);
+ if (!src || memcmp(pp->src, src->one->sha1, 20))
+ continue;
+
+ src->one->rename_used++;
+ src->one->count++;
+ p->two->count++;
+
+ dp = diff_queue(NULL, src->one, p->two);
+ dp->renamed_pair = 1;
+ dp->score = pp->score;
+
+ diff_q(cacheq, dp);
+ q->queue[i] = NULL;
+ diff_free_filepair(p);
+ }
+
+ for_each_hash(&filepair_table, free_cached_filepair);
+ free_hash(&src_table);
+ free_hash(&filepair_table);
+}
+
void diffcore_rename(struct diff_options *options)
{
int detect_rename = options->detect_rename;
int minimum_score = options->rename_score;
int rename_limit = options->rename_limit;
struct diff_queue_struct *q = &diff_queued_diff;
- struct diff_queue_struct outq;
+ struct diff_queue_struct outq, cacheq;
struct diff_score *mx;
int i, j, rename_count;
int num_create, num_src, dst_cnt;
@@ -423,8 +541,19 @@ void diffcore_rename(struct diff_options *options)
if (!minimum_score)
minimum_score = DEFAULT_RENAME_SCORE;
+ cacheq.queue = NULL;
+ cacheq.nr = cacheq.alloc = 0;
+
+ if (detect_rename && options->commit)
+ load_rename_cache(q, &cacheq, options);
+
for (i = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
+
+ /* was consumed by rename cache */
+ if (!p)
+ continue;
+
if (!DIFF_FILE_VALID(p->one)) {
if (!DIFF_FILE_VALID(p->two))
continue; /* unmerged */
@@ -563,10 +692,17 @@ void diffcore_rename(struct diff_options *options)
*/
outq.queue = NULL;
outq.nr = outq.alloc = 0;
- for (i = 0; i < q->nr; i++) {
+ for (i = j = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
struct diff_filepair *pair_to_free = NULL;
+ if (!p) {
+ if (j >= cacheq.nr)
+ die("Internal error: running out of cacheq.");
+ diff_q(&outq, cacheq.queue[j++]);
+ continue;
+ }
+
if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
/*
* Creation
@@ -635,6 +771,8 @@ void diffcore_rename(struct diff_options *options)
diff_debug_queue("done copying original", &outq);
free(q->queue);
+ if (cacheq.queue)
+ free(cacheq.queue);
*q = outq;
diff_debug_queue("done collapsing", q);
diff --git a/log-tree.c b/log-tree.c
index cec3c06..a67ef6d 100644
--- a/log-tree.c
+++ b/log-tree.c
@@ -518,6 +518,7 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit)
log.commit = commit;
log.parent = NULL;
opt->loginfo = &log;
+ opt->diffopt.commit = commit;
shown = log_tree_diff(opt, commit, &log);
if (!shown && opt->loginfo && opt->always_show_header) {
@@ -527,5 +528,6 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit)
}
opt->loginfo = NULL;
maybe_flush_or_die(stdout, "stdout");
+ opt->diffopt.commit = NULL;
return shown;
}
diff --git a/t/t4030-rename-cache.sh b/t/t4030-rename-cache.sh
new file mode 100755
index 0000000..0d8390c
--- /dev/null
+++ b/t/t4030-rename-cache.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+#
+# Copyright (c) 2008 Nguyen Thai Ngoc Duy
+#
+
+test_description='Test diff rename cache'
+. ./test-lib.sh
+
+cat >expected <<EOF
+ create mode 100644 c
+ copy a => d (100%)
+EOF
+test_expect_success 'setup' '
+ for i in a b c
+ do
+ echo $i > $i
+ done
+ cp a d
+ A_SHA1=$(git hash-object a)
+ B_SHA1=$(git hash-object b)
+ C_SHA1=$(git hash-object c)
+ D_SHA1=$(git hash-object d)
+ git add a b
+ git commit -m first
+ git add c d
+ git commit -m second
+ git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+ test_cmp expected result
+'
+
+cat >expected <<EOF
+ copy a => c (100%)
+ copy a => d (100%)
+EOF
+test_expect_success 'load rename pair cache' '
+ P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,") &&
+ mkdir -p $(dirname $P)
+ echo $C_SHA1 $A_SHA1 60000 >> $P
+ git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+ test_cmp expected result
+'
+
+cat >expected <<EOF
+ copy a => c (100%)
+ create mode 100644 d
+EOF
+test_expect_success 'load create pair cache' '
+ P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,") &&
+ mkdir -p $(dirname $P)
+ echo $D_SHA1 0000000000000000000000000000000000000000 0 >> $P
+ git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+ test_cmp expected result
+'
+
+test_done
--
1.6.0.3.802.g47c38
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 2/2] diffcore-rename: add config option to allow to cache renames
2008-11-07 14:35 [PATCH 1/2] diffcore-rename: support rename cache Nguyễn Thái Ngọc Duy
@ 2008-11-07 14:35 ` Nguyễn Thái Ngọc Duy
2008-11-07 22:21 ` [PATCH 1/2] diffcore-rename: support rename cache Yann Dirson
1 sibling, 0 replies; 10+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2008-11-07 14:35 UTC (permalink / raw)
To: git; +Cc: Nguyễn Thái Ngọc Duy
If diff.cacherenames is true, then renames will be cached to
$GIT_DIR/rename-cache. By default, it will not overwrite existing
cache. Add --refresh-cache to overwrite.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
if git-svn is going to use this, then perharps we should add a rule to prevent
overwriting certain cache files with .keep files, so that git-svn generated cache
does not get lost
Documentation/config.txt | 5 ++++
Documentation/diff-options.txt | 5 ++++
diff.c | 12 +++++++++++
diff.h | 2 +
diffcore-rename.c | 41 ++++++++++++++++++++++++++++++++++++++++
t/t4030-rename-cache.sh | 27 ++++++++++++++++++++++++++
6 files changed, 92 insertions(+), 0 deletions(-)
diff --git a/Documentation/config.txt b/Documentation/config.txt
index 29369d0..81160d3 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -630,6 +630,11 @@ diff.renames::
will enable basic rename detection. If set to "copies" or
"copy", it will detect copies, as well.
+diff.cacherenames::
+ Tells git to automatically cache renames when detected. The
+ cache resides in $GIT_DIR/rename-cache, which is used by git
+ if exists.
+
fetch.unpackLimit::
If the number of objects fetched over the git native
transfer is below this
diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt
index c62b45c..d477a40 100644
--- a/Documentation/diff-options.txt
+++ b/Documentation/diff-options.txt
@@ -102,6 +102,11 @@ endif::git-format-patch[]
Turn off rename detection, even when the configuration
file gives the default to do so.
+--refresh-rename-cache::
+ By default, when git finds a cached version of a commit, it
+ will not overwrite the cache. This option makes git overwrite
+ old cache or create a new one.
+
--check::
Warn if changes introduce trailing whitespace
or an indent that uses a space before a tab. Exits with
diff --git a/diff.c b/diff.c
index e368fef..1d65bd9 100644
--- a/diff.c
+++ b/diff.c
@@ -26,6 +26,7 @@ int diff_use_color_default = -1;
static const char *external_diff_cmd_cfg;
int diff_auto_refresh_index = 1;
static int diff_mnemonic_prefix;
+static int diff_cache_renames;
static char diff_colors[][COLOR_MAXLEN] = {
"\033[m", /* reset */
@@ -109,6 +110,11 @@ int git_diff_basic_config(const char *var, const char *value, void *cb)
return 0;
}
+ if (!strcmp(var, "diff.cacherenames")) {
+ diff_cache_renames = git_config_bool(var, value);
+ return 0;
+ }
+
if (!prefixcmp(var, "diff.color.") || !prefixcmp(var, "color.diff.")) {
int slot = parse_diff_color_slot(var, 11);
if (!value)
@@ -2248,6 +2254,8 @@ int diff_setup_done(struct diff_options *options)
if (options->detect_rename && options->rename_limit < 0)
options->rename_limit = diff_rename_limit_default;
+ if (options->detect_rename)
+ options->cache_renames = diff_cache_renames;
if (options->setup & DIFF_SETUP_USE_CACHE) {
if (!active_cache)
/* read-cache does not die even when it fails
@@ -2415,6 +2423,10 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac)
DIFF_OPT_SET(options, RELATIVE_NAME);
options->prefix = arg + 11;
}
+ else if (!strcmp(arg, "--refresh-rename-cache")) {
+ options->cache_renames = 1;
+ options->refresh_rename_cache = 1;
+ }
/* xdiff options */
else if (!strcmp(arg, "-w") || !strcmp(arg, "--ignore-all-space"))
diff --git a/diff.h b/diff.h
index 8b68f6f..eb97955 100644
--- a/diff.h
+++ b/diff.h
@@ -85,6 +85,8 @@ struct diff_options {
int pickaxe_opts;
int rename_score;
int rename_limit;
+ int cache_renames;
+ int refresh_rename_cache;
int warn_on_too_large_rename;
int dirstat_percent;
int setup;
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 598cc8d..2b87e4e 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -527,6 +527,44 @@ static void load_rename_cache(struct diff_queue_struct *q,
free_hash(&filepair_table);
}
+static void save_rename_cache(struct diff_queue_struct *outq,
+ struct diff_options *options)
+{
+ int i;
+ FILE *fp = NULL;
+ struct stat st;
+
+ for (i = 0;i < outq->nr; i++) {
+ struct diff_filepair *dp = outq->queue[i];
+
+ if (!(dp->renamed_pair || /* rename pair */
+ (!DIFF_FILE_VALID(dp->one) && DIFF_FILE_VALID(dp->two)))) /* create pair */
+ continue;
+
+ if (!fp) {
+ char *sha1 = sha1_to_hex(options->commit->object.sha1);
+ char *path = git_path("rename-cache/%c%c/%s", sha1[0], sha1[1], sha1+2);
+
+ /* Already cached. If not force refresh, move on */
+ if (!stat(path, &st) && !options->refresh_rename_cache)
+ return;
+
+ safe_create_leading_directories(path);
+ fp = fopen(path, "w");
+
+ if (!fp)
+ return;
+ }
+
+ fprintf(fp, "%s ", sha1_to_hex(dp->two->sha1));
+ fprintf(fp, "%s %d\n",
+ sha1_to_hex(DIFF_FILE_VALID(dp->one) ? dp->one->sha1 : null_sha1),
+ dp->score);
+ }
+ if (fp)
+ fclose(fp);
+}
+
void diffcore_rename(struct diff_options *options)
{
int detect_rename = options->detect_rename;
@@ -770,6 +808,9 @@ void diffcore_rename(struct diff_options *options)
}
diff_debug_queue("done copying original", &outq);
+ if (options->commit && options->cache_renames)
+ save_rename_cache(&outq, options);
+
free(q->queue);
if (cacheq.queue)
free(cacheq.queue);
diff --git a/t/t4030-rename-cache.sh b/t/t4030-rename-cache.sh
index 0d8390c..24d3667 100755
--- a/t/t4030-rename-cache.sh
+++ b/t/t4030-rename-cache.sh
@@ -52,4 +52,31 @@ test_expect_success 'load create pair cache' '
test_cmp expected result
'
+cat >expected <<EOF
+f2ad6c76f0115a6ba5b00456a849810e7ec0af20 0000000000000000000000000000000000000000 0
+78981922613b2afb6025042ff6bd878ac1994e85 78981922613b2afb6025042ff6bd878ac1994e85 60000
+EOF
+test_expect_success 'save rename cache' '
+ P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,") &&
+ rm -r .git/rename-cache
+ git config diff.cacherenames true
+ git show --summary -C -M --find-copies-harder > /dev/null
+ test_cmp expected $P
+'
+
+test_expect_success 'subsequent command does not change cache' '
+ P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,") &&
+ echo corrupted >> $P
+ ! test_cmp expected $P &&
+ git show --summary -C -M --find-copies-harder HEAD > /dev/null &&
+ ! test_cmp expected $P
+'
+
+test_expect_success 'overwrite cache with --refresh-rename-cache' '
+ P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,") &&
+ ! test_cmp expected $P &&
+ git show --summary -C -M --find-copies-harder --refresh-rename-cache HEAD > /dev/null &&
+ test_cmp expected $P
+'
+
test_done
--
1.6.0.3.802.g47c38
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] diffcore-rename: support rename cache
2008-11-07 14:35 [PATCH 1/2] diffcore-rename: support rename cache Nguyễn Thái Ngọc Duy
2008-11-07 14:35 ` [PATCH 2/2] diffcore-rename: add config option to allow to cache renames Nguyễn Thái Ngọc Duy
@ 2008-11-07 22:21 ` Yann Dirson
2008-11-07 23:17 ` Junio C Hamano
1 sibling, 1 reply; 10+ messages in thread
From: Yann Dirson @ 2008-11-07 22:21 UTC (permalink / raw)
To: Nguy???n Thái Ng???c Duy; +Cc: git
On Fri, Nov 07, 2008 at 09:35:32PM +0700, Nguy???n Thái Ng???c Duy wrote:
> This patch teaches diffcore_rename() to look into
> $GIT_DIR/rename-cache and make use of it to recreate diff_filepair.
> With proper cache, there should be no available entry for estimation
> after exact matching.
This is something I have thought about in the past, good to see that
implemented :)
> Rename caching is per commit. I don't think abitrary tree-tree caching
> is worth it.
That could be a nice complement to my directory-rename patch.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] diffcore-rename: support rename cache
2008-11-07 22:21 ` [PATCH 1/2] diffcore-rename: support rename cache Yann Dirson
@ 2008-11-07 23:17 ` Junio C Hamano
2008-11-08 4:01 ` Nguyen Thai Ngoc Duy
0 siblings, 1 reply; 10+ messages in thread
From: Junio C Hamano @ 2008-11-07 23:17 UTC (permalink / raw)
To: Yann Dirson; +Cc: Nguy???n Thái Ng???c Duy, git
Yann Dirson <ydirson@altern.org> writes:
> On Fri, Nov 07, 2008 at 09:35:32PM +0700, Nguy???n Thái Ng???c Duy wrote:
>> This patch teaches diffcore_rename() to look into
>> $GIT_DIR/rename-cache and make use of it to recreate diff_filepair.
>> With proper cache, there should be no available entry for estimation
>> after exact matching.
>
> This is something I have thought about in the past, good to see that
> implemented :)
>
>> Rename caching is per commit. I don't think abitrary tree-tree caching
>> is worth it.
>
> That could be a nice complement to my directory-rename patch.
Has anybody thought about interaction between that caching and pathspec
limited operation?
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] diffcore-rename: support rename cache
2008-11-07 23:17 ` Junio C Hamano
@ 2008-11-08 4:01 ` Nguyen Thai Ngoc Duy
2008-11-08 9:24 ` Yann Dirson
0 siblings, 1 reply; 10+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2008-11-08 4:01 UTC (permalink / raw)
To: Junio C Hamano; +Cc: Yann Dirson, git
On 11/8/08, Junio C Hamano <gitster@pobox.com> wrote:
> Yann Dirson <ydirson@altern.org> writes:
>
> > On Fri, Nov 07, 2008 at 09:35:32PM +0700, Nguy???n Thái Ng???c Duy wrote:
> >> This patch teaches diffcore_rename() to look into
> >> $GIT_DIR/rename-cache and make use of it to recreate diff_filepair.
> >> With proper cache, there should be no available entry for estimation
> >> after exact matching.
> >
> > This is something I have thought about in the past, good to see that
> > implemented :)
> >
> >> Rename caching is per commit. I don't think abitrary tree-tree caching
> >> is worth it.
> >
> > That could be a nice complement to my directory-rename patch.
>
>
> Has anybody thought about interaction between that caching and pathspec
> limited operation?
>
I didn't. But I think all out-of-pathspec diff pairs are removed
before it reaches diffcore_rename() so the cache has nothing to do
with it (except it still loads full cache for a commit).
--
Duy
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] diffcore-rename: support rename cache
2008-11-08 4:01 ` Nguyen Thai Ngoc Duy
@ 2008-11-08 9:24 ` Yann Dirson
2008-11-08 9:29 ` Nguyen Thai Ngoc Duy
2008-11-08 11:47 ` Jeff King
0 siblings, 2 replies; 10+ messages in thread
From: Yann Dirson @ 2008-11-08 9:24 UTC (permalink / raw)
To: Nguyen Thai Ngoc Duy; +Cc: Junio C Hamano, git
On Sat, Nov 08, 2008 at 11:01:20AM +0700, Nguyen Thai Ngoc Duy wrote:
> On 11/8/08, Junio C Hamano <gitster@pobox.com> wrote:
> > Yann Dirson <ydirson@altern.org> writes:
> >
> > > On Fri, Nov 07, 2008 at 09:35:32PM +0700, Nguy???n Thái Ng???c Duy wrote:
> > >> This patch teaches diffcore_rename() to look into
> > >> $GIT_DIR/rename-cache and make use of it to recreate diff_filepair.
> > >> With proper cache, there should be no available entry for estimation
> > >> after exact matching.
> > >
> > > This is something I have thought about in the past, good to see that
> > > implemented :)
> > >
> > >> Rename caching is per commit. I don't think abitrary tree-tree caching
> > >> is worth it.
> > >
> > > That could be a nice complement to my directory-rename patch.
> >
> >
> > Has anybody thought about interaction between that caching and pathspec
> > limited operation?
> >
>
> I didn't. But I think all out-of-pathspec diff pairs are removed
> before it reaches diffcore_rename() so the cache has nothing to do
> with it (except it still loads full cache for a commit).
Well, it could be that an out-of-pathspec pair would have a better
score than an in-pathspec one. Maybe cache recording should be turned
off when doing pathspec limitation ?
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] diffcore-rename: support rename cache
2008-11-08 9:24 ` Yann Dirson
@ 2008-11-08 9:29 ` Nguyen Thai Ngoc Duy
2008-11-08 11:47 ` Jeff King
1 sibling, 0 replies; 10+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2008-11-08 9:29 UTC (permalink / raw)
To: Yann Dirson; +Cc: Junio C Hamano, git
On 11/8/08, Yann Dirson <ydirson@altern.org> wrote:
> On Sat, Nov 08, 2008 at 11:01:20AM +0700, Nguyen Thai Ngoc Duy wrote:
> > On 11/8/08, Junio C Hamano <gitster@pobox.com> wrote:
> > > Yann Dirson <ydirson@altern.org> writes:
> > >
> > > > On Fri, Nov 07, 2008 at 09:35:32PM +0700, Nguy???n Thái Ng???c Duy wrote:
> > > >> This patch teaches diffcore_rename() to look into
> > > >> $GIT_DIR/rename-cache and make use of it to recreate diff_filepair.
> > > >> With proper cache, there should be no available entry for estimation
> > > >> after exact matching.
> > > >
> > > > This is something I have thought about in the past, good to see that
> > > > implemented :)
> > > >
> > > >> Rename caching is per commit. I don't think abitrary tree-tree caching
> > > >> is worth it.
> > > >
> > > > That could be a nice complement to my directory-rename patch.
> > >
> > >
> > > Has anybody thought about interaction between that caching and pathspec
> > > limited operation?
> > >
> >
> > I didn't. But I think all out-of-pathspec diff pairs are removed
> > before it reaches diffcore_rename() so the cache has nothing to do
> > with it (except it still loads full cache for a commit).
>
>
> Well, it could be that an out-of-pathspec pair would have a better
> score than an in-pathspec one. Maybe cache recording should be turned
> off when doing pathspec limitation ?
Right, recording should be turned off or something. Let me see..
--
Duy
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] diffcore-rename: support rename cache
2008-11-08 9:24 ` Yann Dirson
2008-11-08 9:29 ` Nguyen Thai Ngoc Duy
@ 2008-11-08 11:47 ` Jeff King
2008-11-08 12:00 ` Nguyen Thai Ngoc Duy
1 sibling, 1 reply; 10+ messages in thread
From: Jeff King @ 2008-11-08 11:47 UTC (permalink / raw)
To: Yann Dirson; +Cc: Nguyen Thai Ngoc Duy, Junio C Hamano, git
On Sat, Nov 08, 2008 at 10:24:10AM +0100, Yann Dirson wrote:
> Well, it could be that an out-of-pathspec pair would have a better
> score than an in-pathspec one. Maybe cache recording should be turned
> off when doing pathspec limitation ?
One thing I notice is that the cache works at the level of "here is the
best rename for this commit." Maybe it could go down a level and say
"here is the inexact rename score between these blobs". Then you would
still find the best score between two blobs each time, but save the
really computationally intensive part (which is comparing the actual
_content_ of the blobs).
That should work in the face of path limiting or any other option,
because it is caching something immutable: this is the similarity score
between two pieces of content. And then you get arbitrary tree-to-tree
speedups for free, since such a cache would be valid for every commit.
The downsides are:
- your cache is potentially bigger, since you are caching the score of
every pair you look at, instead of just "good" pairs (OTOH, you are
not doing a per-commit cache, which helps reduce the size)
- you can still "lie" about a score to pre-seed imported SVN renames,
but such lying will actually apply to all commits.
-Peff
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] diffcore-rename: support rename cache
2008-11-08 11:47 ` Jeff King
@ 2008-11-08 12:00 ` Nguyen Thai Ngoc Duy
2008-11-09 2:04 ` Jeff King
0 siblings, 1 reply; 10+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2008-11-08 12:00 UTC (permalink / raw)
To: Jeff King; +Cc: Yann Dirson, Junio C Hamano, git
On 11/8/08, Jeff King <peff@peff.net> wrote:
> On Sat, Nov 08, 2008 at 10:24:10AM +0100, Yann Dirson wrote:
>
> > Well, it could be that an out-of-pathspec pair would have a better
> > score than an in-pathspec one. Maybe cache recording should be turned
> > off when doing pathspec limitation ?
>
>
> One thing I notice is that the cache works at the level of "here is the
> best rename for this commit." Maybe it could go down a level and say
> "here is the inexact rename score between these blobs". Then you would
> still find the best score between two blobs each time, but save the
> really computationally intensive part (which is comparing the actual
> _content_ of the blobs).
> That should work in the face of path limiting or any other option,
> because it is caching something immutable: this is the similarity score
> between two pieces of content. And then you get arbitrary tree-to-tree
> speedups for free, since such a cache would be valid for every commit.
I did that and realized the cost was not from each diff, in
--find-copies-harder case, but from the number of diffs you had to do.
Even with exact matching on linux-2.6.git, it could take significant
time (it was about 5 minutes in no-cache case, 1 minute without exact
match cache, and less than 1 sec if everything is cached).
>
> The downsides are:
>
> - your cache is potentially bigger, since you are caching the score of
> every pair you look at, instead of just "good" pairs (OTOH, you are
> not doing a per-commit cache, which helps reduce the size)
It is huge if you accidentially add --find-copies-harder to your
command, considering that every new file will be compared against
every files in tree (about 25k).
> - you can still "lie" about a score to pre-seed imported SVN renames,
> but such lying will actually apply to all commits.
--
Duy
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2] diffcore-rename: support rename cache
2008-11-08 12:00 ` Nguyen Thai Ngoc Duy
@ 2008-11-09 2:04 ` Jeff King
0 siblings, 0 replies; 10+ messages in thread
From: Jeff King @ 2008-11-09 2:04 UTC (permalink / raw)
To: Nguyen Thai Ngoc Duy; +Cc: Yann Dirson, Junio C Hamano, git
On Sat, Nov 08, 2008 at 07:00:10PM +0700, Nguyen Thai Ngoc Duy wrote:
> > The downsides are:
> >
> > - your cache is potentially bigger, since you are caching the score of
> > every pair you look at, instead of just "good" pairs (OTOH, you are
> > not doing a per-commit cache, which helps reduce the size)
>
> It is huge if you accidentially add --find-copies-harder to your
> command, considering that every new file will be compared against
> every files in tree (about 25k).
Hmm, yeah. I was thinking you might be able to do some kind of cut-off
on the caching (i.e., don't bother storing anything that didn't come
close). But you can't safely assume that because an entry isn't there,
it isn't worth seeing (since it might also just not have been computed
yet). You could still organize by commit, and then each commit is either
fully computed or not. But then you still have a pathspec problem.
One thing you could do is just compute the rename score between all
pairs, even if a pathspec is given, limit it to values over "0.5" (or
something low, but that eliminates the totally uninteresting cases), and
then store that as the complete cache for that commit (or tree pair, if
you want to support that).
Then you would have the full information and could do an arbitrary
pathspec limit on it. If you wanted to set the rename threshold below
0.5, then we would have to recompute without the cache (but in practice,
that should be rare).
The real downside is that you pay for the whole-tree detection when you
have asked for a pathspec (but only the first time, after which you can
always generate from cache).
Just thinking out loud...
-Peff
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2008-11-09 2:06 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-11-07 14:35 [PATCH 1/2] diffcore-rename: support rename cache Nguyễn Thái Ngọc Duy
2008-11-07 14:35 ` [PATCH 2/2] diffcore-rename: add config option to allow to cache renames Nguyễn Thái Ngọc Duy
2008-11-07 22:21 ` [PATCH 1/2] diffcore-rename: support rename cache Yann Dirson
2008-11-07 23:17 ` Junio C Hamano
2008-11-08 4:01 ` Nguyen Thai Ngoc Duy
2008-11-08 9:24 ` Yann Dirson
2008-11-08 9:29 ` Nguyen Thai Ngoc Duy
2008-11-08 11:47 ` Jeff King
2008-11-08 12:00 ` Nguyen Thai Ngoc Duy
2008-11-09 2:04 ` Jeff King
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox