From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org, Junio C Hamano <gitster@pobox.com>,
Yann Dirson <ydirson@altern.org>
Cc: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH v2 1/2] diffcore-rename: support rename cache
Date: Sat, 8 Nov 2008 18:27:32 +0700 [thread overview]
Message-ID: <1226143653-3758-1-git-send-email-pclouds@gmail.com> (raw)
This patch teaches diffcore_rename() to look into
$GIT_DIR/rename-cache and make use of it to recreate diff_filepair.
With proper cache, there should be no available entry for estimation
after exact matching.
Rename caching is per commit. I don't think abitrary tree-tree caching
is worth it.
$GIT_DIR/rename-cache spans out like $GIT_DIR/objects. Each file
corresponds to one commit. Its content consists of lines like this
<Destination SHA-1> <SPC> <Source SHA-1> <SPC> <Score in decimal> <NL>
This can be used to:
- Make --find-copies-harder pratically usable for moderate-size
repositories. The first "git show" on a linux kernel commit was 5.3
sec, it then went down to 0.13 sec.
- Give git-svn a chance to (locally) import explicit renames from
Subversion
- People may correct rename results for better diff, if automatic
rename detection is not good enough.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
> > > Has anybody thought about interaction between that caching and pathspec
> > > limited operation?
> > >
> >
> > I didn't. But I think all out-of-pathspec diff pairs are removed
> > before it reaches diffcore_rename() so the cache has nothing to do
> > with it (except it still loads full cache for a commit).
>
>
> Well, it could be that an out-of-pathspec pair would have a better
> score than an in-pathspec one. Maybe cache recording should be turned
> off when doing pathspec limitation ?
Changes from v1:
- rebased to next to avoid conflict
- no longer generate cache if pathspec is used
diff.h | 2 +
diffcore-rename.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++-
log-tree.c | 2 +
t/t4031-rename-cache.sh | 56 ++++++++++++++++++
4 files changed, 200 insertions(+), 2 deletions(-)
create mode 100755 t/t4031-rename-cache.sh
diff --git a/diff.h b/diff.h
index 42582ed..64a1edd 100644
--- a/diff.h
+++ b/diff.h
@@ -111,6 +111,8 @@ struct diff_options {
add_remove_fn_t add_remove;
diff_format_fn_t format_callback;
void *format_callback_data;
+
+ struct commit *commit;
};
enum color_diff {
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 168a95b..598cc8d 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -5,6 +5,7 @@
#include "diff.h"
#include "diffcore.h"
#include "hash.h"
+#include "commit.h"
/* Table of rename/copy destinations */
@@ -409,13 +410,130 @@ static void record_if_better(struct diff_score m[], struct diff_score *o)
m[worst] = *o;
}
+struct cached_filepair {
+ unsigned char dst[20];
+ unsigned char src[20];
+ int score;
+};
+
+static int free_cached_filepair(void *p)
+{
+ free(p);
+ return 0;
+}
+
+static void load_rename_cache(struct diff_queue_struct *q,
+ struct diff_queue_struct *cacheq,
+ struct diff_options *options)
+{
+ char *sha1_hex;
+ FILE *fp;
+ struct hash_table filepair_table;
+ struct hash_table src_table;
+ struct cached_filepair *pp;
+ int i, hash;
+ static int no_cache_available = -1;
+ struct stat st;
+ char *path;
+
+ if (no_cache_available == -1)
+ no_cache_available = stat(git_path("rename-cache"), &st) || !S_ISDIR(st.st_mode);
+
+ /* return soon so we don't need to waste CPU */
+ if (no_cache_available > 0)
+ return;
+
+
+ /* src_table initialization */
+ init_hash(&src_table);
+ for (i = 0; i < q->nr; i++) {
+ struct diff_filepair *p = q->queue[i];
+ if (DIFF_FILE_VALID(p->one)) {
+ unsigned int hash = hash_filespec(p->one);
+ insert_hash(hash, p, &src_table);
+ }
+ }
+
+ /* filepair_table initialization */
+ init_hash(&filepair_table);
+ sha1_hex = sha1_to_hex(options->commit->object.sha1);
+ path = git_path("rename-cache/%c%c/%s",sha1_hex[0], sha1_hex[1], sha1_hex+2);
+ if (stat(path, &st))
+ fp = NULL;
+ else
+ fp = fopen(path, "r");
+ if (fp) {
+ char src_sha1_hex[41], dst_sha1_hex[41];
+ struct cached_filepair p;
+
+ src_sha1_hex[40] = dst_sha1_hex[40] = '\0';
+ while (fscanf(fp, "%40c %40c %d\n", dst_sha1_hex, src_sha1_hex, &p.score) == 3) {
+ if (get_sha1_hex(src_sha1_hex, p.src) ||
+ get_sha1_hex(dst_sha1_hex, p.dst))
+ break;
+
+ pp = xmalloc(sizeof(*pp));
+ memcpy(pp, &p, sizeof(*pp));
+ memcpy(&hash, p.dst, sizeof(hash));
+ insert_hash(hash, pp, &filepair_table);
+ }
+ fclose(fp);
+ }
+
+ for (i = 0; i < q->nr; i++) {
+ struct diff_filepair *p = q->queue[i];
+ struct diff_filepair *dp, *src;
+
+ /* find remote_dst */
+ if (DIFF_FILE_VALID(p->one) ||
+ !DIFF_FILE_VALID(p->two) ||
+ (options->single_follow && strcmp(options->single_follow, p->two->path)))
+ continue;
+
+ memcpy(&hash, p->two->sha1, sizeof(hash));
+ pp = lookup_hash(hash, &filepair_table);
+ if (!pp || memcmp(p->two->sha1, pp->dst, 20))
+ continue;
+
+ /* create pair */
+ if (is_null_sha1(pp->src)) {
+ if (DIFF_FILE_VALID(p->one))
+ continue;
+ diff_q(cacheq, p);
+ q->queue[i] = NULL;
+ continue;
+ }
+
+ memcpy(&hash, pp->src, sizeof(hash));
+ src = lookup_hash(hash, &src_table);
+ if (!src || memcmp(pp->src, src->one->sha1, 20))
+ continue;
+
+ src->one->rename_used++;
+ src->one->count++;
+ p->two->count++;
+
+ dp = diff_queue(NULL, src->one, p->two);
+ dp->renamed_pair = 1;
+ dp->score = pp->score;
+
+ diff_q(cacheq, dp);
+ q->queue[i] = NULL;
+ diff_free_filepair(p);
+ }
+
+ for_each_hash(&filepair_table, free_cached_filepair);
+ free_hash(&src_table);
+ free_hash(&filepair_table);
+}
+
void diffcore_rename(struct diff_options *options)
{
int detect_rename = options->detect_rename;
int minimum_score = options->rename_score;
int rename_limit = options->rename_limit;
struct diff_queue_struct *q = &diff_queued_diff;
- struct diff_queue_struct outq;
+ struct diff_queue_struct outq, cacheq;
struct diff_score *mx;
int i, j, rename_count;
int num_create, num_src, dst_cnt;
@@ -423,8 +541,19 @@ void diffcore_rename(struct diff_options *options)
if (!minimum_score)
minimum_score = DEFAULT_RENAME_SCORE;
+ cacheq.queue = NULL;
+ cacheq.nr = cacheq.alloc = 0;
+
+ if (detect_rename && options->commit)
+ load_rename_cache(q, &cacheq, options);
+
for (i = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
+
+ /* was consumed by rename cache */
+ if (!p)
+ continue;
+
if (!DIFF_FILE_VALID(p->one)) {
if (!DIFF_FILE_VALID(p->two))
continue; /* unmerged */
@@ -563,10 +692,17 @@ void diffcore_rename(struct diff_options *options)
*/
outq.queue = NULL;
outq.nr = outq.alloc = 0;
- for (i = 0; i < q->nr; i++) {
+ for (i = j = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
struct diff_filepair *pair_to_free = NULL;
+ if (!p) {
+ if (j >= cacheq.nr)
+ die("Internal error: running out of cacheq.");
+ diff_q(&outq, cacheq.queue[j++]);
+ continue;
+ }
+
if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
/*
* Creation
@@ -635,6 +771,8 @@ void diffcore_rename(struct diff_options *options)
diff_debug_queue("done copying original", &outq);
free(q->queue);
+ if (cacheq.queue)
+ free(cacheq.queue);
*q = outq;
diff_debug_queue("done collapsing", q);
diff --git a/log-tree.c b/log-tree.c
index 5444f08..040e095 100644
--- a/log-tree.c
+++ b/log-tree.c
@@ -522,6 +522,7 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit)
log.commit = commit;
log.parent = NULL;
opt->loginfo = &log;
+ opt->diffopt.commit = commit;
shown = log_tree_diff(opt, commit, &log);
if (!shown && opt->loginfo && opt->always_show_header) {
@@ -531,5 +532,6 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit)
}
opt->loginfo = NULL;
maybe_flush_or_die(stdout, "stdout");
+ opt->diffopt.commit = NULL;
return shown;
}
diff --git a/t/t4031-rename-cache.sh b/t/t4031-rename-cache.sh
new file mode 100755
index 0000000..f7c53fd
--- /dev/null
+++ b/t/t4031-rename-cache.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+#
+# Copyright (c) 2008 Nguyen Thai Ngoc Duy
+#
+
+test_description='Test diff rename cache'
+. ./test-lib.sh
+
+cat >expected <<EOF
+ create mode 100644 sub/c
+ copy a => sub/d (100%)
+EOF
+test_expect_success 'setup' '
+ echo a > a
+ echo b > b
+ mkdir sub
+ echo c > sub/c
+ cp a sub/d
+ A_SHA1=$(git hash-object a)
+ B_SHA1=$(git hash-object b)
+ C_SHA1=$(git hash-object sub/c)
+ D_SHA1=$(git hash-object sub/d)
+ git add a b
+ test_tick
+ git commit -m first
+ git add sub
+ git commit -m second
+ git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+ test_cmp expected result
+'
+
+cat >expected <<EOF
+ copy a => sub/c (100%)
+ copy a => sub/d (100%)
+EOF
+test_expect_success 'load rename pair cache' '
+ P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,")
+ mkdir -p $(dirname $P)
+ echo $C_SHA1 $A_SHA1 60000 >> $P
+ git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+ test_cmp expected result
+'
+
+cat >expected <<EOF
+ copy a => sub/c (100%)
+ create mode 100644 sub/d
+EOF
+test_expect_success 'load create pair cache' '
+ P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,")
+ mkdir -p $(dirname $P)
+ echo $D_SHA1 0000000000000000000000000000000000000000 0 >> $P
+ git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+ test_cmp expected result
+'
+
+test_done
--
1.6.0.3.892.g83538
next reply other threads:[~2008-11-08 11:30 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-11-08 11:27 Nguyễn Thái Ngọc Duy [this message]
2008-11-08 11:27 ` [PATCH v2 2/2] diffcore-rename: add config option to allow to cache renames Nguyễn Thái Ngọc Duy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1226143653-3758-1-git-send-email-pclouds@gmail.com \
--to=pclouds@gmail.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=ydirson@altern.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox