All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] diffcore-rename: support rename cache
@ 2008-11-08 11:27 Nguyễn Thái Ngọc Duy
  2008-11-08 11:27 ` [PATCH v2 2/2] diffcore-rename: add config option to allow to cache renames Nguyễn Thái Ngọc Duy
  0 siblings, 1 reply; 2+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2008-11-08 11:27 UTC (permalink / raw)
  To: git, Junio C Hamano, Yann Dirson; +Cc: Nguyễn Thái Ngọc Duy

This patch teaches diffcore_rename() to look into
$GIT_DIR/rename-cache and make use of it to recreate diff_filepair.
With proper cache, there should be no available entry for estimation
after exact matching.

Rename caching is per commit. I don't think abitrary tree-tree caching
is worth it.

$GIT_DIR/rename-cache spans out like $GIT_DIR/objects. Each file
corresponds to one commit. Its content consists of lines like this

<Destination SHA-1> <SPC> <Source SHA-1> <SPC> <Score in decimal> <NL>

This can be used to:

 - Make --find-copies-harder pratically usable for moderate-size
   repositories. The first "git show" on a linux kernel commit was 5.3
   sec, it then went down to 0.13 sec.
 - Give git-svn a chance to (locally) import explicit renames from
   Subversion
 - People may correct rename results for better diff, if automatic
   rename detection is not good enough.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 >  > > Has anybody thought about interaction between that caching and pathspec
 >  > >  limited operation?
 >  > >
 >  >
 >  > I didn't. But I think all out-of-pathspec diff pairs are removed
 >  > before it reaches diffcore_rename() so the cache has nothing to do
 >  > with it (except it still loads full cache for a commit).
 >
 >
 > Well, it could be that an out-of-pathspec pair would have a better
 >  score than an in-pathspec one.  Maybe cache recording should be turned
 >  off when doing pathspec limitation ?

 Changes from v1:
  - rebased to next to avoid conflict
  - no longer generate cache if pathspec is used

 diff.h                  |    2 +
 diffcore-rename.c       |  142 ++++++++++++++++++++++++++++++++++++++++++++++-
 log-tree.c              |    2 +
 t/t4031-rename-cache.sh |   56 ++++++++++++++++++
 4 files changed, 200 insertions(+), 2 deletions(-)
 create mode 100755 t/t4031-rename-cache.sh

diff --git a/diff.h b/diff.h
index 42582ed..64a1edd 100644
--- a/diff.h
+++ b/diff.h
@@ -111,6 +111,8 @@ struct diff_options {
 	add_remove_fn_t add_remove;
 	diff_format_fn_t format_callback;
 	void *format_callback_data;
+
+	struct commit *commit;
 };
 
 enum color_diff {
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 168a95b..598cc8d 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -5,6 +5,7 @@
 #include "diff.h"
 #include "diffcore.h"
 #include "hash.h"
+#include "commit.h"
 
 /* Table of rename/copy destinations */
 
@@ -409,13 +410,130 @@ static void record_if_better(struct diff_score m[], struct diff_score *o)
 		m[worst] = *o;
 }
 
+struct cached_filepair {
+	unsigned char dst[20];
+	unsigned char src[20];
+	int score;
+};
+
+static int free_cached_filepair(void *p)
+{
+	free(p);
+	return 0;
+}
+
+static void load_rename_cache(struct diff_queue_struct *q,
+			      struct diff_queue_struct *cacheq,
+			      struct diff_options *options)
+{
+	char *sha1_hex;
+	FILE *fp;
+	struct hash_table filepair_table;
+	struct hash_table src_table;
+	struct cached_filepair *pp;
+	int i, hash;
+	static int no_cache_available = -1;
+	struct stat st;
+	char *path;
+
+	if (no_cache_available == -1)
+		no_cache_available = stat(git_path("rename-cache"), &st) || !S_ISDIR(st.st_mode);
+
+	/* return soon so we don't need to waste CPU */
+	if (no_cache_available > 0)
+		return;
+
+
+	/* src_table initialization */
+	init_hash(&src_table);
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		if (DIFF_FILE_VALID(p->one)) {
+			unsigned int hash = hash_filespec(p->one);
+			insert_hash(hash, p, &src_table);
+		}
+	}
+
+	/* filepair_table initialization */
+	init_hash(&filepair_table);
+	sha1_hex = sha1_to_hex(options->commit->object.sha1);
+	path = git_path("rename-cache/%c%c/%s",sha1_hex[0], sha1_hex[1], sha1_hex+2);
+	if (stat(path, &st))
+		fp = NULL;
+	else
+		fp = fopen(path, "r");
+	if (fp) {
+		char src_sha1_hex[41], dst_sha1_hex[41];
+		struct cached_filepair p;
+
+		src_sha1_hex[40] = dst_sha1_hex[40] = '\0';
+		while (fscanf(fp, "%40c %40c %d\n", dst_sha1_hex, src_sha1_hex, &p.score) == 3) {
+			if (get_sha1_hex(src_sha1_hex, p.src) ||
+			    get_sha1_hex(dst_sha1_hex, p.dst))
+				break;
+
+			pp = xmalloc(sizeof(*pp));
+			memcpy(pp, &p, sizeof(*pp));
+			memcpy(&hash, p.dst, sizeof(hash));
+			insert_hash(hash, pp, &filepair_table);
+		}
+		fclose(fp);
+	}
+
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		struct diff_filepair *dp, *src;
+
+		/* find remote_dst */
+		if (DIFF_FILE_VALID(p->one) ||
+		    !DIFF_FILE_VALID(p->two) ||
+		    (options->single_follow && strcmp(options->single_follow, p->two->path)))
+			continue;
+
+		memcpy(&hash, p->two->sha1, sizeof(hash));
+		pp = lookup_hash(hash, &filepair_table);
+		if (!pp || memcmp(p->two->sha1, pp->dst, 20))
+			continue;
+
+		/* create pair */
+		if (is_null_sha1(pp->src)) {
+			if (DIFF_FILE_VALID(p->one))
+				continue;
+			diff_q(cacheq, p);
+			q->queue[i] = NULL;
+			continue;
+		}
+
+		memcpy(&hash, pp->src, sizeof(hash));
+		src = lookup_hash(hash, &src_table);
+		if (!src || memcmp(pp->src, src->one->sha1, 20))
+			continue;
+
+		src->one->rename_used++;
+		src->one->count++;
+		p->two->count++;
+
+		dp = diff_queue(NULL, src->one, p->two);
+		dp->renamed_pair = 1;
+		dp->score = pp->score;
+
+		diff_q(cacheq, dp);
+		q->queue[i] = NULL;
+		diff_free_filepair(p);
+	}
+
+	for_each_hash(&filepair_table, free_cached_filepair);
+	free_hash(&src_table);
+	free_hash(&filepair_table);
+}
+
 void diffcore_rename(struct diff_options *options)
 {
 	int detect_rename = options->detect_rename;
 	int minimum_score = options->rename_score;
 	int rename_limit = options->rename_limit;
 	struct diff_queue_struct *q = &diff_queued_diff;
-	struct diff_queue_struct outq;
+	struct diff_queue_struct outq, cacheq;
 	struct diff_score *mx;
 	int i, j, rename_count;
 	int num_create, num_src, dst_cnt;
@@ -423,8 +541,19 @@ void diffcore_rename(struct diff_options *options)
 	if (!minimum_score)
 		minimum_score = DEFAULT_RENAME_SCORE;
 
+	cacheq.queue = NULL;
+	cacheq.nr = cacheq.alloc = 0;
+
+	if (detect_rename && options->commit)
+		load_rename_cache(q, &cacheq, options);
+
 	for (i = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
+
+		/* was consumed by rename cache */
+		if (!p)
+			continue;
+
 		if (!DIFF_FILE_VALID(p->one)) {
 			if (!DIFF_FILE_VALID(p->two))
 				continue; /* unmerged */
@@ -563,10 +692,17 @@ void diffcore_rename(struct diff_options *options)
 	 */
 	outq.queue = NULL;
 	outq.nr = outq.alloc = 0;
-	for (i = 0; i < q->nr; i++) {
+	for (i = j = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
 		struct diff_filepair *pair_to_free = NULL;
 
+		if (!p) {
+			if (j >= cacheq.nr)
+				die("Internal error: running out of cacheq.");
+			diff_q(&outq, cacheq.queue[j++]);
+			continue;
+		}
+
 		if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
 			/*
 			 * Creation
@@ -635,6 +771,8 @@ void diffcore_rename(struct diff_options *options)
 	diff_debug_queue("done copying original", &outq);
 
 	free(q->queue);
+	if (cacheq.queue)
+		free(cacheq.queue);
 	*q = outq;
 	diff_debug_queue("done collapsing", q);
 
diff --git a/log-tree.c b/log-tree.c
index 5444f08..040e095 100644
--- a/log-tree.c
+++ b/log-tree.c
@@ -522,6 +522,7 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit)
 	log.commit = commit;
 	log.parent = NULL;
 	opt->loginfo = &log;
+	opt->diffopt.commit = commit;
 
 	shown = log_tree_diff(opt, commit, &log);
 	if (!shown && opt->loginfo && opt->always_show_header) {
@@ -531,5 +532,6 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit)
 	}
 	opt->loginfo = NULL;
 	maybe_flush_or_die(stdout, "stdout");
+	opt->diffopt.commit = NULL;
 	return shown;
 }
diff --git a/t/t4031-rename-cache.sh b/t/t4031-rename-cache.sh
new file mode 100755
index 0000000..f7c53fd
--- /dev/null
+++ b/t/t4031-rename-cache.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+#
+# Copyright (c) 2008 Nguyen Thai Ngoc Duy
+#
+
+test_description='Test diff rename cache'
+. ./test-lib.sh
+
+cat >expected <<EOF
+ create mode 100644 sub/c
+ copy a => sub/d (100%)
+EOF
+test_expect_success 'setup' '
+	echo a > a
+	echo b > b
+	mkdir sub
+	echo c > sub/c
+	cp a sub/d
+	A_SHA1=$(git hash-object a)
+	B_SHA1=$(git hash-object b)
+	C_SHA1=$(git hash-object sub/c)
+	D_SHA1=$(git hash-object sub/d)
+	git add a b
+	test_tick
+	git commit -m first
+	git add sub
+	git commit -m second
+	git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+	test_cmp expected result
+'
+
+cat >expected <<EOF
+ copy a => sub/c (100%)
+ copy a => sub/d (100%)
+EOF
+test_expect_success 'load rename pair cache' '
+	P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,")
+	mkdir -p $(dirname $P)
+	echo $C_SHA1 $A_SHA1 60000 >> $P
+	git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+	test_cmp expected result
+'
+
+cat >expected <<EOF
+ copy a => sub/c (100%)
+ create mode 100644 sub/d
+EOF
+test_expect_success 'load create pair cache' '
+	P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,")
+	mkdir -p $(dirname $P)
+	echo $D_SHA1 0000000000000000000000000000000000000000 0 >> $P
+	git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+	test_cmp expected result
+'
+
+test_done
-- 
1.6.0.3.892.g83538

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2008-11-08 11:30 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-11-08 11:27 [PATCH v2 1/2] diffcore-rename: support rename cache Nguyễn Thái Ngọc Duy
2008-11-08 11:27 ` [PATCH v2 2/2] diffcore-rename: add config option to allow to cache renames Nguyễn Thái Ngọc Duy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.