All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH 1/2] diffcore-rename: support rename cache
Date: Fri,  7 Nov 2008 21:35:32 +0700	[thread overview]
Message-ID: <1226068533-10152-1-git-send-email-pclouds@gmail.com> (raw)

This patch teaches diffcore_rename() to look into
$GIT_DIR/rename-cache and make use of it to recreate diff_filepair.
With proper cache, there should be no available entry for estimation
after exact matching.

Rename caching is per commit. I don't think abitrary tree-tree caching
is worth it.

$GIT_DIR/rename-cache spans out like $GIT_DIR/objects. Each file
corresponds to one commit. Its content consists of lines like this

<Destination SHA-1> <SPC> <Source SHA-1> <SPC> <Score in decimal> <NL>

This can be used to:

 - Make --find-copies-harder pratically usable for moderate-size
   repositories. The first "git show" on a linux kernel commit was 5.3
   sec, it then went down to 0.13 sec.
 - Give git-svn a chance to (locally) import explicit renames from
   Subversion
 - People may correct rename results for better diff, if automatic
   rename detection is not good enough.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 diff.h                  |    2 +
 diffcore-rename.c       |  142 ++++++++++++++++++++++++++++++++++++++++++++++-
 log-tree.c              |    2 +
 t/t4030-rename-cache.sh |   55 ++++++++++++++++++
 4 files changed, 199 insertions(+), 2 deletions(-)
 create mode 100755 t/t4030-rename-cache.sh

diff --git a/diff.h b/diff.h
index a49d865..8b68f6f 100644
--- a/diff.h
+++ b/diff.h
@@ -110,6 +110,8 @@ struct diff_options {
 	add_remove_fn_t add_remove;
 	diff_format_fn_t format_callback;
 	void *format_callback_data;
+
+	struct commit *commit;
 };
 
 enum color_diff {
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 168a95b..598cc8d 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -5,6 +5,7 @@
 #include "diff.h"
 #include "diffcore.h"
 #include "hash.h"
+#include "commit.h"
 
 /* Table of rename/copy destinations */
 
@@ -409,13 +410,130 @@ static void record_if_better(struct diff_score m[], struct diff_score *o)
 		m[worst] = *o;
 }
 
+struct cached_filepair {
+	unsigned char dst[20];
+	unsigned char src[20];
+	int score;
+};
+
+static int free_cached_filepair(void *p)
+{
+	free(p);
+	return 0;
+}
+
+static void load_rename_cache(struct diff_queue_struct *q,
+			      struct diff_queue_struct *cacheq,
+			      struct diff_options *options)
+{
+	char *sha1_hex;
+	FILE *fp;
+	struct hash_table filepair_table;
+	struct hash_table src_table;
+	struct cached_filepair *pp;
+	int i, hash;
+	static int no_cache_available = -1;
+	struct stat st;
+	char *path;
+
+	if (no_cache_available == -1)
+		no_cache_available = stat(git_path("rename-cache"), &st) || !S_ISDIR(st.st_mode);
+
+	/* return soon so we don't need to waste CPU */
+	if (no_cache_available > 0)
+		return;
+
+
+	/* src_table initialization */
+	init_hash(&src_table);
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		if (DIFF_FILE_VALID(p->one)) {
+			unsigned int hash = hash_filespec(p->one);
+			insert_hash(hash, p, &src_table);
+		}
+	}
+
+	/* filepair_table initialization */
+	init_hash(&filepair_table);
+	sha1_hex = sha1_to_hex(options->commit->object.sha1);
+	path = git_path("rename-cache/%c%c/%s",sha1_hex[0], sha1_hex[1], sha1_hex+2);
+	if (stat(path, &st))
+		fp = NULL;
+	else
+		fp = fopen(path, "r");
+	if (fp) {
+		char src_sha1_hex[41], dst_sha1_hex[41];
+		struct cached_filepair p;
+
+		src_sha1_hex[40] = dst_sha1_hex[40] = '\0';
+		while (fscanf(fp, "%40c %40c %d\n", dst_sha1_hex, src_sha1_hex, &p.score) == 3) {
+			if (get_sha1_hex(src_sha1_hex, p.src) ||
+			    get_sha1_hex(dst_sha1_hex, p.dst))
+				break;
+
+			pp = xmalloc(sizeof(*pp));
+			memcpy(pp, &p, sizeof(*pp));
+			memcpy(&hash, p.dst, sizeof(hash));
+			insert_hash(hash, pp, &filepair_table);
+		}
+		fclose(fp);
+	}
+
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		struct diff_filepair *dp, *src;
+
+		/* find remote_dst */
+		if (DIFF_FILE_VALID(p->one) ||
+		    !DIFF_FILE_VALID(p->two) ||
+		    (options->single_follow && strcmp(options->single_follow, p->two->path)))
+			continue;
+
+		memcpy(&hash, p->two->sha1, sizeof(hash));
+		pp = lookup_hash(hash, &filepair_table);
+		if (!pp || memcmp(p->two->sha1, pp->dst, 20))
+			continue;
+
+		/* create pair */
+		if (is_null_sha1(pp->src)) {
+			if (DIFF_FILE_VALID(p->one))
+				continue;
+			diff_q(cacheq, p);
+			q->queue[i] = NULL;
+			continue;
+		}
+
+		memcpy(&hash, pp->src, sizeof(hash));
+		src = lookup_hash(hash, &src_table);
+		if (!src || memcmp(pp->src, src->one->sha1, 20))
+			continue;
+
+		src->one->rename_used++;
+		src->one->count++;
+		p->two->count++;
+
+		dp = diff_queue(NULL, src->one, p->two);
+		dp->renamed_pair = 1;
+		dp->score = pp->score;
+
+		diff_q(cacheq, dp);
+		q->queue[i] = NULL;
+		diff_free_filepair(p);
+	}
+
+	for_each_hash(&filepair_table, free_cached_filepair);
+	free_hash(&src_table);
+	free_hash(&filepair_table);
+}
+
 void diffcore_rename(struct diff_options *options)
 {
 	int detect_rename = options->detect_rename;
 	int minimum_score = options->rename_score;
 	int rename_limit = options->rename_limit;
 	struct diff_queue_struct *q = &diff_queued_diff;
-	struct diff_queue_struct outq;
+	struct diff_queue_struct outq, cacheq;
 	struct diff_score *mx;
 	int i, j, rename_count;
 	int num_create, num_src, dst_cnt;
@@ -423,8 +541,19 @@ void diffcore_rename(struct diff_options *options)
 	if (!minimum_score)
 		minimum_score = DEFAULT_RENAME_SCORE;
 
+	cacheq.queue = NULL;
+	cacheq.nr = cacheq.alloc = 0;
+
+	if (detect_rename && options->commit)
+		load_rename_cache(q, &cacheq, options);
+
 	for (i = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
+
+		/* was consumed by rename cache */
+		if (!p)
+			continue;
+
 		if (!DIFF_FILE_VALID(p->one)) {
 			if (!DIFF_FILE_VALID(p->two))
 				continue; /* unmerged */
@@ -563,10 +692,17 @@ void diffcore_rename(struct diff_options *options)
 	 */
 	outq.queue = NULL;
 	outq.nr = outq.alloc = 0;
-	for (i = 0; i < q->nr; i++) {
+	for (i = j = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
 		struct diff_filepair *pair_to_free = NULL;
 
+		if (!p) {
+			if (j >= cacheq.nr)
+				die("Internal error: running out of cacheq.");
+			diff_q(&outq, cacheq.queue[j++]);
+			continue;
+		}
+
 		if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
 			/*
 			 * Creation
@@ -635,6 +771,8 @@ void diffcore_rename(struct diff_options *options)
 	diff_debug_queue("done copying original", &outq);
 
 	free(q->queue);
+	if (cacheq.queue)
+		free(cacheq.queue);
 	*q = outq;
 	diff_debug_queue("done collapsing", q);
 
diff --git a/log-tree.c b/log-tree.c
index cec3c06..a67ef6d 100644
--- a/log-tree.c
+++ b/log-tree.c
@@ -518,6 +518,7 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit)
 	log.commit = commit;
 	log.parent = NULL;
 	opt->loginfo = &log;
+	opt->diffopt.commit = commit;
 
 	shown = log_tree_diff(opt, commit, &log);
 	if (!shown && opt->loginfo && opt->always_show_header) {
@@ -527,5 +528,6 @@ int log_tree_commit(struct rev_info *opt, struct commit *commit)
 	}
 	opt->loginfo = NULL;
 	maybe_flush_or_die(stdout, "stdout");
+	opt->diffopt.commit = NULL;
 	return shown;
 }
diff --git a/t/t4030-rename-cache.sh b/t/t4030-rename-cache.sh
new file mode 100755
index 0000000..0d8390c
--- /dev/null
+++ b/t/t4030-rename-cache.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+#
+# Copyright (c) 2008 Nguyen Thai Ngoc Duy
+#
+
+test_description='Test diff rename cache'
+. ./test-lib.sh
+
+cat >expected <<EOF
+ create mode 100644 c
+ copy a => d (100%)
+EOF
+test_expect_success 'setup' '
+	for i in a b c
+	do
+		echo $i > $i
+	done
+	cp a d
+	A_SHA1=$(git hash-object a)
+	B_SHA1=$(git hash-object b)
+	C_SHA1=$(git hash-object c)
+	D_SHA1=$(git hash-object d)
+	git add a b
+	git commit -m first
+	git add c d
+	git commit -m second
+	git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+	test_cmp expected result
+'
+
+cat >expected <<EOF
+ copy a => c (100%)
+ copy a => d (100%)
+EOF
+test_expect_success 'load rename pair cache' '
+	P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,") &&
+	mkdir -p $(dirname $P)
+	echo $C_SHA1 $A_SHA1 60000 >> $P
+	git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+	test_cmp expected result
+'
+
+cat >expected <<EOF
+ copy a => c (100%)
+ create mode 100644 d
+EOF
+test_expect_success 'load create pair cache' '
+	P=.git/rename-cache/$(git rev-parse HEAD|sed "s,\(..\)\(.*\),\1/\2,") &&
+	mkdir -p $(dirname $P)
+	echo $D_SHA1 0000000000000000000000000000000000000000 0 >> $P
+	git show --pretty=oneline --summary -C -M --find-copies-harder HEAD|sed 1d > result
+	test_cmp expected result
+'
+
+test_done
-- 
1.6.0.3.802.g47c38

             reply	other threads:[~2008-11-07 14:37 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-11-07 14:35 Nguyễn Thái Ngọc Duy [this message]
2008-11-07 14:35 ` [PATCH 2/2] diffcore-rename: add config option to allow to cache renames Nguyễn Thái Ngọc Duy
2008-11-07 22:21 ` [PATCH 1/2] diffcore-rename: support rename cache Yann Dirson
2008-11-07 23:17   ` Junio C Hamano
2008-11-08  4:01     ` Nguyen Thai Ngoc Duy
2008-11-08  9:24       ` Yann Dirson
2008-11-08  9:29         ` Nguyen Thai Ngoc Duy
2008-11-08 11:47         ` Jeff King
2008-11-08 12:00           ` Nguyen Thai Ngoc Duy
2008-11-09  2:04             ` Jeff King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1226068533-10152-1-git-send-email-pclouds@gmail.com \
    --to=pclouds@gmail.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.