git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Johannes Schindelin <Johannes.Schindelin@gmx.de>
To: Jeff King <peff@peff.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>,
	"Shawn O. Pearce" <spearce@spearce.org>,
	git@vger.kernel.org, govindsalinas <govindsalinas@yahoo.com>,
	gitster@pobox.com
Subject: Re: [PATCH] diffcore-rename: favour identical basenames
Date: Fri, 22 Jun 2007 02:14:43 +0100 (BST)	[thread overview]
Message-ID: <Pine.LNX.4.64.0706220214250.4059@racer.site> (raw)
In-Reply-To: <20070621131915.GD4487@coredump.intra.peff.net>

Hi,

On Thu, 21 Jun 2007, Jeff King wrote:

> I think something like a Levenshtein distance between the full pathnames 
> would give good results, and would cover almost every situation that the 
> basename heuristic would (there are a few exceptions, like getting 
> "file.c" from either "file2.c" or "foo/file.c", but that seems kind of 
> pathological).

Well, now you only have to test if it makes sense:

-- snipsnap --
[PATCH] diffcore-rename: replace basename_same() heuristics by Levenshtein

Instead of insisting on identical basenames, try the levenshtein
distance.

Basically, if there are multiple rename source candidates, take the
one with the smallest Levenshtein distance.

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
---

	The dangerous thing is that the score can get negative now.

 Makefile          |    4 ++--
 diffcore-rename.c |   42 +++++++++++++++---------------------------
 levenshtein.c     |   39 +++++++++++++++++++++++++++++++++++++++
 levenshtein.h     |    6 ++++++
 4 files changed, 62 insertions(+), 29 deletions(-)
 create mode 100644 levenshtein.c
 create mode 100644 levenshtein.h

diff --git a/Makefile b/Makefile
index 74b69fb..e015833 100644
--- a/Makefile
+++ b/Makefile
@@ -303,12 +303,12 @@ LIB_H = \
 	run-command.h strbuf.h tag.h tree.h git-compat-util.h revision.h \
 	tree-walk.h log-tree.h dir.h path-list.h unpack-trees.h builtin.h \
 	utf8.h reflog-walk.h patch-ids.h attr.h decorate.h progress.h \
-	mailmap.h remote.h
+	mailmap.h remote.h levenshtein.h
 
 DIFF_OBJS = \
 	diff.o diff-lib.o diffcore-break.o diffcore-order.o \
 	diffcore-pickaxe.o diffcore-rename.o tree-diff.o combine-diff.o \
-	diffcore-delta.o log-tree.o
+	diffcore-delta.o log-tree.o levenshtein.o
 
 LIB_OBJS = \
 	blob.o commit.o connect.o csum-file.o cache-tree.o base85.o \
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 79c984c..41448c9 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -4,6 +4,7 @@
 #include "cache.h"
 #include "diff.h"
 #include "diffcore.h"
+#include "levenshtein.h"
 
 /* Table of rename/copy destinations */
 
@@ -119,21 +120,6 @@ static int is_exact_match(struct diff_filespec *src,
 	return 0;
 }
 
-static int basename_same(struct diff_filespec *src, struct diff_filespec *dst)
-{
-	int src_len = strlen(src->path), dst_len = strlen(dst->path);
-	while (src_len && dst_len) {
-		char c1 = src->path[--src_len];
-		char c2 = dst->path[--dst_len];
-		if (c1 != c2)
-			return 0;
-		if (c1 == '/')
-			return 1;
-	}
-	return (!src_len || src->path[src_len - 1] == '/') &&
-		(!dst_len || dst->path[dst_len - 1] == '/');
-}
-
 struct diff_score {
 	int src; /* index in rename_src */
 	int dst; /* index in rename_dst */
@@ -201,11 +187,9 @@ static int estimate_similarity(struct diff_filespec *src,
 	 */
 	if (!dst->size)
 		score = 0; /* should not happen */
-	else {
-		score = (int)(src_copied * MAX_SCORE / max_size);
-		if (basename_same(src, dst))
-			score++;
-	}
+	else
+		score = (int)(src_copied * MAX_SCORE / max_size)
+			- levenshtein(src->path, dst->path);
 	return score;
 }
 
@@ -313,20 +297,24 @@ void diffcore_rename(struct diff_options *options)
 			if (rename_dst[i].pair)
 				continue; /* dealt with an earlier round */
 			for (j = 0; j < rename_src_nr; j++) {
-				int k;
+				int k, distance;
 				struct diff_filespec *one = rename_src[j].one;
 				if (!is_exact_match(one, two, contents_too))
 					continue;
 
+				distance = levenshtein(one->path, two->path);
 				/* see if there is a basename match, too */
 				for (k = j; k < rename_src_nr; k++) {
+					int d2;
 					one = rename_src[k].one;
-					if (basename_same(one, two) &&
-						is_exact_match(one, two,
-							contents_too)) {
-						j = k;
-						break;
-					}
+					if (!is_exact_match(one, two,
+								contents_too))
+						continue;
+					d2 = levenshtein(one->path, two->path);
+					if (d2 > distance)
+						continue;
+					distance = d2;
+					j = k;
 				}
 
 				record_rename_pair(i, j, (int)MAX_SCORE);
diff --git a/levenshtein.c b/levenshtein.c
new file mode 100644
index 0000000..80ef860
--- /dev/null
+++ b/levenshtein.c
@@ -0,0 +1,39 @@
+#include "cache.h"
+#include "levenshtein.h"
+
+int levenshtein(const char *string1, const char *string2)
+{
+	int len1 = strlen(string1), len2 = strlen(string2);
+	int *row1 = xmalloc(sizeof(int) * (len2 + 1));
+	int *row2 = xmalloc(sizeof(int) * (len2 + 1));
+	int i, j;
+
+	for (j = 1; j <= len2; j++)
+		row1[j] = j;
+	for (i = 0; i < len1; i++) {
+		int *dummy;
+
+		row2[0] = i + 1;
+		for (j = 0; j < len2; j++) {
+			/* substitution */
+			row2[j + 1] = row1[j] + (string1[i] != string2[j]);
+			/* insertion */
+			if (row2[j + 1] > row1[j + 1] + 1)
+				row2[j + 1] = row1[j + 1] + 1;
+			/* deletion */
+			if (row2[j + 1] > row2[j] + 1)
+				row2[j + 1] = row2[j] + 1;
+		}
+
+		dummy = row1;
+		row1 = row2;
+		row2 = dummy;
+	}
+
+	i = row1[len2];
+	free(row1);
+	free(row2);
+
+	return i;
+}
+
diff --git a/levenshtein.h b/levenshtein.h
new file mode 100644
index 0000000..74a6626
--- /dev/null
+++ b/levenshtein.h
@@ -0,0 +1,6 @@
+#ifndef LEVENSHTEIN_H
+#define LEVENSHTEIN_H
+
+int levenshtein(const char *string1, const char *string2);
+
+#endif
-- 
1.5.2.2.2822.g027a6-dirty

  parent reply	other threads:[~2007-06-22  1:14 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-06-21  3:06 Basename matching during rename/copy detection Shawn O. Pearce
2007-06-21  3:13 ` Junio C Hamano
2007-06-21  8:00   ` Andy Parkins
2007-06-21  8:07     ` Junio C Hamano
2007-06-21  9:50       ` Andy Parkins
2007-06-21 11:52         ` Johannes Schindelin
2007-06-21 12:44           ` Andy Parkins
2007-06-21 12:53             ` Matthieu Moy
2007-06-21 13:10               ` Jeff King
2007-06-21 13:18               ` Johannes Schindelin
2007-06-21 13:25                 ` Matthieu Moy
2007-06-21 13:52                   ` Johannes Schindelin
2007-06-21 15:37                     ` Steven Grimm
2007-06-21 15:53                       ` Johannes Schindelin
2007-06-21 16:57                         ` Steven Grimm
2007-06-21 13:22             ` Johannes Schindelin
2007-06-21  3:42 ` Linus Torvalds
2007-06-21 11:52   ` [PATCH] diffcore-rename: favour identical basenames Johannes Schindelin
2007-06-21 13:19     ` Jeff King
2007-06-21 14:03       ` Johannes Schindelin
2007-06-21 16:20       ` Linus Torvalds
2007-06-21 17:52         ` Junio C Hamano
2007-06-21 18:24           ` Linus Torvalds
2007-06-22 15:19         ` Andy Parkins
2007-06-22 15:28           ` Johannes Schindelin
2007-06-22 17:51             ` Aidan Van Dyk
2007-06-22  1:14       ` Johannes Schindelin [this message]
2007-06-22  5:41         ` Jeff King
2007-06-22 10:22           ` Johannes Schindelin
2007-06-22  7:17         ` Johannes Sixt
2007-06-22 10:39           ` Johannes Schindelin
2007-06-22 10:52             ` 100% (was: [PATCH] diffcore-rename: favour identical basenames) David Kastrup
2007-06-22 12:49               ` Johannes Schindelin
     [not found]                 ` <86abusi1fw.fsf@lola.quinscape.zz>
2007-06-23  1:31                   ` 100% Johannes Schindelin
2007-06-23 10:18                     ` 100% René Scharfe
2007-06-23 10:56                       ` 100% Johannes Schindelin
2007-06-23 11:41                         ` 100% René Scharfe
2007-06-23 12:00                           ` 100% Johannes Schindelin
2007-06-23 12:11                             ` 100% René Scharfe
2007-06-23 12:21                               ` 100% Johannes Schindelin
2007-06-24 22:23                                 ` 100% René Scharfe
2007-06-23 19:33                         ` 100% Junio C Hamano
2007-06-23 20:41                           ` 100% Johannes Schindelin
2007-06-23  5:44     ` [PATCH] diffcore-rename: favour identical basenames Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Pine.LNX.4.64.0706220214250.4059@racer.site \
    --to=johannes.schindelin@gmx.de \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=govindsalinas@yahoo.com \
    --cc=peff@peff.net \
    --cc=spearce@spearce.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).