git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Karsten Blees <karsten.blees@gmail.com>
To: Git List <git@vger.kernel.org>, Junio C Hamano <gitster@pobox.com>
Cc: Thomas Rast <tr@thomasrast.ch>,
	Jens Lehmann <Jens.Lehmann@web.de>,
	Karsten Blees <karsten.blees@gmail.com>
Subject: [PATCH v4 05/14] diffcore-rename.c: simplify finding exact renames
Date: Thu, 07 Nov 2013 15:36:45 +0100	[thread overview]
Message-ID: <527BA57D.9070400@gmail.com> (raw)
In-Reply-To: <527BA483.6040803@gmail.com>

The find_exact_renames function currently only uses the hash table for
grouping, i.e.:

1. add sources
2. add destinations
3. iterate all buckets, per bucket:
4. split sources from destinations
5. iterate destinations, per destination:
6. iterate sources to find best match

This can be simplified by utilizing the lookup functionality of the hash
table, i.e.:

1. add sources
2. iterate destinations, per destination:
3. lookup sources matching the current destination
4. iterate sources to find best match

This saves several iterations and file_similarity allocations for the
destinations.

Signed-off-by: Karsten Blees <blees@dcon.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 diffcore-rename.c | 75 +++++++++++++++----------------------------------------
 1 file changed, 20 insertions(+), 55 deletions(-)

diff --git a/diffcore-rename.c b/diffcore-rename.c
index 008a60c..cfeb408 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -243,7 +243,7 @@ static int score_compare(const void *a_, const void *b_)
 }
 
 struct file_similarity {
-	int src_dst, index;
+	int index;
 	struct diff_filespec *filespec;
 	struct file_similarity *next;
 };
@@ -260,25 +260,21 @@ static unsigned int hash_filespec(struct diff_filespec *filespec)
 	return hash;
 }
 
-static int find_identical_files(struct file_similarity *src,
-				struct file_similarity *dst,
+static int find_identical_files(struct hash_table *srcs,
+				int dst_index,
 				struct diff_options *options)
 {
 	int renames = 0;
 
-	/*
-	 * Walk over all the destinations ...
-	 */
-	do {
-	struct diff_filespec *target = dst->filespec;
+	struct diff_filespec *target = rename_dst[dst_index].two;
 	struct file_similarity *p, *best;
 	int i = 100, best_score = -1;
 
 	/*
-	 * .. to find the best source match
+	 * Find the best source match for specified destination.
 	 */
 	best = NULL;
-	for (p = src; p; p = p->next) {
+	for (p = lookup_hash(hash_filespec(target), srcs); p; p = p->next) {
 		int score;
 		struct diff_filespec *source = p->filespec;
 
@@ -307,61 +303,28 @@ static int find_identical_files(struct file_similarity *src,
 			break;
 	}
 	if (best) {
-		record_rename_pair(dst->index, best->index, MAX_SCORE);
+		record_rename_pair(dst_index, best->index, MAX_SCORE);
 		renames++;
 	}
-	} while ((dst = dst->next) != NULL);
 	return renames;
 }
 
-static void free_similarity_list(struct file_similarity *p)
+static int free_similarity_list(void *p, void *unused)
 {
 	while (p) {
 		struct file_similarity *entry = p;
-		p = p->next;
+		p = entry->next;
 		free(entry);
 	}
+	return 0;
 }
 
-static int find_same_files(void *ptr, void *data)
-{
-	int ret;
-	struct file_similarity *p = ptr;
-	struct file_similarity *src = NULL, *dst = NULL;
-	struct diff_options *options = data;
-
-	/* Split the hash list up into sources and destinations */
-	do {
-		struct file_similarity *entry = p;
-		p = p->next;
-		if (entry->src_dst < 0) {
-			entry->next = src;
-			src = entry;
-		} else {
-			entry->next = dst;
-			dst = entry;
-		}
-	} while (p);
-
-	/*
-	 * If we have both sources *and* destinations, see if
-	 * we can match them up
-	 */
-	ret = (src && dst) ? find_identical_files(src, dst, options) : 0;
-
-	/* Free the hashes and return the number of renames found */
-	free_similarity_list(src);
-	free_similarity_list(dst);
-	return ret;
-}
-
-static void insert_file_table(struct hash_table *table, int src_dst, int index, struct diff_filespec *filespec)
+static void insert_file_table(struct hash_table *table, int index, struct diff_filespec *filespec)
 {
 	void **pos;
 	unsigned int hash;
 	struct file_similarity *entry = xmalloc(sizeof(*entry));
 
-	entry->src_dst = src_dst;
 	entry->index = index;
 	entry->filespec = filespec;
 	entry->next = NULL;
@@ -385,24 +348,26 @@ static void insert_file_table(struct hash_table *table, int src_dst, int index,
  */
 static int find_exact_renames(struct diff_options *options)
 {
-	int i;
+	int i, renames = 0;
 	struct hash_table file_table;
 
+	/* Add all sources to the hash table */
 	init_hash(&file_table);
-	preallocate_hash(&file_table, rename_src_nr + rename_dst_nr);
+	preallocate_hash(&file_table, rename_src_nr);
 	for (i = 0; i < rename_src_nr; i++)
-		insert_file_table(&file_table, -1, i, rename_src[i].p->one);
+		insert_file_table(&file_table, i, rename_src[i].p->one);
 
+	/* Walk the destinations and find best source match */
 	for (i = 0; i < rename_dst_nr; i++)
-		insert_file_table(&file_table, 1, i, rename_dst[i].two);
+		renames += find_identical_files(&file_table, i, options);
 
-	/* Find the renames */
-	i = for_each_hash(&file_table, find_same_files, options);
+	/* Free source file_similarity chains */
+	for_each_hash(&file_table, free_similarity_list, options);
 
 	/* .. and free the hash data structure */
 	free_hash(&file_table);
 
-	return i;
+	return renames;
 }
 
 #define NUM_CANDIDATE_PER_DST 4
-- 
1.8.4.msysgit.0.12.g88f5ed0

  parent reply	other threads:[~2013-11-07 14:36 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-11-07 14:32 [PATCH v4 00/14] New hash table implementation Karsten Blees
2013-11-07 14:33 ` [PATCH v4 01/14] submodule: don't access the .gitmodules cache entry after removing it Karsten Blees
2013-11-07 22:27   ` Heiko Voigt
2013-11-07 14:34 ` [PATCH v4 02/14] add a hashtable implementation that supports O(1) removal Karsten Blees
2013-11-07 21:40   ` Junio C Hamano
2013-11-08 10:27     ` Karsten Blees
2013-11-08 16:45       ` Philip Oakley
2013-11-08 17:08       ` Junio C Hamano
2013-11-13 16:37         ` Karsten Blees
2013-11-07 14:35 ` [PATCH v4 03/14] buitin/describe.c: use new hash map implementation Karsten Blees
2013-11-07 14:36 ` [PATCH v4 04/14] diffcore-rename.c: move code around to prepare for the next patch Karsten Blees
2013-11-07 14:36 ` Karsten Blees [this message]
2013-11-07 14:37 ` [PATCH v4 06/14] diffcore-rename.c: use new hash map implementation Karsten Blees
2013-11-07 14:38 ` [PATCH v4 07/14] name-hash.c: use new hash map implementation for directories Karsten Blees
2013-11-07 14:38 ` [PATCH v4 08/14] name-hash.c: remove unreferenced directory entries Karsten Blees
2013-11-07 14:39 ` [PATCH v4 09/14] name-hash.c: use new hash map implementation for cache entries Karsten Blees
2013-11-07 14:39 ` [PATCH v4 10/14] name-hash.c: remove cache entries instead of marking them CE_UNHASHED Karsten Blees
2013-11-07 14:40 ` [PATCH v4 11/14] remove old hash.[ch] implementation Karsten Blees
2013-11-07 14:43 ` [PATCH v4 12/14] fix 'git update-index --verbose --again' output Karsten Blees
2013-11-07 22:12   ` Junio C Hamano
2013-11-08 10:27     ` Karsten Blees
2013-11-07 14:44 ` [PATCH v4 13/14] builtin/update-index.c: cleanup update_one Karsten Blees
2013-11-07 21:40   ` Junio C Hamano
2013-11-08 10:27     ` Karsten Blees
2013-11-07 14:45 ` [PATCH v4 14/14] read-cache.c: fix memory leaks caused by removed cache entries Karsten Blees
2013-11-07 21:40   ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=527BA57D.9070400@gmail.com \
    --to=karsten.blees@gmail.com \
    --cc=Jens.Lehmann@web.de \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=tr@thomasrast.ch \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).