RE: [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* RE: [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines
       [not found] <[PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines>
@ 2019-04-07 21:46 ` michael
  2019-04-07 21:52   ` David Kastrup
  2019-04-09 15:56   ` Barret Rhoden
  0 siblings, 2 replies; 9+ messages in thread
From: michael @ 2019-04-07 21:46 UTC (permalink / raw)
  To: git
  Cc: Jeff King, Stefan Beller, Jeff Smith, Junio C Hamano,
	René Scharfe, Ævar Arnfjörð Bjarmason,
	David Kastrup, Johannes Schindelin, Barret Rhoden,
	Michael Platings

From: Michael Platings <michael@platin.gs>

Hi Barret,
This is the updated fuzzy matching algorithm, sorry for the delay. It does
highlight a bug in the calculation for the number of lines ("int nr_parent_lines
 = e->num_lines - delta;") - if you apply the patch, build it, then try to
./git blame --ignore-rev <the patch commit ID> blame.c then you'll get a segfault
because nr_parent_lines is a negative number. I haven't had time to investigate further
but I have confirmed that the bug is not due to my patch.

The matching algorithm might not be obvious so it could do with more commenting.
In the mean time I hope the tests will make the intent clear. In particular I
want to avoid lines being reordered, because for the interesting use cases
usually sequences are unchanged even if they shift across different lines.

Regarding the existing implementation I've got to say I find it unhelpful
marking "unblameable" lines with a 000000 commit ID. That commit ID already has
a meaning - lines that aren't yet committed. Further, the purpose of ignoring
commits should be to avoid obscuring other useful information, not to absolutely
refuse to show that commit at all. If there's no other commit to show then it's
harmless to show the commit that would otherwise be ignored.

- How about matching *outside* the parent's diff hunk?
I'd like to know what the use case would be for that. For the use case of
looking "through" a reformatting or renaming commit I think it would be unhelpful.

- Fix up this commit + message.  I'd be up for splitting it more,
particularly if Michael wants his contributions/fingerprinting in his
own commit.
Thanks, maybe once we've got things into a robust state.

-Michael
---
 Makefile                           |   1 +
 blame.c                            |  16 +-
 fuzzy.c                            | 346 +++++++++++++++++++++++++++++
 fuzzy.h                            |  18 ++
 t/t8014-blame-ignore-revs-fuzzy.sh | 333 +++++++++++++++++++++++++++
 5 files changed, 712 insertions(+), 2 deletions(-)
 create mode 100644 fuzzy.c
 create mode 100644 fuzzy.h
 create mode 100755 t/t8014-blame-ignore-revs-fuzzy.sh

diff --git a/Makefile b/Makefile
index 3e03290d8f..4725060c54 100644
--- a/Makefile
+++ b/Makefile
@@ -893,6 +893,7 @@ LIB_OBJS += fetch-object.o
 LIB_OBJS += fetch-pack.o
 LIB_OBJS += fsck.o
 LIB_OBJS += fsmonitor.o
+LIB_OBJS += fuzzy.o
 LIB_OBJS += gettext.o
 LIB_OBJS += gpg-interface.o
 LIB_OBJS += graph.o
diff --git a/blame.c b/blame.c
index d20c13e6f8..0a7c231102 100644
--- a/blame.c
+++ b/blame.c
@@ -9,6 +9,7 @@
 #include "blame.h"
 #include "alloc.h"
 #include "commit-slab.h"
+#include "fuzzy.h"
 
 define_commit_slab(blame_suspects, struct blame_origin *);
 static struct blame_suspects blame_suspects;
@@ -928,15 +929,26 @@ static void guess_line_blames(struct blame_entry *e,
 {
 	int nr_parent_lines = e->num_lines - delta;
 
+	int *matching_lines = fuzzy_find_matching_lines(parent->file.ptr,
+							target->file.ptr,
+							parent->line_starts,
+							target->line_starts,
+							e->s_lno + offset,
+							e->s_lno,
+							nr_parent_lines,
+							e->num_lines);
+
 	for (int i = 0; i < e->num_lines; i++) {
-		if (i < nr_parent_lines) {
+		if (matching_lines[i] >= 0) {
 			line_blames[i].is_parent = 1;
-			line_blames[i].s_lno = e->s_lno + i + offset;
+			line_blames[i].s_lno = matching_lines[i];
 		} else {
 			line_blames[i].is_parent = 0;
 			line_blames[i].s_lno = e->s_lno + i;
 		}
 	}
+
+	free(matching_lines);
 }
 
 /*
diff --git a/fuzzy.c b/fuzzy.c
new file mode 100644
index 0000000000..b5ecb921b2
--- /dev/null
+++ b/fuzzy.c
@@ -0,0 +1,346 @@
+#include "fuzzy.h"
+#include <ctype.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "git-compat-util.h"
+#include "hashmap.h"
+
+struct fingerprint {
+	struct hashmap map;
+	struct hashmap_entry *entries;
+};
+
+static void get_fingerprint(struct fingerprint *result,
+			    const char *line_begin,
+			    const char *line_end) {
+	unsigned hash;
+	char c0, c1;
+	int map_entry_count = line_end - line_begin - 1;
+	struct hashmap_entry *entry = malloc(map_entry_count *
+					     sizeof(struct hashmap_entry));
+	hashmap_init(&result->map, NULL, NULL, map_entry_count);
+	result->entries = entry;
+	for (const char *p = line_begin; p + 1 < line_end; ++p, ++entry) {
+		c0 = *p;
+		c1 = *(p + 1);
+		/* Ignore whitespace pairs */
+		if (isspace(c0) && isspace(c1))
+			continue;
+		hash = tolower(c0) | (tolower(c1) << 8);
+		hashmap_entry_init(entry, hash);
+		hashmap_put(&result->map, entry);
+	}
+}
+
+static void free_fingerprint(struct fingerprint *f) {
+	hashmap_free(&f->map, 0);
+	free(f->entries);
+}
+
+static int fingerprint_similarity(struct fingerprint *a,
+				  struct fingerprint *b) {
+	int intersection = 0;
+	struct hashmap_iter iter;
+	struct hashmap_entry *entry;
+	hashmap_iter_init(&b->map, &iter);
+
+	while ((entry = hashmap_iter_next(&iter))) {
+		if (hashmap_get(&a->map, entry, NULL)) {
+			++intersection;
+		}
+	}
+	return intersection;
+}
+
+static void get_line_fingerprints(struct fingerprint *fingerprints,
+				  const char *content,
+				  const int *line_starts,
+				  long chunk_start,
+				  long chunk_length) {
+	int i;
+	const char *linestart, *lineend;
+	line_starts += chunk_start;
+	for (i = 0; i != chunk_length; ++i) {
+		linestart = content + line_starts[i];
+		lineend = content + line_starts[i + 1];
+		get_fingerprint(fingerprints + i, linestart, lineend);
+	}
+}
+
+static int get_closest_line(int start_a,
+			    int chunk_line_b,
+			    int closest_line_calc_offset1,
+			    int closest_line_calc_offset2,
+			    int closest_line_calc_numerator,
+			    int closest_line_calc_denominator) {
+	return ((chunk_line_b + closest_line_calc_offset1) * 2 + 1) *
+		closest_line_calc_numerator /
+		(closest_line_calc_denominator * 2) +
+		closest_line_calc_offset2 - start_a;
+}
+
+#define CERTAIN_NOTHING_MATCHES -2
+#define CERTAINTY_NOT_CALCULATED -1
+
+static void find_best_line_matches(const int max_search_distance,
+				   int start_a,
+				   int length_a,
+				   int chunk_line_b,
+				   const int *similarities,
+				   int *certainties,
+				   int *second_best_result,
+				   int *result,
+				   int closest_line_calc_offset1,
+				   int closest_line_calc_offset2,
+				   int closest_line_calc_numerator,
+				   int closest_line_calc_denominator) {
+
+	int i, search_start, search_end, closest_line_a, similarity,
+		best_similarity = 0, second_best_similarity = 0,
+		best_similarity_index = 0, second_best_similarity_index = 0;
+
+	if (certainties[chunk_line_b] != CERTAINTY_NOT_CALCULATED)
+		return;
+
+	closest_line_a = get_closest_line(start_a,
+					  chunk_line_b,
+					  closest_line_calc_offset1,
+					  closest_line_calc_offset2,
+					  closest_line_calc_numerator,
+					  closest_line_calc_denominator);
+
+	search_start = closest_line_a - max_search_distance;
+	if (search_start < 0)
+		search_start = 0;
+
+	search_end = closest_line_a + max_search_distance + 1;
+	if (search_end > length_a)
+		search_end = length_a;
+
+	for (i = search_start; i < search_end; ++i) {
+		similarity = similarities[(i - closest_line_a) +
+			max_search_distance +
+			chunk_line_b * (max_search_distance * 2 + 1)];
+		if (similarity > best_similarity) {
+			second_best_similarity = best_similarity;
+			second_best_similarity_index = best_similarity_index;
+			best_similarity = similarity;
+			best_similarity_index = i;
+		}
+		else if (similarity > second_best_similarity) {
+			second_best_similarity = similarity;
+			second_best_similarity_index = i;
+		}
+	}
+
+	if (best_similarity == 0) {
+		certainties[chunk_line_b] = CERTAIN_NOTHING_MATCHES;
+		result[chunk_line_b] = -1;
+	}
+	else {
+		certainties[chunk_line_b] = best_similarity * 2 -
+			second_best_similarity;
+		result[chunk_line_b] = start_a + best_similarity_index;
+		second_best_result[chunk_line_b] =
+			start_a + second_best_similarity_index;
+	}
+}
+
+/*
+ * This finds the line that we can match with the most confidence, and
+ * uses it as a partition. It then calls itself on the lines on either side of
+ * that partition. In this way we avoid lines appearing out of order, and
+ * retain a sensible line ordering.
+ */
+static void fuzzy_find_matching_lines_recurse(
+	const int max_search_distance,
+	int start_a, int start_b,
+	int length_a, int length_b,
+	const int *similarities,
+	int *certainties,
+	int *second_best_result,
+	int *result,
+	int closest_line_calc_offset1,
+	int closest_line_calc_offset2,
+	int closest_line_calc_numerator,
+	int closest_line_calc_denominator) {
+
+	int i, barrier, invalidate_extent, offset_b,
+		second_half_start_a, second_half_start_b,
+		second_half_length_a, second_half_length_b,
+		most_certain_line = -1,
+		most_certain_line_certainty = -1;
+
+	for (i = 0; i < length_b; ++i) {
+		find_best_line_matches(max_search_distance,
+				       start_a,
+				       length_a,
+				       i,
+				       similarities,
+				       certainties,
+				       second_best_result,
+				       result,
+				       closest_line_calc_offset1,
+				       closest_line_calc_offset2,
+				       closest_line_calc_numerator,
+				       closest_line_calc_denominator);
+
+		if (certainties[i] > most_certain_line_certainty) {
+			most_certain_line_certainty = certainties[i];
+			most_certain_line = i;
+		}
+	}
+
+	if (most_certain_line == -1) {
+		return;
+	}
+
+	/* Invalidate results that may be affected by the choice of pivot. */
+	barrier = result[most_certain_line];
+	invalidate_extent = most_certain_line - max_search_distance;
+	if (invalidate_extent < 0)
+		invalidate_extent = 0;
+	for (i = most_certain_line - 1; i >= invalidate_extent; --i) {
+		if (certainties[i] >= 0 &&
+		    (result[i] > barrier || second_best_result[i] > barrier)) {
+			    certainties[i] = CERTAINTY_NOT_CALCULATED;
+			    barrier = result[i];
+			    invalidate_extent = i - max_search_distance;
+			    if (invalidate_extent < 0)
+				    invalidate_extent = 0;
+		    }
+	}
+
+	barrier = result[most_certain_line];
+	invalidate_extent = most_certain_line + max_search_distance + 1;
+	if (invalidate_extent > length_b)
+		invalidate_extent = length_b;
+	for (i = most_certain_line + 1; i < invalidate_extent; ++i) {
+		if (certainties[i] >= 0 &&
+		    (result[i] < barrier || second_best_result[i] < barrier)) {
+			    certainties[i] = CERTAINTY_NOT_CALCULATED;
+			    barrier = result[i];
+			    invalidate_extent = i + max_search_distance + 1;
+			    if (invalidate_extent > length_b)
+				    invalidate_extent = length_b;
+		    }
+	}
+
+	if (most_certain_line > 0) {
+		fuzzy_find_matching_lines_recurse(
+			max_search_distance,
+			start_a, start_b,
+			result[most_certain_line] + 1 - start_a,
+			most_certain_line, similarities,
+			certainties, second_best_result, result,
+			closest_line_calc_offset1, closest_line_calc_offset2,
+			closest_line_calc_numerator,
+			closest_line_calc_denominator);
+	}
+	if (most_certain_line + 1 < length_b) {
+		second_half_start_a = result[most_certain_line];
+		offset_b = most_certain_line + 1;
+		second_half_start_b = start_b + offset_b;
+		second_half_length_a =
+			length_a + start_a - second_half_start_a;
+		second_half_length_b =
+			length_b + start_b - second_half_start_b;
+		fuzzy_find_matching_lines_recurse(
+			max_search_distance,
+			second_half_start_a, second_half_start_b,
+			second_half_length_a, second_half_length_b,
+			similarities +
+				offset_b * (max_search_distance * 2 + 1),
+			certainties + offset_b,
+			second_best_result + offset_b, result + offset_b,
+			closest_line_calc_offset1 + offset_b,
+			closest_line_calc_offset2,
+			closest_line_calc_numerator,
+			closest_line_calc_denominator);
+	}
+}
+
+int *fuzzy_find_matching_lines(const char *content_a,
+			       const char *content_b,
+			       const int *line_starts_a,
+			       const int *line_starts_b,
+			       int start_a,
+			       int start_b,
+			       int length_a,
+			       int length_b) {
+
+	int i, j, closest_line_a, line_a, *result, *second_best_result,
+		*certainties, *similarities, *similarity;
+	struct fingerprint *fingerprints_a, fingerprint_b;
+
+	int max_search_distance = 10;
+	if (max_search_distance >= length_a)
+		max_search_distance = length_a - 1;
+
+	result = malloc(sizeof(int) * length_b);
+	second_best_result = malloc(sizeof(int) * length_b);
+	certainties = malloc(sizeof(int) * length_b);
+	similarities = malloc(sizeof(int) * length_b *
+			      (max_search_distance * 2 + 1));
+
+	for (i = 0; i < length_b; ++i) {
+		result[i] = -1;
+		second_best_result[i] = -1;
+		certainties[i] = CERTAINTY_NOT_CALCULATED;
+	}
+
+	fingerprints_a = malloc(sizeof(struct fingerprint) * length_a);
+
+	get_line_fingerprints(fingerprints_a, content_a,
+			      line_starts_a,
+			      start_a, length_a);
+
+	for (i = 0; i < length_b; ++i) {
+		get_fingerprint(&fingerprint_b,
+				content_b + line_starts_b[i + start_b],
+				content_b + line_starts_b[i + start_b + 1]);
+
+		closest_line_a = get_closest_line(start_a, i, 0, start_a,
+						  length_a, length_b);
+
+		for (j = -max_search_distance; j <= max_search_distance; ++j) {
+			similarity = similarities + j + max_search_distance +
+				i * (max_search_distance * 2 + 1);
+			line_a = closest_line_a + j;
+			if (line_a < 0 || line_a >= length_a) {
+				*similarity = -1;
+			}
+			else {
+				*similarity = fingerprint_similarity(
+					&fingerprint_b,
+					fingerprints_a + line_a) *
+					(1000 - abs(j));
+			}
+		}
+
+		free_fingerprint(&fingerprint_b);
+	}
+
+	for (i = 0; i < length_a; ++i) {
+		free_fingerprint(fingerprints_a + i);
+	}
+
+	free(fingerprints_a);
+
+	fuzzy_find_matching_lines_recurse(max_search_distance,
+					  start_a, start_b,
+					  length_a, length_b,
+					  similarities,
+					  certainties,
+					  second_best_result,
+					  result,
+					  0, start_a, length_a, length_b);
+
+	free(similarities);
+	free(certainties);
+	free(second_best_result);
+
+	return result;
+}
+
diff --git a/fuzzy.h b/fuzzy.h
new file mode 100644
index 0000000000..bd6d86ae45
--- /dev/null
+++ b/fuzzy.h
@@ -0,0 +1,18 @@
+#ifndef FUZZY_H
+#define FUZZY_H
+
+/*
+ * Find line numbers in "a" that match with lines in "b"
+ * Returns an array of either line indices or -1 where no match is found.
+ * The returned array must be free()d after use.
+ */
+int *fuzzy_find_matching_lines(const char *content_a,
+			       const char *content_b,
+			       const int *line_starts_a,
+			       const int *line_starts_b,
+			       int start_a,
+			       int start_b,
+			       int length_a,
+			       int length_b);
+
+#endif
diff --git a/t/t8014-blame-ignore-revs-fuzzy.sh b/t/t8014-blame-ignore-revs-fuzzy.sh
new file mode 100755
index 0000000000..1537a2b92c
--- /dev/null
+++ b/t/t8014-blame-ignore-revs-fuzzy.sh
@@ -0,0 +1,333 @@
+#!/bin/sh
+
+test_description='git blame ignore a specific revision'
+. ./test-lib.sh
+
+pick_author='s/^[0-9a-f^]* *(\([^ ]*\) .*/\1/'
+
+file_count=11
+
+# Each test is composed of 4 variables:
+# titleN - the test name
+# aN - the initial content
+# bN - the final content
+# expectedN - the line numbers from aN that we expect git blame
+#             on bN to identify, or "Final" if bN itself should
+#             be identified as the origin of that line.
+
+title1="Expand lines"
+cat <<EOF >a1
+aaa
+bbb
+ccc
+ddd
+eee
+EOF
+cat <<EOF >b1
+aaa
+bbbx
+bbbx
+ccc
+dddx
+dddx
+eee
+EOF
+cat <<EOF >expected1
+1
+2
+2
+3
+4
+4
+5
+EOF
+
+title2="Combine 3 lines into 2"
+cat <<EOF >a2
+if ((maxgrow==0) ||
+	( single_line_field && (field->dcols < maxgrow)) ||
+	(!single_line_field && (field->drows < maxgrow)))
+EOF
+cat <<EOF >b2
+if ((maxgrow == 0) || (single_line_field && (field->dcols < maxgrow)) ||
+	(!single_line_field && (field->drows < maxgrow))) {
+EOF
+cat <<EOF >expected2
+2
+3
+EOF
+
+title3="Add curly brackets"
+cat <<EOF >a3
+	if (rows) *rows = field->rows;
+	if (cols) *cols = field->cols;
+	if (frow) *frow = field->frow;
+	if (fcol) *fcol = field->fcol;
+EOF
+cat <<EOF >b3
+	if (rows) {
+		*rows = field->rows;
+	}
+	if (cols) {
+		*cols = field->cols;
+	}
+	if (frow) {
+		*frow = field->frow;
+	}
+	if (fcol) {
+		*fcol = field->fcol;
+	}
+EOF
+cat <<EOF >expected3
+1
+1
+Final
+2
+2
+Final
+3
+3
+Final
+4
+4
+Final
+EOF
+
+
+title4="Combine many lines and change case"
+cat <<EOF >a4
+for(row=0,pBuffer=field->buf;
+	row<height;
+	row++,pBuffer+=width )
+{
+	if ((len = (int)( After_End_Of_Data( pBuffer, width ) - pBuffer )) > 0)
+	{
+		wmove( win, row, 0 );
+		waddnstr( win, pBuffer, len );
+EOF
+cat <<EOF >b4
+for (Row = 0, PBuffer = field->buf; Row < Height; Row++, PBuffer += Width) {
+	if ((Len = (int)(afterEndOfData(PBuffer, Width) - PBuffer)) > 0) {
+		wmove(win, Row, 0);
+		waddnstr(win, PBuffer, Len);
+EOF
+cat <<EOF >expected4
+1
+5
+7
+8
+EOF
+
+title5="Rename and combine lines"
+cat <<EOF >a5
+bool need_visual_update = ((form != (FORM *)0)      &&
+	(form->status & _POSTED) &&
+	(form->current==field));
+
+if (need_visual_update)
+	Synchronize_Buffer(form);
+
+if (single_line_field)
+{
+	growth = field->cols * amount;
+	if (field->maxgrow)
+		growth = Minimum(field->maxgrow - field->dcols,growth);
+	field->dcols += growth;
+	if (field->dcols == field->maxgrow)
+EOF
+cat <<EOF >b5
+bool NeedVisualUpdate = ((Form != (FORM *)0) && (Form->status & _POSTED) &&
+	(Form->current == field));
+
+if (NeedVisualUpdate) {
+	synchronizeBuffer(Form);
+}
+
+if (SingleLineField) {
+	Growth = field->cols * amount;
+	if (field->maxgrow) {
+		Growth = Minimum(field->maxgrow - field->dcols, Growth);
+	}
+	field->dcols += Growth;
+	if (field->dcols == field->maxgrow) {
+EOF
+cat <<EOF >expected5
+1
+3
+4
+5
+6
+Final
+7
+8
+10
+11
+12
+Final
+13
+14
+EOF
+
+# Both lines match identically so position must be used to tie-break.
+title6="Same line twice"
+cat <<EOF >a6
+abc
+abc
+EOF
+cat <<EOF >b6
+abcd
+abcd
+EOF
+cat <<EOF >expected6
+1
+2
+EOF
+
+title7="Enforce line order"
+cat <<EOF >a7
+abcdef
+ghijkl
+ab
+EOF
+cat <<EOF >b7
+ghijk
+abcd
+EOF
+cat <<EOF >expected7
+2
+3
+EOF
+
+title8="Expand lines and rename variables"
+cat <<EOF >a8
+int myFunction(int ArgumentOne, Thing *ArgTwo, Blah XuglyBug) {
+	Squiggle FabulousResult = squargle(ArgumentOne, *ArgTwo,
+		XuglyBug) + EwwwGlobalWithAReallyLongNameYepTooLong;
+	return FabulousResult * 42;
+}
+EOF
+cat <<EOF >b8
+int myFunction(int argument_one, Thing *arg_asdfgh,
+	Blah xugly_bug) {
+	Squiggle fabulous_result = squargle(argument_one,
+		*arg_asdfgh, xugly_bug)
+		+ g_ewww_global_with_a_really_long_name_yep_too_long;
+	return fabulous_result * 42;
+}
+EOF
+cat <<EOF >expected8
+1
+1
+2
+3
+3
+4
+5
+EOF
+
+title9="Two close matches versus one less close match"
+cat <<EOF >a9
+abcdef
+abcdef
+ghijkl
+EOF
+cat <<EOF >b9
+gh
+abcdefx
+EOF
+cat <<EOF >expected9
+Final
+2
+EOF
+
+# The first line of b matches best with the last line of a, but the overall
+# match is better if we match it with the the first line of a.
+title10="Piggy in the middle"
+cat <<EOF >a10
+abcdefg
+ijklmn
+abcdefgh
+EOF
+cat <<EOF >b10
+abcdefghx
+ijklm
+EOF
+cat <<EOF >expected10
+1
+2
+EOF
+
+title11="No trailing newline"
+printf "abc\ndef" >a11
+printf "abx\nstu" >b11
+cat <<EOF >expected11
+1
+Final
+EOF
+
+test_expect_success setup '
+	{ for ((i=1;i<=$file_count;i++))
+	do
+		# Append each line in a separate commit to make it easy to
+		# check which original line the blame output relates to.
+
+		line_count=0 &&
+		{ while IFS= read line
+		do
+			line_count=$((line_count+1)) &&
+			echo "$line" >>"$i" &&
+			git add "$i" &&
+			test_tick &&
+			GIT_AUTHOR_NAME="$line_count" git commit -m "$line_count"
+		done } <"a$i"
+	done } &&
+
+	{ for ((i=1;i<=$file_count;i++))
+	do
+		# Overwrite the files with the final content.
+		cp b$i $i &&
+		git add $i
+	done } &&
+	test_tick &&
+
+	# Commit the final content all at once so it can all be
+	# referred to with the same commit ID.
+	GIT_AUTHOR_NAME=Final git commit -m Final &&
+
+	IGNOREME=$(git rev-parse HEAD)
+'
+
+for ((i=1;i<=$file_count;i++)); do
+	title="title$i"
+	test_expect_success "${!title}" \
+	"git blame --ignore-rev $IGNOREME $i | sed -e \"$pick_author\" >actual && test_cmp expected$i actual"
+done
+
+# This invoked a null pointer dereference when the chunk callback was called
+# with a zero length parent chunk and there were no more suspects.
+test_expect_success 'Diff chunks with no suspects' '
+	test_write_lines xy1 A B C xy1 >file &&
+	git add file &&
+	test_tick &&
+	GIT_AUTHOR_NAME=1 git commit -m 1 &&
+
+	test_write_lines xy2 A B xy2 C xy2 >file &&
+	git add file &&
+	test_tick &&
+	GIT_AUTHOR_NAME=2 git commit -m 2 &&
+	REV_2=$(git rev-parse HEAD) &&
+
+	test_write_lines xy3 A >file &&
+	git add file &&
+	test_tick &&
+	GIT_AUTHOR_NAME=3 git commit -m 3 &&
+	REV_3=$(git rev-parse HEAD) &&
+
+	test_write_lines 1 1 >expected &&
+
+	git blame --ignore-rev $REV_2 --ignore-rev $REV_3 file | sed -e "$pick_author" >actual &&
+
+	test_cmp expected actual
+	'
+
+test_done
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines
  2019-04-07 21:46 ` [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines michael
@ 2019-04-07 21:52   ` David Kastrup
  2019-04-08  9:48     ` Michael Platings
  2019-04-09 15:56   ` Barret Rhoden
  1 sibling, 1 reply; 9+ messages in thread
From: David Kastrup @ 2019-04-07 21:52 UTC (permalink / raw)
  To: michael
  Cc: git, Jeff King, Stefan Beller, Jeff Smith, Junio C Hamano,
	René Scharfe, Ævar Arnfjörð Bjarmason,
	Johannes Schindelin, Barret Rhoden

michael@platin.gs writes:

> From: Michael Platings <michael@platin.gs>
>
> Hi Barret,
> This is the updated fuzzy matching algorithm, sorry for the delay. It does
> highlight a bug in the calculation for the number of lines ("int nr_parent_lines
>  = e->num_lines - delta;") - if you apply the patch, build it, then try to
> ./git blame --ignore-rev <the patch commit ID> blame.c then you'll get a segfault
> because nr_parent_lines is a negative number. I haven't had time to investigate further
> but I have confirmed that the bug is not due to my patch.

If you segfault with the patch and don't segfault with the patch, there
is not much of a point in declaring this "somebody else's problem", is
there?  It has to be fixed anyway in order to make the patch get in.

Or am I fundamentally misunderstanding something here?

-- 
David Kastrup

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines
  2019-04-07 21:52   ` David Kastrup
@ 2019-04-08  9:48     ` Michael Platings
  2019-04-08 16:03       ` Barret Rhoden
  0 siblings, 1 reply; 9+ messages in thread
From: Michael Platings @ 2019-04-08  9:48 UTC (permalink / raw)
  To: David Kastrup
  Cc: Git mailing list, Jeff King, Stefan Beller, Jeff Smith,
	Junio C Hamano, René Scharfe,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin,
	Barret Rhoden

Hi David,
You also get an out-of-memory error with the patch Barret posted at
the start of this thread.
I'm sorry you interpreted my message as declaring it somebody else's
problem, that definitely wasn't my intent. I merely ran out of time to
investigate further and I figure Barret is going to be interested in
this issue and would prefer that I let him know sooner rather than
later.
-Michael
(resending in plain text, sorry for the spam again)

On Sun, 7 Apr 2019 at 23:52, David Kastrup <dak@gnu.org> wrote:
>
> michael@platin.gs writes:
>
> > From: Michael Platings <michael@platin.gs>
> >
> > Hi Barret,
> > This is the updated fuzzy matching algorithm, sorry for the delay. It does
> > highlight a bug in the calculation for the number of lines ("int nr_parent_lines
> >  = e->num_lines - delta;") - if you apply the patch, build it, then try to
> > ./git blame --ignore-rev <the patch commit ID> blame.c then you'll get a segfault
> > because nr_parent_lines is a negative number. I haven't had time to investigate further
> > but I have confirmed that the bug is not due to my patch.
>
> If you segfault with the patch and don't segfault with the patch, there
> is not much of a point in declaring this "somebody else's problem", is
> there?  It has to be fixed anyway in order to make the patch get in.
>
> Or am I fundamentally misunderstanding something here?
>
> --
> David Kastrup

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines
  2019-04-08  9:48     ` Michael Platings
@ 2019-04-08 16:03       ` Barret Rhoden
  2019-04-09 15:38         ` Junio C Hamano
  0 siblings, 1 reply; 9+ messages in thread
From: Barret Rhoden @ 2019-04-08 16:03 UTC (permalink / raw)
  To: Michael Platings, David Kastrup
  Cc: Git mailing list, Jeff King, Stefan Beller, Jeff Smith,
	Junio C Hamano, René Scharfe,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

On 4/8/19 5:48 AM, Michael Platings wrote:
> Hi David,
> You also get an out-of-memory error with the patch Barret posted at
> the start of this thread.

I think I see the issue, and will fix it when I repost the patch set.

Barret

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines
  2019-04-08 16:03       ` Barret Rhoden
@ 2019-04-09 15:38         ` Junio C Hamano
  0 siblings, 0 replies; 9+ messages in thread
From: Junio C Hamano @ 2019-04-09 15:38 UTC (permalink / raw)
  To: Barret Rhoden
  Cc: Michael Platings, David Kastrup, Git mailing list, Jeff King,
	Stefan Beller, Jeff Smith, René Scharfe,
	Ævar Arnfjörð Bjarmason, Johannes Schindelin

Barret Rhoden <brho@google.com> writes:

> On 4/8/19 5:48 AM, Michael Platings wrote:
>> Hi David,
>> You also get an out-of-memory error with the patch Barret posted at
>> the start of this thread.
>
> I think I see the issue, and will fix it when I repost the patch set.

Thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines
  2019-04-07 21:46 ` [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines michael
  2019-04-07 21:52   ` David Kastrup
@ 2019-04-09 15:56   ` Barret Rhoden
  2019-04-09 19:10     ` Barret Rhoden
  1 sibling, 1 reply; 9+ messages in thread
From: Barret Rhoden @ 2019-04-09 15:56 UTC (permalink / raw)
  To: michael, git
  Cc: Jeff King, Stefan Beller, Jeff Smith, Junio C Hamano,
	René Scharfe, Ævar Arnfjörð Bjarmason,
	David Kastrup, Johannes Schindelin

Hi -

On 4/7/19 5:46 PM, michael@platin.gs wrote:
> From: Michael Platings <michael@platin.gs>
> 
> Hi Barret,
> This is the updated fuzzy matching algorithm, sorry for the delay. It does
> highlight a bug in the calculation for the number of lines ("int nr_parent_lines
>   = e->num_lines - delta;") - if you apply the patch, build it, then try to
> ./git blame --ignore-rev <the patch commit ID> blame.c then you'll get a segfault
> because nr_parent_lines is a negative number. I haven't had time to investigate further
> but I have confirmed that the bug is not due to my patch.

Although I couldn't recreate this, I saw how it could happen.  I have an 
updated version that fixes it.  Short version: I just pass through 
parent_len instead of trying to recreate it with 'delta.'  Recreating 
can go awry because e->num_lines may be less than a chunk size.

> The matching algorithm might not be obvious so it could do with more commenting.
> In the mean time I hope the tests will make the intent clear. In particular I
> want to avoid lines being reordered, because for the interesting use cases
> usually sequences are unchanged even if they shift across different lines.

I thought you wanted to detect similarity across potentially reordered 
lines.  E.g.

commit a 10) #include <sys/a.h>
commit b 11) #include <foo/b.h>
commit c 12) #include <bar/c.h>
commit d 13) #include <dir/d.h>

Where commit X (to be ignored) alphabetized them (or something):

commit X 10) #include <bar/c.h>
commit X 11) #include <dir/d.h>
commit X 12) #include <foo/b.h>
commit X 13) #include <sys/a.h>

In this case, you want to find the line number in the parent that 
corresponds to each line in the target (X is the target, in the relevant 
blame code).  The mapping is: (target -> parent)

10 -> 12
11 -> 13
12 -> 11
13 -> 10

And the parent's line numbers are out of order.  But that's fine - at 
least with the infrastructure from my patch, since it sorts the ignored 
queue at the end of blame_chunk().  The rule in blame.c is that blame 
entries attached to a suspect need to be sorted by s_lno.  (At least 
based on a comment, and my reading of the code).

Maybe we have a different definition of reordering or are having 
different issues regarding line numbers?  TBH, I didn't follow the 
recursion and partitioning strategy too closely.

> Regarding the existing implementation I've got to say I find it unhelpful
> marking "unblameable" lines with a 000000 commit ID. That commit ID already has
> a meaning - lines that aren't yet committed. Further, the purpose of ignoring
> commits should be to avoid obscuring other useful information, not to absolutely
> refuse to show that commit at all. If there's no other commit to show then it's
> harmless to show the commit that would otherwise be ignored.

For me, if I tell git blame to not tell me about a commit, then I don't 
want to hear about it.  My typical blame session involves running blame, 
then digging up the commit it pointed to.  If it points to the commit I 
already told it to not show, then it'll just annoy me.   All 0s made 
sense to me as in "don't bother looking this up."

If you *did* want to see the ignored commit, then I'd run it with 
--ignore-revs-file="", to disable ignore.

That being said, I realize this is a preference of mine, and I can make 
a blame config option to control this.  Probably 
blame.maskIgnoredCommits or something.

> - How about matching *outside* the parent's diff hunk?
> I'd like to know what the use case would be for that. For the use case of
> looking "through" a reformatting or renaming commit I think it would be unhelpful.

I noticed that I have some commits where a hunk in a diff is broken up 
into multiple calls to blame_chunk.  So a better way to prhase it would 
be "looking outside a blame_chunk()'s chunk."

For example, I have a hunk like this (modified to show the contiguous 
lines of context, subtraction, addition):

@@ -256,99 +260,99 @@
   3 context
-83 subtract
+23 addition
   1 context
- 2 sub
+16 add
   1 context
- 5 sub
+ 6 add
   1 context
+45 add
   3 context

blame_chunk() sees that as four separate calls, broken up by the lines 
of context:

	blame_chunk tlno 262 off -4 same 285 plen 83 enl 23
	blame_chunk tlno 286 off 56 same 302 plen 2 enl 16
	blame_chunk tlno 303 off 42 same 309 plen 5 enl 6
	blame_chunk tlno 310 off 41 same 355 plen 0 enl 45

For a lot of those lines, the source of the change was in another diff 
hunk - especially that 45 line addition with nothing from the parent.

Part of the reason for that large diff is whitespace changes, so I can 
run blame with -w.  But there are other scenarios.  Here's another one: 
the header file alphabetizing case:

@@ -4,9 +4,9 @@
   3 context
- 1 #include <header.h>
   2 context (other headers)
+ 1 #include <header.h>
   3 context

That shows up as two blame chunks:
	blame_chunk tlno 6 off 0 same 6 plen 1 enl 0
	blame_chunk tlno 8 off 1 same 9 plen 0 enl 1

What's funny about it is that if the commit we're ignoring did more 
damage to the file (changed every line, for instance), it'd be easier to 
find the right line in the parent, since we'd have the entire diff hunk.

Anyway, being able to look outside the current blame_chunk would help in 
those scenarios.  Specifically, I'm talking about letting blame_chunk() 
point anywhere in the parent.  Right now, it can only look in the 
parent's part of the chunk passed to blame_chunk, which can be 
relatively small.

Barret

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines
  2019-04-09 15:56   ` Barret Rhoden
@ 2019-04-09 19:10     ` Barret Rhoden
  0 siblings, 0 replies; 9+ messages in thread
From: Barret Rhoden @ 2019-04-09 19:10 UTC (permalink / raw)
  To: michael, git
  Cc: Jeff King, Stefan Beller, Jeff Smith, Junio C Hamano,
	René Scharfe, Ævar Arnfjörð Bjarmason,
	David Kastrup, Johannes Schindelin

On 4/9/19 11:56 AM, Barret Rhoden wrote:
> Anyway, being able to look outside the current blame_chunk would help in 
> those scenarios.  Specifically, I'm talking about letting blame_chunk() 
> point anywhere in the parent.  Right now, it can only look in the 
> parent's part of the chunk passed to blame_chunk, which can be 
> relatively small.

I hacked up the ability to look outside of a diff chunk.  The change to 
the heuristic-independent part of the code was very minor, both in code 
and in performance.

The change to make the fingerprinting algorithm from my RFC patch look 
at the entire parent was pretty minor too - I can also cache the 
fingerprints.  The main drawback is performance, but Michael's new 
fingerprinting code alleviates this.

Here's a quick analysis.  When run on a 1000 line C file, with large 
changes from an ignored commit, after the file has been paged in (so, 
run twice):

not ignoring at all:
-------------
real	0m0.062s
user	0m0.042s
sys	0m0.021s

scan only in the parent chunk:
----------------------------
real	0m0.097s
user	0m0.085s
sys	0m0.012s

scan parent chunk, scan entire parent on failure:
-------------------------
real	0m1.773s
user	0m1.752s
sys	0m0.021s

scan the entire parent:
-----------------------
real	0m3.049s
user	0m3.024s
sys	0m0.024s

Scanning the parent chunk first helped a lot.  Scanning the entire 
parent is O(nr_parent_lines * nr_lines_changed).  In my test file, 
that's about 1000 * 600.

It still takes a little while even when checking the parent chunk first. 
  Let's call that one the 'smaller scan.'

Caching the fingerprints (meaning, calculate once and attach to the 
blame_origin) didn't help much here.  It's actually worse, possibly due 
to fingerprinting more than I needed to.

smaller scan, without caching:
----------------
real	0m1.651s
user	0m1.626s
sys	0m0.025s

smaller scan, with caching:
----------------
real	0m1.774s
user	0m1.753s
sys	0m0.021s

Let's try Michael's new fingerprinting code (get_fingerprint() and 
fingerprint_similarity())

smaller scan, caching:
-------------------
real	0m0.240s
user	0m0.215s
sys	0m0.025s

smaller scan, no caching:
----------------------
real	0m0.295s
user	0m0.266s
sys	0m0.029s

full parent scan, caching:
--------------------------
real	0m0.377s
user	0m0.356s
sys	0m0.021s

full parent scan, no caching:
-----------------------------
real	0m0.458s
user	0m0.430s
sys	0m0.028s

And for completenesss,

scan only in the parent chunk:
------------------------------
real	0m0.072s
user	0m0.048s
sys	0m0.024s

So first off, Michael's new fingerprinting is much better.  Caching the 
fingerprints also helps.  Overall, it's fast enough now that I don't 
notice the delay.

I'll reroll the patchset with the ability to find lines in the entire 
parent, and see what you all think.

Barret

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v5 0/6] blame: add the ability to ignore commits
@ 2019-04-03 16:02 Barret Rhoden
  2019-04-03 16:02 ` [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines Barret Rhoden
  0 siblings, 1 reply; 9+ messages in thread
From: Barret Rhoden @ 2019-04-03 16:02 UTC (permalink / raw)
  To: git
  Cc: Ævar Arnfjörð Bjarmason, David Kastrup, Jeff King,
	Jeff Smith, Johannes Schindelin, Junio C Hamano,
	René Scharfe, Stefan Beller, Michael Platings

This patch set adds the ability to ignore a set of commits and their
changes when blaming.  This can be used to ignore a commit deemed 'not
interesting,' such as reformatting.

The last patch in the series is an RFC, and the others could be merged
without it.  That last patch changes the heuristic by which ignored
lines are attributed to specific lines in the parent commit.  This
increases the likelihood of blaming the 'right' commit, where 'right' is
subjective.

The last patch needs a little work still - there's a TODO section in
its commit message.  It includes some of Michael's code, so if we are
going to keep it, I'd like to sort out authorship correctly.

v4 -> v5
v4: https://public-inbox.org/git/20190226170648.211847-1-brho@google.com/
- Changed the handling of blame_entries from ignored commits so that you
  can use any algorithm you want to map lines from the diff chunk to
  different parts of the parent commit.
- fill_origin_blob() optionally can track the offsets of the start of
  every line, similar to what we do in the scoreboard for the final
  file.  This can be used by the matching algorithm.  It has no effect
  if you are not ignoring commits.
- RFC of a fuzzy/fingerprinting heuristic, based on Michael Platings RFC
  at https://public-inbox.org/git/20190324235020.49706-2-michael@platin.gs/
- Made the tests that detect unblamable entries more resilient to
  different heuristics.
- Fixed a few bugs:
	- tests were not grepping the line number from --line-porcelain
	  correctly.
	- In the old version, when I passed the "upper" part of the
	  blame entry to the target and marked unblamable, the suspect
	  was incorrectly marked as the parent.  The s_lno was also in
	  the parent's address space.

v3 -> v4
v3: https://public-inbox.org/git/20190212222722.240676-1-brho@google.com/
- Cleaned up the tests, especially removing usage of sed -i.
- Squashed the 'tests' commit into the other blame commits.  Let me know
  if you'd like further squashing.

v2 -> v3
v2: https://public-inbox.org/git/20190117202919.157326-1-brho@google.com/
- SHA-1 -> "object name", and fixed other comments
- Changed error string for oidset_parse_file()
- Adjusted existing fsck tests to handle those string changes
- Return hash of all zeros for lines we know we cannot identify
- Allow repeated options for blame.ignoreRevsFile and
  --ignore-revs-file.  An empty file name resets the list.  Config
  options are parsed before the command line options.
- Rebased to master
- Added regression tests

v1 -> v2
v1: https://public-inbox.org/git/20190107213013.231514-1-brho@google.com/
- extracted the skiplist from fsck to avoid duplicating code
- overhauled the interface and options
- split out markIgnoredFiles
- handled merges

Barret Rhoden (6):
  Move init_skiplist() outside of fsck
  blame: use a helper function in blame_chunk()
  blame: optionally track the line starts during fill_blame_origin()
  blame: add the ability to ignore commits and their changes
  blame: add a config option to mark ignored lines
  RFC blame: use a fingerprint heuristic to match ignored lines

 Documentation/blame-options.txt |  16 ++
 Documentation/config/blame.txt  |  11 +
 Documentation/git-blame.txt     |   1 +
 blame.c                         | 384 +++++++++++++++++++++++++++-----
 blame.h                         |   6 +
 builtin/blame.c                 |  51 +++++
 fsck.c                          |  37 +--
 oidset.c                        |  35 +++
 oidset.h                        |   8 +
 t/t5504-fetch-receive-strict.sh |  14 +-
 t/t8013-blame-ignore-revs.sh    | 201 +++++++++++++++++
 11 files changed, 665 insertions(+), 99 deletions(-)
 create mode 100755 t/t8013-blame-ignore-revs.sh

-- 
2.21.0.392.gf8f6787159e-goog


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines
  2019-04-03 16:02 [PATCH v5 0/6] blame: add the ability to ignore commits Barret Rhoden
@ 2019-04-03 16:02 ` Barret Rhoden
  2019-04-04 16:37   ` SZEDER Gábor
  0 siblings, 1 reply; 9+ messages in thread
From: Barret Rhoden @ 2019-04-03 16:02 UTC (permalink / raw)
  To: git
  Cc: Michael Platings, Ævar Arnfjörð Bjarmason,
	David Kastrup, Jeff King, Jeff Smith, Johannes Schindelin,
	Junio C Hamano, René Scharfe, Stefan Beller

This replaces the heuristic used to identify lines from ignored commits
with one that finds likely candidate lines in the parent's version of
the file.

The old heuristic simply assigned lines in the target to the same line
number (plus offset) in the parent.  The new function uses
fingerprinting based on the sum of the bitwise-ORed bits in a string.

The fingerprint code and the idea to use them for blame
came from Michael Platings <michael@platin.gs>.

For each line in the target, guess_line_blames() finds the best line in
the parent, above a magic threshold (1 byte pair, currently).  Ties are
broken by proximity of the parent line number to the target's line.

Here's an example of the difference between algorithms.  Consider a file
with four commits:

	commit-a 11) void new_func_1(void *x, void *y);
	commit-b 12) void new_func_2(void *x, void *y);
	commit-c 13) some_line_c
	commit-d 14) some_line_d

After a commit 'X', we have:

	commit-X 11) void new_func_1(void *x,
	commit-X 12)                 void *y);
	commit-X 13) void new_func_2(void *x,
	commit-X 14)                 void *y);
	commit-c 15) some_line_c
	commit-d 16) some_line_d

When we blame-ignored with the old algorithm, we get:

	commit-a 11) void new_func_1(void *x,
	commit-b 12)                 void *y);
	00000000 13) void new_func_2(void *x,
	00000000 14)                 void *y);
	commit-c 15) some_line_c
	commit-d 16) some_line_d

Where commit-b is blamed for 12 instead of 13.  With the fingerprint
algorithm, we get:

	commit-a 11) void new_func_1(void *x,
	commit-b 12)                 void *y);
	commit-b 13) void new_func_2(void *x,
	commit-b 14)                 void *y);
	commit-c 15) some_line_c
	commit-d 16) some_line_d

Note both lines 12 and 14 are given to commit b.  Their match is above
the FINGERPRINT_THRESHOLD, and they tied.  Specifically, parent lines 11
and 12 both match these lines.  The algorithm chose parent line 12,
since that was closest to the target line numbers of 12 and 14.

If we increase the threshold, say to 10, those two lines won't match,
and will be treated as 'unblamable.'

TODO:
- Is '1' a decent threshold?  Add another user option?

- Can we decrease that FINGERPRINT_LENGTH?  Or do something about the
TODO about using a set data structure?

- How about matching *outside* the parent's diff hunk?  Right now, this
just looks in the parent's hunk for a match.  But a better match may be
elsewhere in the file.  This might happen when the diff has too small of
a -M.  If we do this, then consider caching the fingerprints for a
parent's entire file, since multiple target blame_entries may check the
entire parent's space.  Also, if we do this, we probably need to sort the
parent's blame list (again), since the spliced entries point outside of
the diff hunk's range in the parent's address space.

- If we never intend to match outside the parent's diff hunk, then we
might be able to short-circuit guess_line_blames() when the parent's
chunk length is 0.  Doing this somewhat limits the algorithms, and would
be a performance optimization, which I didn't want to do without numbers
saying we needed it.

- Fix up this commit + message.  I'd be up for splitting it more,
particularly if Michael wants his contributions/fingerprinting in his
own commit.

Suggested-by: Michael Platings <michael@platin.gs>
Signed-off-by: Barret Rhoden <brho@google.com>
---
 blame.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 87 insertions(+), 5 deletions(-)

diff --git a/blame.c b/blame.c
index c06cbd906658..50511a300059 100644
--- a/blame.c
+++ b/blame.c
@@ -915,27 +915,109 @@ static int are_lines_adjacent(struct blame_line_tracker *first,
 	       first->s_lno + 1 == second->s_lno;
 }
 
-/*
- * This cheap heuristic assigns lines in the chunk to their relative location in
- * the parent's chunk.  Any additional lines are left with the target.
+/* https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */
+static int bitcount(uint32_t v)
+{
+	v = v - ((v >> 1) & 0x55555555u);
+	v = (v & 0x33333333u) + ((v >> 2) & 0x33333333u);
+	return (((v + (v >> 4)) & 0xf0f0f0fu) * 0x1010101u) >> 24;
+}
+
+#define FINGERPRINT_LENGTH (8 * 256)
+#define FINGERPRINT_THRESHOLD 1
+/* This is just a bitset indicating which byte pairs are present.
+ * e.g. the string "good goo" has pairs "go", "oo", "od", "d ", " g"
+ * String similarity is calculated as a bitwise or and counting the set bits.
+ *
+ * TODO for the string lengths we typically deal with, this would probably be
+ * implemented more efficiently with a set data structure.
  */
+struct fingerprint {
+	uint32_t bits[FINGERPRINT_LENGTH];
+};
+
+static void get_fingerprint(struct fingerprint *result, const char *line_begin,
+			    const char *line_end)
+{
+	for (const char *p = line_begin; p + 1 < line_end; ++p) {
+		unsigned int c = tolower(*p) | (tolower(*(p + 1)) << 8);
+
+		result->bits[c >> 5] |= 1u << (c & 0x1f);
+	}
+}
+
+static int fingerprint_similarity(const struct fingerprint *a,
+				  const struct fingerprint *b)
+{
+	int intersection = 0;
+
+	for (int i = 0; i < FINGERPRINT_LENGTH; ++i)
+		intersection += bitcount(a->bits[i] & b->bits[i]);
+	return intersection;
+}
+
+static void get_chunk_fingerprints(struct fingerprint *fingerprints,
+				   const char *content,
+				   const int *line_starts,
+				   long chunk_start,
+				   long chunk_length)
+{
+	line_starts += chunk_start;
+	for (int i = 0; i != chunk_length; ++i) {
+		const char *linestart = content + line_starts[i];
+		const char *lineend = content + line_starts[i + 1];
+
+		get_fingerprint(fingerprints + i, linestart, lineend);
+	}
+}
+
 static void guess_line_blames(struct blame_entry *e,
 			      struct blame_origin *parent,
 			      struct blame_origin *target,
 			      int offset, int delta,
 			      struct blame_line_tracker *line_blames)
 {
+	struct fingerprint *fp_parent, *fp_target;
 	int nr_parent_lines = e->num_lines - delta;
 
+	fp_parent = xcalloc(sizeof(struct fingerprint), nr_parent_lines);
+	fp_target = xcalloc(sizeof(struct fingerprint), e->num_lines);
+
+	get_chunk_fingerprints(fp_parent, parent->file.ptr,
+			       parent->line_starts,
+			       e->s_lno + offset, nr_parent_lines);
+	get_chunk_fingerprints(fp_target, target->file.ptr,
+			       target->line_starts,
+			       e->s_lno, e->num_lines);
+
 	for (int i = 0; i < e->num_lines; i++) {
-		if (i < nr_parent_lines) {
+		int best_sim_val = FINGERPRINT_THRESHOLD;
+		int best_sim_idx = -1;
+		int sim;
+
+		for (int j = 0; j < nr_parent_lines; j++) {
+			sim = fingerprint_similarity(&fp_target[i],
+						     &fp_parent[j]);
+			if (sim < best_sim_val)
+				continue;
+			/* Break ties with the closest-to-target line number */
+			if (sim == best_sim_val && best_sim_idx != -1 &&
+			    abs(best_sim_idx - i) < abs(j - i))
+				continue;
+			best_sim_val = sim;
+			best_sim_idx = j;
+		}
+		if (best_sim_idx >= 0) {
 			line_blames[i].is_parent = 1;
-			line_blames[i].s_lno = e->s_lno + i + offset;
+			line_blames[i].s_lno = e->s_lno + offset + best_sim_idx;
 		} else {
 			line_blames[i].is_parent = 0;
 			line_blames[i].s_lno = e->s_lno + i;
 		}
 	}
+
+	free(fp_parent);
+	free(fp_target);
 }
 
 /*
-- 
2.21.0.392.gf8f6787159e-goog


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines
  2019-04-03 16:02 ` [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines Barret Rhoden
@ 2019-04-04 16:37   ` SZEDER Gábor
  0 siblings, 0 replies; 9+ messages in thread
From: SZEDER Gábor @ 2019-04-04 16:37 UTC (permalink / raw)
  To: Barret Rhoden
  Cc: git, Michael Platings, Ævar Arnfjörð Bjarmason,
	David Kastrup, Jeff King, Jeff Smith, Johannes Schindelin,
	Junio C Hamano, René Scharfe, Stefan Beller

On Wed, Apr 03, 2019 at 12:02:07PM -0400, Barret Rhoden wrote:
> diff --git a/blame.c b/blame.c
> index c06cbd906658..50511a300059 100644
> --- a/blame.c
> +++ b/blame.c
> @@ -915,27 +915,109 @@ static int are_lines_adjacent(struct blame_line_tracker *first,
>  	       first->s_lno + 1 == second->s_lno;
>  }
>  
> -/*
> - * This cheap heuristic assigns lines in the chunk to their relative location in
> - * the parent's chunk.  Any additional lines are left with the target.
> +/* https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */
> +static int bitcount(uint32_t v)
> +{
> +	v = v - ((v >> 1) & 0x55555555u);
> +	v = (v & 0x33333333u) + ((v >> 2) & 0x33333333u);
> +	return (((v + (v >> 4)) & 0xf0f0f0fu) * 0x1010101u) >> 24;
> +}
> +
> +#define FINGERPRINT_LENGTH (8 * 256)
> +#define FINGERPRINT_THRESHOLD 1
> +/* This is just a bitset indicating which byte pairs are present.
> + * e.g. the string "good goo" has pairs "go", "oo", "od", "d ", " g"
> + * String similarity is calculated as a bitwise or and counting the set bits.
> + *
> + * TODO for the string lengths we typically deal with, this would probably be
> + * implemented more efficiently with a set data structure.
>   */
> +struct fingerprint {
> +	uint32_t bits[FINGERPRINT_LENGTH];
> +};
> +
> +static void get_fingerprint(struct fingerprint *result, const char *line_begin,
> +			    const char *line_end)
> +{
> +	for (const char *p = line_begin; p + 1 < line_end; ++p) {

We still stick to C89, which doesn't support for loop initial
declarations yet.  Please declare the loop variable as a regular local
variable.  This also applies to the several 'for (int i = 0; ...)'
loops in the functions below.

> +		unsigned int c = tolower(*p) | (tolower(*(p + 1)) << 8);
> +
> +		result->bits[c >> 5] |= 1u << (c & 0x1f);
> +	}
> +}
> +
> +static int fingerprint_similarity(const struct fingerprint *a,
> +				  const struct fingerprint *b)
> +{
> +	int intersection = 0;
> +
> +	for (int i = 0; i < FINGERPRINT_LENGTH; ++i)
> +		intersection += bitcount(a->bits[i] & b->bits[i]);
> +	return intersection;
> +}
> +
> +static void get_chunk_fingerprints(struct fingerprint *fingerprints,
> +				   const char *content,
> +				   const int *line_starts,
> +				   long chunk_start,
> +				   long chunk_length)
> +{
> +	line_starts += chunk_start;
> +	for (int i = 0; i != chunk_length; ++i) {
> +		const char *linestart = content + line_starts[i];
> +		const char *lineend = content + line_starts[i + 1];
> +
> +		get_fingerprint(fingerprints + i, linestart, lineend);
> +	}
> +}
> +
>  static void guess_line_blames(struct blame_entry *e,
>  			      struct blame_origin *parent,
>  			      struct blame_origin *target,
>  			      int offset, int delta,
>  			      struct blame_line_tracker *line_blames)
>  {
> +	struct fingerprint *fp_parent, *fp_target;
>  	int nr_parent_lines = e->num_lines - delta;
>  
> +	fp_parent = xcalloc(sizeof(struct fingerprint), nr_parent_lines);
> +	fp_target = xcalloc(sizeof(struct fingerprint), e->num_lines);
> +
> +	get_chunk_fingerprints(fp_parent, parent->file.ptr,
> +			       parent->line_starts,
> +			       e->s_lno + offset, nr_parent_lines);
> +	get_chunk_fingerprints(fp_target, target->file.ptr,
> +			       target->line_starts,
> +			       e->s_lno, e->num_lines);
> +
>  	for (int i = 0; i < e->num_lines; i++) {
> -		if (i < nr_parent_lines) {
> +		int best_sim_val = FINGERPRINT_THRESHOLD;
> +		int best_sim_idx = -1;
> +		int sim;
> +
> +		for (int j = 0; j < nr_parent_lines; j++) {
> +			sim = fingerprint_similarity(&fp_target[i],
> +						     &fp_parent[j]);
> +			if (sim < best_sim_val)
> +				continue;
> +			/* Break ties with the closest-to-target line number */
> +			if (sim == best_sim_val && best_sim_idx != -1 &&
> +			    abs(best_sim_idx - i) < abs(j - i))
> +				continue;
> +			best_sim_val = sim;
> +			best_sim_idx = j;
> +		}
> +		if (best_sim_idx >= 0) {
>  			line_blames[i].is_parent = 1;
> -			line_blames[i].s_lno = e->s_lno + i + offset;
> +			line_blames[i].s_lno = e->s_lno + offset + best_sim_idx;
>  		} else {
>  			line_blames[i].is_parent = 0;
>  			line_blames[i].s_lno = e->s_lno + i;
>  		}
>  	}
> +
> +	free(fp_parent);
> +	free(fp_target);
>  }
>  
>  /*
> -- 
> 2.21.0.392.gf8f6787159e-goog
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-04-09 19:10 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <[PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines>
2019-04-07 21:46 ` [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines michael
2019-04-07 21:52   ` David Kastrup
2019-04-08  9:48     ` Michael Platings
2019-04-08 16:03       ` Barret Rhoden
2019-04-09 15:38         ` Junio C Hamano
2019-04-09 15:56   ` Barret Rhoden
2019-04-09 19:10     ` Barret Rhoden
2019-04-03 16:02 [PATCH v5 0/6] blame: add the ability to ignore commits Barret Rhoden
2019-04-03 16:02 ` [PATCH v5 6/6] RFC blame: use a fingerprint heuristic to match ignored lines Barret Rhoden
2019-04-04 16:37   ` SZEDER Gábor

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).