git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHv3] diff.c: emit moved lines with a different color
@ 2016-09-04 23:42 Stefan Beller
  2016-09-06  1:17 ` Jacob Keller
  0 siblings, 1 reply; 7+ messages in thread
From: Stefan Beller @ 2016-09-04 23:42 UTC (permalink / raw)
  To: git; +Cc: jnareb, gitster, jacob.keller, Stefan Beller, Stefan Beller

When we color the diff, we'll mark moved lines with a different color.

This is achieved by doing a two passes over the diff. The first pass
will inspect each line of the diff and store the removed lines and the
added lines in its own hash map.
The second pass will check for each added line if that is found in the
set of removed lines. If so it will color the added line differently as
with the new `moved-new` color mode. For each removed line we check the
set of added lines and if found emit that with the new color `moved-old`.

When detecting the moved lines, we cannot just rely on a line being equal,
but we need to take the context into account to detect when the moves were
reordered as we do not want to color moved but per-mutated lines.
To do that we use the hash of the preceding line.

This patch was motivated by e.g. reviewing 3b0c4200 ("apply: move
libified code from builtin/apply.c to apply.{c,h}", 2016-08-08)

Signed-off-by: Stefan Beller <sbeller@google.com>
---

 * fixed memleaks
 * only run the new code when needed.
 * renamed variables to fit in better.


 Documentation/config.txt               |   5 +-
 contrib/completion/git-completion.bash |   2 +
 diff.c                                 | 200 +++++++++++++++++++++++++++++++--
 diff.h                                 |   5 +-
 4 files changed, 202 insertions(+), 10 deletions(-)

diff --git a/Documentation/config.txt b/Documentation/config.txt
index 0bcb679..f4f51c2 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -980,8 +980,9 @@ color.diff.<slot>::
 	of `context` (context text - `plain` is a historical synonym),
 	`meta` (metainformation), `frag`
 	(hunk header), 'func' (function in hunk header), `old` (removed lines),
-	`new` (added lines), `commit` (commit headers), or `whitespace`
-	(highlighting whitespace errors).
+	`new` (added lines), `commit` (commit headers), `whitespace`
+	(highlighting whitespace errors), `moved-old` (removed lines that
+	reappear), `moved-new` (added lines that were removed elsewhere).
 
 color.decorate.<slot>::
 	Use customized color for 'git log --decorate' output.  `<slot>` is one
diff --git a/contrib/completion/git-completion.bash b/contrib/completion/git-completion.bash
index 9c8f738..b558d12 100644
--- a/contrib/completion/git-completion.bash
+++ b/contrib/completion/git-completion.bash
@@ -2115,6 +2115,8 @@ _git_config ()
 		color.diff.old
 		color.diff.plain
 		color.diff.whitespace
+		color.diff.moved-old
+		color.diff.moved-new
 		color.grep
 		color.grep.context
 		color.grep.filename
diff --git a/diff.c b/diff.c
index 534c12e..7fde014 100644
--- a/diff.c
+++ b/diff.c
@@ -18,6 +18,7 @@
 #include "ll-merge.h"
 #include "string-list.h"
 #include "argv-array.h"
+#include "git-compat-util.h"
 
 #ifdef NO_FAST_WORKING_DIRECTORY
 #define FAST_WORKING_DIRECTORY 0
@@ -42,6 +43,16 @@ static int diff_dirstat_permille_default = 30;
 static struct diff_options default_diff_options;
 static long diff_algorithm;
 
+static struct hashmap *moved_add;
+static struct hashmap *moved_del;
+static int hash_prev_added;
+static int hash_prev_removed;
+struct moved_entry {
+	struct hashmap_entry ent;
+	char *line;
+	int hash_prev_line;
+};
+
 static char diff_colors[][COLOR_MAXLEN] = {
 	GIT_COLOR_RESET,
 	GIT_COLOR_NORMAL,	/* CONTEXT */
@@ -52,6 +63,8 @@ static char diff_colors[][COLOR_MAXLEN] = {
 	GIT_COLOR_YELLOW,	/* COMMIT */
 	GIT_COLOR_BG_RED,	/* WHITESPACE */
 	GIT_COLOR_NORMAL,	/* FUNCINFO */
+	GIT_COLOR_BLUE,		/* NEW MOVED */
+	GIT_COLOR_MAGENTA,	/* OLD MOVED */
 };
 
 static int parse_diff_color_slot(const char *var)
@@ -72,6 +85,10 @@ static int parse_diff_color_slot(const char *var)
 		return DIFF_WHITESPACE;
 	if (!strcasecmp(var, "func"))
 		return DIFF_FUNCINFO;
+	if (!strcasecmp(var, "moved-old"))
+		return DIFF_FILE_MOVED_OLD;
+	if (!strcasecmp(var, "moved-new"))
+		return DIFF_FILE_MOVED_NEW;
 	return -1;
 }
 
@@ -287,6 +304,25 @@ int git_diff_basic_config(const char *var, const char *value, void *cb)
 	return git_default_config(var, value, cb);
 }
 
+static int moved_entry_cmp(const struct moved_entry *a,
+			   const struct moved_entry *b,
+			   const void *unused)
+{
+	return strcmp(a->line, b->line) &&
+	       a->hash_prev_line == b->hash_prev_line;
+}
+
+static struct moved_entry *prepare_entry(const char *line,
+					 unsigned long len,
+					 int hash_prev_line)
+{
+	struct moved_entry *ret = xmalloc(sizeof(*ret));
+	ret->ent.hash = memhash(line, len) ^ hash_prev_line;
+	ret->line = xmemdupz(line, len);
+	ret->hash_prev_line = hash_prev_line;
+	return ret;
+}
+
 static char *quote_two(const char *one, const char *two)
 {
 	int need_one = quote_c_style(one, NULL, NULL, 1);
@@ -537,16 +573,38 @@ static void emit_add_line(const char *reset,
 			  struct emit_callback *ecbdata,
 			  const char *line, int len)
 {
+	enum color_diff color = DIFF_FILE_NEW;
+	unsigned ws_error_highlight = WSEH_NEW;
+
+	if (ecbdata->opt->color_moved) {
+		int hash = memhash(line, len);
+		struct moved_entry *keydata = prepare_entry(line, len,
+							    hash_prev_added);
+		if (hashmap_get(moved_del, keydata, keydata))
+			color = DIFF_FILE_MOVED_NEW;
+		hash_prev_added = hash;
+	}
 	emit_line_checked(reset, ecbdata, line, len,
-			  DIFF_FILE_NEW, WSEH_NEW, '+');
+			  color, ws_error_highlight, '+');
 }
 
 static void emit_del_line(const char *reset,
 			  struct emit_callback *ecbdata,
 			  const char *line, int len)
 {
+	enum color_diff color = DIFF_FILE_OLD;
+	unsigned ws_error_highlight = WSEH_OLD;
+
+	if (ecbdata->opt->color_moved) {
+		int hash = memhash(line, len);
+		struct moved_entry *keydata = prepare_entry(line, len,
+							    hash_prev_removed);
+		if (hashmap_get(moved_add, keydata, keydata))
+			color = DIFF_FILE_MOVED_OLD;
+		hash_prev_removed = hash;
+	}
 	emit_line_checked(reset, ecbdata, line, len,
-			  DIFF_FILE_OLD, WSEH_OLD, '-');
+			  color, ws_error_highlight, '-');
 }
 
 static void emit_context_line(const char *reset,
@@ -555,6 +613,11 @@ static void emit_context_line(const char *reset,
 {
 	emit_line_checked(reset, ecbdata, line, len,
 			  DIFF_CONTEXT, WSEH_CONTEXT, ' ');
+	if (ecbdata->opt->color_moved) {
+		int h = memhash(line, len);
+		hash_prev_removed = h;
+		hash_prev_added = h;
+	}
 }
 
 static void emit_hunk_header(struct emit_callback *ecbdata,
@@ -1323,6 +1386,41 @@ static void fn_out_consume(void *priv, char *line, unsigned long len)
 	}
 }
 
+static void fn_prepare_consume(void *priv, char *line, unsigned long len)
+{
+	struct moved_entry *d;
+	int hash = memhash(line + 1, len - 1);
+
+	switch (line[0]) {
+	case ' ':
+		hashmap_add(moved_del, prepare_entry(line + 1, len - 1, 0));
+		hashmap_add(moved_add, prepare_entry(line + 1, len - 1, 0));
+		hash_prev_added = hash;
+		hash_prev_removed = hash;
+		break;
+	case '+':
+		if (hash_prev_added) {
+			d = prepare_entry(line + 1, len - 1, hash_prev_added);
+			hashmap_add(moved_add, d);
+		}
+		hash_prev_added = hash;
+		hash_prev_removed = 0;
+		break;
+	case '-':
+		if (hash_prev_removed) {
+			d = prepare_entry(line + 1, len - 1, hash_prev_removed);
+			hashmap_add(moved_del, d);
+		}
+		hash_prev_added = 0;
+		hash_prev_removed = hash;
+		break;
+	default:
+		hash_prev_added = 0;
+		hash_prev_removed = 0;
+		break;
+	}
+}
+
 static char *pprint_rename(const char *a, const char *b)
 {
 	const char *old = a;
@@ -2279,6 +2377,56 @@ struct userdiff_driver *get_textconv(struct diff_filespec *one)
 	return userdiff_get_textconv(one->driver);
 }
 
+static void prepare_moved_lines(struct diff_filepair *p, struct diff_options *o)
+{
+	mmfile_t mf1, mf2;
+	xpparam_t xpp;
+	xdemitconf_t xecfg;
+	struct emit_callback ecbdata;
+	struct diff_filespec *one = p->one;
+	struct diff_filespec *two = p->two;
+	struct userdiff_driver *textconv_one = NULL;
+	struct userdiff_driver *textconv_two = NULL;
+
+	if ((!one->mode || S_ISGITLINK(one->mode)) ||
+	    (!two->mode || S_ISGITLINK(two->mode)))
+		return;
+
+	if (DIFF_OPT_TST(o, ALLOW_TEXTCONV)) {
+		textconv_one = get_textconv(one);
+		textconv_two = get_textconv(two);
+	}
+
+	if (!DIFF_OPT_TST(o, TEXT) &&
+	    ( (!textconv_one && diff_filespec_is_binary(one)) ||
+	      (!textconv_two && diff_filespec_is_binary(two)) ))
+	      return;
+
+	mf1.size = fill_textconv(textconv_one, one, &mf1.ptr);
+	mf2.size = fill_textconv(textconv_two, two, &mf2.ptr);
+
+	memset(&xpp, 0, sizeof(xpp));
+	memset(&xecfg, 0, sizeof(xecfg));
+	memset(&ecbdata, 0, sizeof(ecbdata));
+
+	xpp.flags = o->xdl_opts;
+	xecfg.ctxlen = 1;
+
+	if (o->word_diff)
+		init_diff_words_data(&ecbdata, o, one, two);
+	if (xdi_diff_outf(&mf1, &mf2, fn_prepare_consume, &ecbdata,
+			  &xpp, &xecfg))
+		die("unable to generate generate moved color output for %s",
+		    one->path);
+	if (o->word_diff)
+		free_diff_words_data(&ecbdata);
+	if (textconv_one)
+		free(mf1.ptr);
+	if (textconv_two)
+		free(mf2.ptr);
+	xdiff_clear_find_func(&xecfg);
+}
+
 static void builtin_diff(const char *name_a,
 			 const char *name_b,
 			 struct diff_filespec *one,
@@ -3295,6 +3443,7 @@ void diff_setup(struct diff_options *options)
 	options->xdl_opts |= diff_algorithm;
 	if (diff_compaction_heuristic)
 		DIFF_XDL_SET(options, COMPACTION_HEURISTIC);
+	options->color_moved = options->use_color;
 
 	options->orderfile = diff_order_file_cfg;
 
@@ -3413,6 +3562,9 @@ void diff_setup_done(struct diff_options *options)
 
 	if (DIFF_OPT_TST(options, FOLLOW_RENAMES) && options->pathspec.nr != 1)
 		die(_("--follow requires exactly one pathspec"));
+
+	if (!options->use_color || external_diff())
+		options->color_moved = 0;
 }
 
 static int opt_arg(const char *arg, int arg_short, const char *arg_long, int *val)
@@ -4622,6 +4774,43 @@ void diff_warn_rename_limit(const char *varname, int needed, int degraded_cc)
 		warning(rename_limit_advice, varname, needed);
 }
 
+static void diff_flush_format_patch(struct diff_options *o)
+{
+	int i;
+	struct diff_queue_struct *q = &diff_queued_diff;
+	if (o->color_moved) {
+		moved_add = xmalloc(sizeof(*moved_add));
+		moved_del = xmalloc(sizeof(*moved_del));
+		hashmap_init(moved_add, (hashmap_cmp_fn)moved_entry_cmp, 0);
+		hashmap_init(moved_del, (hashmap_cmp_fn)moved_entry_cmp, 0);
+
+		for (i = 0; i < q->nr; i++) {
+			struct diff_filepair *p = q->queue[i];
+			if (check_pair_status(p) && !diff_unmodified_pair(p))
+				prepare_moved_lines(p, o);
+		}
+	}
+
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		if (check_pair_status(p))
+			diff_flush_patch(p, o);
+	}
+
+	if (o->color_moved) {
+		struct hashmap_iter iter;
+		struct moved_entry *e;
+		hashmap_iter_init(moved_add, &iter);
+		while ((e = hashmap_iter_next(&iter)))
+			free(e->line);
+		hashmap_iter_init(moved_del, &iter);
+		while ((e = hashmap_iter_next(&iter)))
+			free(e->line);
+		hashmap_free(moved_add, 1);
+		hashmap_free(moved_del, 1);
+	}
+}
+
 void diff_flush(struct diff_options *options)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
@@ -4696,6 +4885,7 @@ void diff_flush(struct diff_options *options)
 		if (!options->file)
 			die_errno("Could not open /dev/null");
 		options->close_file = 1;
+		options->color_moved = 0;
 		for (i = 0; i < q->nr; i++) {
 			struct diff_filepair *p = q->queue[i];
 			if (check_pair_status(p))
@@ -4716,11 +4906,7 @@ void diff_flush(struct diff_options *options)
 			}
 		}
 
-		for (i = 0; i < q->nr; i++) {
-			struct diff_filepair *p = q->queue[i];
-			if (check_pair_status(p))
-				diff_flush_patch(p, options);
-		}
+		diff_flush_format_patch(options);
 	}
 
 	if (output_format & DIFF_FORMAT_CALLBACK)
diff --git a/diff.h b/diff.h
index 7883729..b0331b9 100644
--- a/diff.h
+++ b/diff.h
@@ -122,6 +122,7 @@ struct diff_options {
 	unsigned int filter;
 
 	int use_color;
+	int color_moved;
 	int context;
 	int interhunkcontext;
 	int break_opt;
@@ -189,7 +190,9 @@ enum color_diff {
 	DIFF_FILE_NEW = 5,
 	DIFF_COMMIT = 6,
 	DIFF_WHITESPACE = 7,
-	DIFF_FUNCINFO = 8
+	DIFF_FUNCINFO = 8,
+	DIFF_FILE_MOVED_NEW = 9,
+	DIFF_FILE_MOVED_OLD = 10
 };
 const char *diff_get_color(int diff_use_color, enum color_diff ix);
 #define diff_get_color_opt(o, ix) \
-- 
2.10.0.1.ga424fb3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCHv3] diff.c: emit moved lines with a different color
  2016-09-04 23:42 [PATCHv3] diff.c: emit moved lines with a different color Stefan Beller
@ 2016-09-06  1:17 ` Jacob Keller
  2016-09-06  2:20   ` Stefan Beller
  0 siblings, 1 reply; 7+ messages in thread
From: Jacob Keller @ 2016-09-06  1:17 UTC (permalink / raw)
  To: Stefan Beller
  Cc: Git mailing list, Jakub Narębski, Junio C Hamano,
	Stefan Beller

On Sun, Sep 4, 2016 at 4:42 PM, Stefan Beller <stefanbeller@gmail.com> wrote:
> When we color the diff, we'll mark moved lines with a different color.
>

Excellent idea. This is a very neat way to show extra information
without cluttering the diff output.

> This is achieved by doing a two passes over the diff. The first pass
> will inspect each line of the diff and store the removed lines and the
> added lines in its own hash map.
> The second pass will check for each added line if that is found in the
> set of removed lines. If so it will color the added line differently as
> with the new `moved-new` color mode. For each removed line we check the
> set of added lines and if found emit that with the new color `moved-old`.
>

Makes sense.

> When detecting the moved lines, we cannot just rely on a line being equal,
> but we need to take the context into account to detect when the moves were
> reordered as we do not want to color moved but per-mutated lines.
> To do that we use the hash of the preceding line.

Also makes sense.

>
> This patch was motivated by e.g. reviewing 3b0c4200 ("apply: move
> libified code from builtin/apply.c to apply.{c,h}", 2016-08-08)
>

Yes, this would be quite helpful.

> diff --git a/Documentation/config.txt b/Documentation/config.txt
> index 0bcb679..f4f51c2 100644
> --- a/Documentation/config.txt
> +++ b/Documentation/config.txt
> @@ -980,8 +980,9 @@ color.diff.<slot>::
>         of `context` (context text - `plain` is a historical synonym),
>         `meta` (metainformation), `frag`
>         (hunk header), 'func' (function in hunk header), `old` (removed lines),
> -       `new` (added lines), `commit` (commit headers), or `whitespace`
> -       (highlighting whitespace errors).
> +       `new` (added lines), `commit` (commit headers), `whitespace`
> +       (highlighting whitespace errors), `moved-old` (removed lines that
> +       reappear), `moved-new` (added lines that were removed elsewhere).
>

I liked Junio's "Moved from" and "moved to" but I think moved old and
moved new are ok as well.

> @@ -287,6 +304,25 @@ int git_diff_basic_config(const char *var, const char *value, void *cb)
>         return git_default_config(var, value, cb);
>  }
>
> +static int moved_entry_cmp(const struct moved_entry *a,
> +                          const struct moved_entry *b,
> +                          const void *unused)
> +{
> +       return strcmp(a->line, b->line) &&
> +              a->hash_prev_line == b->hash_prev_line;

So we're only comparing them if they match and have a matching
previous line? That seems pretty reasonable to reduce the cost of
computing exact copied sequences.

> +       if (ecbdata->opt->color_moved) {
> +               int h = memhash(line, len);
> +               hash_prev_removed = h;
> +               hash_prev_added = h;
> +       }
>  }

If I understand, this is to ensure that we don't keep re-hashing each
line right?

Thanks,
Jake

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv3] diff.c: emit moved lines with a different color
  2016-09-06  1:17 ` Jacob Keller
@ 2016-09-06  2:20   ` Stefan Beller
  2016-09-06  6:58     ` Jacob Keller
  2016-09-07 17:54     ` Junio C Hamano
  0 siblings, 2 replies; 7+ messages in thread
From: Stefan Beller @ 2016-09-06  2:20 UTC (permalink / raw)
  To: Jacob Keller
  Cc: Stefan Beller, Git mailing list, Jakub Narębski,
	Junio C Hamano

On Mon, Sep 5, 2016 at 6:17 PM, Jacob Keller <jacob.keller@gmail.com> wrote:
> On Sun, Sep 4, 2016 at 4:42 PM, Stefan Beller <stefanbeller@gmail.com> wrote:
>> When we color the diff, we'll mark moved lines with a different color.
>>
>
> Excellent idea. This is a very neat way to show extra information
> without cluttering the diff output.
>
>> This is achieved by doing a two passes over the diff. The first pass
>> will inspect each line of the diff and store the removed lines and the
>> added lines in its own hash map.
>> The second pass will check for each added line if that is found in the
>> set of removed lines. If so it will color the added line differently as
>> with the new `moved-new` color mode. For each removed line we check the
>> set of added lines and if found emit that with the new color `moved-old`.
>>
>
> Makes sense.
>
>> When detecting the moved lines, we cannot just rely on a line being equal,
>> but we need to take the context into account to detect when the moves were
>> reordered as we do not want to color moved but per-mutated lines.
>> To do that we use the hash of the preceding line.
>
> Also makes sense.
>
>>
>> This patch was motivated by e.g. reviewing 3b0c4200 ("apply: move
>> libified code from builtin/apply.c to apply.{c,h}", 2016-08-08)
>>
>
> Yes, this would be quite helpful.
>
>> diff --git a/Documentation/config.txt b/Documentation/config.txt
>> index 0bcb679..f4f51c2 100644
>> --- a/Documentation/config.txt
>> +++ b/Documentation/config.txt
>> @@ -980,8 +980,9 @@ color.diff.<slot>::
>>         of `context` (context text - `plain` is a historical synonym),
>>         `meta` (metainformation), `frag`
>>         (hunk header), 'func' (function in hunk header), `old` (removed lines),
>> -       `new` (added lines), `commit` (commit headers), or `whitespace`
>> -       (highlighting whitespace errors).
>> +       `new` (added lines), `commit` (commit headers), `whitespace`
>> +       (highlighting whitespace errors), `moved-old` (removed lines that
>> +       reappear), `moved-new` (added lines that were removed elsewhere).
>>
>
> I liked Junio's "Moved from" and "moved to" but I think moved old and
> moved new are ok as well.

as we do not want to see dashes ('moved-old'), I think I'l go with
"movedfrom" and "movedto".

>
>> @@ -287,6 +304,25 @@ int git_diff_basic_config(const char *var, const char *value, void *cb)
>>         return git_default_config(var, value, cb);
>>  }
>>
>> +static int moved_entry_cmp(const struct moved_entry *a,
>> +                          const struct moved_entry *b,
>> +                          const void *unused)
>> +{
>> +       return strcmp(a->line, b->line) &&
>> +              a->hash_prev_line == b->hash_prev_line;
>
> So we're only comparing them if they match and have a matching
> previous line? That seems pretty reasonable to reduce the cost of
> computing exact copied sequences.
>
>> +       if (ecbdata->opt->color_moved) {
>> +               int h = memhash(line, len);
>> +               hash_prev_removed = h;
>> +               hash_prev_added = h;
>> +       }
>>  }
>
> If I understand, this is to ensure that we don't keep re-hashing each
> line right?

No, this is to ensure we have the context sensitivity of one prior line.

In the collection phase we look at each line of the patch and make a hash of it.
Then we store the hash temporarily (think of a state machine that goes line by
line and always keeps the hash of the last line)

What we store in the hashmaps is the hash(current line) ^
hash(previous applicable line).
With previous applicable line I mean any line starting with " " or "+"
when the current
line starts with "+" and " " or "-" when the current line starts with "-".

When going through the second pass and actually emitting colored lines
we only find matches in the hash map if the current line AND the previous line
match as we lookup by hash code, i.e. if we have a moved line, but the
previous line
changed we do not find it in the hashmap and we don't color it, such
that the reviewer
can spot a permutation.

So in the second phase we also need to have access to previous line, so maybe
we could also go with just taking the line with us instead of 2 hash codes.
But that implementation detail seems like a trade off to me, where I'd lean
on keeping the hashes around as lines may be very long in bad cases, whereas
the hashcode is short and it is a cheap hash.

(I am referring to http://i.imgur.com/MnaSZ1D.png where in the malicious
case all lines were moved to there as well, but permutated)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv3] diff.c: emit moved lines with a different color
  2016-09-06  2:20   ` Stefan Beller
@ 2016-09-06  6:58     ` Jacob Keller
  2016-09-07 17:54     ` Junio C Hamano
  1 sibling, 0 replies; 7+ messages in thread
From: Jacob Keller @ 2016-09-06  6:58 UTC (permalink / raw)
  To: Stefan Beller
  Cc: Stefan Beller, Git mailing list, Jakub Narębski,
	Junio C Hamano

On Mon, Sep 5, 2016 at 7:20 PM, Stefan Beller <sbeller@google.com> wrote:
>> If I understand, this is to ensure that we don't keep re-hashing each
>> line right?
>
> No, this is to ensure we have the context sensitivity of one prior line.
>
> In the collection phase we look at each line of the patch and make a hash of it.
> Then we store the hash temporarily (think of a state machine that goes line by
> line and always keeps the hash of the last line)
>
> What we store in the hashmaps is the hash(current line) ^
> hash(previous applicable line).
> With previous applicable line I mean any line starting with " " or "+"
> when the current
> line starts with "+" and " " or "-" when the current line starts with "-".
>
> When going through the second pass and actually emitting colored lines
> we only find matches in the hash map if the current line AND the previous line
> match as we lookup by hash code, i.e. if we have a moved line, but the
> previous line
> changed we do not find it in the hashmap and we don't color it, such
> that the reviewer
> can spot a permutation.
>
> So in the second phase we also need to have access to previous line, so maybe
> we could also go with just taking the line with us instead of 2 hash codes.
> But that implementation detail seems like a trade off to me, where I'd lean
> on keeping the hashes around as lines may be very long in bad cases, whereas
> the hashcode is short and it is a cheap hash.
>
> (I am referring to http://i.imgur.com/MnaSZ1D.png where in the malicious
> case all lines were moved to there as well, but permutated)

Good that clears up a lot of my questions. It seems like a reasonable
algorithm, and I like the example of per-muted lines.

I think if we can find some real-world data on how much slower this
is, we can find out what kind of configuration we want to give it? It
seems like something incredibly helpful to me, but having a way out is
nice incase it ends up being too costly. I suspect that will only be
the case for a large patch.

Thanks,
Jake

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv3] diff.c: emit moved lines with a different color
  2016-09-06  2:20   ` Stefan Beller
  2016-09-06  6:58     ` Jacob Keller
@ 2016-09-07 17:54     ` Junio C Hamano
  2016-09-07 18:02       ` Stefan Beller
  1 sibling, 1 reply; 7+ messages in thread
From: Junio C Hamano @ 2016-09-07 17:54 UTC (permalink / raw)
  To: Stefan Beller
  Cc: Jacob Keller, Stefan Beller, Git mailing list,
	Jakub Narębski

Stefan Beller <sbeller@google.com> writes:

> as we do not want to see dashes ('moved-old'), I think I'l go with
> "movedfrom" and "movedto".

OK.  They would be color.diff.movedFrom and color.diff.movedTo in
the doc, and "movedfrom" and "movedto" in the code (as the caller
already downcased them for you to strcmp()).

> When going through the second pass and actually emitting colored lines
> we only find matches in the hash map if the current line AND the previous line
> match as we lookup by hash code, i.e. if we have a moved line, but the
> previous line
> changed we do not find it in the hashmap and we don't color it, such
> that the reviewer
> can spot a permutation.

Hmph.  Does this have impact on a line that was at the beginning or
the end of a file that got moved to the beginning or the end of a
file (four permutations, of 9 if you throw in "middle"), because
some cases it does not have a "previous" line?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv3] diff.c: emit moved lines with a different color
  2016-09-07 17:54     ` Junio C Hamano
@ 2016-09-07 18:02       ` Stefan Beller
  2016-09-07 21:21         ` Jacob Keller
  0 siblings, 1 reply; 7+ messages in thread
From: Stefan Beller @ 2016-09-07 18:02 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Jacob Keller, Stefan Beller, Git mailing list,
	Jakub Narębski

On Wed, Sep 7, 2016 at 10:54 AM, Junio C Hamano <gitster@pobox.com> wrote:
> Stefan Beller <sbeller@google.com> writes:
>
>> as we do not want to see dashes ('moved-old'), I think I'l go with
>> "movedfrom" and "movedto".
>
> OK.  They would be color.diff.movedFrom and color.diff.movedTo in
> the doc, and "movedfrom" and "movedto" in the code (as the caller
> already downcased them for you to strcmp()).
>
>> When going through the second pass and actually emitting colored lines
>> we only find matches in the hash map if the current line AND the previous line
>> match as we lookup by hash code, i.e. if we have a moved line, but the
>> previous line
>> changed we do not find it in the hashmap and we don't color it, such
>> that the reviewer
>> can spot a permutation.
>
> Hmph.  Does this have impact on a line that was at the beginning or
> the end of a file that got moved to the beginning or the end of a
> file (four permutations, of 9 if you throw in "middle"), because
> some cases it does not have a "previous" line?

I spotted that problem as well. We need to adapt the algorithm a bit more:

  If the previous line is of the same kind (i.e. starting with + or -),
    then we have to take it into account,
  otherwise (i.e. previous line is different, such as header, hunk header,
  or different sign)
    then ignore previous line, as the previous line is already
    having a different color.

That works for the very first line of a file as well.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv3] diff.c: emit moved lines with a different color
  2016-09-07 18:02       ` Stefan Beller
@ 2016-09-07 21:21         ` Jacob Keller
  0 siblings, 0 replies; 7+ messages in thread
From: Jacob Keller @ 2016-09-07 21:21 UTC (permalink / raw)
  To: Stefan Beller
  Cc: Junio C Hamano, Stefan Beller, Git mailing list,
	Jakub Narębski

On Wed, Sep 7, 2016 at 11:02 AM, Stefan Beller <sbeller@google.com> wrote:
> On Wed, Sep 7, 2016 at 10:54 AM, Junio C Hamano <gitster@pobox.com> wrote:
>> Stefan Beller <sbeller@google.com> writes:
>>
>>> as we do not want to see dashes ('moved-old'), I think I'l go with
>>> "movedfrom" and "movedto".
>>
>> OK.  They would be color.diff.movedFrom and color.diff.movedTo in
>> the doc, and "movedfrom" and "movedto" in the code (as the caller
>> already downcased them for you to strcmp()).
>>
>>> When going through the second pass and actually emitting colored lines
>>> we only find matches in the hash map if the current line AND the previous line
>>> match as we lookup by hash code, i.e. if we have a moved line, but the
>>> previous line
>>> changed we do not find it in the hashmap and we don't color it, such
>>> that the reviewer
>>> can spot a permutation.
>>
>> Hmph.  Does this have impact on a line that was at the beginning or
>> the end of a file that got moved to the beginning or the end of a
>> file (four permutations, of 9 if you throw in "middle"), because
>> some cases it does not have a "previous" line?
>
> I spotted that problem as well. We need to adapt the algorithm a bit more:
>
>   If the previous line is of the same kind (i.e. starting with + or -),
>     then we have to take it into account,
>   otherwise (i.e. previous line is different, such as header, hunk header,
>   or different sign)
>     then ignore previous line, as the previous line is already
>     having a different color.
>
> That works for the very first line of a file as well.

That should work to resolve some issues, and ensure the first line of
a hunk works correctly.

Thanks,
Jake

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-09-07 21:21 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-09-04 23:42 [PATCHv3] diff.c: emit moved lines with a different color Stefan Beller
2016-09-06  1:17 ` Jacob Keller
2016-09-06  2:20   ` Stefan Beller
2016-09-06  6:58     ` Jacob Keller
2016-09-07 17:54     ` Junio C Hamano
2016-09-07 18:02       ` Stefan Beller
2016-09-07 21:21         ` Jacob Keller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).