[PATCH] contrib/diff-highlight: multibyte characters diff

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] contrib/diff-highlight: multibyte characters diff
@ 2014-02-11  9:09 Yoshihiro Sugi
  2014-02-11 19:30 ` Junio C Hamano
  2014-02-12 20:59 ` Jeff King
  0 siblings, 2 replies; 8+ messages in thread
From: Yoshihiro Sugi @ 2014-02-11  9:09 UTC (permalink / raw)
  To: git; +Cc: Yoshihiro Sugi

Signed-off-by: Yoshihiro Sugi <sugi1982@gmail.com>

diff-highlight split each hunks and compare them as byte sequences.
it causes problems when diff hunks include multibyte characters.
This change enable to work on such cases by decoding inputs and encoding output as utf8 string.
---
 contrib/diff-highlight/diff-highlight | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight
index c4404d4..49b4f53 100755
--- a/contrib/diff-highlight/diff-highlight
+++ b/contrib/diff-highlight/diff-highlight
@@ -2,6 +2,7 @@
 
 use warnings FATAL => 'all';
 use strict;
+use Encode qw(decode_utf8 encode_utf8);
 
 # Highlight by reversing foreground and background. You could do
 # other things like bold or underline if you prefer.
@@ -15,8 +16,9 @@ my @added;
 my $in_hunk;
 
 while (<>) {
+	$_ = decode_utf8($_);
 	if (!$in_hunk) {
-		print;
+		print encode_utf8($_);
 		$in_hunk = /^$COLOR*\@/;
 	}
 	elsif (/^$COLOR*-/) {
@@ -30,7 +32,7 @@ while (<>) {
 		@removed = ();
 		@added = ();
 
-		print;
+		print encode_utf8($_);
 		$in_hunk = /^$COLOR*[\@ ]/;
 	}
 
@@ -58,7 +60,8 @@ sub show_hunk {
 
 	# If one side is empty, then there is nothing to compare or highlight.
 	if (!@$a || !@$b) {
-		print @$a, @$b;
+		print encode_utf8($_) for @$a;
+		print encode_utf8($_) for @$b;
 		return;
 	}
 
@@ -67,17 +70,18 @@ sub show_hunk {
 	# stupid, and only handle multi-line hunks that remove and add the same
 	# number of lines.
 	if (@$a != @$b) {
-		print @$a, @$b;
+		print encode_utf8($_) for @$a;
+		print encode_utf8($_) for @$b;
 		return;
 	}
 
 	my @queue;
 	for (my $i = 0; $i < @$a; $i++) {
 		my ($rm, $add) = highlight_pair($a->[$i], $b->[$i]);
-		print $rm;
+		print encode_utf8($rm);
 		push @queue, $add;
 	}
-	print @queue;
+	print encode_utf8($_) for @queue;
 }
 
 sub highlight_pair {
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] contrib/diff-highlight: multibyte characters diff
  2014-02-11  9:09 [PATCH] contrib/diff-highlight: multibyte characters diff Yoshihiro Sugi
@ 2014-02-11 19:30 ` Junio C Hamano
  2014-02-12 20:59 ` Jeff King
  1 sibling, 0 replies; 8+ messages in thread
From: Junio C Hamano @ 2014-02-11 19:30 UTC (permalink / raw)
  To: Jeff King; +Cc: git, Yoshihiro Sugi

Yoshihiro Sugi <sugi1982@gmail.com> writes:

> Signed-off-by: Yoshihiro Sugi <sugi1982@gmail.com>
>
> diff-highlight split each hunks and compare them as byte sequences.
> it causes problems when diff hunks include multibyte characters.
> This change enable to work on such cases by decoding inputs and encoding output as utf8 string.
> ---
>  contrib/diff-highlight/diff-highlight | 16 ++++++++++------
>  1 file changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight
> index c4404d4..49b4f53 100755
> --- a/contrib/diff-highlight/diff-highlight
> +++ b/contrib/diff-highlight/diff-highlight
> @@ -2,6 +2,7 @@
>  
>  use warnings FATAL => 'all';
>  use strict;
> +use Encode qw(decode_utf8 encode_utf8);
>  
>  # Highlight by reversing foreground and background. You could do
>  # other things like bold or underline if you prefer.
> @@ -15,8 +16,9 @@ my @added;
>  my $in_hunk;
>  
>  while (<>) {
> +	$_ = decode_utf8($_);
>  	if (!$in_hunk) {
> -		print;
> +		print encode_utf8($_);
>  		$in_hunk = /^$COLOR*\@/;
>  	}
>  	elsif (/^$COLOR*-/) {
> @@ -30,7 +32,7 @@ while (<>) {
>  		@removed = ();
>  		@added = ();
>  
> -		print;
> +		print encode_utf8($_);
>  		$in_hunk = /^$COLOR*[\@ ]/;
>  	}
>  
> @@ -58,7 +60,8 @@ sub show_hunk {
>  
>  	# If one side is empty, then there is nothing to compare or highlight.
>  	if (!@$a || !@$b) {
> -		print @$a, @$b;
> +		print encode_utf8($_) for @$a;
> +		print encode_utf8($_) for @$b;
>  		return;
>  	}
>  
> @@ -67,17 +70,18 @@ sub show_hunk {
>  	# stupid, and only handle multi-line hunks that remove and add the same
>  	# number of lines.
>  	if (@$a != @$b) {
> -		print @$a, @$b;
> +		print encode_utf8($_) for @$a;
> +		print encode_utf8($_) for @$b;
>  		return;
>  	}
>  
>  	my @queue;
>  	for (my $i = 0; $i < @$a; $i++) {
>  		my ($rm, $add) = highlight_pair($a->[$i], $b->[$i]);
> -		print $rm;
> +		print encode_utf8($rm);
>  		push @queue, $add;
>  	}
> -	print @queue;
> +	print encode_utf8($_) for @queue;
>  }
>  
>  sub highlight_pair {

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] contrib/diff-highlight: multibyte characters diff
  2014-02-11  9:09 [PATCH] contrib/diff-highlight: multibyte characters diff Yoshihiro Sugi
  2014-02-11 19:30 ` Junio C Hamano
@ 2014-02-12 20:59 ` Jeff King
  2014-02-12 23:10   ` Thomas Adam
  2014-02-13 19:14   ` Yoshihiro Sugi
  1 sibling, 2 replies; 8+ messages in thread
From: Jeff King @ 2014-02-12 20:59 UTC (permalink / raw)
  To: Yoshihiro Sugi; +Cc: git

On Tue, Feb 11, 2014 at 06:09:10PM +0900, Yoshihiro Sugi wrote:

> diff-highlight split each hunks and compare them as byte sequences.
> it causes problems when diff hunks include multibyte characters.
> This change enable to work on such cases by decoding inputs and encoding output as utf8 string.

Thanks for looking at this. I didn't consider multibyte characters at
all when I wrote the original.

Sadly, applying your patch seems to cause diff-highlight to take about
1.5x as much CPU (I just tried it on the complete output of "git log
--all -p" in my git.git repository, which went from ~60s to ~90s). That
is not necessarily a deal-breaker, as it is more important to be
correct. But I wonder if there is any way we can optimize it. From my
understanding, it may not be the encoding/decoding itself, but rather
that the utf8-aware text routines are slower. More on that below.

>  # Highlight by reversing foreground and background. You could do
>  # other things like bold or underline if you prefer.
> @@ -15,8 +16,9 @@ my @added;
>  my $in_hunk;
>  
>  while (<>) {
> +	$_ = decode_utf8($_);

What happens when the diff content is not utf8 at all? The default
failure mode for decode_utf8 seems to be to replace non-utf8 sequences
with a substitution character. Which means we would corrupt something
like iso8859-1, which works fine in the current code.

>  	if (!$in_hunk) {
> -		print;
> +		print encode_utf8($_);

And we have the opposite question here. We know that what we have in the
string is perl's internal format. But how do we know that what the
user's terminal wants is utf8?

I think in the most general case, we would need to auto-detect the
encoding for each string (since there is no guarantee that a file even
retains the same encoding through its history). And then probably punt on
highlighting when comparing two lines in different encodings, and if
they are in the same encoding, treat them as a sequence of code points
rather than octets when comparing.

That's probably way too slow and complicated to worry about. And
frankly, I am OK with making the tool work _best_ with UTF-8, as long as
it can fail gracefully for other encodings (i.e., if you can turn off
the utf8 magic when you have another encoding, and we will pass it
through blindly as octets).

Would it be enough to do something like the patch below? The specific
things I am shooting for are:

  - the only part of the code that cares about the sequence is the
    split/highlight bit, and most lines do not get highlighted at all.
    Therefore it tries to lazily do the decode step, which gets back our
    lost performance (I measured ~12% slowdown).

  - we try the utf8 decode and if it fails, fall back to treating it
    like a sequence of bytes. That doesn't help people with other
    multibyte encodings, but at least lets single-byte encodings besides
    utf8 continue to work.

I constructed a very simple test of:

  git init &&
  printf 'cont\xc3\xa9nt\n' >file &&
  git add file &&
  printf 'cont\xc3\xb6nt\n' >file &&
  git diff

and it seemed to work. Can you confirm that it does the right thing on
your more complicated cases?

-Peff

---
diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight
index c4404d4..9447ba2 100755
--- a/contrib/diff-highlight/diff-highlight
+++ b/contrib/diff-highlight/diff-highlight
@@ -2,6 +2,7 @@

 use warnings FATAL => 'all';
 use strict;
+use Encode qw(decode_utf8 encode_utf8);

 # Highlight by reversing foreground and background. You could do
 # other things like bold or underline if you prefer.
@@ -73,13 +74,23 @@ sub show_hunk {

 	my @queue;
 	for (my $i = 0; $i < @$a; $i++) {
-		my ($rm, $add) = highlight_pair($a->[$i], $b->[$i]);
-		print $rm;
-		push @queue, $add;
+		my ($a_dec, $encode_rm) = decode($a->[$i]);
+		my ($b_dec, $encode_add) = decode($b->[$i]);
+		my ($rm, $add) = highlight_pair($a_dec, $b_dec);
+		print $encode_rm->($rm);
+		push @queue, $encode_add->($add);
 	}
 	print @queue;
 }

+sub decode {
+	my $orig = shift;
+	my $decoded = eval { decode_utf8($orig, Encode::FB_CROAK) };
+	return defined $decoded ?
+	       ($decoded, sub { encode_utf8(shift) }) :
+	       ($orig, sub { shift });
+}
+
 sub highlight_pair {
 	my @a = split_line(shift);
 	my @b = split_line(shift);

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] contrib/diff-highlight: multibyte characters diff
  2014-02-12 20:59 ` Jeff King
@ 2014-02-12 23:10   ` Thomas Adam
  2014-02-12 23:27     ` Jeff King
  2014-02-13 19:14   ` Yoshihiro Sugi
  1 sibling, 1 reply; 8+ messages in thread
From: Thomas Adam @ 2014-02-12 23:10 UTC (permalink / raw)
  To: Jeff King; +Cc: Yoshihiro Sugi, git list

On 12 February 2014 20:59, Jeff King <peff@peff.net> wrote:
> +sub decode {
> +       my $orig = shift;
> +       my $decoded = eval { decode_utf8($orig, Encode::FB_CROAK) };
> +       return defined $decoded ?

I'd still advocate checking $@ here, rather than the defined $decoded check.

> +              ($decoded, sub { encode_utf8(shift) }) :
> +              ($orig, sub { shift });
> +}
> +

-- Thomas Adam

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] contrib/diff-highlight: multibyte characters diff
  2014-02-12 23:10   ` Thomas Adam
@ 2014-02-12 23:27     ` Jeff King
  2014-02-13  1:17       ` brian m. carlson
  0 siblings, 1 reply; 8+ messages in thread
From: Jeff King @ 2014-02-12 23:27 UTC (permalink / raw)
  To: Thomas Adam; +Cc: Yoshihiro Sugi, git list

On Wed, Feb 12, 2014 at 11:10:49PM +0000, Thomas Adam wrote:

> On 12 February 2014 20:59, Jeff King <peff@peff.net> wrote:
> > +sub decode {
> > +       my $orig = shift;
> > +       my $decoded = eval { decode_utf8($orig, Encode::FB_CROAK) };
> > +       return defined $decoded ?
> 
> I'd still advocate checking $@ here, rather than the defined $decoded check.

I don't mind changing it, but for my edification, what is the advantage?

-Peff

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] contrib/diff-highlight: multibyte characters diff
  2014-02-12 23:27     ` Jeff King
@ 2014-02-13  1:17       ` brian m. carlson
  2014-02-13  1:37         ` Jeff King
  0 siblings, 1 reply; 8+ messages in thread
From: brian m. carlson @ 2014-02-13  1:17 UTC (permalink / raw)
  To: Jeff King; +Cc: Thomas Adam, Yoshihiro Sugi, git list

[-- Attachment #1: Type: text/plain, Size: 1263 bytes --]

On Wed, Feb 12, 2014 at 06:27:40PM -0500, Jeff King wrote:
> On Wed, Feb 12, 2014 at 11:10:49PM +0000, Thomas Adam wrote:
> 
> > On 12 February 2014 20:59, Jeff King <peff@peff.net> wrote:
> > > +sub decode {
> > > +       my $orig = shift;
> > > +       my $decoded = eval { decode_utf8($orig, Encode::FB_CROAK) };
> > > +       return defined $decoded ?
> > 
> > I'd still advocate checking $@ here, rather than the defined $decoded check.
> 
> I don't mind changing it, but for my edification, what is the advantage?

The documentation for decode_utf8 isn't clear, but I don't know if it
can ever return undef.  What, for example, does it return if $orig is
not defined?  That's the benefit: it's immediately clear to the user
that you're interested in whether it threw an exception, rather than
whether it produced a given value.

That said, $DAYJOB is a Perl shop, and I would certainly not reject this
code in review, and depending on the situation, I might even write
something like this.  I personally think it's fine.

-- 
brian m. carlson / brian with sandals: Houston, Texas, US
+1 832 623 2791 | http://www.crustytoothpaste.net/~bmc | My opinion only
OpenPGP: RSA v4 4096b: 88AC E9B2 9196 305B A994 7552 F1BA 225C 0223 B187

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] contrib/diff-highlight: multibyte characters diff
  2014-02-13  1:17       ` brian m. carlson
@ 2014-02-13  1:37         ` Jeff King
  0 siblings, 0 replies; 8+ messages in thread
From: Jeff King @ 2014-02-13  1:37 UTC (permalink / raw)
  To: Thomas Adam, Yoshihiro Sugi, git list

On Thu, Feb 13, 2014 at 01:17:54AM +0000, brian m. carlson wrote:

> On Wed, Feb 12, 2014 at 06:27:40PM -0500, Jeff King wrote:
> > On Wed, Feb 12, 2014 at 11:10:49PM +0000, Thomas Adam wrote:
> > 
> > > On 12 February 2014 20:59, Jeff King <peff@peff.net> wrote:
> > > > +sub decode {
> > > > +       my $orig = shift;
> > > > +       my $decoded = eval { decode_utf8($orig, Encode::FB_CROAK) };
> > > > +       return defined $decoded ?
> > > 
> > > I'd still advocate checking $@ here, rather than the defined $decoded check.
> > 
> > I don't mind changing it, but for my edification, what is the advantage?
> 
> The documentation for decode_utf8 isn't clear, but I don't know if it
> can ever return undef.  What, for example, does it return if $orig is
> not defined?  That's the benefit: it's immediately clear to the user
> that you're interested in whether it threw an exception, rather than
> whether it produced a given value.

I'd argue that I am more interested in whether it returned a value. Let
us imagine for a moment that decode_utf8 could return undef without
throwing an exception. What should the function return in such a case?

I think the only sensible thing is the original (and to indicate that
the result was not converted).

-Peff

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] contrib/diff-highlight: multibyte characters diff
  2014-02-12 20:59 ` Jeff King
  2014-02-12 23:10   ` Thomas Adam
@ 2014-02-13 19:14   ` Yoshihiro Sugi
  1 sibling, 0 replies; 8+ messages in thread
From: Yoshihiro Sugi @ 2014-02-13 19:14 UTC (permalink / raw)
  To: git; +Cc: Yoshihiro Sugi

Thanks for reviewing.
as you wrote, diff content may not be utf8 at all. and we don't know that the user's terminal watns is utf8.
I think your trying utf8 decode and fall back approach is better than my patch, and do work well.

is using "$@" for catching error like the patch below?
According to perldoc Encode.pm, encode/decode with "FB_CROAK" may destroy original string. We should probabry use "LEAVE_SRC" on decode_utf8's second argument.

---
 contrib/diff-highlight/diff-highlight | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight
index c4404d4..0743851 100755
--- a/contrib/diff-highlight/diff-highlight
+++ b/contrib/diff-highlight/diff-highlight
@@ -2,6 +2,7 @@
 
 use warnings FATAL => 'all';
 use strict;
+use Encode qw(decode_utf8 encode_utf8);
 
 # Highlight by reversing foreground and background. You could do
 # other things like bold or underline if you prefer.
@@ -73,13 +74,23 @@ sub show_hunk {
 
 	my @queue;
 	for (my $i = 0; $i < @$a; $i++) {
-		my ($rm, $add) = highlight_pair($a->[$i], $b->[$i]);
-		print $rm;
-		push @queue, $add;
+		my ($a_dec, $encode_rm) = decode($a->[$i]);
+		my ($b_dec, $encode_add) = decode($b->[$i]);
+		my ($rm, $add) = highlight_pair($a_dec, $b_dec);
+		print $encode_rm->($rm);
+		push @queue, $encode_add->($add);
 	}
 	print @queue;
 }
 
+sub decode {
+	my $orig = shift;
+	my $decoded = eval { decode_utf8($orig, Encode::FB_CROAK | Encode::LEAVE_SRC) };
+	return $@ ?
+	       ($orig, sub { shift }) :
+	       ($decoded, sub { encode_utf8(shift) });
+}
+
 sub highlight_pair {
 	my @a = split_line(shift);
 	my @b = split_line(shift);
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2014-02-13 19:14 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-02-11  9:09 [PATCH] contrib/diff-highlight: multibyte characters diff Yoshihiro Sugi
2014-02-11 19:30 ` Junio C Hamano
2014-02-12 20:59 ` Jeff King
2014-02-12 23:10   ` Thomas Adam
2014-02-12 23:27     ` Jeff King
2014-02-13  1:17       ` brian m. carlson
2014-02-13  1:37         ` Jeff King
2014-02-13 19:14   ` Yoshihiro Sugi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).