git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: tboegi@web.de
To: peff@peff.net, j6t@kdbg.org, lars.schneider@autodesk.com,
	git@vger.kernel.org, gitster@pobox.com, patrick@luehne.de,
	larsxschneider@gmail.com
Cc: "Torsten Bögershausen" <tboegi@web.de>
Subject: [PATCH/RFC 1/1] Auto diff of UTF-16 files in UTF-8
Date: Mon, 26 Feb 2018 18:27:06 +0100	[thread overview]
Message-ID: <20180226172706.7822-1-tboegi@web.de> (raw)
In-Reply-To: <20171218131249.GB4665@sigill.intra.peff.net>

From: Torsten Bögershausen <tboegi@web.de>

When an UTF-16 file is commited and later changed, `git diff` shows
"Binary files XX and YY differ".

When the user wants a diff in UTF-8, a textconv needs to be specified
in .gitattributes and the textconv must be configured.

A more user-friendly diff can be produced for UTF-16 if
- the user did not use `git diff --binary`
- the blob is identified as binary
- the blob has an UTF-16 BOM
- the blob can be converted into UTF-8

Enhance the diff machinery to auto-detect UTF-16 blobs and show them
as UTF-8, unless the user specifies `git diff --binary` which creates
a binary diff.

Signed-off-by: Torsten Bögershausen <tboegi@web.de>
---
 diff.c                   | 43 ++++++++++++++++++++-
 diffcore.h               |  3 ++
 t/t4066-diff-encoding.sh | 98 ++++++++++++++++++++++++++++++++++++++++++++++++
 utf8.h                   | 11 ++++++
 4 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100755 t/t4066-diff-encoding.sh

diff --git a/diff.c b/diff.c
index fb22b19f09..51831ee94d 100644
--- a/diff.c
+++ b/diff.c
@@ -3192,6 +3192,10 @@ static void builtin_diff(const char *name_a,
 			strbuf_reset(&header);
 		}
 
+		if (one && one->reencoded_from_utf16)
+			strbuf_addf(&header, "a is converted to UTF-8 from UTF-16\n");
+		if (two && two->reencoded_from_utf16)
+			strbuf_addf(&header, "b is converted to UTF-8 from UTF-16\n");
 		mf1.size = fill_textconv(textconv_one, one, &mf1.ptr);
 		mf2.size = fill_textconv(textconv_two, two, &mf2.ptr);
 
@@ -3611,8 +3615,25 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
 			s->size = size;
 			s->should_free = 1;
 		}
-	}
-	else {
+		if (!s->binary && buffer_is_binary(s->data, s->size) &&
+		    buffer_has_utf16_bom(s->data, s->size)) {
+			int outsz = 0;
+			char *outbuf;
+			outbuf = reencode_string_len(s->data, (int)s->size,
+						     "UTF-8", "UTF-16", &outsz);
+			if (outbuf) {
+				if (s->should_free)
+					free(s->data);
+				if (s->should_munmap)
+					munmap(s->data, s->size);
+				s->should_munmap = 0;
+				s->data = outbuf;
+				s->size = outsz;
+				s->reencoded_from_utf16 = 1;
+				s->should_free = 1;
+			}
+		}
+	} else {
 		enum object_type type;
 		if (size_only || (flags & CHECK_BINARY)) {
 			type = sha1_object_info(s->oid.hash, &s->size);
@@ -3629,6 +3650,19 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
 		s->data = read_sha1_file(s->oid.hash, &type, &s->size);
 		if (!s->data)
 			die("unable to read %s", oid_to_hex(&s->oid));
+		if (!s->binary && buffer_is_binary(s->data, s->size) &&
+		    buffer_has_utf16_bom(s->data, s->size)) {
+			int outsz = 0;
+			char *buf;
+			buf = reencode_string_len(s->data, (int)s->size,
+						  "UTF-8", "UTF-16", &outsz);
+			if (buf) {
+				free(s->data);
+				s->data = buf;
+				s->size = outsz;
+				s->reencoded_from_utf16 = 1;
+			}
+		}
 		s->should_free = 1;
 	}
 	return 0;
@@ -5695,6 +5729,10 @@ static int diff_filespec_is_identical(struct diff_filespec *one,
 
 static int diff_filespec_check_stat_unmatch(struct diff_filepair *p)
 {
+	if (p->binary) {
+		p->one->binary = 1;
+		p->two->binary = 1;
+	}
 	if (p->done_skip_stat_unmatch)
 		return p->skip_stat_unmatch_result;
 
@@ -5735,6 +5773,7 @@ static void diffcore_skip_stat_unmatch(struct diff_options *diffopt)
 	for (i = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
 
+		p->binary = diffopt->flags.binary;
 		if (diff_filespec_check_stat_unmatch(p))
 			diff_q(&outq, p);
 		else {
diff --git a/diffcore.h b/diffcore.h
index a30da161da..3cd97bb93b 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -47,6 +47,8 @@ struct diff_filespec {
 	unsigned has_more_entries : 1; /* only appear in combined diff */
 	/* data should be considered "binary"; -1 means "don't know yet" */
 	signed int is_binary : 2;
+	unsigned binary : 1;
+	unsigned reencoded_from_utf16 : 1;
 	struct userdiff_driver *driver;
 };
 
@@ -72,6 +74,7 @@ struct diff_filepair {
 	unsigned is_unmerged : 1;
 	unsigned done_skip_stat_unmatch : 1;
 	unsigned skip_stat_unmatch_result : 1;
+	unsigned binary : 1;
 };
 #define DIFF_PAIR_UNMERGED(p) ((p)->is_unmerged)
 
diff --git a/t/t4066-diff-encoding.sh b/t/t4066-diff-encoding.sh
new file mode 100755
index 0000000000..9bb3c70ada
--- /dev/null
+++ b/t/t4066-diff-encoding.sh
@@ -0,0 +1,98 @@
+#!/bin/sh
+
+test_description='git diff with encoding attribute'
+
+. ./test-lib.sh
+
+printf '\303\244rger\n\303\266se\n\303\274bel\n' |
+	iconv -f UTF-8 -t UTF-16 >UTF-16
+printf '\303\266se\n\303\274bel\n\303\245st\n' |
+	iconv -f UTF-8 -t UTF-16 >file2
+
+test_expect_success 'setup' '
+	cp UTF-16 file &&
+	git add file &&
+	git commit -m "add file in UTF-16" &&
+	test_tick &&
+	echo "file encoding=UTF-16" >.gitattributes
+'
+
+test_expect_success 'diff against local change' '
+	cp file2 file &&
+	test_tick &&
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09..e98d27a 100644
+	a is converted to UTF-8 from UTF-16
+	b is converted to UTF-8 from UTF-16
+	--- a/file
+	+++ b/file
+	@@ -1,3 +1,3 @@
+	-ärger
+	 öse
+	 übel
+	+åst
+EOF
+	git diff file >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'diff --binary against local change' '
+	cp file2 file &&
+	test_tick &&
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09b0aad19fb22566956d1a39cb4e2a3b420..e98d27acfb90cfcfc84fcc5173baa4aa7828290f 100644
+	GIT binary patch
+	literal 28
+	ecmezW?;ArgLn;Fo!ykquAe{qbJq3!C0BHb{ln3Pi
+
+	literal 32
+	icmezW?+HT@Lpnn$kmO?c#!w7oaWVX1NCMJ1Ko$VA_z0~4
+
+EOF
+	git diff --binary file >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'commit local change' '
+	git add file &&
+	git commit -m "add file V2 in UTF-16" &&
+	test_tick
+'
+
+test_expect_success 'diff HEAD against HEAD^' '
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09..e98d27a 100644
+	a is converted to UTF-8 from UTF-16
+	b is converted to UTF-8 from UTF-16
+	--- a/file
+	+++ b/file
+	@@ -1,3 +1,3 @@
+	-ärger
+	 öse
+	 übel
+	+åst
+EOF
+	git diff HEAD^ HEAD -- file >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'diff --binary HEAD against HEAD^' '
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09b0aad19fb22566956d1a39cb4e2a3b420..e98d27acfb90cfcfc84fcc5173baa4aa7828290f 100644
+	GIT binary patch
+	literal 28
+	ecmezW?;ArgLn;Fo!ykquAe{qbJq3!C0BHb{ln3Pi
+
+	literal 32
+	icmezW?+HT@Lpnn$kmO?c#!w7oaWVX1NCMJ1Ko$VA_z0~4
+	
+EOF
+	git diff --binary HEAD^ HEAD -- file >actual &&
+	test_cmp expect actual
+'
+
+test_done
diff --git a/utf8.h b/utf8.h
index 6bbcf31a83..a2184d0300 100644
--- a/utf8.h
+++ b/utf8.h
@@ -16,6 +16,17 @@ int utf8_fprintf(FILE *, const char *, ...);
 extern const char utf8_bom[];
 extern int skip_utf8_bom(char **, size_t);
 
+static inline int buffer_has_utf16_bom(const void *buf, size_t len) {
+  const unsigned char *text = (unsigned char *)buf;
+  if (!text ||  len < 2)
+    return 0;
+  if (text[0] == 0xff && text[1] == 0xfe)
+    return 1;
+  if (text[0] == 0xfe && text[1] == 0xff)
+    return 1;
+  return 0;
+}
+
 void strbuf_add_wrapped_text(struct strbuf *buf,
 		const char *text, int indent, int indent2, int width);
 void strbuf_add_wrapped_bytes(struct strbuf *buf, const char *data, int len,
-- 
2.16.1.194.gb2e45c695d


  parent reply	other threads:[~2018-02-26 17:27 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-12-11 15:50 [PATCH v1] convert: add support for 'encoding' attribute lars.schneider
2017-12-11 18:39 ` Eric Sunshine
2017-12-11 23:47   ` Lars Schneider
2017-12-11 23:58     ` Eric Sunshine
2017-12-12 10:58       ` Lars Schneider
2017-12-11 20:47 ` Johannes Sixt
2017-12-11 23:42   ` Lars Schneider
2017-12-12  0:59     ` Junio C Hamano
2017-12-12  7:15       ` Johannes Sixt
2017-12-12 10:55         ` Lars Schneider
2017-12-12 19:31           ` Junio C Hamano
2017-12-13 17:57             ` Lars Schneider
2017-12-13 18:11               ` Junio C Hamano
2017-12-13 23:02                 ` Lars Schneider
2017-12-14 23:01                   ` Junio C Hamano
2017-12-12  7:09     ` Johannes Sixt
2017-12-18 10:13   ` Torsten Bögershausen
2017-12-18 13:12     ` Jeff King
2017-12-23  8:08       ` Torsten Bögershausen
2017-12-29 13:28       ` [PATCH/RFC 0/2] git diff --UTF-8 tboegi
2017-12-29 13:28       ` [PATCH/RFC 1/2] convert_to_git(): checksafe becomes an integer tboegi
2017-12-29 13:28       ` [PATCH/RFC 2/2] git diff: Allow to reencode into UTF-8 tboegi
2018-02-26 17:27       ` tboegi [this message]
2018-02-26 18:43         ` [PATCH/RFC 1/1] Auto diff of UTF-16 files in UTF-8 Peter Krefting
2018-02-27 22:39         ` Jeff King
2017-12-18 18:02     ` [PATCH v1] convert: add support for 'encoding' attribute Junio C Hamano
2017-12-18 21:55     ` Johannes Sixt
2017-12-15  9:58 ` Jeff King
2017-12-18 10:54   ` Lars Schneider
2017-12-18 12:59     ` Jeff King
2017-12-17 17:14 ` Torsten Bögershausen
2017-12-28 16:14   ` Lars Schneider
2017-12-29 12:59     ` Torsten Bögershausen
2017-12-29 13:56       ` Lars Schneider
2018-01-03 19:15       ` Junio C Hamano
2018-01-03 20:45         ` Lars Schneider

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180226172706.7822-1-tboegi@web.de \
    --to=tboegi@web.de \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=j6t@kdbg.org \
    --cc=lars.schneider@autodesk.com \
    --cc=larsxschneider@gmail.com \
    --cc=patrick@luehne.de \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).