All of lore.kernel.org
 help / color / mirror / Atom feed
From: tboegi@web.de
To: peff@peff.net, j6t@kdbg.org, lars.schneider@autodesk.com,
	git@vger.kernel.org, gitster@pobox.com, patrick@luehne.de,
	larsxschneider@gmail.com
Cc: "Torsten Bögershausen" <tboegi@web.de>
Subject: [PATCH/RFC 2/2] git diff: Allow to reencode into UTF-8
Date: Fri, 29 Dec 2017 14:28:29 +0100	[thread overview]
Message-ID: <20171229132829.17637-1-tboegi@web.de> (raw)
In-Reply-To: <20171218131249.GB4665@sigill.intra.peff.net>

From: Torsten Bögershausen <tboegi@web.de>

When blobs are encoded in UTF-16, `git diff` will treat them as binary.
Make it possible to show a user readable diff encoded in UTF-8.
This allows to run git diff and feed the into a web sever.

Improve Git to look at the "encodig" attribute and to reencode the
content into UTF-8 before running the diff itself.

Signed-off-by: Torsten Bögershausen <tboegi@web.de>
---
 Documentation/diff-options.txt  |  4 ++
 Documentation/gitattributes.txt |  9 +++++
 convert.c                       | 40 +++++++++++++++++++
 convert.h                       |  2 +
 diff.c                          | 38 ++++++++++++++++--
 diff.h                          |  1 +
 diffcore.h                      |  3 ++
 t/t4066-diff-encoding.sh        | 86 +++++++++++++++++++++++++++++++++++++++++
 8 files changed, 180 insertions(+), 3 deletions(-)
 create mode 100755 t/t4066-diff-encoding.sh

diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt
index 9d1586b956..bf2f115f11 100644
--- a/Documentation/diff-options.txt
+++ b/Documentation/diff-options.txt
@@ -629,6 +629,10 @@ endif::git-format-patch[]
 	linkgit:git-log[1], but not for linkgit:git-format-patch[1] or
 	diff plumbing commands.
 
+--UTF-8::
+	Git converts the content into UTF-8 before running the diff when the
+	"encoding" attribute is defined. See linkgit:gitattributes[5]
+
 --ignore-submodules[=<when>]::
 	Ignore changes to submodules in the diff generation. <when> can be
 	either "none", "untracked", "dirty" or "all", which is the default.
diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
index 30687de81a..753a7c39b7 100644
--- a/Documentation/gitattributes.txt
+++ b/Documentation/gitattributes.txt
@@ -881,6 +881,15 @@ advantages to choosing this method:
 3. Caching. Textconv caching can speed up repeated diffs, such as those
    you might trigger by running `git log -p`.
 
+Running diff on UTF-16 encoded files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Git can convert UTF-16 encoded into UTF-8 before they are feed
+into the diff machinery: `diff --UTF-8 file.xxx`.
+
+------------------------
+file.xxx encoding=UTF-16
+------------------------
 
 Marking files as binary
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/convert.c b/convert.c
index 5efcc3b73b..45577ce504 100644
--- a/convert.c
+++ b/convert.c
@@ -7,6 +7,7 @@
 #include "sigchain.h"
 #include "pkt-line.h"
 #include "sub-process.h"
+#include "utf8.h"
 
 /*
  * convert.c - convert a file when checking it out and checking it in.
@@ -734,6 +735,34 @@ static struct convert_driver {
 	int required;
 } *user_convert, **user_convert_tail;
 
+const char *get_encoding_attr(const char *path)
+{
+	static struct attr_check *check;
+	if (!check)
+		check = attr_check_initl("encoding", NULL);
+	if (!git_check_attr(path, check)) {
+		struct attr_check_item *ccheck = check->items;
+		const char *value;
+		value = ccheck->value;
+		if (ATTR_UNSET(value))
+			return NULL;
+		return value;
+	}
+	return NULL;
+}
+
+static int reencode_into_strbuf(const char *path, const char *src, size_t len,
+				struct strbuf *dst, const char *encoding)
+{
+	int outsz = 0;
+	char *buf;
+	buf = reencode_string_len(src, (int)len, "UTF-8", encoding, &outsz);
+	if (!buf)
+		return 0;
+	strbuf_attach(dst, buf, outsz, outsz);
+	return SAFE_CRLF_REENCODE;
+}
+
 static int apply_filter(const char *path, const char *src, size_t len,
 			int fd, struct strbuf *dst, struct convert_driver *drv,
 			const unsigned int wanted_capability,
@@ -1136,6 +1165,17 @@ int convert_to_git(const struct index_state *istate,
 
 	convert_attrs(&ca, path);
 
+	if (checksafe & SAFE_CRLF_REENCODE) {
+		const char *encoding = get_encoding_attr(path);
+		if (encoding) {
+			ret |= reencode_into_strbuf(path, src, len, dst,
+						    encoding);
+			if (ret && dst) {
+				src = dst->buf;
+				len = dst->len;
+			}
+		}
+	}
 	ret |= apply_filter(path, src, len, -1, dst, ca.drv, CAP_CLEAN, NULL);
 	if (!ret && ca.drv && ca.drv->required)
 		die("%s: clean filter '%s' failed", path, ca.drv->name);
diff --git a/convert.h b/convert.h
index 532af00423..0b093715c9 100644
--- a/convert.h
+++ b/convert.h
@@ -13,6 +13,7 @@ struct index_state;
 #define SAFE_CRLF_WARN        (1<<1)
 #define SAFE_CRLF_RENORMALIZE (1<<2)
 #define SAFE_CRLF_KEEP_CRLF   (1<<3)
+#define SAFE_CRLF_REENCODE    (1<<4)
 
 extern int safe_crlf;
 
@@ -60,6 +61,7 @@ extern const char *get_cached_convert_stats_ascii(const struct index_state *ista
 						  const char *path);
 extern const char *get_wt_convert_stats_ascii(const char *path);
 extern const char *get_convert_attr_ascii(const char *path);
+extern const char *get_encoding_attr(const char *path);
 
 /* returns 1 if *dst was used */
 extern int convert_to_git(const struct index_state *istate,
diff --git a/diff.c b/diff.c
index 5e3aaea6e0..07480a465c 100644
--- a/diff.c
+++ b/diff.c
@@ -3191,6 +3191,12 @@ static void builtin_diff(const char *name_a,
 					 header.buf, header.len, 0);
 			strbuf_reset(&header);
 		}
+		if (one && one->reencoded_to_utf8)
+		  strbuf_addf(&header, "a is converted to UTF-8 from %s\n",
+			      get_encoding_attr(one->path));
+		if (two && two->reencoded_to_utf8)
+		  strbuf_addf(&header, "b is converted to UTF-8 from %s\n",
+			      get_encoding_attr(two->path));
 
 		mf1.size = fill_textconv(textconv_one, one, &mf1.ptr);
 		mf2.size = fill_textconv(textconv_two, two, &mf2.ptr);
@@ -3520,6 +3526,7 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
 {
 	int size_only = flags & CHECK_SIZE_ONLY;
 	int err = 0;
+	int ret = 0;
 	/*
 	 * demote FAIL to WARN to allow inspecting the situation
 	 * instead of refusing.
@@ -3527,7 +3534,8 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
 	int checksafe = (safe_crlf == SAFE_CRLF_FAIL
 				    ? SAFE_CRLF_WARN
 				    : safe_crlf);
-
+	if (s->reencode_to_utf8)
+		checksafe |= SAFE_CRLF_REENCODE;
 	if (!DIFF_FILE_VALID(s))
 		die("internal error: asking to populate invalid file.");
 	if (S_ISDIR(s->mode))
@@ -3603,17 +3611,22 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
 		/*
 		 * Convert from working tree format to canonical git format
 		 */
-		if (convert_to_git(&the_index, s->path, s->data, s->size, &buf, checksafe)) {
+		ret = convert_to_git(&the_index, s->path, s->data, s->size, &buf, checksafe);
+
+		if (ret) {
 			size_t size = 0;
 			munmap(s->data, s->size);
 			s->should_munmap = 0;
 			s->data = strbuf_detach(&buf, &size);
 			s->size = size;
 			s->should_free = 1;
+			if (ret & SAFE_CRLF_REENCODE)
+				 s->reencoded_to_utf8 = 1;
 		}
 	}
 	else {
 		enum object_type type;
+		const char *encoding = NULL;
 		if (size_only || (flags & CHECK_BINARY)) {
 			type = sha1_object_info(s->oid.hash, &s->size);
 			if (type < 0)
@@ -3629,6 +3642,20 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
 		s->data = read_sha1_file(s->oid.hash, &type, &s->size);
 		if (!s->data)
 			die("unable to read %s", oid_to_hex(&s->oid));
+		if (s->reencode_to_utf8)
+			encoding = get_encoding_attr(s->path);
+		if (encoding) {
+			int outsz = 0;
+			char *buf;
+			buf = reencode_string_len(s->data, (int)s->size,
+						  "UTF-8", encoding, &outsz);
+			if (buf) {
+				free(s->data);
+				s->data = buf;
+				s->size = outsz;
+				s->reencoded_to_utf8 = 1;
+			}
+		}
 		s->should_free = 1;
 	}
 	return 0;
@@ -4627,7 +4654,9 @@ int diff_opt_parse(struct diff_options *options,
 		enable_patch_output(&options->output_format);
 		options->flags.binary = 1;
 	}
-	else if (!strcmp(arg, "--full-index"))
+	else if (!strcmp(arg, "--UTF-8")) {
+		options->flags.reencode_to_utf8 = 1;
+	} else if (!strcmp(arg, "--full-index"))
 		options->flags.full_index = 1;
 	else if (!strcmp(arg, "-a") || !strcmp(arg, "--text"))
 		options->flags.text = 1;
@@ -5695,6 +5724,8 @@ static int diff_filespec_is_identical(struct diff_filespec *one,
 
 static int diff_filespec_check_stat_unmatch(struct diff_filepair *p)
 {
+	p->one->reencode_to_utf8 = p->reencode_to_utf8;
+	p->two->reencode_to_utf8 = p->reencode_to_utf8;
 	if (p->done_skip_stat_unmatch)
 		return p->skip_stat_unmatch_result;
 
@@ -5735,6 +5766,7 @@ static void diffcore_skip_stat_unmatch(struct diff_options *diffopt)
 	for (i = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
 
+		p->reencode_to_utf8 = diffopt->flags.reencode_to_utf8;
 		if (diff_filespec_check_stat_unmatch(p))
 			diff_q(&outq, p);
 		else {
diff --git a/diff.h b/diff.h
index 7cf276f077..d2137bab58 100644
--- a/diff.h
+++ b/diff.h
@@ -65,6 +65,7 @@ struct diff_flags {
 	unsigned recursive:1;
 	unsigned tree_in_recursive:1;
 	unsigned binary:1;
+	unsigned reencode_to_utf8:1;
 	unsigned text:1;
 	unsigned full_index:1;
 	unsigned silent_on_remove:1;
diff --git a/diffcore.h b/diffcore.h
index a30da161da..2e84730778 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -47,6 +47,8 @@ struct diff_filespec {
 	unsigned has_more_entries : 1; /* only appear in combined diff */
 	/* data should be considered "binary"; -1 means "don't know yet" */
 	signed int is_binary : 2;
+	unsigned reencode_to_utf8 : 1;
+	unsigned reencoded_to_utf8 : 1;
 	struct userdiff_driver *driver;
 };
 
@@ -72,6 +74,7 @@ struct diff_filepair {
 	unsigned is_unmerged : 1;
 	unsigned done_skip_stat_unmatch : 1;
 	unsigned skip_stat_unmatch_result : 1;
+	unsigned reencode_to_utf8 : 1;
 };
 #define DIFF_PAIR_UNMERGED(p) ((p)->is_unmerged)
 
diff --git a/t/t4066-diff-encoding.sh b/t/t4066-diff-encoding.sh
new file mode 100755
index 0000000000..9b89253877
--- /dev/null
+++ b/t/t4066-diff-encoding.sh
@@ -0,0 +1,86 @@
+#!/bin/sh
+
+test_description='git diff with encoding attribute'
+
+. ./test-lib.sh
+
+printf '\303\244rger\n\303\266se\n\303\274bel\n' |
+	iconv -f UTF-8 -t UTF-16 >UTF-16
+printf '\303\266se\n\303\274bel\n\303\245gren\n' |
+	iconv -f UTF-8 -t UTF-16 >file2
+
+test_expect_success 'setup' '
+	cp UTF-16 file &&
+	git add file &&
+	git commit -m "add file in UTF-16" &&
+	test_tick &&
+	echo "file encoding=UTF-16" >.gitattributes
+'
+
+test_expect_success 'diff --UTF-8 against local change' '
+	cp file2 file &&
+	test_tick &&
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09..06d06e4 100644
+	a is converted to UTF-8 from UTF-16
+	b is converted to UTF-8 from UTF-16
+	--- a/file
+	+++ b/file
+	@@ -1,3 +1,3 @@
+	-ärger
+	 öse
+	 übel
+	+ågren
+EOF
+	git diff --UTF-8 file >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'diff against local change' '
+	cp file2 file &&
+	test_tick &&
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09..06d06e4 100644
+	Binary files a/file and b/file differ
+EOF
+	git diff file >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'commit local change' '
+	git add file &&
+	git commit -m "add file V2 in UTF-16" &&
+	test_tick
+'
+
+test_expect_success 'diff --UTF-8  HEAD against HEAD^' '
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09..06d06e4 100644
+	a is converted to UTF-8 from UTF-16
+	b is converted to UTF-8 from UTF-16
+	--- a/file
+	+++ b/file
+	@@ -1,3 +1,3 @@
+	-ärger
+	 öse
+	 übel
+	+ågren
+EOF
+	git diff --UTF-8 HEAD^ HEAD -- file >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'diff HEAD against HEAD^' '
+	cat >expect <<-\EOF &&
+	diff --git a/file b/file
+	index 26acf09..06d06e4 100644
+	Binary files a/file and b/file differ
+EOF
+	git diff HEAD^ HEAD -- file >actual &&
+	test_cmp expect actual
+'
+
+test_done
-- 
2.15.1.271.g1a4e40aa5d


  parent reply	other threads:[~2017-12-29 13:28 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-12-11 15:50 [PATCH v1] convert: add support for 'encoding' attribute lars.schneider
2017-12-11 18:39 ` Eric Sunshine
2017-12-11 23:47   ` Lars Schneider
2017-12-11 23:58     ` Eric Sunshine
2017-12-12 10:58       ` Lars Schneider
2017-12-11 20:47 ` Johannes Sixt
2017-12-11 23:42   ` Lars Schneider
2017-12-12  0:59     ` Junio C Hamano
2017-12-12  7:15       ` Johannes Sixt
2017-12-12 10:55         ` Lars Schneider
2017-12-12 19:31           ` Junio C Hamano
2017-12-13 17:57             ` Lars Schneider
2017-12-13 18:11               ` Junio C Hamano
2017-12-13 23:02                 ` Lars Schneider
2017-12-14 23:01                   ` Junio C Hamano
2017-12-12  7:09     ` Johannes Sixt
2017-12-18 10:13   ` Torsten Bögershausen
2017-12-18 13:12     ` Jeff King
2017-12-23  8:08       ` Torsten Bögershausen
2017-12-29 13:28       ` [PATCH/RFC 0/2] git diff --UTF-8 tboegi
2017-12-29 13:28       ` [PATCH/RFC 1/2] convert_to_git(): checksafe becomes an integer tboegi
2017-12-29 13:28       ` tboegi [this message]
2018-02-26 17:27       ` [PATCH/RFC 1/1] Auto diff of UTF-16 files in UTF-8 tboegi
2018-02-26 18:43         ` Peter Krefting
2018-02-27 22:39         ` Jeff King
2017-12-18 18:02     ` [PATCH v1] convert: add support for 'encoding' attribute Junio C Hamano
2017-12-18 21:55     ` Johannes Sixt
2017-12-15  9:58 ` Jeff King
2017-12-18 10:54   ` Lars Schneider
2017-12-18 12:59     ` Jeff King
2017-12-17 17:14 ` Torsten Bögershausen
2017-12-28 16:14   ` Lars Schneider
2017-12-29 12:59     ` Torsten Bögershausen
2017-12-29 13:56       ` Lars Schneider
2018-01-03 19:15       ` Junio C Hamano
2018-01-03 20:45         ` Lars Schneider

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20171229132829.17637-1-tboegi@web.de \
    --to=tboegi@web.de \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=j6t@kdbg.org \
    --cc=lars.schneider@autodesk.com \
    --cc=larsxschneider@gmail.com \
    --cc=patrick@luehne.de \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.