From: tboegi@web.de
To: peff@peff.net, j6t@kdbg.org, lars.schneider@autodesk.com,
git@vger.kernel.org, gitster@pobox.com, patrick@luehne.de,
larsxschneider@gmail.com
Cc: "Torsten Bögershausen" <tboegi@web.de>
Subject: [PATCH/RFC 2/2] git diff: Allow to reencode into UTF-8
Date: Fri, 29 Dec 2017 14:28:29 +0100 [thread overview]
Message-ID: <20171229132829.17637-1-tboegi@web.de> (raw)
In-Reply-To: <20171218131249.GB4665@sigill.intra.peff.net>
From: Torsten Bögershausen <tboegi@web.de>
When blobs are encoded in UTF-16, `git diff` will treat them as binary.
Make it possible to show a user readable diff encoded in UTF-8.
This allows to run git diff and feed the into a web sever.
Improve Git to look at the "encodig" attribute and to reencode the
content into UTF-8 before running the diff itself.
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
---
Documentation/diff-options.txt | 4 ++
Documentation/gitattributes.txt | 9 +++++
convert.c | 40 +++++++++++++++++++
convert.h | 2 +
diff.c | 38 ++++++++++++++++--
diff.h | 1 +
diffcore.h | 3 ++
t/t4066-diff-encoding.sh | 86 +++++++++++++++++++++++++++++++++++++++++
8 files changed, 180 insertions(+), 3 deletions(-)
create mode 100755 t/t4066-diff-encoding.sh
diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt
index 9d1586b956..bf2f115f11 100644
--- a/Documentation/diff-options.txt
+++ b/Documentation/diff-options.txt
@@ -629,6 +629,10 @@ endif::git-format-patch[]
linkgit:git-log[1], but not for linkgit:git-format-patch[1] or
diff plumbing commands.
+--UTF-8::
+ Git converts the content into UTF-8 before running the diff when the
+ "encoding" attribute is defined. See linkgit:gitattributes[5]
+
--ignore-submodules[=<when>]::
Ignore changes to submodules in the diff generation. <when> can be
either "none", "untracked", "dirty" or "all", which is the default.
diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
index 30687de81a..753a7c39b7 100644
--- a/Documentation/gitattributes.txt
+++ b/Documentation/gitattributes.txt
@@ -881,6 +881,15 @@ advantages to choosing this method:
3. Caching. Textconv caching can speed up repeated diffs, such as those
you might trigger by running `git log -p`.
+Running diff on UTF-16 encoded files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Git can convert UTF-16 encoded into UTF-8 before they are feed
+into the diff machinery: `diff --UTF-8 file.xxx`.
+
+------------------------
+file.xxx encoding=UTF-16
+------------------------
Marking files as binary
^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/convert.c b/convert.c
index 5efcc3b73b..45577ce504 100644
--- a/convert.c
+++ b/convert.c
@@ -7,6 +7,7 @@
#include "sigchain.h"
#include "pkt-line.h"
#include "sub-process.h"
+#include "utf8.h"
/*
* convert.c - convert a file when checking it out and checking it in.
@@ -734,6 +735,34 @@ static struct convert_driver {
int required;
} *user_convert, **user_convert_tail;
+const char *get_encoding_attr(const char *path)
+{
+ static struct attr_check *check;
+ if (!check)
+ check = attr_check_initl("encoding", NULL);
+ if (!git_check_attr(path, check)) {
+ struct attr_check_item *ccheck = check->items;
+ const char *value;
+ value = ccheck->value;
+ if (ATTR_UNSET(value))
+ return NULL;
+ return value;
+ }
+ return NULL;
+}
+
+static int reencode_into_strbuf(const char *path, const char *src, size_t len,
+ struct strbuf *dst, const char *encoding)
+{
+ int outsz = 0;
+ char *buf;
+ buf = reencode_string_len(src, (int)len, "UTF-8", encoding, &outsz);
+ if (!buf)
+ return 0;
+ strbuf_attach(dst, buf, outsz, outsz);
+ return SAFE_CRLF_REENCODE;
+}
+
static int apply_filter(const char *path, const char *src, size_t len,
int fd, struct strbuf *dst, struct convert_driver *drv,
const unsigned int wanted_capability,
@@ -1136,6 +1165,17 @@ int convert_to_git(const struct index_state *istate,
convert_attrs(&ca, path);
+ if (checksafe & SAFE_CRLF_REENCODE) {
+ const char *encoding = get_encoding_attr(path);
+ if (encoding) {
+ ret |= reencode_into_strbuf(path, src, len, dst,
+ encoding);
+ if (ret && dst) {
+ src = dst->buf;
+ len = dst->len;
+ }
+ }
+ }
ret |= apply_filter(path, src, len, -1, dst, ca.drv, CAP_CLEAN, NULL);
if (!ret && ca.drv && ca.drv->required)
die("%s: clean filter '%s' failed", path, ca.drv->name);
diff --git a/convert.h b/convert.h
index 532af00423..0b093715c9 100644
--- a/convert.h
+++ b/convert.h
@@ -13,6 +13,7 @@ struct index_state;
#define SAFE_CRLF_WARN (1<<1)
#define SAFE_CRLF_RENORMALIZE (1<<2)
#define SAFE_CRLF_KEEP_CRLF (1<<3)
+#define SAFE_CRLF_REENCODE (1<<4)
extern int safe_crlf;
@@ -60,6 +61,7 @@ extern const char *get_cached_convert_stats_ascii(const struct index_state *ista
const char *path);
extern const char *get_wt_convert_stats_ascii(const char *path);
extern const char *get_convert_attr_ascii(const char *path);
+extern const char *get_encoding_attr(const char *path);
/* returns 1 if *dst was used */
extern int convert_to_git(const struct index_state *istate,
diff --git a/diff.c b/diff.c
index 5e3aaea6e0..07480a465c 100644
--- a/diff.c
+++ b/diff.c
@@ -3191,6 +3191,12 @@ static void builtin_diff(const char *name_a,
header.buf, header.len, 0);
strbuf_reset(&header);
}
+ if (one && one->reencoded_to_utf8)
+ strbuf_addf(&header, "a is converted to UTF-8 from %s\n",
+ get_encoding_attr(one->path));
+ if (two && two->reencoded_to_utf8)
+ strbuf_addf(&header, "b is converted to UTF-8 from %s\n",
+ get_encoding_attr(two->path));
mf1.size = fill_textconv(textconv_one, one, &mf1.ptr);
mf2.size = fill_textconv(textconv_two, two, &mf2.ptr);
@@ -3520,6 +3526,7 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
{
int size_only = flags & CHECK_SIZE_ONLY;
int err = 0;
+ int ret = 0;
/*
* demote FAIL to WARN to allow inspecting the situation
* instead of refusing.
@@ -3527,7 +3534,8 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
int checksafe = (safe_crlf == SAFE_CRLF_FAIL
? SAFE_CRLF_WARN
: safe_crlf);
-
+ if (s->reencode_to_utf8)
+ checksafe |= SAFE_CRLF_REENCODE;
if (!DIFF_FILE_VALID(s))
die("internal error: asking to populate invalid file.");
if (S_ISDIR(s->mode))
@@ -3603,17 +3611,22 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
/*
* Convert from working tree format to canonical git format
*/
- if (convert_to_git(&the_index, s->path, s->data, s->size, &buf, checksafe)) {
+ ret = convert_to_git(&the_index, s->path, s->data, s->size, &buf, checksafe);
+
+ if (ret) {
size_t size = 0;
munmap(s->data, s->size);
s->should_munmap = 0;
s->data = strbuf_detach(&buf, &size);
s->size = size;
s->should_free = 1;
+ if (ret & SAFE_CRLF_REENCODE)
+ s->reencoded_to_utf8 = 1;
}
}
else {
enum object_type type;
+ const char *encoding = NULL;
if (size_only || (flags & CHECK_BINARY)) {
type = sha1_object_info(s->oid.hash, &s->size);
if (type < 0)
@@ -3629,6 +3642,20 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
s->data = read_sha1_file(s->oid.hash, &type, &s->size);
if (!s->data)
die("unable to read %s", oid_to_hex(&s->oid));
+ if (s->reencode_to_utf8)
+ encoding = get_encoding_attr(s->path);
+ if (encoding) {
+ int outsz = 0;
+ char *buf;
+ buf = reencode_string_len(s->data, (int)s->size,
+ "UTF-8", encoding, &outsz);
+ if (buf) {
+ free(s->data);
+ s->data = buf;
+ s->size = outsz;
+ s->reencoded_to_utf8 = 1;
+ }
+ }
s->should_free = 1;
}
return 0;
@@ -4627,7 +4654,9 @@ int diff_opt_parse(struct diff_options *options,
enable_patch_output(&options->output_format);
options->flags.binary = 1;
}
- else if (!strcmp(arg, "--full-index"))
+ else if (!strcmp(arg, "--UTF-8")) {
+ options->flags.reencode_to_utf8 = 1;
+ } else if (!strcmp(arg, "--full-index"))
options->flags.full_index = 1;
else if (!strcmp(arg, "-a") || !strcmp(arg, "--text"))
options->flags.text = 1;
@@ -5695,6 +5724,8 @@ static int diff_filespec_is_identical(struct diff_filespec *one,
static int diff_filespec_check_stat_unmatch(struct diff_filepair *p)
{
+ p->one->reencode_to_utf8 = p->reencode_to_utf8;
+ p->two->reencode_to_utf8 = p->reencode_to_utf8;
if (p->done_skip_stat_unmatch)
return p->skip_stat_unmatch_result;
@@ -5735,6 +5766,7 @@ static void diffcore_skip_stat_unmatch(struct diff_options *diffopt)
for (i = 0; i < q->nr; i++) {
struct diff_filepair *p = q->queue[i];
+ p->reencode_to_utf8 = diffopt->flags.reencode_to_utf8;
if (diff_filespec_check_stat_unmatch(p))
diff_q(&outq, p);
else {
diff --git a/diff.h b/diff.h
index 7cf276f077..d2137bab58 100644
--- a/diff.h
+++ b/diff.h
@@ -65,6 +65,7 @@ struct diff_flags {
unsigned recursive:1;
unsigned tree_in_recursive:1;
unsigned binary:1;
+ unsigned reencode_to_utf8:1;
unsigned text:1;
unsigned full_index:1;
unsigned silent_on_remove:1;
diff --git a/diffcore.h b/diffcore.h
index a30da161da..2e84730778 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -47,6 +47,8 @@ struct diff_filespec {
unsigned has_more_entries : 1; /* only appear in combined diff */
/* data should be considered "binary"; -1 means "don't know yet" */
signed int is_binary : 2;
+ unsigned reencode_to_utf8 : 1;
+ unsigned reencoded_to_utf8 : 1;
struct userdiff_driver *driver;
};
@@ -72,6 +74,7 @@ struct diff_filepair {
unsigned is_unmerged : 1;
unsigned done_skip_stat_unmatch : 1;
unsigned skip_stat_unmatch_result : 1;
+ unsigned reencode_to_utf8 : 1;
};
#define DIFF_PAIR_UNMERGED(p) ((p)->is_unmerged)
diff --git a/t/t4066-diff-encoding.sh b/t/t4066-diff-encoding.sh
new file mode 100755
index 0000000000..9b89253877
--- /dev/null
+++ b/t/t4066-diff-encoding.sh
@@ -0,0 +1,86 @@
+#!/bin/sh
+
+test_description='git diff with encoding attribute'
+
+. ./test-lib.sh
+
+printf '\303\244rger\n\303\266se\n\303\274bel\n' |
+ iconv -f UTF-8 -t UTF-16 >UTF-16
+printf '\303\266se\n\303\274bel\n\303\245gren\n' |
+ iconv -f UTF-8 -t UTF-16 >file2
+
+test_expect_success 'setup' '
+ cp UTF-16 file &&
+ git add file &&
+ git commit -m "add file in UTF-16" &&
+ test_tick &&
+ echo "file encoding=UTF-16" >.gitattributes
+'
+
+test_expect_success 'diff --UTF-8 against local change' '
+ cp file2 file &&
+ test_tick &&
+ cat >expect <<-\EOF &&
+ diff --git a/file b/file
+ index 26acf09..06d06e4 100644
+ a is converted to UTF-8 from UTF-16
+ b is converted to UTF-8 from UTF-16
+ --- a/file
+ +++ b/file
+ @@ -1,3 +1,3 @@
+ -ärger
+ öse
+ übel
+ +ågren
+EOF
+ git diff --UTF-8 file >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'diff against local change' '
+ cp file2 file &&
+ test_tick &&
+ cat >expect <<-\EOF &&
+ diff --git a/file b/file
+ index 26acf09..06d06e4 100644
+ Binary files a/file and b/file differ
+EOF
+ git diff file >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'commit local change' '
+ git add file &&
+ git commit -m "add file V2 in UTF-16" &&
+ test_tick
+'
+
+test_expect_success 'diff --UTF-8 HEAD against HEAD^' '
+ cat >expect <<-\EOF &&
+ diff --git a/file b/file
+ index 26acf09..06d06e4 100644
+ a is converted to UTF-8 from UTF-16
+ b is converted to UTF-8 from UTF-16
+ --- a/file
+ +++ b/file
+ @@ -1,3 +1,3 @@
+ -ärger
+ öse
+ übel
+ +ågren
+EOF
+ git diff --UTF-8 HEAD^ HEAD -- file >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'diff HEAD against HEAD^' '
+ cat >expect <<-\EOF &&
+ diff --git a/file b/file
+ index 26acf09..06d06e4 100644
+ Binary files a/file and b/file differ
+EOF
+ git diff HEAD^ HEAD -- file >actual &&
+ test_cmp expect actual
+'
+
+test_done
--
2.15.1.271.g1a4e40aa5d
next prev parent reply other threads:[~2017-12-29 13:28 UTC|newest]
Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-12-11 15:50 [PATCH v1] convert: add support for 'encoding' attribute lars.schneider
2017-12-11 18:39 ` Eric Sunshine
2017-12-11 23:47 ` Lars Schneider
2017-12-11 23:58 ` Eric Sunshine
2017-12-12 10:58 ` Lars Schneider
2017-12-11 20:47 ` Johannes Sixt
2017-12-11 23:42 ` Lars Schneider
2017-12-12 0:59 ` Junio C Hamano
2017-12-12 7:15 ` Johannes Sixt
2017-12-12 10:55 ` Lars Schneider
2017-12-12 19:31 ` Junio C Hamano
2017-12-13 17:57 ` Lars Schneider
2017-12-13 18:11 ` Junio C Hamano
2017-12-13 23:02 ` Lars Schneider
2017-12-14 23:01 ` Junio C Hamano
2017-12-12 7:09 ` Johannes Sixt
2017-12-18 10:13 ` Torsten Bögershausen
2017-12-18 13:12 ` Jeff King
2017-12-23 8:08 ` Torsten Bögershausen
2017-12-29 13:28 ` [PATCH/RFC 0/2] git diff --UTF-8 tboegi
2017-12-29 13:28 ` [PATCH/RFC 1/2] convert_to_git(): checksafe becomes an integer tboegi
2017-12-29 13:28 ` tboegi [this message]
2018-02-26 17:27 ` [PATCH/RFC 1/1] Auto diff of UTF-16 files in UTF-8 tboegi
2018-02-26 18:43 ` Peter Krefting
2018-02-27 22:39 ` Jeff King
2017-12-18 18:02 ` [PATCH v1] convert: add support for 'encoding' attribute Junio C Hamano
2017-12-18 21:55 ` Johannes Sixt
2017-12-15 9:58 ` Jeff King
2017-12-18 10:54 ` Lars Schneider
2017-12-18 12:59 ` Jeff King
2017-12-17 17:14 ` Torsten Bögershausen
2017-12-28 16:14 ` Lars Schneider
2017-12-29 12:59 ` Torsten Bögershausen
2017-12-29 13:56 ` Lars Schneider
2018-01-03 19:15 ` Junio C Hamano
2018-01-03 20:45 ` Lars Schneider
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20171229132829.17637-1-tboegi@web.de \
--to=tboegi@web.de \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=j6t@kdbg.org \
--cc=lars.schneider@autodesk.com \
--cc=larsxschneider@gmail.com \
--cc=patrick@luehne.de \
--cc=peff@peff.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).