From: Junio C Hamano <gitster@pobox.com>
To: Tsugikazu Shibata <tshibata@ab.jp.nec.com>
Cc: git@vger.kernel.org
Subject: [PATCH 1/2] utf8_width(): allow non NUL-terminated input
Date: Wed, 02 Jan 2008 01:49:58 -0800 [thread overview]
Message-ID: <7v4pdwzhdl.fsf@gitster.siamese.dyndns.org> (raw)
In-Reply-To: 7v1w904x29.fsf@gitster.siamese.dyndns.org
The original interface assumed that the input string to be
always terminated with NUL, but that wasn't too useful.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
* This is a preparatory step for the next one that follows what
I outlined in my response. As we are in pre-1.5.4 freeze, I
won't be applying this to 'master' but this and the one that
implements the sane truncation might be 'next' material.
utf8.c | 68 ++++++++++++++++++++++++++++++++++++++-------------------------
utf8.h | 2 +-
2 files changed, 42 insertions(+), 28 deletions(-)
diff --git a/utf8.c b/utf8.c
index 9efcdb9..5d641fa 100644
--- a/utf8.c
+++ b/utf8.c
@@ -157,57 +157,71 @@ static int git_wcwidth(ucs_char_t ch)
* pointed to by the variable start. The pointer is updated to point at
* the next character. If it was not valid UTF-8, the pointer is set to NULL.
*/
-int utf8_width(const char **start)
+int utf8_width(const char **start, size_t *remainder_p)
{
unsigned char *s = (unsigned char *)*start;
ucs_char_t ch;
+ size_t remainder, incr;
- if (*s < 0x80) {
+ /*
+ * A caller that assumes NUL terminated text can choose
+ * not to bother with the remainder length.
+ */
+ remainder = (remainder_p ? *remainder_p : 999);
+
+ if (remainder < 1) {
+ goto invalid;
+ } else if (*s < 0x80) {
/* 0xxxxxxx */
ch = *s;
- *start += 1;
+ incr = 1;
} else if ((s[0] & 0xe0) == 0xc0) {
/* 110XXXXx 10xxxxxx */
- if ((s[1] & 0xc0) != 0x80 ||
- /* overlong? */
- (s[0] & 0xfe) == 0xc0)
+ if (remainder < 2 ||
+ (s[1] & 0xc0) != 0x80 ||
+ (s[0] & 0xfe) == 0xc0)
goto invalid;
ch = ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
- *start += 2;
+ incr = 2;
} else if ((s[0] & 0xf0) == 0xe0) {
/* 1110XXXX 10Xxxxxx 10xxxxxx */
- if ((s[1] & 0xc0) != 0x80 ||
- (s[2] & 0xc0) != 0x80 ||
- /* overlong? */
- (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||
- /* surrogate? */
- (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||
- /* U+FFFE or U+FFFF? */
- (s[0] == 0xef && s[1] == 0xbf &&
- (s[2] & 0xfe) == 0xbe))
+ if (remainder < 3 ||
+ (s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ /* overlong? */
+ (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||
+ /* surrogate? */
+ (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||
+ /* U+FFFE or U+FFFF? */
+ (s[0] == 0xef && s[1] == 0xbf &&
+ (s[2] & 0xfe) == 0xbe))
goto invalid;
ch = ((s[0] & 0x0f) << 12) |
((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
- *start += 3;
+ incr = 3;
} else if ((s[0] & 0xf8) == 0xf0) {
/* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
- if ((s[1] & 0xc0) != 0x80 ||
- (s[2] & 0xc0) != 0x80 ||
- (s[3] & 0xc0) != 0x80 ||
- /* overlong? */
- (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||
- /* > U+10FFFF? */
- (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4)
+ if (remainder < 4 ||
+ (s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ /* overlong? */
+ (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||
+ /* > U+10FFFF? */
+ (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4)
goto invalid;
ch = ((s[0] & 0x07) << 18) | ((s[1] & 0x3f) << 12) |
((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
- *start += 4;
+ incr = 4;
} else {
invalid:
*start = NULL;
return 0;
}
+ *start += incr;
+ if (remainder_p)
+ *remainder_p = remainder - incr;
return git_wcwidth(ch);
}
@@ -218,7 +232,7 @@ int is_utf8(const char *text)
text++;
continue;
}
- utf8_width(&text);
+ utf8_width(&text, NULL);
if (!text)
return 0;
}
@@ -278,7 +292,7 @@ int print_wrapped_text(const char *text, int indent, int indent2, int width)
continue;
}
if (assume_utf8)
- w += utf8_width(&text);
+ w += utf8_width(&text, NULL);
else {
w++;
text++;
diff --git a/utf8.h b/utf8.h
index 15db6f1..4295a19 100644
--- a/utf8.h
+++ b/utf8.h
@@ -1,7 +1,7 @@
#ifndef GIT_UTF8_H
#define GIT_UTF8_H
-int utf8_width(const char **start);
+int utf8_width(const char **start, size_t *remain);
int is_utf8(const char *text);
int is_encoding_utf8(const char *name);
--
1.5.4.rc2.13.g690bd
next prev parent reply other threads:[~2008-01-02 9:50 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-01-01 23:20 [RFC/PATCH] fix "git diff" to create wrong UTF-8 text Tsugikazu Shibata
2008-01-02 5:26 ` Junio C Hamano
2008-01-02 9:49 ` Junio C Hamano [this message]
2008-01-02 9:50 ` [PATCH 2/2] diff: do not chomp hunk-header in the middle of a character Junio C Hamano
2008-01-02 9:50 ` [PATCH 3/2] attribute "coding": specify blob encoding Junio C Hamano
2008-01-03 21:23 ` しらいしななこ
2008-01-03 21:54 ` Junio C Hamano
2008-01-04 16:16 ` Tsugikazu Shibata
2008-01-04 20:53 ` Junio C Hamano
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=7v4pdwzhdl.fsf@gitster.siamese.dyndns.org \
--to=gitster@pobox.com \
--cc=git@vger.kernel.org \
--cc=tshibata@ab.jp.nec.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).