From: Stephane Chazelas <stephane.chazelas@gmail.com>
To: dash@vger.kernel.org
Subject: Re: [BUG] ${#var} returns length in bytes, not characters
Date: Wed, 3 Jun 2015 12:45:19 +0100 [thread overview]
Message-ID: <20150603114519.GB4767@chaz.gmail.com> (raw)
In-Reply-To: <556EE51D.8080100@inlv.org>
[-- Attachment #1: Type: text/plain, Size: 765 bytes --]
2015-06-03 13:29:33 +0200, Martijn Dekker:
> POSIX:
> http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_06_02
> > ${#parameter}
> > String Length. The length in characters of the value of parameter
> > shall be substituted. [...]
>
> dash does not expand the length in characters; it expands the length in
> bytes instead. That is invalid for locales that include multi-byte
> characters, such as the now ubiquitous UTF-8 set.
[...]
See also:
http://thread.gmane.org/gmane.comp.standards.posix.austin.general/9972/focus=10040
For a few UTF-8 related variations in behaviour between shells
(including this one), though many of them are about "unspecified
behaviour".
The script mentioned there is also attached here.
--
Stephane
[-- Attachment #2: u8-tests --]
[-- Type: text/plain, Size: 4137 bytes --]
euro=$(printf '\342\202\254')
o342=$(printf '\342')
o202=$(printf '\202')
o254=$(printf '\254')
test_shell=$1
n=0
case $test_shell in
zsh) emulate sh;;
zsh5) setopt shwordsplit;;
[lm]ksh) set -o utf8-mode;;
esac
if [ -n "$test_shell" ]; then
printf '%8s:' "$test_shell"
testing() {
n=$(($n + 1))
test
ret=$?
[ "$ret" -eq "$na" ] && ret=-
printf ' %2s' "$ret"
[ "$ret" = "$1" ] && printf '\342\203\235'
}
else
testing() {
n=$(($n + 1))
printf '%2d: %s (expected: %d)\n' "$n" "$2" "$1"
}
fi
na=99
if_accept_invalid() { [ "$o254" ] || exit "$na"; }
if_printf_builtin() {
case $(type printf 2> /dev/null) in
*builtin*);;
(*) exit "$na"
esac
}
################################################################
test() (
exit "${#euro}"
)
testing 1 '${#utf8-character}'
################################################################
test() (
if_accept_invalid
exit "${#o254}"
)
testing 1 '${#single byte, invalid char}'
################################################################
test() (
if_accept_invalid
t=$o342$o202
exit "${#t}"
)
testing 2 '${truncated character, 2 bytes}'
################################################################
test() (
if_accept_invalid
case $euro in
*"$o254"*) true;;
*) false
esac
)
testing 0 '$char contains byte component'
################################################################
test() (
if_accept_invalid
t=+$euro-
[ "${t##*"$o254"}" = - ]
)
testing 0 '${##} matching with byte components of $mbchar'
################################################################
test() (
if_accept_invalid
t=+$o254$euro-
[ "${t##*"$euro"}" = - ]
)
testing 0 '${##} works in invalid strings'
################################################################
test() (
if_accept_invalid
IFS=$o254
t=+$euro+
set -- $t
exit "$#"
)
testing 2 'byte component found in $mbchar by IFS'
################################################################
test() (
if_accept_invalid
IFS=$o342$o202
t=+$euro+
set -- $t
exit "$#"
)
testing 3 'byte component found in $mbchar by IFS'
################################################################
test() (
if_accept_invalid
IFS=$o254
t=+$o254+
set -- $t
exit "$#"
)
testing 2 'IFS works with bytes'
################################################################
test() (
IFS=$euro
t=+$euro+
set -- $t
exit "$#"
)
testing 2 'IFS works with mbchars'
################################################################
test() (
if_accept_invalid
IFS=$euro
t=+$o254$o342$euro+
set -- $t
exit "$#"
)
testing 2 'IFS works with mbchars in invalid strings'
################################################################
test() (
if_accept_invalid
IFS=$o342$o202
set / /
[ "$*" = "/$o342/" ]
)
testing 0 'Joining $* with byte forming invalid char'
################################################################
test() (
IFS=$euro
set / /
[ "$*" = "/$euro/" ]
)
testing 0 'Joining $* with mbchar'
################################################################
test() (
if_printf_builtin
exit "$(printf '%4s' "$euro" | wc -c | tr -cd 0-9)"
)
testing 6 'byte length of 4-padded euro'
################################################################
test() (
if_accept_invalid
if_printf_builtin
exit "$(printf '%4s' "$o342$o202" | wc -c | tr -cd 0-9)"
)
testing 4 'byte length of 4-padded truncated mbchar'
################################################################
test() (
case $euro in
?) true;;
(*) false
esac
)
testing 0 '? matches mbchar'
################################################################
test() (
if_accept_invalid
case $o342$o202 in
??) true;;
(*) false
esac
)
testing 0 '?? matches 2-byte truncated mbchar'
################################################################
echo
if [ -z "$test_shell" ]; then
printf '%8s:' tests
i=1; while [ "$i" -le "$n" ]; do
printf ' %2d' "$i"
i=$(($i + 1))
done
printf '\n\n'
for test_shell in dash zsh yash bash lksh mksh ksh93 zsh5 posh; do
"$test_shell" "$0" "$test_shell"
done
fi
next prev parent reply other threads:[~2015-06-03 11:50 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-06-03 11:29 [BUG] ${#var} returns length in bytes, not characters Martijn Dekker
2015-06-03 11:45 ` Stephane Chazelas [this message]
2015-06-08 5:25 ` Herbert Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20150603114519.GB4767@chaz.gmail.com \
--to=stephane.chazelas@gmail.com \
--cc=dash@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.