Re: [BUG] ${#var} returns length in bytes, not characters

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Stephane Chazelas <stephane.chazelas@gmail.com>
To: dash@vger.kernel.org
Subject: Re: [BUG] ${#var} returns length in bytes, not characters
Date: Wed, 3 Jun 2015 12:45:19 +0100	[thread overview]
Message-ID: <20150603114519.GB4767@chaz.gmail.com> (raw)
In-Reply-To: <556EE51D.8080100@inlv.org>

[-- Attachment #1: Type: text/plain, Size: 765 bytes --]

2015-06-03 13:29:33 +0200, Martijn Dekker:
> POSIX:
> http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_06_02
> > ${#parameter}
> > String Length. The length in characters of the value of parameter
> > shall be substituted. [...]
> 
> dash does not expand the length in characters; it expands the length in
> bytes instead. That is invalid for locales that include multi-byte
> characters, such as the now ubiquitous UTF-8 set.
[...]

See also:

http://thread.gmane.org/gmane.comp.standards.posix.austin.general/9972/focus=10040

For a few UTF-8 related variations in behaviour between shells
(including this one), though many of them are about "unspecified
behaviour".

The script mentioned there is also attached here.

-- 
Stephane

[-- Attachment #2: u8-tests --]
[-- Type: text/plain, Size: 4137 bytes --]

euro=$(printf '\342\202\254')
o342=$(printf '\342')
o202=$(printf '\202')
o254=$(printf '\254')

test_shell=$1

n=0
case $test_shell in
  zsh) emulate sh;;
  zsh5) setopt shwordsplit;;
  [lm]ksh) set -o utf8-mode;;
esac

if [ -n "$test_shell" ]; then
  printf '%8s:' "$test_shell"
  testing() {
    n=$(($n + 1))
    test
    ret=$?
    [ "$ret" -eq "$na" ] && ret=-
    printf ' %2s' "$ret"
    [ "$ret" = "$1" ] && printf '\342\203\235'
  }
else
  testing() {
    n=$(($n + 1))
    printf '%2d: %s (expected: %d)\n' "$n" "$2" "$1"
  }
fi

na=99
if_accept_invalid() { [ "$o254" ] || exit "$na"; }
if_printf_builtin() {
  case $(type printf 2> /dev/null) in
    *builtin*);;
    (*) exit "$na"
  esac
}

################################################################
test() (
  exit "${#euro}"
)
testing 1 '${#utf8-character}'
################################################################
test() (
  if_accept_invalid
  exit "${#o254}"
)
testing 1 '${#single byte, invalid char}'
################################################################
test() (
  if_accept_invalid
  t=$o342$o202
  exit "${#t}"
)
testing 2 '${truncated character, 2 bytes}'
################################################################
test() (
  if_accept_invalid
  case $euro in
    *"$o254"*) true;;
    *) false
  esac
)
testing 0 '$char contains byte component'
################################################################
test() (
  if_accept_invalid
  t=+$euro-
  [ "${t##*"$o254"}" = - ]
)
testing 0 '${##} matching with byte components of $mbchar'
################################################################
test() (
  if_accept_invalid
  t=+$o254$euro-
  [ "${t##*"$euro"}" = - ]
)
testing 0 '${##} works in invalid strings'
################################################################
test() (
  if_accept_invalid
  IFS=$o254
  t=+$euro+
  set -- $t
  exit "$#"
)
testing 2 'byte component found in $mbchar by IFS'
################################################################
test() (
  if_accept_invalid
  IFS=$o342$o202
  t=+$euro+
  set -- $t
  exit "$#"
)
testing 3 'byte component found in $mbchar by IFS'
################################################################
test() (
  if_accept_invalid
  IFS=$o254
  t=+$o254+
  set -- $t
  exit "$#"
)
testing 2 'IFS works with bytes'
################################################################
test() (
  IFS=$euro
  t=+$euro+
  set -- $t
  exit "$#"
)
testing 2 'IFS works with mbchars'
################################################################
test() (
  if_accept_invalid
  IFS=$euro
  t=+$o254$o342$euro+
  set -- $t
  exit "$#"
)
testing 2 'IFS works with mbchars in invalid strings'
################################################################
test() (
  if_accept_invalid
  IFS=$o342$o202
  set / /
  [ "$*" = "/$o342/" ]
)
testing 0 'Joining $* with byte forming invalid char'
################################################################
test() (
  IFS=$euro
  set / /
  [ "$*" = "/$euro/" ]
)
testing 0 'Joining $* with mbchar'
################################################################
test() (
  if_printf_builtin
  exit "$(printf '%4s' "$euro" | wc -c | tr -cd 0-9)"
)
testing 6 'byte length of 4-padded euro'
################################################################
test() (
  if_accept_invalid
  if_printf_builtin
  exit "$(printf '%4s' "$o342$o202" | wc -c | tr -cd 0-9)"
)
testing 4 'byte length of 4-padded truncated mbchar'
################################################################
test() (
  case $euro in
    ?) true;;
    (*) false
  esac
)
testing 0 '? matches mbchar'
################################################################
test() (
  if_accept_invalid
  case $o342$o202 in
    ??) true;;
    (*) false
  esac
)
testing 0 '?? matches 2-byte truncated mbchar'
################################################################


echo
if [ -z "$test_shell" ]; then
  printf '%8s:' tests
  i=1; while [ "$i" -le "$n" ]; do
    printf ' %2d' "$i"
    i=$(($i + 1))
  done
  printf '\n\n'
  for test_shell in dash zsh yash bash lksh mksh ksh93 zsh5 posh; do
    "$test_shell" "$0" "$test_shell"
  done
fi

next prev parent reply	other threads:[~2015-06-03 11:50 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-06-03 11:29 [BUG] ${#var} returns length in bytes, not characters Martijn Dekker
2015-06-03 11:45 ` Stephane Chazelas [this message]
2015-06-08  5:25 ` Herbert Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150603114519.GB4767@chaz.gmail.com \
    --to=stephane.chazelas@gmail.com \
    --cc=dash@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.