All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jan Kara <jack@suse.cz>
To: Andrew Gabbasov <andrew_gabbasov@mentor.com>
Cc: Jan Kara <jack@suse.com>, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v2 7/7] udf: Merge linux specific translation into CS0 conversion function
Date: Mon, 4 Jan 2016 14:25:00 +0100	[thread overview]
Message-ID: <20160104132500.GC13014@quack.suse.cz> (raw)
In-Reply-To: <1450974338-22762-8-git-send-email-andrew_gabbasov@mentor.com>

On Thu 24-12-15 10:25:38, Andrew Gabbasov wrote:
> Current implementation of udf_translate_to_linux function does not
> support multi-bytes characters at all: it counts bytes while calculating
> extension length, when inserting CRC inside the name it doesn't
> take into account inter-character boundaries and can break into
> the middle of the character.
> 
> The most efficient way to properly support multi-bytes characters is
> merging of translation operations directly into conversion function.
> This can help to avoid extra passes along the string or parsing
> the multi-bytes character back into unicode to find out it's length.
> 
> Signed-off-by: Andrew Gabbasov <andrew_gabbasov@mentor.com>

Looks mostly good. A few comments below.

> ---
>  fs/udf/unicode.c | 260 ++++++++++++++++++++++++++++++-------------------------
>  1 file changed, 141 insertions(+), 119 deletions(-)
> 
> diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
> index f1cdeac..1dc967d 100644
> --- a/fs/udf/unicode.c
> +++ b/fs/udf/unicode.c
> @@ -28,9 +28,6 @@
>  
>  #include "udf_sb.h"
>  
> -static int udf_translate_to_linux(uint8_t *, int, const uint8_t *, int,
> -				  const uint8_t *, int);
> -
>  static int udf_uni2char_utf8(wchar_t uni,
>  			     unsigned char *out,
>  			     int boundlen)
> @@ -114,13 +111,32 @@ static int udf_char2uni_utf8(const unsigned char *in,
>  	return u_len;
>  }
>  
> +#define ILLEGAL_CHAR_MARK	'_'
> +#define EXT_MARK		'.'
> +#define CRC_MARK		'#'
> +#define EXT_SIZE		5
> +/* Number of chars we need to store generated CRC to make filename unique */
> +#define CRC_LEN			5
> +
>  static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
>  			     const uint8_t *ocu, int ocu_len,
> -			     int (*conv_f)(wchar_t, unsigned char *, int))
> +			     int (*conv_f)(wchar_t, unsigned char *, int),
> +			     int translate)
>  {
> +	uint32_t c;
>  	uint8_t cmp_id;
>  	int i, len;
> -	int str_o_len = 0;
> +	int u_ch;
> +	int firstDots = 0, needsCRC = 0, illChar;
> +	int ext_i_len, ext_max_len;
> +	int str_o_len = 0;	/* Length of resulting output */
> +	int ext_o_len = 0;	/* Extension output length */
> +	int ext_crc_len = 0;	/* Extension output length if used with CRC */
> +	int i_ext = -1;		/* Extension position in input buffer */
> +	int o_crc = 0;		/* Rightmost possible output pos for CRC+ext */
> +	unsigned short valueCRC;
> +	uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
> +	uint8_t crc[CRC_LEN];
>  
>  	if (str_max_len <= 0)
>  		return 0;
> @@ -133,22 +149,134 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
>  	cmp_id = ocu[0];
>  	if (cmp_id != 8 && cmp_id != 16) {
>  		memset(str_o, 0, str_max_len);
> -		pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu);
> +		pr_err("unknown compression code (%d)\n", cmp_id);
>  		return -EINVAL;
>  	}
> +	u_ch = cmp_id >> 3;
> +
> +	ocu++;
> +	ocu_len--;
> +

Can we just check whether ocu_len is a multiple of u_ch here and bail out
if not? That would save us some rounding below and also fix possible access
beyond the end of the string in case fs is corrupted.

> +	if (translate) {
> +		/* Look for extension */
> +		for (i = (ocu_len & ~(u_ch - 1)) - u_ch, ext_i_len = 0;
> +		     (i >= 0) && (ext_i_len < EXT_SIZE);
> +		     i -= u_ch, ext_i_len++) {
> +

Just remove the empty line above please.

> +			c = ocu[i];
> +			if (u_ch > 1)
> +				c = (c << 8) | ocu[i + 1];
> +
> +			if (c == EXT_MARK) {
> +				if (ext_i_len)
> +					i_ext = i;
> +				break;
> +			}
> +		}
> +		if (i_ext >= 0) {
> +			/* Convert extension */
> +			ext_max_len = min_t(int, sizeof(ext), str_max_len);
> +			ext[ext_o_len++] = EXT_MARK;
> +			illChar = 0;
> +			for (i = i_ext + u_ch; i < ocu_len;) {
> +

Remove the empty line here please. Also how about using i += u_ch in the
for() above instead of i++ in the two lines below?

> +				c = ocu[i++];
> +				if (u_ch > 1)
> +					c = (c << 8) | ocu[i++];
> +
> +				if (c == '/' || c == 0) {
> +					if (illChar)
> +						continue;
> +					illChar = 1;
> +					needsCRC = 1;
> +					c = ILLEGAL_CHAR_MARK;
> +				} else {
> +					illChar = 0;
> +				}
> +
> +				len = conv_f(c, &ext[ext_o_len],
> +					     ext_max_len - ext_o_len);
> +				/* Valid character? */
> +				if (len >= 0) {
> +					ext_o_len += len;
> +				} else {
> +					ext[ext_o_len++] = '?';
> +					needsCRC = 1;
> +				}
> +				if ((ext_o_len + CRC_LEN) < str_max_len)
> +					ext_crc_len = ext_o_len;
> +			}
> +		}
> +	}
> +
> +	illChar = 0;
> +	for (i = 0; i < ocu_len;) {
> +

Again, remove the empty line, i += u_ch.

> +		if (str_o_len >= str_max_len) {
> +			needsCRC = 1;
> +			break;
> +		}
> +
> +		if (translate && (i == i_ext)) {
> +			if (str_o_len > (str_max_len - ext_o_len))
> +				needsCRC = 1;
> +			break;
> +		}
>  
> -	for (i = 1; (i < ocu_len) && (str_o_len < str_max_len);) {
>  		/* Expand OSTA compressed Unicode to Unicode */
> -		uint32_t c = ocu[i++];
> -		if (cmp_id == 16)
> +		c = ocu[i++];
> +		if (u_ch > 1)
>  			c = (c << 8) | ocu[i++];
>  
> +		if (translate) {
> +			if ((c == '.') && (firstDots >= 0))
> +				firstDots++;
> +			else
> +				firstDots = -1;
> +
> +			if (c == '/' || c == 0) {
> +				if (illChar)
> +					continue;
> +				illChar = 1;
> +				needsCRC = 1;
> +				c = ILLEGAL_CHAR_MARK;
> +			} else {
> +				illChar = 0;
> +			}
> +		}
> +
>  		len = conv_f(c, &str_o[str_o_len], str_max_len - str_o_len);
>  		/* Valid character? */
> -		if (len >= 0)
> +		if (len >= 0) {
>  			str_o_len += len;
> -		else
> +		} else {
>  			str_o[str_o_len++] = '?';
> +			needsCRC = 1;
> +		}

Umm, can you try factoring out body of this loop into a helper function and
using it both during extension parsing and full name parsing? Also I think
that checking whether the name is '.' or '..' could be moved just into the
if (translate) below where you could just directly check it like:
	if (str_o_len <= 2 && str_o[0] == '.' &&
	    (str_o_len == 1 || str_o[1] == '.'))

> +		if (str_o_len <= (str_max_len - ext_o_len - CRC_LEN))
> +			o_crc = str_o_len;
> +	}
> +
> +	if (translate) {
> +		if ((firstDots == 1) || (firstDots == 2))
> +			needsCRC = 1;
> +		if (needsCRC) {
> +			str_o_len = o_crc;
> +			valueCRC = crc_itu_t(0, ocu, ocu_len);
> +			crc[0] = CRC_MARK;
> +			crc[1] = hex_asc_upper_hi(valueCRC >> 8);
> +			crc[2] = hex_asc_upper_lo(valueCRC >> 8);
> +			crc[3] = hex_asc_upper_hi(valueCRC);
> +			crc[4] = hex_asc_upper_lo(valueCRC);
> +			len = min_t(int, CRC_LEN, str_max_len - str_o_len);
> +			memcpy(&str_o[str_o_len], crc, len);
> +			str_o_len += len;
> +			ext_o_len = ext_crc_len;
> +		}
> +		if (ext_o_len > 0) {
> +			memcpy(&str_o[str_o_len], ext, ext_o_len);
> +			str_o_len += ext_o_len;
> +		}
>  	}
>  
>  	return str_o_len;

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

  reply	other threads:[~2016-01-04 13:24 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-12-24 16:25 [PATCH v2 0/7] udf: rework name conversions to fix multi-bytes characters support Andrew Gabbasov
2015-12-24 16:25 ` [PATCH v2 1/7] udf: Prevent buffer overrun with multi-byte characters Andrew Gabbasov
2015-12-24 16:25 ` [PATCH v2 2/7] udf: Check output buffer length when converting name to CS0 Andrew Gabbasov
2016-01-04 17:19   ` Jan Kara
2015-12-24 16:25 ` [PATCH v2 3/7] udf: Parameterize output length in udf_put_filename Andrew Gabbasov
2015-12-24 16:25 ` [PATCH v2 4/7] udf: Join functions for UTF8 and NLS conversions Andrew Gabbasov
2015-12-24 16:25 ` [PATCH v2 5/7] udf: Adjust UDF_NAME_LEN to better reflect actual restrictions Andrew Gabbasov
2015-12-24 16:25 ` [PATCH v2 6/7] udf: Remove struct ustr as non-needed intermediate storage Andrew Gabbasov
2016-01-04 12:32   ` Jan Kara
2016-01-11 13:31     ` Andrew Gabbasov
2016-01-12 13:39       ` Jan Kara
2015-12-24 16:25 ` [PATCH v2 7/7] udf: Merge linux specific translation into CS0 conversion function Andrew Gabbasov
2016-01-04 13:25   ` Jan Kara [this message]
2016-01-11 13:31     ` Andrew Gabbasov
2016-01-04 13:30 ` [PATCH v2 0/7] udf: rework name conversions to fix multi-bytes characters support Jan Kara

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160104132500.GC13014@quack.suse.cz \
    --to=jack@suse.cz \
    --cc=andrew_gabbasov@mentor.com \
    --cc=jack@suse.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.