[RFC PATCH 18/20] cifs: Do not use broken utf8 NLS table for iocharset=utf8 mount option

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Pali Rohár" <pali@kernel.org>
To: linux-fsdevel@vger.kernel.org,
	linux-ntfs-dev@lists.sourceforge.net, linux-cifs@vger.kernel.org,
	jfs-discussion@lists.sourceforge.net,
	linux-kernel@vger.kernel.org,
	"Alexander Viro" <viro@zeniv.linux.org.uk>,
	"Jan Kara" <jack@suse.cz>,
	"OGAWA Hirofumi" <hirofumi@mail.parknet.co.jp>,
	"Theodore Y . Ts'o" <tytso@mit.edu>,
	"Luis de Bethencourt" <luisbg@kernel.org>,
	"Salah Triki" <salah.triki@gmail.com>,
	"Andrew Morton" <akpm@linux-foundation.org>,
	"Dave Kleikamp" <shaggy@kernel.org>,
	"Anton Altaparmakov" <anton@tuxera.com>,
	"Pavel Machek" <pavel@ucw.cz>, "Marek Behún" <marek.behun@nic.cz>,
	"Christoph Hellwig" <hch@infradead.org>
Subject: [RFC PATCH 18/20] cifs: Do not use broken utf8 NLS table for iocharset=utf8 mount option
Date: Sun,  8 Aug 2021 18:24:51 +0200	[thread overview]
Message-ID: <20210808162453.1653-19-pali@kernel.org> (raw)
In-Reply-To: <20210808162453.1653-1-pali@kernel.org>

NLS table for utf8 is broken and cannot be fixed.

So instead of broken utf8 nls functions char2uni() and uni2char() use
functions utf8s_to_utf16s() and utf16s_to_utf8s() which implements correct
conversion between UTF-16 and UTF-8.

When iochatset=utf8 is used then set ctx->iocharset to NULL and use it for
distinguish between the fact if NLS table or native UTF-8 functions should
be used.

Signed-off-by: Pali Rohár <pali@kernel.org>
---
 fs/cifs/cifs_unicode.c | 128 +++++++++++++++++++++++++++--------------
 fs/cifs/cifs_unicode.h |   2 +-
 fs/cifs/cifsfs.c       |   2 +
 fs/cifs/connect.c      |   8 ++-
 fs/cifs/dir.c          |  28 +++++++--
 fs/cifs/winucase.c     |  14 +++--
 6 files changed, 124 insertions(+), 58 deletions(-)

diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 9bd03a231032..b0f7f78da7c2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -131,20 +131,17 @@ cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
 		  convert_sfu_char(src_char, target))
 		return len;
 
-	/* if character not one of seven in special remap set */
-	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
-	if (len <= 0)
-		goto surrogate_pair;
-
-	return len;
+	if (cp) {
+		/* if character not one of seven in special remap set */
+		len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
+		if (len <= 0)
+			goto unknown;
+	} else {
+		len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
+		if (len <= 0)
+			goto unknown;
+	}
 
-surrogate_pair:
-	/* convert SURROGATE_PAIR and IVS */
-	if (strcmp(cp->charset, "utf8"))
-		goto unknown;
-	len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
-	if (len <= 0)
-		goto unknown;
 	return len;
 
 unknown:
@@ -240,6 +237,37 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
 	return outlen;
 }
 
+static int cifs_utf8s_to_utf16s(const char *s, int inlen, __le16 *pwcs)
+{
+	__le16 *op;
+	int size;
+	unicode_t u;
+
+	op = pwcs;
+	while (inlen > 0 && *s) {
+		if (*s & 0x80) {
+			size = utf8_to_utf32(s, inlen, &u);
+			if (size <= 0) {
+				u = 0x003f; /* A question mark */
+				size = 1;
+			}
+			s += size;
+			inlen -= size;
+			if (u >= 0x10000) {
+				u -= 0x10000;
+				*op++ = __cpu_to_le16(0xd800 | ((u >> 10) & 0x03ff));
+				*op++ = __cpu_to_le16(0xdc00 | (u & 0x03ff));
+			} else {
+				*op++ = __cpu_to_le16(u);
+			}
+		} else {
+			*op++ = __cpu_to_le16(*s++);
+			inlen--;
+		}
+	}
+	return op - pwcs;
+}
+
 /*
  * NAME:	cifs_strtoUTF16()
  *
@@ -255,24 +283,14 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
 	wchar_t wchar_to; /* needed to quiet sparse */
 
 	/* special case for utf8 to handle no plane0 chars */
-	if (!strcmp(codepage->charset, "utf8")) {
+	if (!codepage) {
 		/*
 		 * convert utf8 -> utf16, we assume we have enough space
 		 * as caller should have assumed conversion does not overflow
-		 * in destination len is length in wchar_t units (16bits)
-		 */
-		i  = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
-				       (wchar_t *) to, len);
-
-		/* if success terminate and exit */
-		if (i >= 0)
-			goto success;
-		/*
-		 * if fails fall back to UCS encoding as this
-		 * function should not return negative values
-		 * currently can fail only if source contains
-		 * invalid encoded characters
+		 * in destination len is length in __le16 units
 		 */
+		i  = cifs_utf8s_to_utf16s(from, len, to);
+		goto success;
 	}
 
 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
@@ -508,25 +526,29 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
 		 * as they use backslash as separator.
 		 */
 		if (dst_char == 0) {
-			charlen = cp->char2uni(source + i, srclen - i, &tmp);
-			dst_char = cpu_to_le16(tmp);
-
-			/*
-			 * if no match, use question mark, which at least in
-			 * some cases serves as wild card
-			 */
-			if (charlen > 0)
-				goto ctoUTF16;
-
-			/* convert SURROGATE_PAIR */
-			if (strcmp(cp->charset, "utf8") || !wchar_to)
-				goto unknown;
-			if (*(source + i) & 0x80) {
-				charlen = utf8_to_utf32(source + i, 6, &u);
-				if (charlen < 0)
+			if (cp) {
+				charlen = cp->char2uni(source + i, srclen - i, &tmp);
+				dst_char = cpu_to_le16(tmp);
+
+				/*
+				 * if no match, use question mark, which at least in
+				 * some cases serves as wild card
+				 */
+				if (charlen > 0)
+					goto ctoUTF16;
+				else
 					goto unknown;
-			} else
+			}
+
+			/* UTF-8 to UTF-16 conversion */
+
+			if (!wchar_to)
 				goto unknown;
+
+			charlen = utf8_to_utf32(source + i, 6, &u);
+			if (charlen < 0)
+				goto unknown;
+
 			ret  = utf8s_to_utf16s(source + i, charlen,
 					       UTF16_LITTLE_ENDIAN,
 					       wchar_to, 6);
@@ -595,8 +617,26 @@ cifs_local_to_utf16_bytes(const char *from, int len,
 {
 	int charlen;
 	int i;
+	int outlen;
+	unicode_t u_to;
 	wchar_t wchar_to;
 
+	if (!codepage) {
+		outlen = 0;
+		for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+			charlen = utf8_to_utf32(from, len, &u_to);
+			/* Failed conversion defaults to a question mark */
+			if (charlen < 1) {
+				charlen = 1;
+				outlen += 2;
+			} else if (u_to <= 0xFFFF)
+				outlen += 2;
+			else
+				outlen += 4;
+		}
+		return outlen;
+	}
+
 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
 		charlen = codepage->char2uni(from, len, &wchar_to);
 		/* Failed conversion defaults to a question mark */
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 80b3d845419f..b9a3290faaf7 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -106,7 +106,7 @@ extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
 				     int remap);
 #endif
 
-wchar_t cifs_toupper(wchar_t in);
+unicode_t cifs_toupper(unicode_t in);
 
 /*
  * UniStrcat:  Concatenate the second string to the first
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 64b71c4e2a9d..9941bb6f2aad 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -569,6 +569,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 					   cifs_sb->ctx->dir_mode);
 	if (cifs_sb->ctx->iocharset)
 		seq_printf(s, ",iocharset=%s", cifs_sb->ctx->iocharset);
+	else
+		seq_puts(s, ",iocharset=utf8");
 	if (tcon->seal)
 		seq_puts(s, ",seal");
 	else if (tcon->ses->server->ignore_signature)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3781eee9360a..d560fb7a9aed 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2338,7 +2338,11 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
 	    old->ctx->dir_mode != new->ctx->dir_mode)
 		return 0;
 
-	if (strcmp(old->local_nls->charset, new->local_nls->charset))
+	if (old->local_nls && !new->local_nls)
+		return 0;
+	if (!old->local_nls && new->local_nls)
+		return 0;
+	if (old->local_nls && new->local_nls && strcmp(old->local_nls->charset, new->local_nls->charset))
 		return 0;
 
 	if (old->ctx->acregmax != new->ctx->acregmax)
@@ -2800,7 +2804,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
 	if (ctx->iocharset == NULL) {
 		/* load_nls_default cannot return null */
 		cifs_sb->local_nls = load_nls_default();
-	} else {
+	} else if (strcmp(ctx->iocharset, "utf8") != 0) {
 		cifs_sb->local_nls = load_nls(ctx->iocharset);
 		if (cifs_sb->local_nls == NULL) {
 			cifs_dbg(VFS, "CIFS mount error: iocharset %s not found\n",
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 79402ca0ddfa..fa09fb5d3641 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -789,16 +789,22 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
 {
 	struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
 	unsigned long hash;
+	unicode_t u;
 	wchar_t c;
 	int i, charlen;
 
 	hash = init_name_hash(dentry);
 	for (i = 0; i < q->len; i += charlen) {
-		charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
+		if (codepage) {
+			charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
+			if (likely(charlen > 0))
+				u = c;
+		} else
+			charlen = utf8_to_utf32(&q->name[i], q->len - i, &u);
 		/* error out if we can't convert the character */
 		if (unlikely(charlen < 0))
 			return charlen;
-		hash = partial_name_hash(cifs_toupper(c), hash);
+		hash = partial_name_hash(cifs_toupper(u), hash);
 	}
 	q->hash = end_name_hash(hash);
 
@@ -809,6 +815,7 @@ static int cifs_ci_compare(const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
+	unicode_t u1, u2;
 	wchar_t c1, c2;
 	int i, l1, l2;
 
@@ -822,9 +829,18 @@ static int cifs_ci_compare(const struct dentry *dentry,
 		return 1;
 
 	for (i = 0; i < len; i += l1) {
-		/* Convert characters in both strings to UTF-16. */
-		l1 = codepage->char2uni(&str[i], len - i, &c1);
-		l2 = codepage->char2uni(&name->name[i], name->len - i, &c2);
+		/* Convert characters in both strings to UTF-32. */
+		if (codepage) {
+			l1 = codepage->char2uni(&str[i], len - i, &c1);
+			l2 = codepage->char2uni(&name->name[i], name->len - i, &c2);
+			if (likely(l1 > 0))
+				u1 = c1;
+			if (likely(l2 > 0))
+				u2 = c2;
+		} else {
+			l1 = utf8_to_utf32(&str[i], len - i, &u1);
+			l2 = utf8_to_utf32(&name->name[i], name->len - i, &u2);
+		}
 
 		/*
 		 * If we can't convert either character, just declare it to
@@ -845,7 +861,7 @@ static int cifs_ci_compare(const struct dentry *dentry,
 			return 1;
 
 		/* Now compare uppercase versions of these characters */
-		if (cifs_toupper(c1) != cifs_toupper(c2))
+		if (cifs_toupper(u1) != cifs_toupper(u2))
 			return 1;
 	}
 
diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c
index 59b6c577aa0a..fce38de59e13 100644
--- a/fs/cifs/winucase.c
+++ b/fs/cifs/winucase.c
@@ -18,7 +18,7 @@
 
 #include <linux/nls.h>
 
-wchar_t cifs_toupper(wchar_t in);  /* quiet sparse */
+unicode_t cifs_toupper(unicode_t in);  /* quiet sparse */
 
 static const wchar_t t2_00[256] = {
 	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
@@ -616,20 +616,24 @@ static const wchar_t *const toplevel[256] = {
 };
 
 /**
- * cifs_toupper - convert a wchar_t from lower to uppercase
+ * cifs_toupper - convert a unicode_t from lower to uppercase
  * @in: character to convert from lower to uppercase
  *
- * This function consults the static tables above to convert a wchar_t from
+ * This function consults the static tables above to convert a unicode_t from
  * lower to uppercase. In the event that there is no mapping, the original
  * "in" character is returned.
  */
-wchar_t
-cifs_toupper(wchar_t in)
+unicode_t
+cifs_toupper(unicode_t in)
 {
 	unsigned char idx;
 	const wchar_t *tbl;
 	wchar_t out;
 
+	/* cifs_toupper table has only defines for plane-0 */
+	if (in > 0xffff)
+		return in;
+
 	/* grab upper byte */
 	idx = (in & 0xff00) >> 8;
 
-- 
2.20.1

next prev parent reply	other threads:[~2021-08-08 16:25 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-08 16:24 [RFC PATCH 00/20] fs: Remove usage of broken nls_utf8 and drop it Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 01/20] fat: Fix iocharset=utf8 mount option Pali Rohár
2021-08-15  3:42   ` OGAWA Hirofumi
2021-08-15  9:42     ` Pali Rohár
2021-08-15 11:23       ` OGAWA Hirofumi
2021-08-23  3:51   ` Kari Argillander
2021-08-08 16:24 ` [RFC PATCH 02/20] hfsplus: Add iocharset= mount option as alias for nls= Pali Rohár
2021-08-09 17:51   ` Viacheslav Dubeyko
2021-08-09 20:49   ` Kari Argillander
2021-08-09 21:25     ` Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 03/20] udf: Fix iocharset=utf8 mount option Pali Rohár
2021-08-12 14:17   ` Jan Kara
2021-08-12 15:51     ` Pali Rohár
2021-08-13 13:48       ` Jan Kara
2021-08-19  8:34         ` Pali Rohár
2021-08-19 10:41           ` Jan Kara
2021-08-08 16:24 ` [RFC PATCH 04/20] isofs: joliet: " Pali Rohár
2021-08-12 14:18   ` Jan Kara
2021-08-08 16:24 ` [RFC PATCH 05/20] ntfs: Undeprecate iocharset= " Pali Rohár
2021-08-09 20:52   ` Kari Argillander
2021-08-19  1:21   ` Kari Argillander
2021-08-19  8:12     ` Pali Rohár
2021-08-19 10:23       ` Kari Argillander
2021-08-19 22:04         ` Pali Rohár
2021-08-19 23:18           ` Kari Argillander
2021-08-08 16:24 ` [RFC PATCH 06/20] ntfs: Fix error processing when load_nls() fails Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 07/20] befs: Fix printing iocharset= mount option Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 08/20] befs: Rename enum value Opt_charset to Opt_iocharset to match " Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 09/20] befs: Fix error processing when load_nls() fails Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 10/20] befs: Allow to use native UTF-8 mode Pali Rohár
2021-08-08 19:20   ` kernel test robot
2021-08-08 16:24 ` [RFC PATCH 11/20] hfs: Explicitly set hsb->nls_disk when hsb->nls_io is set Pali Rohár
2021-08-09 17:31   ` Viacheslav Dubeyko
2021-08-09 17:37     ` Matthew Wilcox
2021-08-09 17:47       ` Pali Rohár
2021-08-09 20:43         ` Steve French
2021-08-09 18:00       ` Viacheslav Dubeyko
2021-08-08 16:24 ` [RFC PATCH 12/20] hfs: Do not use broken utf8 NLS table for iocharset=utf8 mount option Pali Rohár
2021-08-09 17:49   ` Viacheslav Dubeyko
2022-09-25 12:06     ` Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 13/20] hfsplus: " Pali Rohár
2021-08-09 17:42   ` Viacheslav Dubeyko
2022-09-25 12:12     ` Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 14/20] jfs: Remove custom iso8859-1 implementation Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 15/20] jfs: Fix buffer overflow in jfs_strfromUCS_le() function Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 16/20] jfs: Do not use broken utf8 NLS table for iocharset=utf8 mount option Pali Rohár
2021-08-09 22:51   ` kernel test robot
2021-08-08 16:24 ` [RFC PATCH 17/20] ntfs: " Pali Rohár
2021-08-08 17:53   ` kernel test robot
2021-08-10  0:34   ` kernel test robot
2021-08-08 16:24 ` Pali Rohár [this message]
2021-08-08 16:24 ` [RFC PATCH 19/20] cifs: Remove usage of load_nls_default() calls Pali Rohár
2021-08-08 16:24 ` [RFC PATCH 20/20] nls: Drop broken nls_utf8 module Pali Rohár
2021-09-03 21:26 ` [RFC PATCH 00/20] fs: Remove usage of broken nls_utf8 and drop it Kari Argillander
2021-09-03 21:37   ` Pali Rohár
2021-09-03 22:06     ` Kari Argillander

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:9bd03a23103 dfblob:b0f7f78da7c dfblob:80b3d845419
dfblob:b9a3290faaf dfblob:64b71c4e2a9 dfblob:9941bb6f2aa
dfblob:3781eee9360 dfblob:d560fb7a9ae dfblob:79402ca0ddf
dfblob:fa09fb5d364 dfblob:59b6c577aa0 dfblob:fce38de59e1 )
 OR (
bs:"[RFC PATCH 18/20] cifs: Do not use broken utf8 NLS table for iocharset=utf8 mount option" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210808162453.1653-19-pali@kernel.org \
    --to=pali@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=anton@tuxera.com \
    --cc=hch@infradead.org \
    --cc=hirofumi@mail.parknet.co.jp \
    --cc=jack@suse.cz \
    --cc=jfs-discussion@lists.sourceforge.net \
    --cc=linux-cifs@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-ntfs-dev@lists.sourceforge.net \
    --cc=luisbg@kernel.org \
    --cc=marek.behun@nic.cz \
    --cc=pavel@ucw.cz \
    --cc=salah.triki@gmail.com \
    --cc=shaggy@kernel.org \
    --cc=tytso@mit.edu \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.