From: Gabriel Krisman Bertazi <krisman@collabora.com>
To: tytso@mit.edu
Cc: linux-fsdevel@vger.kernel.org, kernel@collabora.com,
linux-ext4@vger.kernel.org,
Gabriel Krisman Bertazi <krisman@collabora.co.uk>
Subject: [PATCH v4 17/23] nls: utf8: Integrate utf8 normalization code with utf8 charset
Date: Thu, 6 Dec 2018 18:08:57 -0500 [thread overview]
Message-ID: <20181206230903.30011-18-krisman@collabora.com> (raw)
In-Reply-To: <20181206230903.30011-1-krisman@collabora.com>
From: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
This patch integrates the utf8n patches with the NLS utf8 charset by
implementing the nls_ops operations and nls_charset table. The
Normalization is done with NFKD, and Casefold is implemented using the
NFKD+CF algorithm, implemented by Olaf Weber and SGI. The high level,
strcmp, strncmp functions are implemented on top of the same utf8 code.
Utf-8 with normalization is exposed as optional on top of the existing
utf8 charset, and disabled by default, to avoid changing the behavior of
existing nls_utf8 users. To enable normalization, the specific
normalization type must be set at load_table() time.
Changes since RFC v2:
- Integrate with NLS
- Merge utf8n with nls_utf8.
Changes since RFC v1:
- Change error return code from EIO to EINVAL. (Olaf Weber)
- Fix issues with strncmp/strcmp. (Olaf Weber)
- Remove stack buffer in normalization/casefold. (Olaf Weber)
- Include length parameter for second string on comparison functions.
- Change length type to size_t.
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
---
fs/nls/nls_utf8-core.c | 269 ++++++++++++++++++++++++++++++++++++++---
fs/nls/nls_utf8-norm.c | 6 +
fs/nls/utf8n.h | 1 +
include/linux/nls.h | 8 ++
4 files changed, 270 insertions(+), 14 deletions(-)
diff --git a/fs/nls/nls_utf8-core.c b/fs/nls/nls_utf8-core.c
index fe1ac5efaa37..1b7320bd9c34 100644
--- a/fs/nls/nls_utf8-core.c
+++ b/fs/nls/nls_utf8-core.c
@@ -6,10 +6,15 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
#include <linux/nls.h>
#include <linux/errno.h>
+#include "utf8n.h"
+
static unsigned char identity[256];
+static struct nls_charset utf8_info;
static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
{
@@ -50,22 +55,257 @@ static unsigned char charset_toupper(const struct nls_table *table,
return identity[c];
}
-static const struct nls_ops charset_ops = {
- .lowercase = charset_toupper,
- .uppercase = charset_tolower,
- .uni2char = uni2char,
- .char2uni = char2uni,
-};
+#ifdef CONFIG_NLS_UTF8_NORMALIZATION
+
+static int utf8_validate(const struct nls_table *charset,
+ const unsigned char *str, size_t len)
+{
+ const struct utf8data *data = utf8nfkdi(charset->version);
+
+ if (utf8nlen(data, str, len) < 0)
+ return -1;
+ return 0;
+}
+
+static int utf8_strncmp(const struct nls_table *charset,
+ const unsigned char *str1, size_t len1,
+ const unsigned char *str2, size_t len2)
+{
+ const struct utf8data *data = utf8nfkdi(charset->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, str1, len1) < 0)
+ goto invalid_seq;
+
+ if (utf8ncursor(&cur2, data, str2, len2) < 0)
+ goto invalid_seq;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ goto invalid_seq;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+
+invalid_seq:
+ if(IS_STRICT_MODE(charset))
+ return -EINVAL;
+
+ /* Treat the sequence as a binary blob. */
+ if (len1 != len2)
+ return 1;
+
+ return !!memcmp(str1, str2, len1);
+}
+
+static int utf8_strncasecmp(const struct nls_table *charset,
+ const unsigned char *str1, size_t len1,
+ const unsigned char *str2, size_t len2)
+{
+ const struct utf8data *data = utf8nfkdicf(charset->version);
+ struct utf8cursor cur1, cur2;
+ int c1, c2;
+
+ if (utf8ncursor(&cur1, data, str1, len1) < 0)
+ goto invalid_seq;
+
+ if (utf8ncursor(&cur2, data, str2, len2) < 0)
+ goto invalid_seq;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = utf8byte(&cur2);
+
+ if (c1 < 0 || c2 < 0)
+ goto invalid_seq;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+
+invalid_seq:
+ if(IS_STRICT_MODE(charset))
+ return -EINVAL;
+
+ /* Treat the sequence as a binary blob. */
+ if (len1 != len2)
+ return 1;
+
+ return !!memcmp(str1, str2, len1);
+}
+
+static int utf8_casefold_nfkdcf(const struct nls_table *charset,
+ const unsigned char *str, size_t len,
+ unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfkdicf(charset->version);
+ struct utf8cursor cur;
+ size_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str, len) < 0)
+ goto invalid_seq;
+
+ for (nlen = 0; nlen < dlen; nlen++) {
+ dest[nlen] = utf8byte(&cur);
+ if (!dest[nlen])
+ return nlen;
+ if (dest[nlen] == -1)
+ break;
+ }
+
+invalid_seq:
+ if (IS_STRICT_MODE(charset))
+ return -EINVAL;
+
+ /* Treat the sequence as a binary blob. */
+ memcpy(dest, str, len);
+ return len;
+}
+
+static int utf8_normalize_nfkd(const struct nls_table *charset,
+ const unsigned char *str,
+ size_t len, unsigned char *dest, size_t dlen)
+{
+ const struct utf8data *data = utf8nfkdi(charset->version);
+ struct utf8cursor cur;
+ ssize_t nlen = 0;
+
+ if (utf8ncursor(&cur, data, str, len) < 0)
+ goto invalid_seq;
-static struct nls_charset nls_charset;
-static struct nls_table table = {
- .charset = &nls_charset,
- .ops = &charset_ops,
+ for (nlen = 0; nlen < dlen; nlen++) {
+ dest[nlen] = utf8byte(&cur);
+ if (!dest[nlen])
+ return nlen;
+ if (dest[nlen] == -1)
+ break;
+ }
+
+invalid_seq:
+ if (IS_STRICT_MODE(charset))
+ return -EINVAL;
+
+ /* Treat the sequence as a binary blob. */
+ memcpy(dest, str, len);
+ return len;
+}
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+ unsigned int *min, unsigned int *rev)
+{
+ substring_t args[3];
+ char version_string[12];
+ const struct match_token token[] = {
+ {1, "%d.%d.%d"},
+ {0, NULL}
+ };
+
+ strncpy(version_string, version, sizeof(version_string));
+
+ if (match_token(version_string, token, args) != 1)
+ return -EINVAL;
+
+ if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+ match_int(&args[2], rev))
+ return -EINVAL;
+
+ return 0;
+}
+#endif
+
+struct utf8_table {
+ struct nls_table tbl;
+ struct nls_ops ops;
};
-static struct nls_charset nls_charset = {
+static void utf8_set_ops(struct utf8_table *utbl)
+{
+ utbl->ops.lowercase = charset_toupper;
+ utbl->ops.uppercase = charset_tolower;
+ utbl->ops.uni2char = uni2char;
+ utbl->ops.char2uni = char2uni;
+
+#ifdef CONFIG_NLS_UTF8_NORMALIZATION
+ utbl->ops.validate = utf8_validate;
+
+ if (IS_NORMALIZATION_TYPE_UTF8_NFKD(&utbl->tbl)) {
+ utbl->ops.normalize = utf8_normalize_nfkd;
+ utbl->ops.strncmp = utf8_strncmp;
+ }
+
+ if (IS_CASEFOLD_TYPE_UTF8_NFKDCF(&utbl->tbl)) {
+ utbl->ops.casefold = utf8_casefold_nfkdcf;
+ utbl->ops.strncasecmp = utf8_strncasecmp;
+ }
+#endif
+
+ utbl->tbl.ops = &utbl->ops;
+}
+
+static struct nls_table *utf8_load_table(const char *version, unsigned int flags)
+{
+ struct utf8_table *utbl = NULL;
+ unsigned int nls_version;
+
+#ifdef CONFIG_NLS_UTF8_NORMALIZATION
+ if (version) {
+ unsigned int maj, min, rev;
+
+ if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+ return ERR_PTR(-EINVAL);
+
+ if (!utf8version_is_supported(maj, min, rev))
+ return ERR_PTR(-EINVAL);
+
+ nls_version = UNICODE_AGE(maj, min, rev);
+ } else {
+ nls_version = utf8version_latest();
+ printk(KERN_WARNING"UTF-8 version not specified. "
+ "Assuming latest supported version (%d.%d.%d).",
+ (nls_version >> 16) & 0xff, (nls_version >> 8) & 0xff,
+ (nls_version & 0xff));
+ }
+#else
+ nls_version = 0;
+#endif
+
+ utbl = kzalloc(sizeof(struct utf8_table), GFP_KERNEL);
+ if (!utbl)
+ return ERR_PTR(-ENOMEM);
+
+ utbl->tbl.charset = &utf8_info;
+ utbl->tbl.version = nls_version;
+ utbl->tbl.flags = flags;
+ utf8_set_ops(utbl);
+
+ utbl->tbl.next = utf8_info.tables;
+ utf8_info.tables = &utbl->tbl;
+
+ return &utbl->tbl;
+}
+
+static void utf8_cleanup_tables(void)
+{
+ struct nls_table *tmp, *tbl = utf8_info.tables;
+
+ while (tbl) {
+ tmp = tbl;
+ tbl = tbl->next;
+ kfree(tmp);
+ }
+ utf8_info.tables = NULL;
+}
+
+static struct nls_charset utf8_info = {
.charset = "utf8",
- .tables = &table,
+ .load_table = utf8_load_table,
};
static int __init init_nls_utf8(void)
@@ -74,12 +314,13 @@ static int __init init_nls_utf8(void)
for (i=0; i<256; i++)
identity[i] = i;
- return register_nls(&nls_charset);
+ return register_nls(&utf8_info);
}
static void __exit exit_nls_utf8(void)
{
- unregister_nls(&nls_charset);
+ unregister_nls(&utf8_info);
+ utf8_cleanup_tables();
}
module_init(init_nls_utf8)
diff --git a/fs/nls/nls_utf8-norm.c b/fs/nls/nls_utf8-norm.c
index 64c3cc74a2ca..abee8b376a87 100644
--- a/fs/nls/nls_utf8-norm.c
+++ b/fs/nls/nls_utf8-norm.c
@@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
}
EXPORT_SYMBOL(utf8version_is_supported);
+int utf8version_latest()
+{
+ return utf8vers;
+}
+EXPORT_SYMBOL(utf8version_latest);
+
/*
* UTF-8 valid ranges.
*
diff --git a/fs/nls/utf8n.h b/fs/nls/utf8n.h
index f60827663503..b4697f9bfbab 100644
--- a/fs/nls/utf8n.h
+++ b/fs/nls/utf8n.h
@@ -32,6 +32,7 @@
/* Highest unicode version supported by the data tables. */
extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
+extern int utf8version_latest(void);
/*
* Look for the correct const struct utf8data for a unicode version.
diff --git a/include/linux/nls.h b/include/linux/nls.h
index aab60d4858ee..aee5cbfc07c6 100644
--- a/include/linux/nls.h
+++ b/include/linux/nls.h
@@ -186,6 +186,14 @@ NLS_CASEFOLD_FUNCS(ALL, TOUPPER, NLS_CASEFOLD_TYPE_TOUPPER)
NLS_CASEFOLD_FUNCS(ASCII, TOUPPER, NLS_ASCII_CASEFOLD_TOUPPER)
NLS_CASEFOLD_FUNCS(ASCII, TOLOWER, NLS_ASCII_CASEFOLD_TOLOWER)
+/* UTF-8 */
+
+#define NLS_UTF8_NORMALIZATION_TYPE_NFKD NLS_NORMALIZATION_TYPE(1)
+#define NLS_UTF8_CASEFOLD_TYPE_NFKDCF NLS_CASEFOLD_TYPE(1)
+
+NLS_NORMALIZATION_FUNCS(UTF8, NFKD, NLS_UTF8_NORMALIZATION_TYPE_NFKD)
+NLS_CASEFOLD_FUNCS(UTF8, NFKDCF, NLS_UTF8_CASEFOLD_TYPE_NFKDCF)
+
/* nls_base.c */
extern int __register_nls(struct nls_charset *, struct module *);
extern int unregister_nls(struct nls_charset *);
--
2.20.0.rc2
next prev parent reply other threads:[~2018-12-06 23:10 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-12-06 23:08 [PATCH v4 00/23] Ext4 Encoding and Case-insensitive support Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 01/23] nls: Wrap uni2char/char2uni callers Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 02/23] nls: Wrap charset field access Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 03/23] nls: Wrap charset hooks in ops structure Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 04/23] nls: Split default charset from NLS core Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 05/23] nls: Split struct nls_charset from struct nls_table Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 06/23] nls: Add support for multiple versions of an encoding Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 07/23] nls: Implement NLS_STRICT_MODE flag Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 08/23] nls: Let charsets define the behavior of tolower/toupper Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 09/23] nls: Add new interface for string comparisons Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 10/23] nls: Add optional normalization and casefold hooks Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 11/23] nls: ascii: Support validation and normalization operations Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 12/23] nls: utf8: Add unicode character database files Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 13/23] scripts: add trie generator for UTF-8 Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 14/23] nls: utf8: Move nls-utf8{,-core}.c Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 15/23] nls: utf8: Introduce code for UTF-8 normalization Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 16/23] nls: utf8n: reduce the size of utf8data[] Gabriel Krisman Bertazi
2018-12-06 23:08 ` Gabriel Krisman Bertazi [this message]
2018-12-06 23:08 ` [PATCH v4 18/23] nls: utf8: Introduce test module for normalized utf8 implementation Gabriel Krisman Bertazi
2018-12-06 23:08 ` [PATCH v4 19/23] ext4: Reserve superblock fields for encoding information Gabriel Krisman Bertazi
2018-12-06 23:09 ` [PATCH v4 20/23] ext4: Include encoding information in the superblock Gabriel Krisman Bertazi
2018-12-06 23:09 ` [PATCH v4 21/23] ext4: Support encoding-aware file name lookups Gabriel Krisman Bertazi
2018-12-06 23:09 ` [PATCH v4 22/23] ext4: Implement EXT4_CASEFOLD_FL flag Gabriel Krisman Bertazi
2018-12-06 23:09 ` [PATCH v4 23/23] docs: ext4.rst: Document encoding and case-insensitive Gabriel Krisman Bertazi
2018-12-07 18:41 ` [PATCH v4 00/23] Ext4 Encoding and Case-insensitive support Randy Dunlap
[not found] ` <20181208194128.GE20708@thunk.org>
2018-12-08 21:48 ` Linus Torvalds
2018-12-08 21:58 ` Linus Torvalds
2018-12-08 22:59 ` Linus Torvalds
2018-12-09 0:46 ` Andreas Dilger
[not found] ` <20181209050326.GA28659@mit.edu>
2018-12-09 17:41 ` Linus Torvalds
2018-12-09 20:10 ` Theodore Y. Ts'o
2018-12-09 20:54 ` Linus Torvalds
2018-12-10 0:08 ` Theodore Y. Ts'o
2018-12-10 19:35 ` Linus Torvalds
2018-12-09 20:53 ` Gabriel Krisman Bertazi
2018-12-09 21:05 ` Linus Torvalds
-- strict thread matches above, loose matches on Subject: below --
2018-12-06 22:04 Gabriel Krisman Bertazi
2018-12-06 22:04 ` [PATCH v4 17/23] nls: utf8: Integrate utf8 normalization code with utf8 charset Gabriel Krisman Bertazi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20181206230903.30011-18-krisman@collabora.com \
--to=krisman@collabora.com \
--cc=kernel@collabora.com \
--cc=krisman@collabora.co.uk \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).