Add support for SMS national language identifiers

All of lore.kernel.org
 help / color / mirror / Atom feed

* Add support for SMS national language identifiers
@ 2009-09-04 15:27 Aki Niemi
  2009-09-04 16:21 ` Denis Kenzior
  0 siblings, 1 reply; 3+ messages in thread
From: Aki Niemi @ 2009-09-04 15:27 UTC (permalink / raw)
  To: ofono

[-- Attachment #1: Type: text/plain, Size: 350 bytes --]

Hi All,

Here is a set of patches to add support for decoding SMSs that have
been encoded using national language tables instead of the default GSM
7bit tables. I was planning to push a couple of these patches
directly, but the changes turned out a bit more extensive than I
originally thought. Please take a look and comment.

Cheers,
Aki

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Add-support-for-national-language-variants.patch --]
[-- Type: text/x-patch, Size: 28085 bytes --]

From 935d2358e40a6eec4b68f4b1a367022f3b821c88 Mon Sep 17 00:00:00 2001
From: Aki Niemi <aki.niemi@nokia.com>
Date: Fri, 4 Sep 2009 17:34:34 +0300
Subject: [PATCH] Add support for national language variants

Add API for supporting character conversion using national language
variants. Also, add conversion tables for Turkish, Spanish and
Portuguese, and fix the default table. The lookup algorithms were
tweaked to support multiple tables.
---
 src/util.c |  599 +++++++++++++++++++++++++++++++++++++++++++++++++-----------
 src/util.h |   11 +
 2 files changed, 507 insertions(+), 103 deletions(-)

diff --git a/src/util.c b/src/util.c
index 9136b64..6be43e1 100644
--- a/src/util.c
+++ b/src/util.c
@@ -26,6 +26,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <ctype.h>
+#include <stdlib.h>
 
 #include <glib.h>
 
@@ -58,27 +59,229 @@
 	attached.
 */
 
+struct codepoint {
+	const unsigned short from;
+	const unsigned short to;
+};
+
+#define KNOWN_VARIANTS 4
+
+#define TABLE_SIZE(t) \
+	(sizeof((t)) / sizeof(struct codepoint))
+
+
+
 /* GSM to Unicode extension table, for GSM sequences starting with 0x1B */
-static unsigned short gsm_extension[] =
-{
-	0x0A, 0x000C,		/* See NOTE 3 in 23.038 */
-	0x14, 0x005E,
-	0x1B, 0x0020,		/* See NOTE 1 in 23.038 */
-	0x28, 0x007B,
-	0x29, 0x007D,
-	0x2F, 0x005C,
-	0x3C, 0x005B,
-	0x3D, 0x007E,
-	0x3E, 0x005D,
-	0x40, 0x007C,
-	0x65, 0x20AC
+static struct codepoint default_ext_gsm[] =
+{
+	{ 0x0A, 0x000C },		/* See NOTE 3 in 23.038 */
+	{ 0x14, 0x005E },
+	{ 0x1B, 0x0020 },		/* See NOTE 1 in 23.038 */
+	{ 0x28, 0x007B },
+	{ 0x29, 0x007D },
+	{ 0x2F, 0x005C },
+	{ 0x3C, 0x005B },
+	{ 0x3D, 0x007E },
+	{ 0x3E, 0x005D },
+	{ 0x40, 0x007C },
+	{ 0x65, 0x20AC }
+};
+
+static struct codepoint default_ext_unicode[] =
+{
+	{ 0x000C, 0x1B0A },
+	{ 0x005B, 0x1B3C },
+	{ 0x005C, 0x1B2F },
+	{ 0x005D, 0x1B3E },
+	{ 0x005E, 0x1B14 },
+	{ 0x007B, 0x1B28 },
+	{ 0x007C, 0x1B40 },
+	{ 0x007D, 0x1B29 },
+	{ 0x007E, 0x1B3D },
+	{ 0x20AC, 0x1B65 }
+};
+
+/* Appendix A.2.1. in 3GPP TS23.038, V.8.2.0 */
+static struct codepoint turkish_ext_gsm[] =
+{
+	{ 0x0A, 0x000C },		/* See NOTE 3 */
+	{ 0x14, 0x005E },
+	{ 0x1B, 0x0020 },		/* See NOTE 1 */
+	{ 0x28, 0x007B },
+	{ 0x29, 0x007D },
+	{ 0x2F, 0x005C },
+	{ 0x3C, 0x005B },
+	{ 0x3D, 0x007E },
+	{ 0x3E, 0x005D },
+	{ 0x40, 0x007C },
+	{ 0x47, 0x011E },
+	{ 0x49, 0x0130 },
+	{ 0x53, 0x015E },
+	{ 0x63, 0x00E7 },
+	{ 0x65, 0x20AC },
+	{ 0x67, 0x011F },
+	{ 0x69, 0x0131 },
+	{ 0x73, 0x015F }
+};
+
+static struct codepoint turkish_ext_unicode[] =
+{
+	{ 0x000C, 0x1B0A },
+	{ 0x005B, 0x1B3C },
+	{ 0x005C, 0x1B2F },
+	{ 0x005D, 0x1B3E },
+	{ 0x005E, 0x1B14 },
+	{ 0x007B, 0x1B28 },
+	{ 0x007C, 0x1B40 },
+	{ 0x007D, 0x1B29 },
+	{ 0x007E, 0x1B3D },
+	{ 0x00E7, 0x1B63 },
+	{ 0x011E, 0x1B47 },
+	{ 0x011F, 0x1B67 },
+	{ 0x0130, 0x1B49 },
+	{ 0x0131, 0x1B69 },
+	{ 0x015E, 0x1B53 },
+	{ 0x015F, 0x1B73 },
+	{ 0x20AC, 0x1B65 }
+};
+
+/* Appendix A.2.2. in 3GPP TS23.038 V.8.2.0*/
+static struct codepoint spanish_ext_gsm[] =
+{
+	{ 0x09, 0x00E7 },
+	{ 0x0A, 0x000C },		/* See NOTE 3 */
+	{ 0x14, 0x005E },
+	{ 0x1B, 0x0020 },		/* See NOTE 1 */
+	{ 0x28, 0x007B },
+	{ 0x29, 0x007D },
+	{ 0x2F, 0x005C },
+	{ 0x3C, 0x005B },
+	{ 0x3D, 0x007E },
+	{ 0x3E, 0x005D },
+	{ 0x40, 0x007C },
+	{ 0x41, 0x00C1 },
+	{ 0x49, 0x00CD },
+	{ 0x4F, 0x00D3 },
+	{ 0x55, 0x00DA },
+	{ 0x61, 0x00E1 },
+	{ 0x65, 0x20AC },
+	{ 0x69, 0x00ED },
+	{ 0x6F, 0x00F3 },
+	{ 0x75, 0x00FA }
+};
+
+static struct codepoint spanish_ext_unicode[] =
+{
+	{ 0x000C, 0x1B0A },
+	{ 0x005B, 0x1B3C },
+	{ 0x005C, 0x1B2F },
+	{ 0x005D, 0x1B3E },
+	{ 0x005E, 0x1B14 },
+	{ 0x007B, 0x1B28 },
+	{ 0x007C, 0x1B40 },
+	{ 0x007D, 0x1B29 },
+	{ 0x007E, 0x1B3D },
+	{ 0x00C1, 0x1B41 },
+	{ 0x00CD, 0x1B49 },
+	{ 0x00D3, 0x1B4F },
+	{ 0x00DA, 0x1B55 },
+	{ 0x00E1, 0x1B61 },
+	{ 0x00E7, 0x1B09 },
+	{ 0x00ED, 0x1B69 },
+	{ 0x00F3, 0x1B6F },
+	{ 0x00FA, 0x1B75 },
+	{ 0x20AC, 0x1B65 }
+};
+
+/* Appendix A.2.3. in 3GPP TS23.038 V.8.2.0 */
+static struct codepoint portuguese_ext_gsm[] =
+{
+	{ 0x05, 0x00EA },
+	{ 0x09, 0x00E7 },
+	{ 0x0A, 0x000C },		/* See NOTE 3 */
+	{ 0x0B, 0x00D4 },
+	{ 0x0C, 0x00F4 },
+	{ 0x0E, 0x00C1 },
+	{ 0x0F, 0x00E1 },
+	{ 0x12, 0x03A6 },
+	{ 0x13, 0x0393 },
+	{ 0x14, 0x005E },
+	{ 0x15, 0x03A9 },
+	{ 0x16, 0x03A0 },
+	{ 0x17, 0x03A8 },
+	{ 0x18, 0x03A3 },
+	{ 0x19, 0x0398 },
+	{ 0x1B, 0x0020 },		/* See NOTE 1 */
+	{ 0x1F, 0x00CA },
+	{ 0x28, 0x007B },
+	{ 0x29, 0x007D },
+	{ 0x2F, 0x005C },
+	{ 0x3C, 0x005B },
+	{ 0x3D, 0x007E },
+	{ 0x3E, 0x005D },
+	{ 0x40, 0x007C },
+	{ 0x41, 0x00C0 },
+	{ 0x49, 0x00CD },
+	{ 0x4F, 0x00D3 },
+	{ 0x55, 0x00DA },
+	{ 0x5B, 0x00C3 },
+	{ 0x5C, 0x00D5 },
+	{ 0x61, 0x00C2 },
+	{ 0x65, 0x20AC },
+	{ 0x69, 0x00ED },
+	{ 0x6F, 0x00F3 },
+	{ 0x75, 0x00FA },
+	{ 0x7B, 0x00E3 },
+	{ 0x7C, 0x00F5 },
+	{ 0x7F, 0x00E2 }
+};
+
+static struct codepoint portuguese_ext_unicode[] =
+{
+	{ 0x000C, 0x1B0A },
+	{ 0x005B, 0x1B3C },
+	{ 0x005C, 0x1B2F },
+	{ 0x005D, 0x1B3E },
+	{ 0x005E, 0x1B14 },
+	{ 0x007B, 0x1B28 },
+	{ 0x007C, 0x1B40 },
+	{ 0x007D, 0x1B29 },
+	{ 0x007E, 0x1B3D },
+	{ 0x00C0, 0x1B41 },
+	{ 0x00C1, 0x1B0E },
+	{ 0x00C2, 0x1B61 },
+	{ 0x00C3, 0x1B5B },
+	{ 0x00CA, 0x1B1F },
+	{ 0x00CD, 0x1B49 },
+	{ 0x00D3, 0x1B4F },
+	{ 0x00D4, 0x1B0B },
+	{ 0x00D5, 0x1B5C },
+	{ 0x00DA, 0x1B55 },
+	{ 0x00E1, 0x1B0F },
+	{ 0x00E2, 0x1B7F },
+	{ 0x00E3, 0x1B7B },
+	{ 0x00E7, 0x1B09 },
+	{ 0x00EA, 0x1B05 },
+	{ 0x00ED, 0x1B69 },
+	{ 0x00F3, 0x1B6F },
+	{ 0x00F4, 0x1B0C },
+	{ 0x00F5, 0x1B7C },
+	{ 0x00FA, 0x1B75 },
+	{ 0x0393, 0x1B13 },
+	{ 0x0398, 0x1B19 },
+	{ 0x03A0, 0x1B16 },
+	{ 0x03A3, 0x1B18 },
+	{ 0x03A6, 0x1B12 },
+	{ 0x03A8, 0x1B17 },
+	{ 0x03A9, 0x1B15 },
+	{ 0x20AC, 0x1B65 }
 };
 
 /* Used for conversion of GSM to Unicode */
-static unsigned short gsm_table[] =
+static const unsigned short default_gsm[] =
 {
 	0x0040, 0x00A3, 0x0024, 0x00A5, 0x00E8, 0x00E9, 0x00F9, 0x00EC, /* 0x07 */
-	0x00F2, 0x00E7, 0x000A, 0x00D8, 0x00F8, 0x000D, 0x00C5, 0x00E5, /* 0x0F */
+	0x00F2, 0x00C7, 0x000A, 0x00D8, 0x00F8, 0x000D, 0x00C5, 0x00E5, /* 0x0F */
 	0x0394, 0x005F, 0x03A6, 0x0393, 0x039B, 0x03A9, 0x03A0, 0x03A8, /* 0x17 */
 	0x03A3, 0x0398, 0x039E, 0x00A0, 0x00C6, 0x00E6, 0x00DF, 0x00C9, /* 0x1F */
 	0x0020, 0x0021, 0x0022, 0x0023, 0x00A4, 0x0025, 0x0026, 0x0027, /* 0x27 */
@@ -95,80 +298,259 @@ static unsigned short gsm_table[] =
 	0x0078, 0x0079, 0x007A, 0x00E4, 0x00F6, 0x00F1, 0x00FC, 0x00E0  /* 0x7F */
 };
 
-#define GUND 0xFFFF
+static struct codepoint default_unicode[] =
+{
+	{ 0x000A, 0x0A }, { 0x000D, 0x0D }, { 0x0020, 0x20 }, { 0x0021, 0x21 },
+	{ 0x0022, 0x22 }, { 0x0023, 0x23 }, { 0x0024, 0x02 }, { 0x0025, 0x25 },
+	{ 0x0026, 0x26 }, { 0x0027, 0x27 }, { 0x0028, 0x28 }, { 0x0029, 0x29 },
+	{ 0x002A, 0x2A }, { 0x002B, 0x2B }, { 0x002C, 0x2C }, { 0x002D, 0x2D },
+	{ 0x002E, 0x2E }, { 0x002F, 0x2F }, { 0x0030, 0x30 }, { 0x0031, 0x31 },
+	{ 0x0032, 0x32 }, { 0x0033, 0x33 }, { 0x0034, 0x34 }, { 0x0035, 0x35 },
+	{ 0x0036, 0x36 }, { 0x0037, 0x37 }, { 0x0038, 0x38 }, { 0x0039, 0x39 },
+	{ 0x003A, 0x3A }, { 0x003B, 0x3B }, { 0x003C, 0x3C }, { 0x003D, 0x3D },
+	{ 0x003E, 0x3E }, { 0x003F, 0x3F }, { 0x0040, 0x00 }, { 0x0041, 0x41 },
+	{ 0x0042, 0x42 }, { 0x0043, 0x43 }, { 0x0044, 0x44 }, { 0x0045, 0x45 },
+	{ 0x0046, 0x46 }, { 0x0047, 0x47 }, { 0x0048, 0x48 }, { 0x0049, 0x49 },
+	{ 0x004A, 0x4A }, { 0x004B, 0x4B }, { 0x004C, 0x4C }, { 0x004D, 0x4D },
+	{ 0x004E, 0x4E }, { 0x004F, 0x4F }, { 0x0050, 0x50 }, { 0x0051, 0x51 },
+	{ 0x0052, 0x52 }, { 0x0053, 0x53 }, { 0x0054, 0x54 }, { 0x0055, 0x55 },
+	{ 0x0056, 0x56 }, { 0x0057, 0x57 }, { 0x0058, 0x58 }, { 0x0059, 0x59 },
+	{ 0x005A, 0x5A }, { 0x005F, 0x11 }, { 0x0061, 0x61 }, { 0x0062, 0x62 },
+	{ 0x0063, 0x63 }, { 0x0064, 0x64 }, { 0x0065, 0x65 }, { 0x0066, 0x66 },
+	{ 0x0067, 0x67 }, { 0x0068, 0x68 }, { 0x0069, 0x69 }, { 0x006A, 0x6A },
+	{ 0x006B, 0x6B }, { 0x006C, 0x6C }, { 0x006D, 0x6D }, { 0x006E, 0x6E },
+	{ 0x006F, 0x6F }, { 0x0070, 0x70 }, { 0x0071, 0x71 }, { 0x0072, 0x72 },
+	{ 0x0073, 0x73 }, { 0x0074, 0x74 }, { 0x0075, 0x75 }, { 0x0076, 0x76 },
+	{ 0x0077, 0x77 }, { 0x0078, 0x78 }, { 0x0079, 0x79 }, { 0x007A, 0x7A },
+	{ 0x00A0, 0x20 }, { 0x00A1, 0x40 }, { 0x00A3, 0x01 }, { 0x00A4, 0x24 },
+	{ 0x00A5, 0x03 }, { 0x00A7, 0x5F }, { 0x00BF, 0x60 }, { 0x00C4, 0x5B },
+	{ 0x00C5, 0x0E }, { 0x00C6, 0x1C }, { 0x00C7, 0x09 }, { 0x00C9, 0x1F },
+	{ 0x00D1, 0x5D }, { 0x00D6, 0x5C }, { 0x00D8, 0x0B }, { 0x00DC, 0x5E },
+	{ 0x00DF, 0x1E }, { 0x00E0, 0x7F }, { 0x00E4, 0x7B }, { 0x00E5, 0x0F },
+	{ 0x00E6, 0x1D }, { 0x00E8, 0x04 }, { 0x00E9, 0x05 }, { 0x00EC, 0x07 },
+	{ 0x00F1, 0x7D }, { 0x00F2, 0x08 }, { 0x00F6, 0x7C }, { 0x00F8, 0x0C },
+	{ 0x00F9, 0x06 }, { 0x00FC, 0x7E }, { 0x0393, 0x13 }, { 0x0394, 0x10 },
+	{ 0x0398, 0x19 }, { 0x039B, 0x14 }, { 0x039E, 0x1A }, { 0x03A0, 0x16 },
+	{ 0x03A3, 0x18 }, { 0x03A6, 0x12 }, { 0x03A8, 0x17 }, { 0x03A9, 0x15 }
+};
+
+/* Appendix A.3.1 in 3GPP TS23.038 */
+static const unsigned short turkish_gsm[] =
+{
+	0x0040, 0x00A3, 0x0024, 0x00A5, 0x20AC, 0x00E9, 0x00F9, 0x0131, /* 0x07 */
+	0x00F2, 0x00C7, 0x000A, 0x011E, 0x011F, 0x000D, 0x00C5, 0x00E5, /* 0x0F */
+	0x0394, 0x005F, 0x03A6, 0x0393, 0x039B, 0x03A9, 0x03A0, 0x03A8, /* 0x17 */
+	0x03A3, 0x0398, 0x039E, 0x00A0, 0x015E, 0x015F, 0x00DF, 0x00C9, /* 0x1F */
+	0x0020, 0x0021, 0x0022, 0x0023, 0x00A4, 0x0025, 0x0026, 0x0027, /* 0x27 */
+	0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, /* 0x2F */
+	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, /* 0x37 */
+	0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, /* 0x3F */
+	0x0130, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, /* 0x47 */
+	0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, /* 0x4F */
+	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, /* 0x57 */
+	0x0058, 0x0059, 0x005A, 0x00C4, 0x00D6, 0x00D1, 0x00DC, 0x00A7, /* 0x5F */
+	0x00E7, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, /* 0x67 */
+	0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, /* 0x6F */
+	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, /* 0x77 */
+	0x0078, 0x0079, 0x007A, 0x00E4, 0x00F6, 0x00F1, 0x00FC, 0x00E0  /* 0x7F */
+};
 
-/* 3GPP 27.005 Annex A */
-static unsigned short unicode_256_table[] =
-{
-	GUND, GUND, GUND, GUND, GUND, GUND, GUND, GUND, /* 0x07 */
-	GUND, GUND, 0x0A, GUND, 0x1B0A, 0x0D, GUND, GUND, /* 0x0F */
-	GUND, GUND, GUND, GUND, GUND, GUND, GUND, GUND, /* 0x17 */
-	GUND, GUND, GUND, GUND, GUND, GUND, GUND, GUND, /* 0x1F */
-	0x20, 0x21, 0x22, 0x23, 0x02, 0x25, 0x26, 0x27, /* 0x27 */
-	0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, /* 0x2F */
-	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x37 */
-	0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, /* 0x3F */
-	0x00, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x47 */
-	0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, /* 0x4F */
-	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x57 */
-	0x58, 0x59, 0x5A, 0x1B3C, 0x1B2F, 0x1B3E, 0x1B14, 0x11, /* 0x5F */
-	GUND, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x67 */
-	0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, /* 0x6F */
-	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x77 */
-	0x78, 0x79, 0x7A, 0x1B28, 0x1B40, 0x1B29, 0x1B3D, GUND, /* 0x7F */
-	GUND, GUND, GUND, GUND, GUND, GUND, GUND, GUND, /* 0x87 */
-	GUND, GUND, GUND, GUND, GUND, GUND, GUND, GUND, /* 0x8F */
-	GUND, GUND, GUND, GUND, GUND, GUND, GUND, GUND, /* 0x97 */
-	GUND, GUND, GUND, GUND, GUND, GUND, GUND, GUND, /* 0x9F */
-	GUND, 0x40, GUND, 0x01, 0x24, 0x03, GUND, 0x5f, /* 0xA7 */
-	GUND, GUND, GUND, GUND, GUND, GUND, GUND, GUND, /* 0xAF */
-	GUND, GUND, GUND, GUND, GUND, GUND, GUND, GUND, /* 0xB7 */
-	GUND, GUND, GUND, GUND, GUND, GUND, GUND, 0x60, /* 0xBF */
-	0x41, 0x41, 0x41, 0x41, 0x5B, 0x0E, 0x1C, 0x09, /* 0xC7 */
-	0x45, 0x1F, 0x45, 0x45, 0x49, 0x49, 0x49, 0x49, /* 0xCF */
-	GUND, 0x5D, 0x4F, 0x4F, 0x4F, 0x4F, 0x5C, GUND, /* 0xD7 */
-	0x0B, 0x55, 0x55, 0x55, 0x5E, 0x59, GUND, 0x1E, /* 0xDF */
-	0x7F, 0x61, 0x61, 0x61, 0x7B, 0x0F, 0x1D, 0x09, /* 0xE7 */
-	0x04, 0x05, 0x65, 0x65, 0x07, 0x69, 0x69, 0x69, /* 0xEF */
-	GUND, 0x7D, 0x08, 0x6F, 0x6F, 0x6F, 0x7C, GUND, /* 0xF7 */
-	0x0C, 0x06, 0x75, 0x75, 0x7E, 0x79, GUND, 0x79  /* 0xFF */
+static struct codepoint turkish_unicode[] =
+{
+	{ 0x000A, 0x0A }, { 0x000D, 0x0D }, { 0x0020, 0x20 }, { 0x0021, 0x21 },
+	{ 0x0022, 0x22 }, { 0x0023, 0x23 }, { 0x0024, 0x02 }, { 0x0025, 0x25 },
+	{ 0x0026, 0x26 }, { 0x0027, 0x27 }, { 0x0028, 0x28 }, { 0x0029, 0x29 },
+	{ 0x002A, 0x2A }, { 0x002B, 0x2B }, { 0x002C, 0x2C }, { 0x002D, 0x2D },
+	{ 0x002E, 0x2E }, { 0x002F, 0x2F }, { 0x0030, 0x30 }, { 0x0031, 0x31 },
+	{ 0x0032, 0x32 }, { 0x0033, 0x33 }, { 0x0034, 0x34 }, { 0x0035, 0x35 },
+	{ 0x0036, 0x36 }, { 0x0037, 0x37 }, { 0x0038, 0x38 }, { 0x0039, 0x39 },
+	{ 0x003A, 0x3A }, { 0x003B, 0x3B }, { 0x003C, 0x3C }, { 0x003D, 0x3D },
+	{ 0x003E, 0x3E }, { 0x003F, 0x3F }, { 0x0040, 0x00 }, { 0x0041, 0x41 },
+	{ 0x0042, 0x42 }, { 0x0043, 0x43 }, { 0x0044, 0x44 }, { 0x0045, 0x45 },
+	{ 0x0046, 0x46 }, { 0x0047, 0x47 }, { 0x0048, 0x48 }, { 0x0049, 0x49 },
+	{ 0x004A, 0x4A }, { 0x004B, 0x4B }, { 0x004C, 0x4C }, { 0x004D, 0x4D },
+	{ 0x004E, 0x4E }, { 0x004F, 0x4F }, { 0x0050, 0x50 }, { 0x0051, 0x51 },
+	{ 0x0052, 0x52 }, { 0x0053, 0x53 }, { 0x0054, 0x54 }, { 0x0055, 0x55 },
+	{ 0x0056, 0x56 }, { 0x0057, 0x57 }, { 0x0058, 0x58 }, { 0x0059, 0x59 },
+	{ 0x005A, 0x5A }, { 0x005F, 0x11 }, { 0x0061, 0x61 }, { 0x0062, 0x62 },
+	{ 0x0063, 0x63 }, { 0x0064, 0x64 }, { 0x0065, 0x65 }, { 0x0066, 0x66 },
+	{ 0x0067, 0x67 }, { 0x0068, 0x68 }, { 0x0069, 0x69 }, { 0x006A, 0x6A },
+	{ 0x006B, 0x6B }, { 0x006C, 0x6C }, { 0x006D, 0x6D }, { 0x006E, 0x6E },
+	{ 0x006F, 0x6F }, { 0x0070, 0x70 }, { 0x0071, 0x71 }, { 0x0072, 0x72 },
+	{ 0x0073, 0x73 }, { 0x0074, 0x74 }, { 0x0075, 0x75 }, { 0x0076, 0x76 },
+	{ 0x0077, 0x77 }, { 0x0078, 0x78 }, { 0x0079, 0x79 }, { 0x007A, 0x7A },
+	{ 0x00A0, 0x20 }, { 0x00A3, 0x01 }, { 0x00A4, 0x24 }, { 0x00A5, 0x03 },
+	{ 0x00A7, 0x5F }, { 0x00C4, 0x5B }, { 0x00C5, 0x0E }, { 0x00C7, 0x09 },
+	{ 0x00C9, 0x1F }, { 0x00D1, 0x5D }, { 0x00D6, 0x5C }, { 0x00DC, 0x5E },
+	{ 0x00DF, 0x1E }, { 0x00E0, 0x7F }, { 0x00E4, 0x7B }, { 0x00E5, 0x0F },
+	{ 0x00E7, 0x60 }, { 0x00E9, 0x05 }, { 0x00F1, 0x7D }, { 0x00F2, 0x08 },
+	{ 0x00F6, 0x7C }, { 0x00F9, 0x06 }, { 0x00FC, 0x7E }, { 0x011E, 0x0B },
+	{ 0x011F, 0x0C }, { 0x0130, 0x40 }, { 0x0131, 0x07 }, { 0x015E, 0x1C },
+	{ 0x015F, 0x1D }, { 0x0393, 0x13 }, { 0x0394, 0x10 }, { 0x0398, 0x19 },
+	{ 0x039B, 0x14 }, { 0x039E, 0x1A }, { 0x03A0, 0x16 }, { 0x03A3, 0x18 },
+	{ 0x03A6, 0x12 }, { 0x03A8, 0x17 }, { 0x03A9, 0x15 }, { 0x20AC, 0x04 }
 };
 
-/* Starts at 0x0390 */
-static unsigned short greek_unicode_offset = 0x0390;
+/* Appendix A.3.2 in 3GPP TS23.038 */
+static const unsigned short portuguese_gsm[] =
+{
+	0x0040, 0x00A3, 0x0024, 0x00A5, 0x00EA, 0x00E9, 0x00FA, 0x00ED, /* 0x07 */
+	0x00F3, 0x00E7, 0x000A, 0x00D4, 0x00F4, 0x000D, 0x00C1, 0x00E1, /* 0x0F */
+	0x0394, 0x005F, 0x00AA, 0x00C7, 0x00C0, 0x221E, 0x005E, 0x005C, /* 0x17 */
+	0x20ac, 0x00D3, 0x007C, 0x00A0, 0x00C2, 0x00E2, 0x00CA, 0x00C9, /* 0x1F */
+	0x0020, 0x0021, 0x0022, 0x0023, 0x00BA, 0x0025, 0x0026, 0x0027, /* 0x27 */
+	0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, /* 0x2F */
+	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, /* 0x37 */
+	0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, /* 0x3F */
+	0x00A1, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, /* 0x47 */
+	0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, /* 0x4F */
+	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, /* 0x57 */
+	0x0058, 0x0059, 0x005A, 0x00C3, 0x00D5, 0x00DA, 0x00DC, 0x00A7, /* 0x5F */
+	0x007E, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, /* 0x67 */
+	0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, /* 0x6F */
+	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, /* 0x77 */
+	0x0078, 0x0079, 0x007A, 0x00E3, 0x00F5, 0x0060, 0x00FC, 0x00E0  /* 0x7F */
+};
 
-static unsigned short greek_unicode_table[] =
+static struct codepoint portuguese_unicode[] =
 {
-	GUND, GUND, GUND, 0x13, 0x10, GUND, GUND, GUND, /* 0x07 */
-	0x19, GUND, GUND, 0x14, GUND, GUND, 0x1A, GUND, /* 0x0F */
-	0x16, GUND, GUND, 0x18, GUND, GUND, 0x12, GUND, /* 0x17 */
-	0x17, 0x15, GUND, GUND, GUND, GUND, GUND, GUND, /* 0x1F */
+	{ 0x000A, 0x0A }, { 0x000D, 0x0D }, { 0x0020, 0x20 }, { 0x0021, 0x21 },
+	{ 0x0022, 0x22 }, { 0x0023, 0x23 }, { 0x0024, 0x02 }, { 0x0025, 0x25 },
+	{ 0x0026, 0x26 }, { 0x0027, 0x27 }, { 0x0028, 0x28 }, { 0x0029, 0x29 },
+	{ 0x002A, 0x2A }, { 0x002B, 0x2B }, { 0x002C, 0x2C }, { 0x002D, 0x2D },
+	{ 0x002E, 0x2E }, { 0x002F, 0x2F }, { 0x0030, 0x30 }, { 0x0031, 0x31 },
+	{ 0x0032, 0x32 }, { 0x0033, 0x33 }, { 0x0034, 0x34 }, { 0x0035, 0x35 },
+	{ 0x0036, 0x36 }, { 0x0037, 0x37 }, { 0x0038, 0x38 }, { 0x0039, 0x39 },
+	{ 0x003A, 0x3A }, { 0x003B, 0x3B }, { 0x003C, 0x3C }, { 0x003D, 0x3D },
+	{ 0x003E, 0x3E }, { 0x003F, 0x3F }, { 0x0040, 0x00 }, { 0x0041, 0x41 },
+	{ 0x0042, 0x42 }, { 0x0043, 0x43 }, { 0x0044, 0x44 }, { 0x0045, 0x45 },
+	{ 0x0046, 0x46 }, { 0x0047, 0x47 }, { 0x0048, 0x48 }, { 0x0049, 0x49 },
+	{ 0x004A, 0x4A }, { 0x004B, 0x4B }, { 0x004C, 0x4C }, { 0x004D, 0x4D },
+	{ 0x004E, 0x4E }, { 0x004F, 0x4F }, { 0x0050, 0x50 }, { 0x0051, 0x51 },
+	{ 0x0052, 0x52 }, { 0x0053, 0x53 }, { 0x0054, 0x54 }, { 0x0055, 0x55 },
+	{ 0x0056, 0x56 }, { 0x0057, 0x57 }, { 0x0058, 0x58 }, { 0x0059, 0x59 },
+	{ 0x005A, 0x5A }, { 0x005C, 0x17 }, { 0x005E, 0x16 }, { 0x005F, 0x11 },
+	{ 0x0060, 0x7D }, { 0x0061, 0x61 }, { 0x0062, 0x62 }, { 0x0063, 0x63 },
+	{ 0x0064, 0x64 }, { 0x0065, 0x65 }, { 0x0066, 0x66 }, { 0x0067, 0x67 },
+	{ 0x0068, 0x68 }, { 0x0069, 0x69 }, { 0x006A, 0x6A }, { 0x006B, 0x6B },
+	{ 0x006C, 0x6C }, { 0x006D, 0x6D }, { 0x006E, 0x6E }, { 0x006F, 0x6F },
+	{ 0x0070, 0x70 }, { 0x0071, 0x71 }, { 0x0072, 0x72 }, { 0x0073, 0x73 },
+	{ 0x0074, 0x74 }, { 0x0075, 0x75 }, { 0x0076, 0x76 }, { 0x0077, 0x77 },
+	{ 0x0078, 0x78 }, { 0x0079, 0x79 }, { 0x007A, 0x7A }, { 0x007C, 0x1A },
+	{ 0x007E, 0x60 }, { 0x00A0, 0x20 }, { 0x00A3, 0x01 }, { 0x00A5, 0x03 },
+	{ 0x00A7, 0x5F }, { 0x00AA, 0x12 }, { 0x00BA, 0x24 }, { 0x00C0, 0x14 },
+	{ 0x00C1, 0x0E }, { 0x00C2, 0x1C }, { 0x00C3, 0x5B }, { 0x00C7, 0x13 },
+	{ 0x00C9, 0x1F }, { 0x00CA, 0x1E }, { 0x00CD, 0x40 }, { 0x00D3, 0x19 },
+	{ 0x00D4, 0x0B }, { 0x00D5, 0x5C }, { 0x00DA, 0x5D }, { 0x00DC, 0x5E },
+	{ 0x00E0, 0x7F }, { 0x00E1, 0x0F }, { 0x00E2, 0x1D }, { 0x00E3, 0x7B },
+	{ 0x00E7, 0x09 }, { 0x00E9, 0x05 }, { 0x00EA, 0x04 }, { 0x00ED, 0x07 },
+	{ 0x00F3, 0x08 }, { 0x00F4, 0x0C }, { 0x00F5, 0x7C }, { 0x00FA, 0x06 },
+	{ 0x00FC, 0x7E }, { 0x0394, 0x10 }, { 0x20AC, 0x18 }, { 0x221E, 0x15 }
+};
+
+struct single_shift_table {
+	struct codepoint *table;
+	unsigned int len;
+};
+
+static struct single_shift_table gsm_single_shift[] =
+{
+	{ default_ext_gsm, TABLE_SIZE(default_ext_gsm) },
+	{ turkish_ext_gsm, TABLE_SIZE(turkish_ext_gsm) },
+	{ spanish_ext_gsm, TABLE_SIZE(spanish_ext_gsm) },
+	{ portuguese_ext_gsm, TABLE_SIZE(portuguese_ext_gsm) }
+};
+
+static struct single_shift_table unicode_single_shift[] =
+{
+	{ default_ext_unicode, TABLE_SIZE(default_ext_unicode) },
+	{ turkish_ext_unicode, TABLE_SIZE(turkish_ext_unicode) },
+	{ spanish_ext_unicode, TABLE_SIZE(spanish_ext_unicode) },
+	{ portuguese_ext_unicode, TABLE_SIZE(portuguese_ext_unicode) }
+};
+
+static const unsigned short *gsm_locking_shift[] =
+{
+	default_gsm,
+	turkish_gsm,
+	default_gsm,
+	portuguese_gsm
+};
+
+static struct codepoint *unicode_locking_shift[] =
+{
+	default_unicode,
+	turkish_unicode,
+	default_unicode,
+	portuguese_unicode
 };
 
 #define UTF8_LENGTH(c) \
 	((c) < 0x80 ? 1 : \
 	 ((c) < 0x800 ? 2 : 3))
 
-static unsigned short gsm_extension_table_lookup(unsigned char k)
+#define GUND 0xFFFF
+
+static int compare_codepoints(const void *a, const void *b)
 {
-	static unsigned int ext_table_len =
-		(sizeof(gsm_extension) / sizeof(unsigned short)) >> 1;
-	unsigned int i;
-	unsigned short *t;
+	const struct codepoint *ca = (const struct codepoint *)a;
+	const struct codepoint *cb = (const struct codepoint *)b;
 
-	for (i = 0, t = gsm_extension; i < ext_table_len; i++) {
-		if (t[0] == k)
-			return t[1];
-		t += 2;
-	}
+	return (ca->from > cb->from) - (ca->from < cb->from);
+}
+
+static unsigned short codepoint_lookup(struct codepoint *key,
+					struct codepoint *table,
+					unsigned int len)
+{
+	struct codepoint *result = NULL;
+
+	result = bsearch(key, table, len, sizeof(struct codepoint),
+				compare_codepoints);
+
+	return result ? result->to : GUND;
+}
 
-	return 0;
+static unsigned short gsm_locking_shift_lookup(unsigned char k, int lang)
+{
+	/* If language is not defined in 3GPP TS 23.038,
+	 * implementations are instructed to ignore it' */
+	int variant = lang < KNOWN_VARIANTS ? lang : 0;
+
+	return gsm_locking_shift[variant][k];
+}
+
+static unsigned short gsm_single_shift_lookup(unsigned char k, int lang)
+{
+	struct codepoint key = { k, 0 };
+	int variant = lang < KNOWN_VARIANTS ? lang : 0;
+
+	return codepoint_lookup(&key, gsm_single_shift[variant].table,
+				gsm_single_shift[variant].len);
+}
+
+static unsigned short unicode_locking_shift_lookup(unsigned short k, int lang)
+{
+	struct codepoint key = { k, 0 };
+	int variant = lang < KNOWN_VARIANTS ? lang : 0;
+
+	return codepoint_lookup(&key, unicode_locking_shift[variant], 128);
+}
+
+static unsigned short unicode_single_shift_lookup(unsigned short k, int lang)
+{
+	struct codepoint key = { k, 0 };
+	int variant = lang < KNOWN_VARIANTS ? lang : 0;
+
+	return codepoint_lookup(&key, unicode_single_shift[variant].table,
+				unicode_single_shift[variant].len);
 }
 
 /*!
- * Converts text coded using GSM codec into UTF8 encoded text.  If len
- * is less than 0, and terminator character is given, the length is
- * computed automatically.
+ * Converts text coded using GSM codec into UTF8 encoded text, using
+ * the given language identifiers for single shift and locking shift
+ * tables.  If len is less than 0, and terminator character is given,
+ * the length is computed automatically.
  *
  * Returns newly-allocated UTF8 encoded string or NULL if the conversion
  * could not be performed.  Returns the number of bytes read from the
@@ -177,9 +559,11 @@ static unsigned short gsm_extension_table_lookup(unsigned char k)
  * encoded string in items_written (if not NULL) not including the terminal
  * '\0' character.  The caller is reponsible for freeing the returned value.
  */
-char *convert_gsm_to_utf8(const unsigned char *text, long len,
-				long *items_read, long *items_written,
-				unsigned char terminator)
+char *convert_gsm_to_utf8_with_lang(const unsigned char *text, long len,
+					long *items_read, long *items_written,
+					unsigned char terminator,
+					unsigned int locking_lang,
+					unsigned int single_lang)
 {
 	char *res = NULL;
 	char *out;
@@ -209,12 +593,12 @@ char *convert_gsm_to_utf8(const unsigned char *text, long len,
 			if (i >= len)
 				goto err_out;
 
-			c = gsm_extension_table_lookup(text[i]);
+			c = gsm_single_shift_lookup(text[i], single_lang);
 
-			if (c == 0)
+			if (c == GUND)
 				goto err_out;
 		} else {
-			c = gsm_table[text[i]];
+			c = gsm_locking_shift_lookup(text[i], locking_lang);
 		}
 
 		res_length += UTF8_LENGTH(c);
@@ -232,9 +616,9 @@ char *convert_gsm_to_utf8(const unsigned char *text, long len,
 		unsigned short c;
 
 		if (text[i] == 0x1b)
-			c = gsm_extension_table_lookup(text[++i]);
+			c = gsm_single_shift_lookup(text[++i], single_lang);
 		else
-			c = gsm_table[text[i]];
+			c = gsm_locking_shift_lookup(text[i], locking_lang);
 
 		out += g_unichar_to_utf8(c, out);
 
@@ -253,22 +637,13 @@ err_out:
 	return res;
 }
 
-static unsigned short unicode_to_gsm(unsigned short c)
+char *convert_gsm_to_utf8(const unsigned char *text, long len,
+				long *items_read, long *items_written,
+				unsigned char terminator)
 {
-	static int greek_unicode_size = sizeof(greek_unicode_table) /
-					sizeof(unsigned short);
-	unsigned short converted = GUND;
-
-	if (c == 0x20AC)
-		converted = 0x1B65;
-	else if (c < 256)
-		converted = unicode_256_table[c];
-	else if ((c >= greek_unicode_offset) &&
-			(c < (greek_unicode_offset + greek_unicode_size))) {
-		converted = greek_unicode_table[c-greek_unicode_offset];
-	}
-
-	return converted;
+	return convert_gsm_to_utf8_with_lang(text, len, items_read,
+						items_written,
+						terminator, 0, 0);
 }
 
 /*!
@@ -281,9 +656,11 @@ static unsigned short unicode_to_gsm(unsigned short c)
  * the actual number of bytes read.  If items_written is not NULL, contains
  * the number of bytes written.
  */
-unsigned char *convert_utf8_to_gsm(const char *text, long len,
+unsigned char *convert_utf8_to_gsm_with_lang(const char *text, long len,
 					long *items_read, long *items_written,
-					unsigned char terminator)
+					unsigned char terminator,
+					unsigned int locking_lang,
+					unsigned int single_lang)
 {
 	long nchars = 0;
 	const char *in;
@@ -306,7 +683,10 @@ unsigned char *convert_utf8_to_gsm(const char *text, long len,
 		if (c > 0xffff)
 			goto err_out;
 
-		converted = unicode_to_gsm(c);
+		converted = unicode_locking_shift_lookup(c, locking_lang);
+
+		if (converted == GUND)
+			converted = unicode_single_shift_lookup(c, single_lang);
 
 		if (converted == GUND)
 			goto err_out;
@@ -332,7 +712,11 @@ unsigned char *convert_utf8_to_gsm(const char *text, long len,
 
 		gunichar c = g_utf8_get_char(in);
 
-		converted = unicode_to_gsm(c);
+		converted = unicode_locking_shift_lookup(c, locking_lang);
+
+		if (converted == GUND)
+			converted = unicode_single_shift_lookup(c, single_lang);
+
 		if (converted & 0x1b00) {
 			*out = 0x1b;
 			++out;
@@ -357,6 +741,15 @@ err_out:
 	return res;
 }
 
+unsigned char *convert_utf8_to_gsm(const char *text, long len,
+					long *items_read, long *items_written,
+					unsigned char terminator)
+{
+	return convert_utf8_to_gsm_with_lang(text, len, items_read,
+						items_written,
+						terminator, 0, 0);
+}
+
 /*!
  * Decodes the hex encoded data and converts to a byte array.  If terminator
  * is not 0, the terminator character is appended to the end of the result.
@@ -779,14 +1172,14 @@ char *sim_string_to_utf8(const unsigned char *buffer, int length)
 			if (i >= length)
 				return NULL;
 
-			c = gsm_extension_table_lookup(buffer[i++]);
+			c = gsm_single_shift_lookup(buffer[i++], 0);
 
 			if (c == 0)
 				return NULL;
 
 			j += 2;
 		} else {
-			c = gsm_table[buffer[i++]];
+			c = gsm_locking_shift_lookup(buffer[i++], 0);
 			j += 1;
 		}
 
@@ -816,9 +1209,9 @@ char *sim_string_to_utf8(const unsigned char *buffer, int length)
 			c = (buffer[i++] & 0x7f) + ucs2_offset;
 		else if (buffer[i] == 0x1b) {
 			++i;
-			c = gsm_extension_table_lookup(buffer[i++]);
+			c = gsm_single_shift_lookup(buffer[i++], 0);
 		} else
-			c = gsm_table[buffer[i++]];
+			c = gsm_locking_shift_lookup(buffer[i++], 0);
 
 		out += g_unichar_to_utf8(c, out);
 	}
diff --git a/src/util.h b/src/util.h
index 46bb3ba..1d2e01f 100644
--- a/src/util.h
+++ b/src/util.h
@@ -21,9 +21,20 @@
 
 char *convert_gsm_to_utf8(const unsigned char *text, long len, long *items_read,
 				long *items_written, unsigned char terminator);
+
+char *convert_gsm_to_utf8_with_lang(const unsigned char *text, long len, long *items_read,
+				long *items_written, unsigned char terminator,
+				unsigned int locking_shift_lang,
+				unsigned int single_shift_lang);
+
 unsigned char *convert_utf8_to_gsm(const char *text, long len, long *items_read,
 				long *items_written, unsigned char terminator);
 
+unsigned char *convert_utf8_to_gsm_with_lang(const char *text, long len, long *items_read,
+				long *items_written, unsigned char terminator,
+				unsigned int locking_shift_lang,
+				unsigned int single_shifth_lang);
+
 unsigned char *decode_hex_own_buf(const char *in, long len, long *items_written,
 					unsigned char terminator,
 					unsigned char *buf);
-- 
1.6.0.4


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: 0002-Add-unit-test-for-Turkish-variant.patch --]
[-- Type: text/x-patch, Size: 5320 bytes --]

From ac26628926f012a929974f67958ac1e19c388c5e Mon Sep 17 00:00:00 2001
From: Aki Niemi <aki.niemi@nokia.com>
Date: Fri, 4 Sep 2009 17:35:43 +0300
Subject: [PATCH] Add unit test for Turkish variant

Also fixes an error in the default table unit test.
---
 unit/test-util.c |  217 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 216 insertions(+), 1 deletions(-)

diff --git a/unit/test-util.c b/unit/test-util.c
index cc96442..7b9a372 100644
--- a/unit/test-util.c
+++ b/unit/test-util.c
@@ -45,7 +45,7 @@ unsigned short gsm_to_unicode_map[] =
 0x06,	0x00F9,
 0x07,	0x00EC,
 0x08,	0x00F2,
-0x09,	0x00E7,
+0x09,	0x00C7,
 0x0A,	0x000A,
 0x0B,	0x00D8,
 0x0C,	0x00F8,
@@ -176,6 +176,159 @@ unsigned short gsm_to_unicode_map[] =
 0x7F,	0x00E0,
 };
 
+unsigned short gsm_turkish_to_unicode_map[] =
+{
+0x00, 0x0040,
+0x01, 0x00A3,
+0x02, 0x0024,
+0x03, 0x00A5,
+0x04, 0x20AC,
+0x05, 0x00E9,
+0x06, 0x00F9,
+0x07, 0x0131,
+0x08, 0x00F2,
+0x09, 0x00C7,
+0x0A, 0x000A,
+0x0B, 0x011E,
+0x0C, 0x011F,
+0x0D, 0x000D,
+0x0E, 0x00C5,
+0x0F, 0x00E5,
+0x10, 0x0394,
+0x11, 0x005F,
+0x12, 0x03A6,
+0x13, 0x0393,
+0x14, 0x039B,
+0x15, 0x03A9,
+0x16, 0x03A0,
+0x17, 0x03A8,
+0x18, 0x03A3,
+0x19, 0x0398,
+0x1A, 0x039E,
+/* We're not including some of the single shift codes to this map,
+* because the turkish variant isn't symmetric, i.e., the same
+* character is present in both the locking shift table as well as the
+* single shift table */
+0x1B0A, 0x000C,
+0x1B14, 0x005E,
+0x1B28, 0x007B,
+0x1B29, 0x007D,
+0x1B2F, 0x005C,
+0x1B3C, 0x005B,
+0x1B3D, 0x007E,
+0x1B3E, 0x005D,
+0x1B40, 0x007C,
+/*0x1B47, 0x011E,*/
+/*0x1B49, 0x0130,*/
+/*0x1B53, 0x015E,*/
+/*0x1B63, 0x00E7,*/
+/*0x1B65, 0x20AC,*/
+/*0x1B67, 0x011F,*/
+/*0x1B69, 0x0131,*/
+/*0x1B73, 0x015F,*/
+0x1C, 0x015E,
+0x1D, 0x015F,
+0x1E, 0x00DF,
+0x1F, 0x00C9,
+0x20, 0x0020,
+0x21, 0x0021,
+0x22, 0x0022,
+0x23, 0x0023,
+0x24, 0x00A4,
+0x25, 0x0025,
+0x26, 0x0026,
+0x27, 0x0027,
+0x28, 0x0028,
+0x29, 0x0029,
+0x2A, 0x002A,
+0x2B, 0x002B,
+0x2C, 0x002C,
+0x2D, 0x002D,
+0x2E, 0x002E,
+0x2F, 0x002F,
+0x30, 0x0030,
+0x31, 0x0031,
+0x32, 0x0032,
+0x33, 0x0033,
+0x34, 0x0034,
+0x35, 0x0035,
+0x36, 0x0036,
+0x37, 0x0037,
+0x38, 0x0038,
+0x39, 0x0039,
+0x40, 0x0130,
+0x3A, 0x003A,
+0x3B, 0x003B,
+0x3C, 0x003C,
+0x3D, 0x003D,
+0x3E, 0x003E,
+0x3F, 0x003F,
+0x40, 0x0130,
+0x41, 0x0041,
+0x42, 0x0042,
+0x43, 0x0043,
+0x44, 0x0044,
+0x45, 0x0045,
+0x46, 0x0046,
+0x47, 0x0047,
+0x48, 0x0048,
+0x49, 0x0049,
+0x4A, 0x004A,
+0x4B, 0x004B,
+0x4C, 0x004C,
+0x4D, 0x004D,
+0x4E, 0x004E,
+0x4F, 0x004F,
+0x50, 0x0050,
+0x51, 0x0051,
+0x52, 0x0052,
+0x53, 0x0053,
+0x54, 0x0054,
+0x55, 0x0055,
+0x56, 0x0056,
+0x57, 0x0057,
+0x58, 0x0058,
+0x59, 0x0059,
+0x5A, 0x005A,
+0x5B, 0x00C4,
+0x5C, 0x00D6,
+0x5D, 0x00D1,
+0x5E, 0x00DC,
+0x5F, 0x00A7,
+0x60, 0x00E7,
+0x61, 0x0061,
+0x62, 0x0062,
+0x63, 0x0063,
+0x64, 0x0064,
+0x65, 0x0065,
+0x66, 0x0066,
+0x67, 0x0067,
+0x68, 0x0068,
+0x69, 0x0069,
+0x6A, 0x006A,
+0x6B, 0x006B,
+0x6C, 0x006C,
+0x6D, 0x006D,
+0x6E, 0x006E,
+0x6F, 0x006F,
+0x70, 0x0070,
+0x71, 0x0071,
+0x72, 0x0072,
+0x73, 0x0073,
+0x74, 0x0074,
+0x75, 0x0075,
+0x76, 0x0076,
+0x77, 0x0077,
+0x78, 0x0078,
+0x79, 0x0079,
+0x7A, 0x007A,
+0x7B, 0x00E4,
+0x7C, 0x00F6,
+0x7D, 0x00F1,
+0x7E, 0x00FC,
+0x7F, 0x00E0
+};
+
 #define UTF8_LENGTH(c) \
 	((c) < 0x80 ? 1 : \
 	 ((c) < 0x800 ? 2 : 3))
@@ -268,6 +421,67 @@ static void test_valid()
 	}
 }
 
+static void test_valid_turkish()
+{
+	long nwritten;
+	long nread;
+	char *res;
+	int i;
+	long size;
+	gunichar *verify;
+	unsigned char *back;
+
+	unsigned char buf[2];
+
+	static int map_size =
+		sizeof(gsm_turkish_to_unicode_map) / sizeof(unsigned short) / 2;
+
+	for (i = 0; i < map_size; i++) {
+		unsigned short c = gsm_turkish_to_unicode_map[i*2];
+
+		if (c & 0x1b00) {
+			buf[0] = 0x1b;
+			buf[1] = c & 0x7f;
+			size = 2;
+		} else {
+			size = 1;
+			buf[0] = c & 0x7f;
+		}
+
+		res = convert_gsm_to_utf8_with_lang(buf, size, &nread, &nwritten, 0, 1, 1);
+		g_assert(res);
+
+		if (g_test_verbose())
+			g_print("size: %ld, nread:%ld, nwritten:%ld, %s\n",
+				size, nread, nwritten, res);
+
+		g_assert(nread == size);
+
+		verify = g_utf8_to_ucs4(res, -1, NULL, NULL, NULL);
+
+		g_assert(verify[0] == gsm_turkish_to_unicode_map[i*2+1]);
+		g_assert(verify[1] == 0);
+
+		g_assert(nwritten == UTF8_LENGTH(verify[0]));
+
+		back = convert_utf8_to_gsm_with_lang(res, -1, &nread, &nwritten, 0, 1, 1);
+
+		g_assert(back);
+
+		g_assert(nwritten == size);
+		if (c & 0x1b00) {
+			g_assert(back[0] == 0x1b);
+			g_assert(back[1] == (c & 0x7f));
+		} else {
+			g_assert(back[0] == (c & 0x7f));
+		}
+
+		g_free(back);
+		g_free(verify);
+		g_free(res);
+	}
+}
+
 static const char hex_packed[] = "493A283D0795C3F33C88FE06C9CB6132885EC6D34"
 					"1EDF27C1E3E97E7207B3A0C0A5241E377BB1D"
 					"7693E72E";
@@ -693,6 +907,7 @@ int main(int argc, char **argv)
 
 	g_test_add_func("/testutil/Invalid Conversions", test_invalid);
 	g_test_add_func("/testutil/Valid Conversions", test_valid);
+	g_test_add_func("/testutil/Valid Turkish National Variant Conversions", test_valid_turkish);
 	g_test_add_func("/testutil/Decode Encode", test_decode_encode);
 	g_test_add_func("/testutil/Pack Size", test_pack_size);
 	g_test_add_func("/testutil/CBS CR Handling", test_cr_handling);
-- 
1.6.0.4


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #4: 0003-Use-SMS-national-language-identifier-when-decoding.patch --]
[-- Type: text/x-patch, Size: 4091 bytes --]

From fa1bf5bad838ec628cd9e48b43a4607f2e47373c Mon Sep 17 00:00:00 2001
From: Aki Niemi <aki.niemi@nokia.com>
Date: Fri, 4 Sep 2009 17:36:54 +0300
Subject: [PATCH] Use SMS national language identifier when decoding

Adds support for decoding SMSs encoded using national language single
shift and locking shift tables.
---
 src/smsutil.c |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/smsutil.h |    9 ++++++++
 2 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/src/smsutil.c b/src/smsutil.c
index fcff9aa..2433419 100644
--- a/src/smsutil.c
+++ b/src/smsutil.c
@@ -1936,6 +1936,59 @@ gboolean sms_extract_concatenation(const struct sms *sms, guint16 *ref_num,
 	return TRUE;
 }
 
+gboolean sms_extract_language_variant(const struct sms *sms, int *locking,
+					int *single)
+{
+	struct sms_udh_iter iter;
+	enum sms_iei iei;
+	guint8 variant;
+	*locking = SMS_LANGUAGE_DEFAULT;
+	*single = SMS_LANGUAGE_DEFAULT;
+	
+	/* We must ignore the entire user_data header here:
+	 * If the length of the User Data Header is such that there
+	 * are too few or too many octets in the final Information
+	 * Element then the whole User Data Header shall be ignored.
+	 */
+	if (!sms_udh_iter_init(sms, &iter))
+		return FALSE;
+
+	/* According to the specification, we have to use the last
+	 * useable header:
+	 * In the event that IEs determined as not repeatable are
+	 * duplicated, the last occurrence of the IE shall be used.
+	 * In the event that two or more IEs occur which have mutually
+	 * exclusive meanings (e.g. an 8bit port address and a 16bit
+	 * port address), then the last occurring IE shall be used.
+	 */
+	while ((iei = sms_udh_iter_get_ie_type(&iter)) !=
+			SMS_IEI_INVALID) {
+		switch (iei) {
+		case SMS_IEI_NATIONAL_LANGUAGE_SINGLE_SHIFT:
+			if (sms_udh_iter_get_ie_length(&iter) != 1)
+				break;
+
+			sms_udh_iter_get_ie_data(&iter, &variant);
+			*single = (unsigned int)variant;
+			break;
+
+		case SMS_IEI_NATIONAL_LANGUAGE_LOCKING_SHIFT:
+			if (sms_udh_iter_get_ie_length(&iter) != 1)
+				break;
+
+			sms_udh_iter_get_ie_data(&iter, &variant);
+			*locking = (unsigned int)variant;
+			break;
+		default:
+			break;
+		}
+
+		sms_udh_iter_next(&iter);
+	}
+
+	return TRUE;
+}
+
 /*!
  * Decodes a list of SMSes that contain a datagram.  The list must be
  * sorted in order of the sequence number.  This function assumes that
@@ -2063,6 +2116,8 @@ char *sms_decode_text(GSList *sms_list)
 		if (charset == SMS_CHARSET_7BIT) {
 			unsigned char buf[160];
 			long written;
+			int locking_shift = 0;
+			int single_shift = 0;
 			int max_chars = sms_text_capacity_gsm(udl, taken);
 
 			unpack_7bit_own_buf(ud + taken, udl_in_bytes - taken,
@@ -2073,8 +2128,12 @@ char *sms_decode_text(GSList *sms_list)
 			if (buf[written-1] == 0x1b)
 				written = written - 1;
 
-			converted = convert_gsm_to_utf8(buf, written,
-							NULL, NULL, 0);
+			sms_extract_language_variant(sms, &locking_shift, &single_shift);
+
+			converted = convert_gsm_to_utf8_with_lang(buf, written,
+								NULL, NULL, 0,
+								locking_shift,
+								single_shift);
 		} else {
 			const gchar *from = (const gchar *)(ud + taken);
 			/* According to the spec: A UCS2 character shall not be
diff --git a/src/smsutil.h b/src/smsutil.h
index 95d0c78..d23772b 100644
--- a/src/smsutil.h
+++ b/src/smsutil.h
@@ -153,6 +153,13 @@ enum sms_charset {
 	SMS_CHARSET_UCS2 = 2,
 };
 
+enum sms_language {
+	SMS_LANGUAGE_DEFAULT = 0,
+	SMS_LANGUAGE_TURKISH = 1,
+	SMS_LANGUAGE_SPANISH = 2,
+	SMS_LANGUAGE_PORTUGUESE = 3
+};
+
 enum sms_mwi_type {
 	SMS_MWI_TYPE_VOICE = 0,
 	SMS_MWI_TYPE_FAX = 1,
@@ -450,6 +457,8 @@ gboolean sms_extract_app_port(const struct sms *sms, int *dst, int *src,
 				gboolean *is_8bit);
 gboolean sms_extract_concatenation(const struct sms *sms, guint16 *ref_num,
 					guint8 *max_msgs, guint8 *seq_num);
+gboolean sms_extract_language_variant(const struct sms *sms, int *locking,
+					int *single);
 
 unsigned char *sms_decode_datagram(GSList *sms_list, long *out_len);
 char *sms_decode_text(GSList *sms_list);
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: Add support for SMS national language identifiers
  2009-09-04 15:27 Add support for SMS national language identifiers Aki Niemi
@ 2009-09-04 16:21 ` Denis Kenzior
  2009-09-07 11:49   ` Aki Niemi
  0 siblings, 1 reply; 3+ messages in thread
From: Denis Kenzior @ 2009-09-04 16:21 UTC (permalink / raw)
  To: ofono

[-- Attachment #1: Type: text/plain, Size: 2075 bytes --]

Hi Aki,

> Hi All,
>
> Here is a set of patches to add support for decoding SMSs that have
> been encoded using national language tables instead of the default GSM
> 7bit tables. I was planning to push a couple of these patches
> directly, but the changes turned out a bit more extensive than I
> originally thought. Please take a look and comment.

Just a couple of minor nitpicks:
+struct single_shift_table {
+	struct codepoint *table;
+	unsigned int len;
+};
+
+static struct single_shift_table gsm_single_shift[] =
+{
+	{ default_ext_gsm, TABLE_SIZE(default_ext_gsm) },
+	{ turkish_ext_gsm, TABLE_SIZE(turkish_ext_gsm) },
+	{ spanish_ext_gsm, TABLE_SIZE(spanish_ext_gsm) },
+	{ portuguese_ext_gsm, TABLE_SIZE(portuguese_ext_gsm) }
+};
+
+static struct single_shift_table unicode_single_shift[] =
+{
+	{ default_ext_unicode, TABLE_SIZE(default_ext_unicode) },
+	{ turkish_ext_unicode, TABLE_SIZE(turkish_ext_unicode) },
+	{ spanish_ext_unicode, TABLE_SIZE(spanish_ext_unicode) },
+	{ portuguese_ext_unicode, TABLE_SIZE(portuguese_ext_unicode) }
+};
+
+static const unsigned short *gsm_locking_shift[] =
+{
+	default_gsm,
+	turkish_gsm,
+	default_gsm,
+	portuguese_gsm
+};
+
+static struct codepoint *unicode_locking_shift[] =
+{
+	default_unicode,
+	turkish_unicode,
+	default_unicode,
+	portuguese_unicode

Can we put all of these into a single table?

+char *convert_gsm_to_utf8_with_lang(const unsigned char *text, long len,
+					long *items_read, long *items_written,
+					unsigned char terminator,
+					unsigned int locking_lang,
+					unsigned int single_lang)

locking_lang & single_lang should be unsigned char or an enum.

+gboolean sms_extract_language_variant(const struct sms *sms, int *locking,
+					int *single);

locking & single should use guint8.

+enum sms_language {
+	SMS_LANGUAGE_DEFAULT = 0,
+	SMS_LANGUAGE_TURKISH = 1,
+	SMS_LANGUAGE_SPANISH = 2,
+	SMS_LANGUAGE_PORTUGUESE = 3
+};
+

This part isn't used anywhere, should we just keep this out for now?

Regards,
-Denis

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: Add support for SMS national language identifiers
  2009-09-04 16:21 ` Denis Kenzior
@ 2009-09-07 11:49   ` Aki Niemi
  0 siblings, 0 replies; 3+ messages in thread
From: Aki Niemi @ 2009-09-07 11:49 UTC (permalink / raw)
  To: ofono

[-- Attachment #1: Type: text/plain, Size: 2649 bytes --]

Hi Denis,

2009/9/4 Denis Kenzior <denkenz@gmail.com>:
> Just a couple of minor nitpicks:
> +struct single_shift_table {
> +       struct codepoint *table;
> +       unsigned int len;
> +};
> +
> +static struct single_shift_table gsm_single_shift[] =
> +{
> +       { default_ext_gsm, TABLE_SIZE(default_ext_gsm) },
> +       { turkish_ext_gsm, TABLE_SIZE(turkish_ext_gsm) },
> +       { spanish_ext_gsm, TABLE_SIZE(spanish_ext_gsm) },
> +       { portuguese_ext_gsm, TABLE_SIZE(portuguese_ext_gsm) }
> +};
> +
> +static struct single_shift_table unicode_single_shift[] =
> +{
> +       { default_ext_unicode, TABLE_SIZE(default_ext_unicode) },
> +       { turkish_ext_unicode, TABLE_SIZE(turkish_ext_unicode) },
> +       { spanish_ext_unicode, TABLE_SIZE(spanish_ext_unicode) },
> +       { portuguese_ext_unicode, TABLE_SIZE(portuguese_ext_unicode) }
> +};
> +
> +static const unsigned short *gsm_locking_shift[] =
> +{
> +       default_gsm,
> +       turkish_gsm,
> +       default_gsm,
> +       portuguese_gsm
> +};
> +
> +static struct codepoint *unicode_locking_shift[] =
> +{
> +       default_unicode,
> +       turkish_unicode,
> +       default_unicode,
> +       portuguese_unicode
>
> Can we put all of these into a single table?

Can you give an example of what you mean? I tried doing something like
that, but the initializer turned a bit hairy, so I reckoned in the end
there's not much to gain by doing that.

> +char *convert_gsm_to_utf8_with_lang(const unsigned char *text, long len,
> +                                       long *items_read, long *items_written,
> +                                       unsigned char terminator,
> +                                       unsigned int locking_lang,
> +                                       unsigned int single_lang)
>
> locking_lang & single_lang should be unsigned char or an enum.

Sure.

> +gboolean sms_extract_language_variant(const struct sms *sms, int *locking,
> +                                       int *single);
>
> locking & single should use guint8.

Indeed.

> +enum sms_language {
> +       SMS_LANGUAGE_DEFAULT = 0,
> +       SMS_LANGUAGE_TURKISH = 1,
> +       SMS_LANGUAGE_SPANISH = 2,
> +       SMS_LANGUAGE_PORTUGUESE = 3
> +};
> +
>
> This part isn't used anywhere, should we just keep this out for now?

We can leave it out. It'll come in handy if/when we want to add
support on the encoding path.

Cheers,
Aki

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2009-09-07 11:49 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-09-04 15:27 Add support for SMS national language identifiers Aki Niemi
2009-09-04 16:21 ` Denis Kenzior
2009-09-07 11:49   ` Aki Niemi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.