[PATCH]console:UTF-8 mode compatibility fixes

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH]console:UTF-8 mode compatibility fixes
@ 2006-02-17 23:33 Adam Tla/lka
  2006-02-18 10:59 ` Andrew Morton
                   ` (3 more replies)
  0 siblings, 4 replies; 21+ messages in thread
From: Adam Tla/lka @ 2006-02-17 23:33 UTC (permalink / raw)
  To: linux-kernel; +Cc: torvalds


This patch applies to 2.6.15.3 kernel sources to drivers/char/vt.c file.
It should work with other versions too.

Changed console behaviour so in UTF-8 mode vt100 alternate character
sequences work as described in terminfo/termcap linux terminal definition.
Programs can use vt100 control seqences - smacs, rmacs and acsc  characters
in UTF-8 mode in the same way as in normal mode so one definition is always
valid - current behaviour make these seqences not working in UTF-8 mode.

Added reporting malformed UTF-8 seqences as replacement glyphs.
I think that terminal should always display something rather then ignoring
these kind of data as it does now. Also it sticks to Unicode standards
saying that every wrong byte should be reported. It is more human readable
too in case of Latin subsets including ASCII chars.

Signed-off-by: Adam Tla/lka <atlka@pg.gda.pl>

---

--- drivers/char/vt_orig.c	2006-02-13 11:33:54.000000000 +0100
+++ drivers/char/vt.c	2006-02-17 23:05:50.000000000 +0100
@@ -63,6 +63,12 @@
  *
  * Removed console_lock, enabled interrupts across all console operations
  * 13 March 2001, Andrew Morton
+ *
+ * Fixed UTF-8 mode so alternate charset modes always work without need
+ * of different linux terminal definition for normal and UTF-8 modes
+ * preserving backward US-ASCII and VT100 semigraphics compatibility,
+ * malformed UTF sequences represented as sequences of replacement glyphs
+ * by Adam Tla/lka <atlka@pg.gda.pl>, Feb 2006
  */
 
 #include <linux/module.h>
@@ -1991,17 +1997,23 @@
 		/* Do no translation at all in control states */
 		if (vc->vc_state != ESnormal) {
 			tc = c;
-		} else if (vc->vc_utf) {
+		} else if (vc->vc_utf && !vc->vc_disp_ctrl) {
 		    /* Combine UTF-8 into Unicode */
-		    /* Incomplete characters silently ignored */
+		    /* Malformed sequence represented as replacement glyphs */
+rescan_last_byte:
 		    if(c > 0x7f) {
-			if (vc->vc_utf_count > 0 && (c & 0xc0) == 0x80) {
-				vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
-				vc->vc_utf_count--;
-				if (vc->vc_utf_count == 0)
-				    tc = c = vc->vc_utf_char;
-				else continue;
+			if (vc->vc_utf_count) {
+			       if ((c & 0xc0) == 0x80) {
+				       vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
+       				       if (--vc->vc_utf_count) {
+					       vc->vc_npar++;
+				   	       continue;
+       				       }
+				       tc = c = vc->vc_utf_char;
+			       } else
+				       goto insert_replacement_glyph;
 			} else {
+				vc->vc_npar = 0;
 				if ((c & 0xe0) == 0xc0) {
 				    vc->vc_utf_count = 1;
 				    vc->vc_utf_char = (c & 0x1f);
@@ -2018,12 +2030,13 @@
 				    vc->vc_utf_count = 5;
 				    vc->vc_utf_char = (c & 0x01);
 				} else
-				    vc->vc_utf_count = 0;
+	    			    goto insert_replacement_glyph;
 				continue;
 			      }
 		    } else {
+		      if (vc->vc_utf_count)
+	  		      goto insert_replacement_glyph;
 		      tc = c;
-		      vc->vc_utf_count = 0;
 		    }
 		} else {	/* no utf */
 		  tc = vc->vc_translate[vc->vc_toggle_meta ? (c | 0x80) : c];
@@ -2040,8 +2053,8 @@
                  * direct-to-font zone in UTF-8 mode.
                  */
                 ok = tc && (c >= 32 ||
-			    (!vc->vc_utf && !(((vc->vc_disp_ctrl ? CTRL_ALWAYS
-						: CTRL_ACTION) >> c) & 1)))
+			    !(vc->vc_disp_ctrl ? (CTRL_ALWAYS >> c) & 1 :
+				  vc->vc_utf || ((CTRL_ACTION >> c) & 1)))
 			&& (c != 127 || vc->vc_disp_ctrl)
 			&& (c != 128+27);
 
@@ -2051,6 +2064,7 @@
 			if ( tc == -4 ) {
                                 /* If we got -4 (not found) then see if we have
                                    defined a replacement character (U+FFFD) */
+insert_replacement_glyph:
                                 tc = conv_uni_to_pc(vc, 0xfffd);
 
 				/* One reason for the -4 can be that we just
@@ -2062,9 +2076,19 @@
                                 /* Bad hash table -- hope for the best */
                                 tc = c;
                         }
-			if (tc & ~charmask)
+			if (tc & ~charmask) {
+				/*  no replacement glyph */
+				if (vc->vc_utf_count) {
+					vc->vc_utf_count = 0;
+					if (vc->vc_npar) {
+						c = orig;
+						goto rescan_last_byte;
+					}
+				}
                                 continue; /* Conversion failed */
+			}
 
+repeat_replacement_glyph:
 			if (vc->vc_need_wrap || vc->vc_decim)
 				FLUSH
 			if (vc->vc_need_wrap) {
@@ -2088,6 +2112,15 @@
 				vc->vc_x++;
 				draw_to = (vc->vc_pos += 2);
 			}
+			if (vc->vc_utf_count) {
+				if (vc->vc_npar) {
+					vc->vc_npar--;
+					goto repeat_replacement_glyph;
+				}
+				vc->vc_utf_count = 0;
+				c = orig;
+				goto rescan_last_byte;
+			}
 			continue;
 		}
 		FLUSH

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-17 23:33 [PATCH]console:UTF-8 mode compatibility fixes Adam Tla/lka
@ 2006-02-18 10:59 ` Andrew Morton
  2006-02-18 16:01   ` Adam Tlałka
  2006-02-18 14:17 ` Alexander E. Patrakov
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 21+ messages in thread
From: Andrew Morton @ 2006-02-18 10:59 UTC (permalink / raw)
  To: Adam Tla/lka; +Cc: linux-kernel, torvalds

Adam Tla/lka <atlka@pg.gda.pl> wrote:
>
> 
> This patch applies to 2.6.15.3 kernel sources to drivers/char/vt.c file.
> It should work with other versions too.
> 
> Changed console behaviour so in UTF-8 mode vt100 alternate character
> sequences work as described in terminfo/termcap linux terminal definition.
> Programs can use vt100 control seqences - smacs, rmacs and acsc  characters
> in UTF-8 mode in the same way as in normal mode so one definition is always
> valid - current behaviour make these seqences not working in UTF-8 mode.
> 
> Added reporting malformed UTF-8 seqences as replacement glyphs.
> I think that terminal should always display something rather then ignoring
> these kind of data as it does now. Also it sticks to Unicode standards
> saying that every wrong byte should be reported. It is more human readable
> too in case of Latin subsets including ASCII chars.
> 
> ...
>
> -		} else if (vc->vc_utf) {
> +		} else if (vc->vc_utf && !vc->vc_disp_ctrl) {
>  		    /* Combine UTF-8 into Unicode */
> -		    /* Incomplete characters silently ignored */
> +		    /* Malformed sequence represented as replacement glyphs */
> +rescan_last_byte:
>  		    if(c > 0x7f) {
>
> ...
>
> +					if (vc->vc_npar) {
> +						c = orig;
> +						goto rescan_last_byte;
> +					}
>
> ...
>
> +				}
> +				vc->vc_utf_count = 0;
> +				c = orig;
> +				goto rescan_last_byte;
> +			}
>  			continue;
>  		}

I spent some time trying to work out why this cannot cause an infinite loop
and gave up.  Can you explain?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-18 10:59 ` Andrew Morton
@ 2006-02-18 16:01   ` Adam Tlałka
  2006-02-19  4:24     ` Alexander E. Patrakov
  0 siblings, 1 reply; 21+ messages in thread
From: Adam Tlałka @ 2006-02-18 16:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, torvalds

Użytkownik Andrew Morton napisał:

> Adam Tla/lka <atlka@pg.gda.pl> wrote:
> 
>>
>>This patch applies to 2.6.15.3 kernel sources to drivers/char/vt.c file.
>>It should work with other versions too.
>>
>>Changed console behaviour so in UTF-8 mode vt100 alternate character
>>sequences work as described in terminfo/termcap linux terminal definition.
>>Programs can use vt100 control seqences - smacs, rmacs and acsc  characters
>>in UTF-8 mode in the same way as in normal mode so one definition is always
>>valid - current behaviour make these seqences not working in UTF-8 mode.
>>
>>Added reporting malformed UTF-8 seqences as replacement glyphs.
>>I think that terminal should always display something rather then ignoring
>>these kind of data as it does now. Also it sticks to Unicode standards
>>saying that every wrong byte should be reported. It is more human readable
>>too in case of Latin subsets including ASCII chars.
>>
>>...
>>
>>-		} else if (vc->vc_utf) {
>>+		} else if (vc->vc_utf && !vc->vc_disp_ctrl) {
>> 		    /* Combine UTF-8 into Unicode */
>>-		    /* Incomplete characters silently ignored */
>>+		    /* Malformed sequence represented as replacement glyphs */
>>+rescan_last_byte:
>> 		    if(c > 0x7f) {
>>
>>...
>>
>>+					if (vc->vc_npar) {
>>+						c = orig;
>>+						goto rescan_last_byte;
>>+					}
>>
>>...
>>
>>+				}
>>+				vc->vc_utf_count = 0;
>>+				c = orig;
>>+				goto rescan_last_byte;
>>+			}
>> 			continue;
>> 		}
> 
> 
> I spent some time trying to work out why this cannot cause an infinite loop
> and gave up.  Can you explain?

1. this code is executed only if vc_utf_count != 0
which means uncompleted UTF-8 sequence, because in case of proper UTF-8 
sequence or normal mode vc_utf_count == 0 in these places of code.

2. vc_npar is not used while completing UTF-seqence so I used it as a 
counter of scanned sequence continuation bytes, it is set to 0 if begin 
of UTF-8 seqence is detected and vc_utf_count set to number of 
continuation bytes

3. when you can't display replacement glyph bad sequence is ignored as 
previously so vc_utf_count and vc_npar is zeroed in case of  malformed 
UTF-8 seqence and there is no loop - anyway replacement glyph
should always be defined IMHO or I must change this code because
it seems not to be correct to use c as tc as a last resort because in 
this case c means byte value which malformed scanned seqence so it is 
not valuable for us. Maybe the better way is to use "?" char as a last
resort instead of c value. What do you think?
Maybe I should remember all bytes of the UTF-sequence to use their 
values as a last resort char in case of malformed sequence and 0xfffd
not defined?

Regards
-- 
Adam Tlałka       mailto:atlka@pg.gda.pl    ^v^ ^v^ ^v^
System  & Network Administration Group       - - - ~~~~~~
Computer Center,  Gdańsk University of Technology, Poland
PGP public key:   finger atlka@sunrise.pg.gda.pl

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-18 16:01   ` Adam Tlałka
@ 2006-02-19  4:24     ` Alexander E. Patrakov
  2006-02-19 12:45       ` Adam Tla/lka
  2006-02-19 16:16       ` Adam Tla/lka
  0 siblings, 2 replies; 21+ messages in thread
From: Alexander E. Patrakov @ 2006-02-19  4:24 UTC (permalink / raw)
  To: Adam Tlałka; +Cc: linux-kernel, torvalds

Adam Tlałka wrote:

> Maybe I should remember all bytes of the UTF-sequence to use their 
> values as a last resort char in case of malformed sequence and 0xfffd
> not defined?

Please don't do that. Display question marks instead in the case when 
0xfffd is not defined.

-- 
Alexander E. Patrakov

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-19  4:24     ` Alexander E. Patrakov
@ 2006-02-19 12:45       ` Adam Tla/lka
  2006-02-19 16:16       ` Adam Tla/lka
  1 sibling, 0 replies; 21+ messages in thread
From: Adam Tla/lka @ 2006-02-19 12:45 UTC (permalink / raw)
  To: Alexander E. Patrakov; +Cc: linux-kernel, torvalds

On Sun, Feb 19, 2006 at 09:24:26AM +0500, Alexander E. Patrakov wrote:
> Adam TlaĹ‚ka wrote:
> 
> >Maybe I should remember all bytes of the UTF-sequence to use their 
> >values as a last resort char in case of malformed sequence and 0xfffd
> >not defined?
> 
> Please don't do that. Display question marks instead in the case when 
> 0xfffd is not defined.

But this is the normal console behaviour in case of non existing replacement
glyph for some char. I just not changed it. Anyway if you have no '?' glyph
at '?' char code position you get some different char on the screen.
Replacement glyph should always be defined as I said before,
so this part of code could be easilly removed but I do not really know
if this assumption is true now.

Regards 
-- 
Adam Tlałka      mailto:atlka@pg.gda.pl    ^v^ ^v^ ^v^
System  & Network Administration Group           ~~~~~~
Computer Center,  Gdańsk University of Technology, Poland
PGP public key:   finger atlka@sunrise.pg.gda.pl

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-19  4:24     ` Alexander E. Patrakov
  2006-02-19 12:45       ` Adam Tla/lka
@ 2006-02-19 16:16       ` Adam Tla/lka
  2006-02-19 17:07         ` Alexander E. Patrakov
  1 sibling, 1 reply; 21+ messages in thread
From: Adam Tla/lka @ 2006-02-19 16:16 UTC (permalink / raw)
  To: Alexander E. Patrakov; +Cc: linux-kernel, torvalds

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=utf-8, Size: 1393 bytes --]

On Sun, Feb 19, 2006 at 09:24:26AM +0500, Alexander E. Patrakov wrote:
> Adam TlaÅ‚ka wrote:
> 
> >Maybe I should remember all bytes of the UTF-sequence to use their 
> >values as a last resort char in case of malformed sequence and 0xfffd
> >not defined?
> 
> Please don't do that. Display question marks instead in the case when 
> 0xfffd is not defined.

Look at the original code. If conv_uni_to_pc fails and there is no replacement
char (after a clear_unimap for example) and we using US-ASCII we rather
should see something then sequences of '?' chars.
Maybe I could change this to:

if (tc == -4) {
	if (c < 128)
		tc = c;
	else
		tc = '?';
}

What about that?

Remembering of original bytes is needed if we could then remember
them in a way so paste from screen gives us the same sequence as it was
in input. With current console design it is impossible is case
of correct UTF-8 sequences containing undisplayable glyphs or malformed
sequences. So I can remove that part of patch and it will wait until
this functionality will be implemented - not so easy but can
be done IMHO and worth it to obtain properly working selection
and copying in UTF-8 mode.

Regards
-- 
Adam Tla³ka      mailto:atlka@pg.gda.pl    ^v^ ^v^ ^v^
System  & Network Administration Group           ~~~~~~
Computer Center,  Gdañsk University of Technology, Poland
PGP public key:   finger atlka@sunrise.pg.gda.pl

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-19 16:16       ` Adam Tla/lka
@ 2006-02-19 17:07         ` Alexander E. Patrakov
  0 siblings, 0 replies; 21+ messages in thread
From: Alexander E. Patrakov @ 2006-02-19 17:07 UTC (permalink / raw)
  To: Adam Tla/lka; +Cc: linux-kernel, torvalds

Adam Tla/lka wrote:
> On Sun, Feb 19, 2006 at 09:24:26AM +0500, Alexander E. Patrakov wrote:
>   
>> Adam Tlałka wrote:
>>
>>     
>>> Maybe I should remember all bytes of the UTF-sequence to use their 
>>> values as a last resort char in case of malformed sequence and 0xfffd
>>> not defined?
>>>       
>> Please don't do that. Display question marks instead in the case when 
>> 0xfffd is not defined.
>>     
>
> Look at the original code. If conv_uni_to_pc fails and there is no replacement
> char (after a clear_unimap for example) and we using US-ASCII we rather
> should see something then sequences of '?' chars.
> Maybe I could change this to:
>
> if (tc == -4) {
> 	if (c < 128)
> 		tc = c;
> 	else
> 		tc = '?';
> }
>
> What about that?
>   
I'd let someone else judge, but that is clearly a broken case that just 
has to be declared broken. <joke>could you please also adapt to a font 
that has all glyphs looking as smileys?</joke> But it's only three extra 
lines of code, so let's accept that "c<128" check.
> Remembering of original bytes is needed if we could then remember
> them in a way so paste from screen gives us the same sequence as it was
> in input.
This doesn't match the behaviour of X.
>  With current console design it is impossible is case
> of correct UTF-8 sequences containing undisplayable glyphs or malformed
> sequences.
I agree that, in some cases, it makes sense to copy and paste 
undisplayable glyphs. However, IMHO, this should not be allowed for 
malformed sequences.

-- 
Alexander E. Patrakov


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-17 23:33 [PATCH]console:UTF-8 mode compatibility fixes Adam Tla/lka
  2006-02-18 10:59 ` Andrew Morton
@ 2006-02-18 14:17 ` Alexander E. Patrakov
  2006-02-19  1:53   ` Thomas Dickey
  2006-02-19  5:42   ` Alexander E. Patrakov
       [not found] ` <43F72A1E.1090707@ums.usu.ru>
  2006-02-18 22:35 ` Adam Tla/lka
  3 siblings, 2 replies; 21+ messages in thread
From: Alexander E. Patrakov @ 2006-02-18 14:17 UTC (permalink / raw)
  To: Adam Tla/lka; +Cc: torvalds, bug-ncurses, LKML

[-- Attachment #1: Type: text/plain, Size: 6068 bytes --]

[sorry for repost, the first attempt got blocked due to html attachment, 
now I gzipped it to circumvent the filter]

Adam Tla/lka wrote:
> This patch applies to 2.6.15.3 kernel sources to drivers/char/vt.c file.
> It should work with other versions too.
> 
> Changed console behaviour so in UTF-8 mode vt100 alternate character
> sequences work as described in terminfo/termcap linux terminal definition.
> Programs can use vt100 control seqences - smacs, rmacs and acsc  characters
> in UTF-8 mode in the same way as in normal mode so one definition is always
> valid - current behaviour make these seqences not working in UTF-8 mode.

Doesn't work here with linux-2.6.16-rc3-mm1, ncurses-5.5. BTW has this
been discussed with Thomas Dickey (ncurses maintainer)?

> Added reporting malformed UTF-8 seqences as replacement glyphs.

Works.

> I think that terminal should always display something rather then ignoring
> these kind of data as it does now. Also it sticks to Unicode standards
> saying that every wrong byte should be reported. It is more human readable
> too in case of Latin subsets including ASCII chars.

Another feature request / bug report (spotted while viewing in Lynx a
page containing English text and a few Chinese characters, artificial
testcase attached):

If ncurses attempt to add some Chinese character to the Linux text
screen, Linux (correctly) prints this replacement character and advances
the cursor by one position. Ncurses think that the cursor has moved two
positions forward. The effect is that when you view the testcase in Lynx
(compiled --with-screen=ncursesw) on Linux console and press PageDown,
the fourth line contains "Thek" instead of "The" in the end.

This disagreement has to be solved somehow.

UTF-8 input issues (fixed by a patch originally from
http://chris.heathens.co.nz/linux/utf8.html) are also outstanding.

> Signed-off-by: Adam Tla/lka <atlka@pg.gda.pl>

Not adding my own signed-off-by line here because the acs part doesn't work.

> --- drivers/char/vt_orig.c	2006-02-13 11:33:54.000000000 +0100
> +++ drivers/char/vt.c	2006-02-17 23:05:50.000000000 +0100
> @@ -63,6 +63,12 @@
>   *
>   * Removed console_lock, enabled interrupts across all console operations
>   * 13 March 2001, Andrew Morton
> + *
> + * Fixed UTF-8 mode so alternate charset modes always work without need
> + * of different linux terminal definition for normal and UTF-8 modes
> + * preserving backward US-ASCII and VT100 semigraphics compatibility,
> + * malformed UTF sequences represented as sequences of replacement glyphs
> + * by Adam Tla/lka <atlka@pg.gda.pl>, Feb 2006
>   */
>  
>  #include <linux/module.h>
> @@ -1991,17 +1997,23 @@
>  		/* Do no translation at all in control states */
>  		if (vc->vc_state != ESnormal) {
>  			tc = c;
> -		} else if (vc->vc_utf) {
> +		} else if (vc->vc_utf && !vc->vc_disp_ctrl) {
>  		    /* Combine UTF-8 into Unicode */
> -		    /* Incomplete characters silently ignored */
> +		    /* Malformed sequence represented as replacement glyphs */
> +rescan_last_byte:
>  		    if(c > 0x7f) {
> -			if (vc->vc_utf_count > 0 && (c & 0xc0) == 0x80) {
> -				vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
> -				vc->vc_utf_count--;
> -				if (vc->vc_utf_count == 0)
> -				    tc = c = vc->vc_utf_char;
> -				else continue;
> +			if (vc->vc_utf_count) {
> +			       if ((c & 0xc0) == 0x80) {
> +				       vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
> +       				       if (--vc->vc_utf_count) {
> +					       vc->vc_npar++;
> +				   	       continue;
> +       				       }
> +				       tc = c = vc->vc_utf_char;
> +			       } else
> +				       goto insert_replacement_glyph;
>  			} else {
> +				vc->vc_npar = 0;
>  				if ((c & 0xe0) == 0xc0) {
>  				    vc->vc_utf_count = 1;
>  				    vc->vc_utf_char = (c & 0x1f);
> @@ -2018,12 +2030,13 @@
>  				    vc->vc_utf_count = 5;
>  				    vc->vc_utf_char = (c & 0x01);
>  				} else
> -				    vc->vc_utf_count = 0;
> +	    			    goto insert_replacement_glyph;
>  				continue;
>  			      }
>  		    } else {
> +		      if (vc->vc_utf_count)
> +	  		      goto insert_replacement_glyph;
>  		      tc = c;
> -		      vc->vc_utf_count = 0;
>  		    }
>  		} else {	/* no utf */
>  		  tc = vc->vc_translate[vc->vc_toggle_meta ? (c | 0x80) : c];
> @@ -2040,8 +2053,8 @@
>                   * direct-to-font zone in UTF-8 mode.
>                   */
>                  ok = tc && (c >= 32 ||
> -			    (!vc->vc_utf && !(((vc->vc_disp_ctrl ? CTRL_ALWAYS
> -						: CTRL_ACTION) >> c) & 1)))
> +			    !(vc->vc_disp_ctrl ? (CTRL_ALWAYS >> c) & 1 :
> +				  vc->vc_utf || ((CTRL_ACTION >> c) & 1)))
>  			&& (c != 127 || vc->vc_disp_ctrl)
>  			&& (c != 128+27);
>  
> @@ -2051,6 +2064,7 @@
>  			if ( tc == -4 ) {
>                                  /* If we got -4 (not found) then see if we have
>                                     defined a replacement character (U+FFFD) */
> +insert_replacement_glyph:
>                                  tc = conv_uni_to_pc(vc, 0xfffd);
>  
>  				/* One reason for the -4 can be that we just
> @@ -2062,9 +2076,19 @@
>                                  /* Bad hash table -- hope for the best */
>                                  tc = c;
>                          }
> -			if (tc & ~charmask)
> +			if (tc & ~charmask) {
> +				/*  no replacement glyph */
> +				if (vc->vc_utf_count) {
> +					vc->vc_utf_count = 0;
> +					if (vc->vc_npar) {
> +						c = orig;
> +						goto rescan_last_byte;
> +					}
> +				}
>                                  continue; /* Conversion failed */
> +			}
>  
> +repeat_replacement_glyph:
>  			if (vc->vc_need_wrap || vc->vc_decim)
>  				FLUSH
>  			if (vc->vc_need_wrap) {
> @@ -2088,6 +2112,15 @@
>  				vc->vc_x++;
>  				draw_to = (vc->vc_pos += 2);
>  			}
> +			if (vc->vc_utf_count) {
> +				if (vc->vc_npar) {
> +					vc->vc_npar--;
> +					goto repeat_replacement_glyph;
> +				}
> +				vc->vc_utf_count = 0;
> +				c = orig;
> +				goto rescan_last_byte;
> +			}
>  			continue;
>  		}
>  		FLUSH

-- 
Alexander E. Patrakov


[-- Attachment #2: test.html.gz --]
[-- Type: application/gzip, Size: 238 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-18 14:17 ` Alexander E. Patrakov
@ 2006-02-19  1:53   ` Thomas Dickey
  2006-02-19  4:33     ` Alexander E. Patrakov
  2006-02-19 11:47     ` Adam Tla/lka
  2006-02-19  5:42   ` Alexander E. Patrakov
  1 sibling, 2 replies; 21+ messages in thread
From: Thomas Dickey @ 2006-02-19  1:53 UTC (permalink / raw)
  To: Alexander E. Patrakov; +Cc: Adam Tla/lka, torvalds, bug-ncurses, LKML

On Sat, 18 Feb 2006, Alexander E. Patrakov wrote:

> [sorry for repost, the first attempt got blocked due to html attachment, now 
> I gzipped it to circumvent the filter]
>
> Adam Tla/lka wrote:
>> This patch applies to 2.6.15.3 kernel sources to drivers/char/vt.c file.
>> It should work with other versions too.
>> 
>> Changed console behaviour so in UTF-8 mode vt100 alternate character
>> sequences work as described in terminfo/termcap linux terminal definition.
>> Programs can use vt100 control seqences - smacs, rmacs and acsc  characters
>> in UTF-8 mode in the same way as in normal mode so one definition is always
>> valid - current behaviour make these seqences not working in UTF-8 mode.

I expect some discussion from the people who _vehemently_ refused to allow
the Linux console to have anything that resembled a mode.

More to the point: since it's been in this form for several years, it 
doesn't do much good to developers, because there are already workarounds 
in ncurses to accommodate this, and even if you fixed it today, it would
be needed in ncurses for a few more years.

For example (man ncurses):

        NCURSES_NO_UTF8_ACS
             During initialization, the  ncurses  library  checks  for  special
             cases  where  VT100  line-drawing (and the corresponding alternate
             character set capabilities) described in the terminfo are known to
             be  missing.   Specifically,  when  running in a UTF-8 locale, the
             Linux console emulator and the GNU screen  program  ignore  these.
             Ncurses checks the TERM environment variable for these.  For other
             special cases, you should set this  environment  variable.   Doing
             this  tells  ncurses to use Unicode values which correspond to the
             VT100 line-drawing glyphs.   That  works  for  the  special  cases
             cited, and is likely to work for terminal emulators.

             When  setting this variable, you should set it to a nonzero value.
             Setting it to zero (or to a nonnumber) disables the special  check
             for Linux and screen.

is the most recent refinement (from a year ago) to a workaround which 
first appeared in ncurses 5.4 (originally from December 2002).

> Doesn't work here with linux-2.6.16-rc3-mm1, ncurses-5.5. BTW has this
> been discussed with Thomas Dickey (ncurses maintainer)?

no (my seeing it via google doesn't count).

> Another feature request / bug report (spotted while viewing in Lynx a
> page containing English text and a few Chinese characters, artificial
> testcase attached):
>
> If ncurses attempt to add some Chinese character to the Linux text
> screen, Linux (correctly) prints this replacement character and advances
> the cursor by one position. Ncurses think that the cursor has moved two
> positions forward. The effect is that when you view the testcase in Lynx
> (compiled --with-screen=ncursesw) on Linux console and press PageDown,
> the fourth line contains "Thek" instead of "The" in the end.
>
> This disagreement has to be solved somehow.

yes.  ncurses has no better information for this than the result from
wcwidth().  Shall we add another kludge to accommodate Linux console?
(Are there other terminal emulators with this specific problem?)

-- 
Thomas E. Dickey
http://invisible-island.net
ftp://invisible-island.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-19  1:53   ` Thomas Dickey
@ 2006-02-19  4:33     ` Alexander E. Patrakov
  2006-02-19 11:47     ` Adam Tla/lka
  1 sibling, 0 replies; 21+ messages in thread
From: Alexander E. Patrakov @ 2006-02-19  4:33 UTC (permalink / raw)
  To: Thomas Dickey; +Cc: Adam Tla/lka, torvalds, bug-ncurses, LKML

Thomas Dickey wrote:

> On Sat, 18 Feb 2006, Alexander E. Patrakov wrote:
>
>> If ncurses attempt to add some Chinese character to the Linux text
>> screen, Linux (correctly) prints this replacement character and advances
>> the cursor by one position. Ncurses think that the cursor has moved two
>> positions forward. The effect is that when you view the testcase in Lynx
>> (compiled --with-screen=ncursesw) on Linux console and press PageDown,
>> the fourth line contains "Thek" instead of "The" in the end.
>>
>> This disagreement has to be solved somehow.
>
>
> yes.  ncurses has no better information for this than the result from
> wcwidth().  Shall we add another kludge to accommodate Linux console?

Maybe yes, since putting wcwidth() into the kernel is a bigger kludge, 
because linux kernel will never draw CJK, and because after glibc 
update, kernel and glibc might disagree upon wcwidth of some characters.

So on linux console, ncurses should draw two 0xfffd characters when a 
character with wcwidth > 1 is requested.

> (Are there other terminal emulators with this specific problem?)

Not sure. I will test putty a bit later. Anyway, an environment variable 
for this kludge may be a good idea.

-- 
Alexander E. Patrakov

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-19  1:53   ` Thomas Dickey
  2006-02-19  4:33     ` Alexander E. Patrakov
@ 2006-02-19 11:47     ` Adam Tla/lka
  2006-02-20  1:20       ` Thomas Dickey
  1 sibling, 1 reply; 21+ messages in thread
From: Adam Tla/lka @ 2006-02-19 11:47 UTC (permalink / raw)
  To: Thomas Dickey; +Cc: Alexander E. Patrakov, torvalds, bug-ncurses, LKML

On Sat, Feb 18, 2006 at 08:53:38PM -0500, Thomas Dickey wrote:
> More to the point: since it's been in this form for several years, it 
> doesn't do much good to developers, because there are already workarounds 
> in ncurses to accommodate this, and even if you fixed it today, it would
> be needed in ncurses for a few more years.
> 
> For example (man ncurses):
> 
>        NCURSES_NO_UTF8_ACS
> ...
>             for Linux and screen.
> 
> is the most recent refinement (from a year ago) to a workaround which 
> first appeared in ncurses 5.4 (originally from December 2002).

OK but my fix is for all not only curses programs. Anyway from my point of view
programs should be written in a way so they are easy to use and work correctly
without user special intervention and knowledge. So ncurses hack is not
a desired way IMHO. Generally saying products should be designed with customers
comfort and not developers/producers comfort in mind. But this is just a wish
of course in current world and off topic.

> >This disagreement has to be solved somehow.
> 
> yes.  ncurses has no better information for this than the result from
> wcwidth().  Shall we add another kludge to accommodate Linux console?
> (Are there other terminal emulators with this specific problem?)

If ncurses know that this is a two-columns wide character so for compatibility
and correct handling reasons kernel console driver should know it too
and send two replacement glyphs or better one replacement glyph and one space 
or there should be n column replacement glyph implemented
(maybe as a replacement glyph and n-1 additional spaces) as for example putty
xterm-color terminal emulator does.

For correct selecting and pasting from console in UTF-8 mode without
data corruption in case of non displayed glyphs and malformed sequences
console screen contents should be memorized as UCS-2 wide chars plus additional
attributes and color information.

Also there should be some specification - maybe RFC - how correctly handle
UTF-8 malformed seqences so no information is corrupt during
displaying/cut/paste/edit operations. If user edits then it should be changed
but if there is no action there should be no change.
So automatic recoding or not displaying something leads to state when I could
have inproper file name and just can't correct it from command line because
I can't input inproper sequence from keyboard - just like in MS Windows -
or I must use some additional software or graphical tool.

Additionally a replacement glyph means that a valid glyph can't be displayed
by a device which doesn't mean that the particular UTF-8 sequence is incorrect.
Of course an inproper byte can't be displayed too because we don't know
what it really means. Result is the same but reasons are different.
So showed on vt screen results hide real reasons which is bad IMHO.
But only the replacement glyph is defined in the standard and there is no bad
byte indicator which is really needed ;-(.

Regards
-- 
Adam Tlałka      mailto:atlka@pg.gda.pl    ^v^ ^v^ ^v^
System  & Network Administration Group           ~~~~~~
Computer Center,  Gdańsk University of Technology, Poland
PGP public key:   finger atlka@sunrise.pg.gda.pl

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-19 11:47     ` Adam Tla/lka
@ 2006-02-20  1:20       ` Thomas Dickey
  2006-03-07 15:05         ` Adam Tlałka
  0 siblings, 1 reply; 21+ messages in thread
From: Thomas Dickey @ 2006-02-20  1:20 UTC (permalink / raw)
  To: Adam Tla/lka; +Cc: Alexander E. Patrakov, torvalds, Ncurses Mailing List, LKML

On Sun, 19 Feb 2006, Adam Tla/lka wrote:

> On Sat, Feb 18, 2006 at 08:53:38PM -0500, Thomas Dickey wrote: OK but my 
> fix is for all not only curses programs. Anyway from my point of view 
> programs should be written in a way so they are easy to use and work 
> correctly without user special intervention and knowledge. So ncurses 
> hack is not

of course (which is why I have xterm doing the same thing).

> a desired way IMHO. Generally saying products should be designed with 
> customers comfort and not developers/producers comfort in mind. But this 
> is just a wish of course in current world and off topic.

;-)

-- 
Thomas E. Dickey
http://invisible-island.net
ftp://invisible-island.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-20  1:20       ` Thomas Dickey
@ 2006-03-07 15:05         ` Adam Tlałka
  0 siblings, 0 replies; 21+ messages in thread
From: Adam Tlałka @ 2006-03-07 15:05 UTC (permalink / raw)
  To: Thomas Dickey
  Cc: Alexander E. Patrakov, torvalds, Ncurses Mailing List, LKML,
	Andrew Morton

Thomas Dickey wrote:

> On Sun, 19 Feb 2006, Adam Tla/lka wrote:
>
>> OK but my fix is for all not only curses programs. Anyway from my
>> point of view programs should be written in a way so they are easy to
>> use and work correctly without user special intervention and
>> knowledge. So ncurses hack is not
>
>
> of course (which is why I have xterm doing the same thing).

Yes, I tested VT100 graphics in UTF8-mode with  various terminal
emulator programs - xterm, gnome-terminal, kconsole, mlterm, rxvt - and
they all work the same way accepting ^N as smacs and VT100 graphics
chars. So this is the reason why Linux console should not break this
compatibility in UTF8 mode too.

There are some programs which use ascs even if they can do it by UTF8
sequences - w3m for example. Without supporting
acsc in UTF8 mode using Linux console and w3m gives not so good visual
results. Second reason ;-).
There are other programs of course - aptitude for example - working the
same way as w3m.

Using smacs=\e[11m which means smacs == smpch is some kind of a hack and
not the proper solution. Terminal description should describe sequences
interpreted by a device and in case of smacs == smpch sequences like ^N
and ^O do not exist in terminal description but they are interpreted as
before. So the description is not complete. The acsc string should be
set so it sticks to VT100 map in Linux console sources too:

acsc=++\,\,--..00``aaffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz{{||}}~~

now adding proper USC2 to font mapping to used console font makes it
working correctly in both normal and UTF8 modes. Other solutions like
modifying acsc to obtain some different chars for nonexisting glyphs are
only partially correct because we have only half functioning terminal -
in normal mode we see some chars but in UTF8  mode there are only
replacemnt glyphs. So only the compatible definition and then correction
to console fonts included UCS2 to glyph maps solves the problem IMHO.
Also if we have different definitions on different systems and VT100
compatibility is lost we have problems too.

After some discussion I've  sended final patch

2006-02-27 13:06 [PATCH]console compatibility fixes to UTF-8 mode

and now I am waiting for some reaction.
I am using this patch with my work machines now.

Regards

-- 
Adam Tlałka       mailto:atlka@pg.gda.pl    ^v^ ^v^ ^v^
System  & Network Administration Group       - - - ~~~~~~
Computer Center,  Gdańsk University of Technology, Poland
PGP public key:   finger atlka@sunrise.pg.gda.pl

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-18 14:17 ` Alexander E. Patrakov
  2006-02-19  1:53   ` Thomas Dickey
@ 2006-02-19  5:42   ` Alexander E. Patrakov
  2006-02-19 10:15     ` Adam Tla/lka
  1 sibling, 1 reply; 21+ messages in thread
From: Alexander E. Patrakov @ 2006-02-19  5:42 UTC (permalink / raw)
  To: Adam Tla/lka; +Cc: torvalds, bug-ncurses, LKML

I wrote:

> Adam Tla/lka wrote:

>> Changed console behaviour so in UTF-8 mode vt100 alternate character
>> sequences work as described in terminfo/termcap linux terminal 
>> definition.
>> Programs can use vt100 control seqences - smacs, rmacs and acsc  
>> characters
>> in UTF-8 mode in the same way as in normal mode so one definition is 
>> always
>> valid - current behaviour make these seqences not working in UTF-8 mode.
> 
> 
> Doesn't work here with linux-2.6.16-rc3-mm1, ncurses-5.5.

Sorry, that my non-true statement was due to the less-than-perfect 
description of the patch. After patching, this produces a horizontal line:

echo -e '\x0eqqqq\x0f'

So please correct the description and the first (comment) hunk of the 
patch, so that it doesn't mention "smacs" and similar words with meaning 
that may vary, and so that it mentions the exact control codes.

-- 
Alexander E. Patrakov

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-19  5:42   ` Alexander E. Patrakov
@ 2006-02-19 10:15     ` Adam Tla/lka
  2006-02-19 23:19       ` [PATCH]console:UTF-8 mode compatibility fixes - new version Adam Tla/lka
  0 siblings, 1 reply; 21+ messages in thread
From: Adam Tla/lka @ 2006-02-19 10:15 UTC (permalink / raw)
  To: Alexander E. Patrakov; +Cc: torvalds, bug-ncurses, LKML

On Sun, Feb 19, 2006 at 10:42:38AM +0500, Alexander E. Patrakov wrote:
> Sorry, that my non-true statement was due to the less-than-perfect 
> description of the patch. After patching, this produces a horizontal line:
> 
> echo -e '\x0eqqqq\x0f'
> 
> So please correct the description and the first (comment) hunk of the 
> patch, so that it doesn't mention "smacs" and similar words with meaning 
> that may vary, and so that it mentions the exact control codes.
> 

Ok, you are right.
Here is the corrected description and the new patch:

Fixed UTF-8 mode so alternate charset modes always work according
to control sequences interpreted in do_con_trol function:
smacs = '\x0e', rmacs = '\x0f' if vt100 translation map for alternate
charset is active which means enacs = '\e)0' which preserves
backward US-ASCII and VT100 semigraphics compatibility,
malformed UTF sequences represented as sequences of replacement glyphs
or original codes if replacement glyph is undefined
Signed-off-by: Adam Tla/lka <atlka@pg.gda.pl>

--- drivers/char/vt_orig.c	2006-02-13 11:33:54.000000000 +0100
+++ drivers/char/vt.c	2006-02-19 10:59:27.000000000 +0100
@@ -63,6 +63,15 @@
  *
  * Removed console_lock, enabled interrupts across all console operations
  * 13 March 2001, Andrew Morton
+ *
+ * Fixed UTF-8 mode so alternate charset modes always work according
+ * to control sequences interpreted in do_con_trol function:
+ * smacs = '\x0e', rmacs = '\x0f' if vt100 translation map for alternate
+ * charset is active which means enacs = '\e)0'
+ * preserving backward US-ASCII and VT100 semigraphics compatibility,
+ * malformed UTF sequences represented as sequences of replacement glyphs
+ * or original codes if replacement glyph is undefined
+ * by Adam Tla/lka <atlka@pg.gda.pl>, Feb 2006
  */
 
 #include <linux/module.h>
@@ -1991,17 +2000,26 @@ static int do_con_write(struct tty_struc
 		/* Do no translation at all in control states */
 		if (vc->vc_state != ESnormal) {
 			tc = c;
-		} else if (vc->vc_utf) {
+		} else if (vc->vc_utf && !vc->vc_disp_ctrl) {
 		    /* Combine UTF-8 into Unicode */
-		    /* Incomplete characters silently ignored */
+		    /* Malformed sequence represented as replacement glyphs */
+rescan_last_byte:
 		    if(c > 0x7f) {
-			if (vc->vc_utf_count > 0 && (c & 0xc0) == 0x80) {
-				vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
-				vc->vc_utf_count--;
-				if (vc->vc_utf_count == 0)
-				    tc = c = vc->vc_utf_char;
-				else continue;
+			if (vc->vc_utf_count) {
+			       if ((c & 0xc0) == 0x80) {
+				       vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
+       				       if (--vc->vc_utf_count) {
+					       vc->vc_par[vc->vc_utf_count] = c;
+					       vc->vc_npar++;
+				   	       continue;
+       				       }
+				       tc = c = vc->vc_utf_char;
+			       } else {
+				       c = vc->vc_par[vc->vc_utf_count + vc->vc_npar];
+				       goto insert_replacement_glyph;
+			       }
 			} else {
+				vc->vc_npar = 0;
 				if ((c & 0xe0) == 0xc0) {
 				    vc->vc_utf_count = 1;
 				    vc->vc_utf_char = (c & 0x1f);
@@ -2018,12 +2036,16 @@ static int do_con_write(struct tty_struc
 				    vc->vc_utf_count = 5;
 				    vc->vc_utf_char = (c & 0x01);
 				} else
-				    vc->vc_utf_count = 0;
+	    			    goto insert_replacement_glyph;
+				vc->vc_par[vc->vc_utf_count] = c;
 				continue;
 			      }
 		    } else {
+		      if (vc->vc_utf_count) {
+			      c = vc->vc_par[vc->vc_utf_count + vc->vc_npar];
+	  		      goto insert_replacement_glyph;
+		      }
 		      tc = c;
-		      vc->vc_utf_count = 0;
 		    }
 		} else {	/* no utf */
 		  tc = vc->vc_translate[vc->vc_toggle_meta ? (c | 0x80) : c];
@@ -2040,8 +2062,8 @@ static int do_con_write(struct tty_struc
                  * direct-to-font zone in UTF-8 mode.
                  */
                 ok = tc && (c >= 32 ||
-			    (!vc->vc_utf && !(((vc->vc_disp_ctrl ? CTRL_ALWAYS
-						: CTRL_ACTION) >> c) & 1)))
+			    !(vc->vc_disp_ctrl ? (CTRL_ALWAYS >> c) & 1 :
+				  vc->vc_utf || ((CTRL_ACTION >> c) & 1)))
 			&& (c != 127 || vc->vc_disp_ctrl)
 			&& (c != 128+27);
 
@@ -2051,6 +2073,7 @@ static int do_con_write(struct tty_struc
 			if ( tc == -4 ) {
                                 /* If we got -4 (not found) then see if we have
                                    defined a replacement character (U+FFFD) */
+insert_replacement_glyph:
                                 tc = conv_uni_to_pc(vc, 0xfffd);
 
 				/* One reason for the -4 can be that we just
@@ -2063,7 +2086,7 @@ static int do_con_write(struct tty_struc
                                 tc = c;
                         }
 			if (tc & ~charmask)
-                                continue; /* Conversion failed */
+				goto check_malformed_sequence;
 
 			if (vc->vc_need_wrap || vc->vc_decim)
 				FLUSH
@@ -2088,6 +2111,16 @@ static int do_con_write(struct tty_struc
 				vc->vc_x++;
 				draw_to = (vc->vc_pos += 2);
 			}
+check_malformed_sequence:
+			if (vc->vc_utf_count) {
+				if (vc->vc_npar) {
+					c = vc->vc_par[vc->vc_utf_count + --vc->vc_npar];
+					goto insert_replacement_glyph;
+				}
+				vc->vc_utf_count = 0;
+				c = orig;
+				goto rescan_last_byte;
+			}
 			continue;
 		}
 		FLUSH

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH]console:UTF-8 mode compatibility fixes - new version
  2006-02-19 10:15     ` Adam Tla/lka
@ 2006-02-19 23:19       ` Adam Tla/lka
  2006-02-20  8:14         ` [PATCH]console:UTF-8 mode compatibility fixes - new version #1 Adam Tla/lka
  0 siblings, 1 reply; 21+ messages in thread
From: Adam Tla/lka @ 2006-02-19 23:19 UTC (permalink / raw)
  To: Alexander E. Patrakov; +Cc: torvalds, LKML


Corrected and optimized version of the first patch - continuation
bytes and the first byte of the scanned UTF-8 sequence are not memorized because
with current console design this information is unsable.

Description:

Fixed UTF-8 mode so alternate charset modes always work according
to control sequences interpreted in do_con_trol function:
smacs = '\x0e', rmacs = '\x0f' if vt100 translation map for alternate
charset is active which means enacs = '\e)0'
preserving backward US-ASCII and VT100 semigraphics compatibility.

Malformed UTF sequences are represented as sequences of replacement glyphs,
original codes or '?' as a last resort if replacement glyph is undefined
which is a bad console state but something should be displayed
if UCS-2 code represents displayable char.

Signed-off-by: Adam Tla/lka <atlka@pg.gda.pl>


--- drivers/char/vt_orig.c	2006-02-13 11:33:54.000000000 +0100
+++ drivers/char/vt.c	2006-02-19 23:50:00.000000000 +0100
@@ -63,6 +63,15 @@
  *
  * Removed console_lock, enabled interrupts across all console operations
  * 13 March 2001, Andrew Morton
+ *
+ * Fixed UTF-8 mode so alternate charset modes always work according
+ * to control sequences interpreted in do_con_trol function:
+ * smacs = '\x0e', rmacs = '\x0f' if vt100 translation map for alternate
+ * charset is active which means enacs = '\e)0'
+ * preserving backward US-ASCII and VT100 semigraphics compatibility,
+ * malformed UTF sequences represented as sequences of replacement glyphs,
+ * original codes or '?' as a last resort if replacement glyph is undefined
+ * by Adam Tla/lka <atlka@pg.gda.pl>, Feb 2006
  */
 
 #include <linux/module.h>
@@ -1991,17 +2000,23 @@ static int do_con_write(struct tty_struc
 		/* Do no translation at all in control states */
 		if (vc->vc_state != ESnormal) {
 			tc = c;
-		} else if (vc->vc_utf) {
+		} else if (vc->vc_utf && !vc->vc_disp_ctrl) {
 		    /* Combine UTF-8 into Unicode */
-		    /* Incomplete characters silently ignored */
+		    /* Malformed sequence represented as replacement glyphs */
+rescan_last_byte:
 		    if(c > 0x7f) {
-			if (vc->vc_utf_count > 0 && (c & 0xc0) == 0x80) {
-				vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
-				vc->vc_utf_count--;
-				if (vc->vc_utf_count == 0)
-				    tc = c = vc->vc_utf_char;
-				else continue;
+			if (vc->vc_utf_count) {
+			       if ((c & 0xc0) == 0x80) {
+				       vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
+       				       if (--vc->vc_utf_count) {
+					       vc->vc_npar++;
+				   	       continue;
+       				       }
+				       tc = c = vc->vc_utf_char;
+			       } else
+				       goto insert_replacement_glyph;
 			} else {
+				vc->vc_npar = 0;
 				if ((c & 0xe0) == 0xc0) {
 				    vc->vc_utf_count = 1;
 				    vc->vc_utf_char = (c & 0x1f);
@@ -2018,12 +2033,13 @@ static int do_con_write(struct tty_struc
 				    vc->vc_utf_count = 5;
 				    vc->vc_utf_char = (c & 0x01);
 				} else
-				    vc->vc_utf_count = 0;
+	    			    goto insert_replacement_glyph;
 				continue;
 			      }
 		    } else {
+		      if (vc->vc_utf_count)
+	  		      goto insert_replacement_glyph;
 		      tc = c;
-		      vc->vc_utf_count = 0;
 		    }
 		} else {	/* no utf */
 		  tc = vc->vc_translate[vc->vc_toggle_meta ? (c | 0x80) : c];
@@ -2040,31 +2056,41 @@ static int do_con_write(struct tty_struc
                  * direct-to-font zone in UTF-8 mode.
                  */
                 ok = tc && (c >= 32 ||
-			    (!vc->vc_utf && !(((vc->vc_disp_ctrl ? CTRL_ALWAYS
-						: CTRL_ACTION) >> c) & 1)))
+			    !(vc->vc_disp_ctrl ? (CTRL_ALWAYS >> c) & 1 :
+				  vc->vc_utf || ((CTRL_ACTION >> c) & 1)))
 			&& (c != 127 || vc->vc_disp_ctrl)
 			&& (c != 128+27);
 
 		if (vc->vc_state == ESnormal && ok) {
 			/* Now try to find out how to display it */
 			tc = conv_uni_to_pc(vc, tc);
-			if ( tc == -4 ) {
+			if ( tc & ~charmask ) {
+				if ( tc == -4 ) {
                                 /* If we got -4 (not found) then see if we have
                                    defined a replacement character (U+FFFD) */
-                                tc = conv_uni_to_pc(vc, 0xfffd);
+insert_replacement_glyph:
+					tc = conv_uni_to_pc(vc, 0xfffd);
 
 				/* One reason for the -4 can be that we just
 				   did a clear_unimap();
 				   try at least to show something. */
-				if (tc == -4)
-				     tc = c;
-                        } else if ( tc == -3 ) {
+					if (tc & ~charmask) {
+						if (c & ~charmask)
+							tc = '?';
+						else
+							tc = c;
+					}
+                     	  	 } else if ( tc == -3 ) {
                                 /* Bad hash table -- hope for the best */
-                                tc = c;
+					if (c & ~charmask)
+						tc = '?';
+					else
+						tc = c;
+				 } else
+                                	continue; /* Conversion failed */
                         }
-			if (tc & ~charmask)
-                                continue; /* Conversion failed */
 
+repeat_replacement_glyph:	
 			if (vc->vc_need_wrap || vc->vc_decim)
 				FLUSH
 			if (vc->vc_need_wrap) {
@@ -2088,6 +2114,15 @@ static int do_con_write(struct tty_struc
 				vc->vc_x++;
 				draw_to = (vc->vc_pos += 2);
 			}
+			if (vc->vc_utf_count) {
+				if (vc->vc_npar) {
+					vc->vc_npar--;
+					goto repeat_replacement_glyph;
+				}
+				vc->vc_utf_count = 0;
+				c = orig;
+				goto rescan_last_byte;
+			}
 			continue;
 		}
 		FLUSH

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes - new version #1
  2006-02-19 23:19       ` [PATCH]console:UTF-8 mode compatibility fixes - new version Adam Tla/lka
@ 2006-02-20  8:14         ` Adam Tla/lka
  0 siblings, 0 replies; 21+ messages in thread
From: Adam Tla/lka @ 2006-02-20  8:14 UTC (permalink / raw)
  To: Alexander E. Patrakov; +Cc: torvalds, LKML

Hope the last version applaying to current console design. 
What do you think?

Description:

Fixed UTF-8 mode so alternate charset modes always work according
to control sequences interpreted in do_con_trol function:
smacs = '\x0e', rmacs = '\x0f' if vt100 translation map for alternate
charset is active which means enacs = '\e)0'
preserving backward US-ASCII and VT100 semigraphics compatibility.

Malformed UTF sequences are represented as sequences of replacement glyphs,
original codes or '?' as a last resort if replacement glyph is undefined
and  UCS-2 code does not fit in 0-127 code range.

Signed-off-by: Adam Tla/lka <atlka@pg.gda.pl>

--- drivers/char/vt_orig.c	2006-02-13 11:33:54.000000000 +0100
+++ drivers/char/vt.c	2006-02-20 08:53:59.000000000 +0100
@@ -63,6 +63,16 @@
  *
  * Removed console_lock, enabled interrupts across all console operations
  * 13 March 2001, Andrew Morton
+ *
+ * Fixed UTF-8 mode so alternate charset modes always work according
+ * to control sequences interpreted in do_con_trol function:
+ * smacs = '\x0e', rmacs = '\x0f' if vt100 translation map for alternate
+ * charset is active which means enacs = '\e)0'
+ * preserving backward US-ASCII and VT100 semigraphics compatibility,
+ * malformed UTF sequences represented as sequences of replacement glyphs,
+ * original codes or '?' as a last resort if replacement glyph is undefined
+ * and  UCS-2 code does not fit in 0-127 code range
+ * by Adam Tla/lka <atlka@pg.gda.pl>, Feb 2006
  */
 
 #include <linux/module.h>
@@ -1991,17 +2001,23 @@ static int do_con_write(struct tty_struc
 		/* Do no translation at all in control states */
 		if (vc->vc_state != ESnormal) {
 			tc = c;
-		} else if (vc->vc_utf) {
+		} else if (vc->vc_utf && !vc->vc_disp_ctrl) {
 		    /* Combine UTF-8 into Unicode */
-		    /* Incomplete characters silently ignored */
+		    /* Malformed sequence represented as replacement glyphs */
+rescan_last_byte:
 		    if(c > 0x7f) {
-			if (vc->vc_utf_count > 0 && (c & 0xc0) == 0x80) {
-				vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
-				vc->vc_utf_count--;
-				if (vc->vc_utf_count == 0)
-				    tc = c = vc->vc_utf_char;
-				else continue;
+			if (vc->vc_utf_count) {
+			       if ((c & 0xc0) == 0x80) {
+				       vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
+       				       if (--vc->vc_utf_count) {
+					       vc->vc_npar++;
+				   	       continue;
+       				       }
+				       tc = c = vc->vc_utf_char;
+			       } else
+				       goto insert_replacement_glyph;
 			} else {
+				vc->vc_npar = 0;
 				if ((c & 0xe0) == 0xc0) {
 				    vc->vc_utf_count = 1;
 				    vc->vc_utf_char = (c & 0x1f);
@@ -2018,12 +2034,13 @@ static int do_con_write(struct tty_struc
 				    vc->vc_utf_count = 5;
 				    vc->vc_utf_char = (c & 0x01);
 				} else
-				    vc->vc_utf_count = 0;
+	    			    goto insert_replacement_glyph;
 				continue;
 			      }
 		    } else {
+		      if (vc->vc_utf_count)
+	  		      goto insert_replacement_glyph;
 		      tc = c;
-		      vc->vc_utf_count = 0;
 		    }
 		} else {	/* no utf */
 		  tc = vc->vc_translate[vc->vc_toggle_meta ? (c | 0x80) : c];
@@ -2040,31 +2057,41 @@ static int do_con_write(struct tty_struc
                  * direct-to-font zone in UTF-8 mode.
                  */
                 ok = tc && (c >= 32 ||
-			    (!vc->vc_utf && !(((vc->vc_disp_ctrl ? CTRL_ALWAYS
-						: CTRL_ACTION) >> c) & 1)))
+			    !(vc->vc_disp_ctrl ? (CTRL_ALWAYS >> c) & 1 :
+				  vc->vc_utf || ((CTRL_ACTION >> c) & 1)))
 			&& (c != 127 || vc->vc_disp_ctrl)
 			&& (c != 128+27);
 
 		if (vc->vc_state == ESnormal && ok) {
 			/* Now try to find out how to display it */
 			tc = conv_uni_to_pc(vc, tc);
-			if ( tc == -4 ) {
+			if ( tc & ~charmask ) {
+				if ( tc == -4 ) {
                                 /* If we got -4 (not found) then see if we have
                                    defined a replacement character (U+FFFD) */
-                                tc = conv_uni_to_pc(vc, 0xfffd);
+insert_replacement_glyph:
+					tc = conv_uni_to_pc(vc, 0xfffd);
 
 				/* One reason for the -4 can be that we just
 				   did a clear_unimap();
 				   try at least to show something. */
-				if (tc == -4)
-				     tc = c;
-                        } else if ( tc == -3 ) {
+					if (tc & ~charmask) {
+						if ( c & ~0x7f )
+							tc = '?';
+						else
+							tc = c;
+					}
+                     	  	 } else if ( tc == -3 ) {
                                 /* Bad hash table -- hope for the best */
-                                tc = c;
+					if ( c & ~0x7f )
+						tc = '?';
+					else
+						tc = c;
+				 } else
+                                	continue; /* Conversion failed */
                         }
-			if (tc & ~charmask)
-                                continue; /* Conversion failed */
 
+repeat_replacement_glyph:	
 			if (vc->vc_need_wrap || vc->vc_decim)
 				FLUSH
 			if (vc->vc_need_wrap) {
@@ -2088,6 +2115,15 @@ static int do_con_write(struct tty_struc
 				vc->vc_x++;
 				draw_to = (vc->vc_pos += 2);
 			}
+			if (vc->vc_utf_count) {
+				if (vc->vc_npar) {
+					vc->vc_npar--;
+					goto repeat_replacement_glyph;
+				}
+				vc->vc_utf_count = 0;
+				c = orig;
+				goto rescan_last_byte;
+			}
 			continue;
 		}
 		FLUSH

^ permalink raw reply	[flat|nested] 21+ messages in thread

[parent not found: <43F72A1E.1090707@ums.usu.ru>]

* Re: [PATCH]console:UTF-8 mode compatibility fixes
       [not found] ` <43F72A1E.1090707@ums.usu.ru>
@ 2006-02-18 14:37   ` Adam Tlałka
  2006-02-19  1:43     ` Thomas Dickey
  0 siblings, 1 reply; 21+ messages in thread
From: Adam Tlałka @ 2006-02-18 14:37 UTC (permalink / raw)
  To: Alexander E. Patrakov; +Cc: torvalds, bug-ncurses, LKML

Użytkownik Alexander E. Patrakov napisał:
> Adam Tla/lka wrote:
> 
>> This patch applies to 2.6.15.3 kernel sources to drivers/char/vt.c file.
>> It should work with other versions too.
>>
>> Changed console behaviour so in UTF-8 mode vt100 alternate character
>> sequences work as described in terminfo/termcap linux terminal 
>> definition.
>> Programs can use vt100 control seqences - smacs, rmacs and acsc  
>> characters
>> in UTF-8 mode in the same way as in normal mode so one definition is 
>> always
>> valid - current behaviour make these seqences not working in UTF-8 mode.
> 
> 
> Doesn't work here with linux-2.6.16-rc3-mm1, ncurses-5.5. BTW has this 
> been discussed with Thomas Dickey (ncurses maintainer)?
>

Hmm, I don't know how current ncurses treat terminfo definition in UTF-8 
mode and what linux terminal definition you are using in you test 
system. There are different definitions floating around in different 
distributions which are slightly different especially as we talk about 
acsc chars and enacs, smacs and rmacs sequences. Which is also wrong 
approach so there should be one proper definition of the linux console. 
Maybe kernel developers should prepare some most compatible and 
acceptable one.
I can post the one I am using today:

#       Reconstructed via infocmp from file: /etc/terminfo/l/linux
linux|linux console,
         am, bce, ccc, eo, mir, msgr, xenl, xon,
         colors#8, it#8, ncv#18, pairs#64,
 
acsc=++\,\,--..00__``aaffgbhhjjkkllmmnnooqqssttuuvvwwxxyyzz{{||}c~~,
         bel=^G, blink=\E[5m, bold=\E[1m, civis=\E[?25l\E[?1c,
         clear=\E[H\E[J, cnorm=\E[?25h\E[?0c, cr=^M,
         csr=\E[%i%p1%d;%p2%dr, cub1=^H, cud1=^J, cuf1=\E[C,
         cup=\E[%i%p1%d;%p2%dH, cuu1=\E[A, cvvis=\E[?25h\E[?8c,
         dch=\E[%p1%dP, dch1=\E[P, dim=\E[2m, dl=\E[%p1%dM,
         dl1=\E[M, ech=\E[%p1%dX, ed=\E[J, el=\E[K, el1=\E[1K,
         enacs=\E)0, flash=\E[?5h\E[?5l$<200/>, home=\E[H,
         hpa=\E[%i%p1%dG, ht=^I, hts=\EH, il=\E[%p1%dL, il1=\E[L,
         ind=^J,
 
initc=\E]P%p1%x%p2%{256}%*%{1000}%/%02x%p3%{256}%*%{1000}%/%02x%p4%{256}%*%{1000}%/%02x,
         invis=\E[8m, kb2=\E[G, kbs=\177, kcbt=\E[Z, kcub1=\E[D,
         kcud1=\E[B, kcuf1=\E[C, kcuu1=\E[A, kdch1=\E[3~,
         kend=\E[4~, kf1=\E[[A, kf10=\E[21~, kf11=\E[23~,
         kf12=\E[24~, kf13=\E[25~, kf14=\E[26~, kf15=\E[28~,
         kf16=\E[29~, kf17=\E[31~, kf18=\E[32~, kf19=\E[33~,
         kf2=\E[[B, kf20=\E[34~, kf3=\E[[C, kf4=\E[[D, kf5=\E[[E,
         kf6=\E[17~, kf7=\E[18~, kf8=\E[19~, kf9=\E[20~,
         khome=\E[1~, kich1=\E[2~, kmous=\E[M, knp=\E[6~, kpp=\E[5~,
         kspd=^Z, nel=^M^J, oc=\E]R, op=\E[39;49m, rc=\E8, rev=\E[7m,
         ri=\EM, rmacs=^O, rmam=\E[?7l, rmir=\E[4l, rmpch=\E[10m,
         rmso=\E[27m, rmul=\E[24m, rs1=\Ec\E]R, sc=\E7,
         setab=\E[4%p1%dm, setaf=\E[3%p1%dm,
 
sgr=\E[0;10%?%p1%t;7%;%?%p2%t;4%;%?%p3%t;7%;%?%p4%t;5%;%?%p5%t;2%;%?%p6%t;1%;%?%p7%t;8%;m%?%p9%t\016%e\017%;,
         sgr0=\E[0;10m, smacs=^N, smam=\E[?7h, smir=\E[4h,
         smpch=\E[11m, smso=\E[7m, smul=\E[4m, tbc=\E[3g,
         u6=\E[%i%d;%dR, u7=\E[6n, u8=\E[?6c, u9=\E[c,
         vpa=\E[%i%p1%dd,

If using this definition and my patch applied acsc mode is not working 
then there is something wrong in curses code IMHO.
I am using Ubuntu distro and with this patch and this definition
aptitude properly displays semigraphics in UTF-8 mode and Midnight
Commander also works almost correctly (almost because it is not properly
ported - it displays help using iso-8859-2 help file without conversion
to UTF-8 so I can see replacement glyphs as it should be but in options
and menu it uses UTF-8 but incorrectly counts bytes instedad of glyphs
so screen is not looking properly if there is a multibyte UTF-8 seqence
used in a line; anyway it's MC bad code not terminal fault).

Regards
-- 
Adam Tlałka       mailto:atlka@pg.gda.pl    ^v^ ^v^ ^v^
System  & Network Administration Group       - - - ~~~~~~
Computer Center,  Gdańsk University of Technology, Poland
PGP public key:   finger atlka@sunrise.pg.gda.pl

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-18 14:37   ` [PATCH]console:UTF-8 mode compatibility fixes Adam Tlałka
@ 2006-02-19  1:43     ` Thomas Dickey
  2006-02-19 10:45       ` Adam Tla/lka
  0 siblings, 1 reply; 21+ messages in thread
From: Thomas Dickey @ 2006-02-19  1:43 UTC (permalink / raw)
  To: Adam Tlałka; +Cc: Alexander E. Patrakov, torvalds, bug-ncurses, LKML

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: TEXT/PLAIN; charset=UTF-8; format=flowed, Size: 1128 bytes --]

On Sat, 18 Feb 2006, Adam TlaÅ~Bka wrote:

> Hmm, I don't know how current ncurses treat terminfo definition in UTF-8 mode 
> and what linux terminal definition you are using in you test system. There 
> are different definitions floating around in different distributions which

Since we're talking about ncurses, there are not "different definitions 
floating around".  Different distributions may of course have their 
favorite tweaks, but ncurses definition for Linux console doesn't change
that much over the past ten years.

> are slightly different especially as we talk about acsc chars and enacs, 
> smacs and rmacs sequences. Which is also wrong approach so there should be 
> one proper definition of the linux console. Maybe kernel developers should 
> prepare some most compatible and acceptable one.
> I can post the one I am using today:

...of course, this one isn't like any of the variations I've seen.
But you knew that.  (Aside from the acs/enacs/rmacs/smacs changes,
I'm curious what happened to ich/ich1).

-- 
Thomas E. Dickey
http://invisible-island.net
ftp://invisible-island.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-19  1:43     ` Thomas Dickey
@ 2006-02-19 10:45       ` Adam Tla/lka
  0 siblings, 0 replies; 21+ messages in thread
From: Adam Tla/lka @ 2006-02-19 10:45 UTC (permalink / raw)
  To: Thomas Dickey; +Cc: Alexander E. Patrakov, torvalds, bug-ncurses, LKML

On Sat, Feb 18, 2006 at 08:43:56PM -0500, Thomas Dickey wrote:
> On Sat, 18 Feb 2006, Adam TlaĹ~Bka wrote:
> >one proper definition of the linux console. Maybe kernel developers should 
> >prepare some most compatible and acceptable one.
> >I can post the one I am using today:
> 
> ...of course, this one isn't like any of the variations I've seen.
> But you knew that.  (Aside from the acs/enacs/rmacs/smacs changes,
> I'm curious what happened to ich/ich1).

They were removed because of possible incompatibility warning.
Maybe they should stay - I posted only currently used by me linux terminal
definition. As I said before there should be the official linux terminal
definition and description included with kernel sources because vt.c
in sources defines console behaviour.

Anyway acs, enacs, smacs and rmacs sequences are defined here 
to stick to the controls interpretation used in vt.c 
do_con_trol function and to get desired behaviour.

Regards
-- 
Adam Tlałka      mailto:atlka@pg.gda.pl    ^v^ ^v^ ^v^
System  & Network Administration Group           ~~~~~~
Computer Center,  Gdańsk University of Technology, Poland
PGP public key:   finger atlka@sunrise.pg.gda.pl

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH]console:UTF-8 mode compatibility fixes
  2006-02-17 23:33 [PATCH]console:UTF-8 mode compatibility fixes Adam Tla/lka
                   ` (2 preceding siblings ...)
       [not found] ` <43F72A1E.1090707@ums.usu.ru>
@ 2006-02-18 22:35 ` Adam Tla/lka
  3 siblings, 0 replies; 21+ messages in thread
From: Adam Tla/lka @ 2006-02-18 22:35 UTC (permalink / raw)
  To: linux-kernel; +Cc: torvalds


This patch applies to 2.6.15.3 kernel sources to drivers/char/vt.c file.
It should work with other versions too.

Changed console behaviour so in UTF-8 mode vt100 alternate character
sequences work as described in terminfo/termcap linux terminal definition.
Programs can use vt100 control seqences - smacs, rmacs and acsc  characters
in UTF-8 mode in the same way as in normal mode so one definition is always
valid - current behaviour make these seqences not working in UTF-8 mode.

Added reporting malformed UTF-8 seqences as replacement glyphs.
I think that terminal should always display something rather then ignoring
these kind of data as it does now. Also it sticks to Unicode standards
saying that every wrong byte should be reported. It is more human readable
too in case of Latin subsets including ASCII chars.

It's the second version with original codes used in case when there is no replacement
glyph defined. So all scanned bytes are remembered. Anyway it could be used
in the future if we implement screen buffer as UCS-2 plus attributes
so copying from it in UTF-8 mode will always work properly.

Signed-off-by: Adam Tla/lka <atlka@pg.gda.pl>

---

--- drivers/char/vt_orig.c	2006-02-13 11:33:54.000000000 +0100
+++ drivers/char/vt.c	2006-02-18 23:18:50.000000000 +0100
@@ -63,6 +63,13 @@
  *
  * Removed console_lock, enabled interrupts across all console operations
  * 13 March 2001, Andrew Morton
+ *
+ * Fixed UTF-8 mode so alternate charset modes always work without need
+ * of different linux terminal definition for normal and UTF-8 modes
+ * preserving backward US-ASCII and VT100 semigraphics compatibility,
+ * malformed UTF sequences represented as sequences of replacement glyphs
+ * or original codes if replacement glyph is undefined
+ * by Adam Tla/lka <atlka@pg.gda.pl>, Feb 2006
  */
 
 #include <linux/module.h>
@@ -1991,17 +1998,26 @@ static int do_con_write(struct tty_struc
 		/* Do no translation at all in control states */
 		if (vc->vc_state != ESnormal) {
 			tc = c;
-		} else if (vc->vc_utf) {
+		} else if (vc->vc_utf && !vc->vc_disp_ctrl) {
 		    /* Combine UTF-8 into Unicode */
-		    /* Incomplete characters silently ignored */
+		    /* Malformed sequence represented as replacement glyphs */
+rescan_last_byte:
 		    if(c > 0x7f) {
-			if (vc->vc_utf_count > 0 && (c & 0xc0) == 0x80) {
-				vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
-				vc->vc_utf_count--;
-				if (vc->vc_utf_count == 0)
-				    tc = c = vc->vc_utf_char;
-				else continue;
+			if (vc->vc_utf_count) {
+			       if ((c & 0xc0) == 0x80) {
+				       vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
+       				       if (--vc->vc_utf_count) {
+					       vc->vc_par[vc->vc_utf_count] = c;
+					       vc->vc_npar++;
+				   	       continue;
+       				       }
+				       tc = c = vc->vc_utf_char;
+			       } else {
+				       c = vc->vc_par[vc->vc_utf_count + vc->vc_npar];
+				       goto insert_replacement_glyph;
+			       }
 			} else {
+				vc->vc_npar = 0;
 				if ((c & 0xe0) == 0xc0) {
 				    vc->vc_utf_count = 1;
 				    vc->vc_utf_char = (c & 0x1f);
@@ -2018,12 +2034,16 @@ static int do_con_write(struct tty_struc
 				    vc->vc_utf_count = 5;
 				    vc->vc_utf_char = (c & 0x01);
 				} else
-				    vc->vc_utf_count = 0;
+	    			    goto insert_replacement_glyph;
+				vc->vc_par[vc->vc_utf_count] = c;
 				continue;
 			      }
 		    } else {
+		      if (vc->vc_utf_count) {
+			      c = vc->vc_par[vc->vc_utf_count + vc->vc_npar];
+	  		      goto insert_replacement_glyph;
+		      }
 		      tc = c;
-		      vc->vc_utf_count = 0;
 		    }
 		} else {	/* no utf */
 		  tc = vc->vc_translate[vc->vc_toggle_meta ? (c | 0x80) : c];
@@ -2040,8 +2060,8 @@ static int do_con_write(struct tty_struc
                  * direct-to-font zone in UTF-8 mode.
                  */
                 ok = tc && (c >= 32 ||
-			    (!vc->vc_utf && !(((vc->vc_disp_ctrl ? CTRL_ALWAYS
-						: CTRL_ACTION) >> c) & 1)))
+			    !(vc->vc_disp_ctrl ? (CTRL_ALWAYS >> c) & 1 :
+				  vc->vc_utf || ((CTRL_ACTION >> c) & 1)))
 			&& (c != 127 || vc->vc_disp_ctrl)
 			&& (c != 128+27);
 
@@ -2051,6 +2071,7 @@ static int do_con_write(struct tty_struc
 			if ( tc == -4 ) {
                                 /* If we got -4 (not found) then see if we have
                                    defined a replacement character (U+FFFD) */
+insert_replacement_glyph:
                                 tc = conv_uni_to_pc(vc, 0xfffd);
 
 				/* One reason for the -4 can be that we just
@@ -2063,7 +2084,7 @@ static int do_con_write(struct tty_struc
                                 tc = c;
                         }
 			if (tc & ~charmask)
-                                continue; /* Conversion failed */
+				goto check_malformed_sequence;
 
 			if (vc->vc_need_wrap || vc->vc_decim)
 				FLUSH
@@ -2088,6 +2109,16 @@ static int do_con_write(struct tty_struc
 				vc->vc_x++;
 				draw_to = (vc->vc_pos += 2);
 			}
+check_malformed_sequence:
+			if (vc->vc_utf_count) {
+				if (vc->vc_npar) {
+					c = vc->vc_par[vc->vc_utf_count + --vc->vc_npar];
+					goto insert_replacement_glyph;
+				}
+				vc->vc_utf_count = 0;
+				c = orig;
+				goto rescan_last_byte;
+			}
 			continue;
 		}
 		FLUSH

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2006-03-07 15:06 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-02-17 23:33 [PATCH]console:UTF-8 mode compatibility fixes Adam Tla/lka
2006-02-18 10:59 ` Andrew Morton
2006-02-18 16:01   ` Adam Tlałka
2006-02-19  4:24     ` Alexander E. Patrakov
2006-02-19 12:45       ` Adam Tla/lka
2006-02-19 16:16       ` Adam Tla/lka
2006-02-19 17:07         ` Alexander E. Patrakov
2006-02-18 14:17 ` Alexander E. Patrakov
2006-02-19  1:53   ` Thomas Dickey
2006-02-19  4:33     ` Alexander E. Patrakov
2006-02-19 11:47     ` Adam Tla/lka
2006-02-20  1:20       ` Thomas Dickey
2006-03-07 15:05         ` Adam Tlałka
2006-02-19  5:42   ` Alexander E. Patrakov
2006-02-19 10:15     ` Adam Tla/lka
2006-02-19 23:19       ` [PATCH]console:UTF-8 mode compatibility fixes - new version Adam Tla/lka
2006-02-20  8:14         ` [PATCH]console:UTF-8 mode compatibility fixes - new version #1 Adam Tla/lka
     [not found] ` <43F72A1E.1090707@ums.usu.ru>
2006-02-18 14:37   ` [PATCH]console:UTF-8 mode compatibility fixes Adam Tlałka
2006-02-19  1:43     ` Thomas Dickey
2006-02-19 10:45       ` Adam Tla/lka
2006-02-18 22:35 ` Adam Tla/lka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox