From 28fbd2b34c21666ad5c83379539e72cef64054ef Mon Sep 17 00:00:00 2001 From: mattias Date: Mon, 30 Jan 2012 09:34:18 +0000 Subject: [PATCH] lazutils: UTF8FixBroken: check for ascci as multi byte git-svn-id: trunk@35036 - --- components/lazutils/lazutf8.pas | 95 ++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 36 deletions(-) diff --git a/components/lazutils/lazutf8.pas b/components/lazutils/lazutf8.pas index 0110056666..f07ae7100c 100644 --- a/components/lazutils/lazutf8.pas +++ b/components/lazutils/lazutf8.pas @@ -187,32 +187,35 @@ begin // regular single byte character (#0 is a character, this is pascal ;) Result:=1; end - else if ((ord(p^) and %11100000) = %11000000) then begin - // could be 2 byte character - if (ord(p[1]) and %11000000) = %10000000 then - Result:=2 + else begin + // multi byte + if ((ord(p^) and %11100000) = %11000000) then begin + // could be 2 byte character + if (ord(p[1]) and %11000000) = %10000000 then + Result:=2 + else + Result:=1; + end + else if ((ord(p^) and %11110000) = %11100000) then begin + // could be 3 byte character + if ((ord(p[1]) and %11000000) = %10000000) + and ((ord(p[2]) and %11000000) = %10000000) then + Result:=3 + else + Result:=1; + end + else if ((ord(p^) and %11111000) = %11110000) then begin + // could be 4 byte character + if ((ord(p[1]) and %11000000) = %10000000) + and ((ord(p[2]) and %11000000) = %10000000) + and ((ord(p[3]) and %11000000) = %10000000) then + Result:=4 + else + Result:=1; + end else Result:=1; - end - else if ((ord(p^) and %11110000) = %11100000) then begin - // could be 3 byte character - if ((ord(p[1]) and %11000000) = %10000000) - and ((ord(p[2]) and %11000000) = %10000000) then - Result:=3 - else - Result:=1; - end - else if ((ord(p^) and %11111000) = %11110000) then begin - // could be 4 byte character - if ((ord(p[1]) and %11000000) = %10000000) - and ((ord(p[2]) and %11000000) = %10000000) - and ((ord(p[3]) and %11000000) = %10000000) then - Result:=4 - else - Result:=1; - end - else - Result:=1 + end; end else Result:=0; end; @@ -292,7 +295,6 @@ begin if (CharLen>1) and (Result<128) then begin // invalid character Result:=ord(p^); - CharLen:=1; end; end; end else begin @@ -463,6 +465,8 @@ end; { fix any broken UTF8 sequences with spaces } procedure UTF8FixBroken(P: PChar); +var + c: cardinal; begin if p=nil then exit; while p^<>#0 do begin @@ -476,27 +480,46 @@ begin inc(p); end else if ((ord(p^) and %11100000) = %11000000) then begin - // should be 2 byte character - if (ord(p[1]) and %11000000) = %10000000 then - inc(p,2) + // starts with %110 => should be 2 byte character + if ((ord(p[1]) and %11000000) = %10000000) then begin + c:=((ord(p^) and %00011111) shl 6); + //or (ord(p[1]) and %00111111); + if c<128 then + p^:=' ' + else + inc(p,2) + end else if p[1]<>#0 then p^:=' '; end else if ((ord(p^) and %11110000) = %11100000) then begin - // should be 3 byte character + // starts with %1110 => should be 3 byte character if ((ord(p[1]) and %11000000) = %10000000) - and ((ord(p[2]) and %11000000) = %10000000) then - inc(p,3) - else + and ((ord(p[2]) and %11000000) = %10000000) then begin + c:=((ord(p^) and %00011111) shl 12) + or ((ord(p[1]) and %00111111) shl 6); + //or (ord(p[2]) and %00111111); + if c<128 then + p^:=' ' + else + inc(p,3); + end else p^:=' '; end else if ((ord(p^) and %11111000) = %11110000) then begin - // should be 4 byte character + // starts with %11110 => should be 4 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) - and ((ord(p[3]) and %11000000) = %10000000) then - inc(p,4) - else + and ((ord(p[3]) and %11000000) = %10000000) then begin + c:=((ord(p^) and %00001111) shl 18) + or ((ord(p[1]) and %00111111) shl 12) + or ((ord(p[2]) and %00111111) shl 6); + //or (ord(p[3]) and %00111111); + if c<128 then + p^:=' ' + else + inc(p,4) + end else p^:=' '; end else begin