lazutils: UTF8FixBroken: check for ascci as multi byte

git-svn-id: trunk@35036 -
This commit is contained in:
mattias 2012-01-30 09:34:18 +00:00
parent 5feee468d7
commit 28fbd2b34c

View File

@ -187,32 +187,35 @@ begin
// regular single byte character (#0 is a character, this is pascal ;)
Result:=1;
end
else if ((ord(p^) and %11100000) = %11000000) then begin
// could be 2 byte character
if (ord(p[1]) and %11000000) = %10000000 then
Result:=2
else begin
// multi byte
if ((ord(p^) and %11100000) = %11000000) then begin
// could be 2 byte character
if (ord(p[1]) and %11000000) = %10000000 then
Result:=2
else
Result:=1;
end
else if ((ord(p^) and %11110000) = %11100000) then begin
// could be 3 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then
Result:=3
else
Result:=1;
end
else if ((ord(p^) and %11111000) = %11110000) then begin
// could be 4 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then
Result:=4
else
Result:=1;
end
else
Result:=1;
end
else if ((ord(p^) and %11110000) = %11100000) then begin
// could be 3 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then
Result:=3
else
Result:=1;
end
else if ((ord(p^) and %11111000) = %11110000) then begin
// could be 4 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then
Result:=4
else
Result:=1;
end
else
Result:=1
end;
end else
Result:=0;
end;
@ -292,7 +295,6 @@ begin
if (CharLen>1) and (Result<128) then begin
// invalid character
Result:=ord(p^);
CharLen:=1;
end;
end;
end else begin
@ -463,6 +465,8 @@ end;
{ fix any broken UTF8 sequences with spaces }
procedure UTF8FixBroken(P: PChar);
var
c: cardinal;
begin
if p=nil then exit;
while p^<>#0 do begin
@ -476,27 +480,46 @@ begin
inc(p);
end
else if ((ord(p^) and %11100000) = %11000000) then begin
// should be 2 byte character
if (ord(p[1]) and %11000000) = %10000000 then
inc(p,2)
// starts with %110 => should be 2 byte character
if ((ord(p[1]) and %11000000) = %10000000) then begin
c:=((ord(p^) and %00011111) shl 6);
//or (ord(p[1]) and %00111111);
if c<128 then
p^:=' '
else
inc(p,2)
end
else if p[1]<>#0 then
p^:=' ';
end
else if ((ord(p^) and %11110000) = %11100000) then begin
// should be 3 byte character
// starts with %1110 => should be 3 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then
inc(p,3)
else
and ((ord(p[2]) and %11000000) = %10000000) then begin
c:=((ord(p^) and %00011111) shl 12)
or ((ord(p[1]) and %00111111) shl 6);
//or (ord(p[2]) and %00111111);
if c<128 then
p^:=' '
else
inc(p,3);
end else
p^:=' ';
end
else if ((ord(p^) and %11111000) = %11110000) then begin
// should be 4 byte character
// starts with %11110 => should be 4 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then
inc(p,4)
else
and ((ord(p[3]) and %11000000) = %10000000) then begin
c:=((ord(p^) and %00001111) shl 18)
or ((ord(p[1]) and %00111111) shl 12)
or ((ord(p[2]) and %00111111) shl 6);
//or (ord(p[3]) and %00111111);
if c<128 then
p^:=' '
else
inc(p,4)
end else
p^:=' ';
end
else begin