lazutils: UTF8CharacterToUnicode: check for intersecting ranges

git-svn-id: trunk@35038 -
This commit is contained in:
mattias 2012-01-30 10:46:07 +00:00
parent 26d2cf7352
commit 64e2cd6b98
2 changed files with 775 additions and 1015 deletions

View File

@ -239,6 +239,11 @@ begin
end; end;
function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal; function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal;
{ if p=nil then CharLen=0 otherwise CharLen>0
If there is an encoding error the Result is undefined.
Use UTF8FixBroken to fix UTF-8 encoding.
It does not check if the codepoint is defined in the Unicode tables.
}
begin begin
if p<>nil then begin if p<>nil then begin
if ord(p^)<%11000000 then begin if ord(p^)<%11000000 then begin
@ -246,56 +251,61 @@ begin
Result:=ord(p^); Result:=ord(p^);
CharLen:=1; CharLen:=1;
end end
else begin else if ((ord(p^) and %11100000) = %11000000) then begin
// multi byte // starts with %110 => could be double byte character
if ((ord(p^) and %11100000) = %11000000) then begin if (ord(p[1]) and %11000000) = %10000000 then begin
// starts with %110 => could be double byte character CharLen:=2;
if (ord(p[1]) and %11000000) = %10000000 then begin Result:=((ord(p^) and %00011111) shl 6)
Result:=((ord(p^) and %00011111) shl 6) or (ord(p[1]) and %00111111);
or (ord(p[1]) and %00111111); if Result<(1 shl 7) then begin
CharLen:=2; // wrong encoded, could be an XSS attack
end else begin Result:=0;
Result:=ord(p^);
CharLen:=1;
end; end;
end end else begin
else if ((ord(p^) and %11110000) = %11100000) then begin
// starts with %1110 => could be triple byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then begin
Result:=((ord(p^) and %00011111) shl 12)
or ((ord(p[1]) and %00111111) shl 6)
or (ord(p[2]) and %00111111);
CharLen:=3;
end else begin
Result:=ord(p^);
CharLen:=1;
end;
end
else if ((ord(p^) and %11111000) = %11110000) then begin
// starts with %11110 => could be 4 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then begin
Result:=((ord(p^) and %00001111) shl 18)
or ((ord(p[1]) and %00111111) shl 12)
or ((ord(p[2]) and %00111111) shl 6)
or (ord(p[3]) and %00111111);
CharLen:=4;
end else begin
Result:=ord(p^);
CharLen:=1;
end;
end
else begin
// invalid character
Result:=ord(p^); Result:=ord(p^);
CharLen:=1; CharLen:=1;
end; end;
if (CharLen>1) and (Result<128) then begin end
// invalid character else if ((ord(p^) and %11110000) = %11100000) then begin
// starts with %1110 => could be triple byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then begin
CharLen:=3;
Result:=((ord(p^) and %00011111) shl 12)
or ((ord(p[1]) and %00111111) shl 6)
or (ord(p[2]) and %00111111);
if Result<(1 shl 11) then begin
// wrong encoded, could be an XSS attack
Result:=0;
end;
end else begin
Result:=ord(p^); Result:=ord(p^);
CharLen:=1;
end; end;
end
else if ((ord(p^) and %11111000) = %11110000) then begin
// starts with %11110 => could be 4 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then begin
CharLen:=4;
Result:=((ord(p^) and %00001111) shl 18)
or ((ord(p[1]) and %00111111) shl 12)
or ((ord(p[2]) and %00111111) shl 6)
or (ord(p[3]) and %00111111);
if Result<(1 shl 16) then begin
// wrong encoded, could be an XSS attack
Result:=0;
end;
end else begin
Result:=ord(p^);
CharLen:=1;
end;
end
else begin
// invalid character
Result:=ord(p^);
CharLen:=1;
end; end;
end else begin end else begin
Result:=0; Result:=0;
@ -484,7 +494,7 @@ begin
if ((ord(p[1]) and %11000000) = %10000000) then begin if ((ord(p[1]) and %11000000) = %10000000) then begin
c:=((ord(p^) and %00011111) shl 6); c:=((ord(p^) and %00011111) shl 6);
//or (ord(p[1]) and %00111111); //or (ord(p[1]) and %00111111);
if c<128 then if c<(1 shl 7) then
p^:=' ' p^:=' '
else else
inc(p,2) inc(p,2)
@ -499,7 +509,7 @@ begin
c:=((ord(p^) and %00011111) shl 12) c:=((ord(p^) and %00011111) shl 12)
or ((ord(p[1]) and %00111111) shl 6); or ((ord(p[1]) and %00111111) shl 6);
//or (ord(p[2]) and %00111111); //or (ord(p[2]) and %00111111);
if c<128 then if c<(1 shl 11) then
p^:=' ' p^:=' '
else else
inc(p,3); inc(p,3);
@ -515,7 +525,7 @@ begin
or ((ord(p[1]) and %00111111) shl 12) or ((ord(p[1]) and %00111111) shl 12)
or ((ord(p[2]) and %00111111) shl 6); or ((ord(p[2]) and %00111111) shl 6);
//or (ord(p[3]) and %00111111); //or (ord(p[3]) and %00111111);
if c<128 then if c<(1 shl 16) then
p^:=' ' p^:=' '
else else
inc(p,4) inc(p,4)

File diff suppressed because it is too large Load Diff