mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-04-05 15:58:07 +02:00
lazutils: FindInvalidUTF8Codepoint: check if bigger U+10FFFF
git-svn-id: trunk@65162 -
This commit is contained in:
parent
e828efa600
commit
c887c889e3
@ -2813,22 +2813,22 @@ function FindInvalidUTF8Codepoint(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolea
|
||||
// return -1 if ok
|
||||
var
|
||||
CharLen: Integer;
|
||||
c: Char;
|
||||
c: Byte;
|
||||
begin
|
||||
if (p<>nil) then begin
|
||||
Result:=0;
|
||||
while Result<Count do begin
|
||||
c:=p^;
|
||||
if ord(c)<%10000000 then begin
|
||||
c:=ord(p^);
|
||||
if c<%10000000 then begin
|
||||
// regular single byte ASCII character (#0 is a character, this is Pascal ;)
|
||||
CharLen:=1;
|
||||
end else if ord(c)<=%11000001 then begin
|
||||
end else if c<=%11000001 then begin
|
||||
// single byte character, between valid UTF-8 encodings
|
||||
// %11000000 and %11000001 map 2 byte to #0..#128, which is invalid and used for XSS attacks
|
||||
if StopOnNonUTF8 or (ord(c)>=192) then
|
||||
if StopOnNonUTF8 or (c>=192) then
|
||||
exit;
|
||||
CharLen:=1;
|
||||
end else if ord(c)<=%11011111 then begin
|
||||
end else if c<=%11011111 then begin
|
||||
// could be 2 byte character (%110xxxxx %10xxxxxx)
|
||||
if (Result<Count-1)
|
||||
and ((ord(p[1]) and %11000000) = %10000000) then
|
||||
@ -2836,25 +2836,29 @@ begin
|
||||
else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else if ord(c)<=%11101111 then begin
|
||||
else if c<=%11101111 then begin
|
||||
// could be 3 byte character (%1110xxxx %10xxxxxx %10xxxxxx)
|
||||
if (Result<Count-2)
|
||||
and ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then begin
|
||||
if (ord(c)=%11100000) and (ord(p[1])<=%10011111) then
|
||||
if (c=%11100000) and (ord(p[1])<=%10011111) then
|
||||
exit; // XSS attack: 3 bytes are mapped to the 1 or 2 byte codes
|
||||
CharLen:=3;
|
||||
end else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else if ord(c)<=%11110111 then begin
|
||||
else if c<=%11110111 then begin
|
||||
// could be 4 byte character (%11110xxx %10xxxxxx %10xxxxxx %10xxxxxx)
|
||||
if (Result<Count-3)
|
||||
and ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then begin
|
||||
if (ord(c)=%11110000) and (ord(p[1])<=%10001111) then
|
||||
if (c=%11110000) and (ord(p[1])<=%10001111) then
|
||||
exit; // XSS attack: 4 bytes are mapped to the 1-3 byte codes
|
||||
if (c>%11110100) then
|
||||
exit; // out of range U+10FFFF
|
||||
if (c=%11110100) and (ord(p[1])>%10001111) then
|
||||
exit; // out of range U+10FFFF
|
||||
CharLen:=4;
|
||||
end else
|
||||
exit; // missing following bytes
|
||||
|
@ -92,6 +92,8 @@ begin
|
||||
t(UnicodeToUTF8($10000),-1,'unicode($10000)');
|
||||
t(UnicodeToUTF8($10900),-1,'unicode($10900)');
|
||||
t(UnicodeToUTF8($10ffff),-1,'unicode($10ffff)');
|
||||
t(#$F4#$8F#$BF#$BF,-1,'unicode($10ffff)');
|
||||
t(#$F4#$90#$80#$80,0,'unicode($110000)');
|
||||
t(#$c0#0,0,'invalid second byte of 2 byte');
|
||||
t(#$e0#0,0,'invalid second byte of 3 byte');
|
||||
t(#$e0#$80#0,0,'invalid third byte of 3 byte');
|
||||
@ -106,6 +108,7 @@ begin
|
||||
t(#$e0#$9f#$bf,0,'invalid: $7ff encoded as 3 byte');
|
||||
t(#$f0#$80#$80#$80,0,'invalid: 0 encoded as 4 byte');
|
||||
t(#$f0#$8f#$bf#$bf,0,'invalid: $ffff encoded as 4 byte');
|
||||
t(#$F7#$BF#$BF#$BF,0,'invalid 4 byte out of range');
|
||||
end;
|
||||
|
||||
procedure TTestLazUTF8.TestFindUnicodeToUTF8;
|
||||
|
Loading…
Reference in New Issue
Block a user