lazutils: FindInvalidUTF8Codepoint: check if bigger U+10FFFF

git-svn-id: trunk@65162 -
This commit is contained in:
mattias 2021-06-01 21:02:23 +00:00
parent e828efa600
commit c887c889e3
2 changed files with 17 additions and 10 deletions

View File

@ -2813,22 +2813,22 @@ function FindInvalidUTF8Codepoint(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolea
// return -1 if ok
var
CharLen: Integer;
c: Char;
c: Byte;
begin
if (p<>nil) then begin
Result:=0;
while Result<Count do begin
c:=p^;
if ord(c)<%10000000 then begin
c:=ord(p^);
if c<%10000000 then begin
// regular single byte ASCII character (#0 is a character, this is Pascal ;)
CharLen:=1;
end else if ord(c)<=%11000001 then begin
end else if c<=%11000001 then begin
// single byte character, between valid UTF-8 encodings
// %11000000 and %11000001 map 2 byte to #0..#128, which is invalid and used for XSS attacks
if StopOnNonUTF8 or (ord(c)>=192) then
if StopOnNonUTF8 or (c>=192) then
exit;
CharLen:=1;
end else if ord(c)<=%11011111 then begin
end else if c<=%11011111 then begin
// could be 2 byte character (%110xxxxx %10xxxxxx)
if (Result<Count-1)
and ((ord(p[1]) and %11000000) = %10000000) then
@ -2836,25 +2836,29 @@ begin
else
exit; // missing following bytes
end
else if ord(c)<=%11101111 then begin
else if c<=%11101111 then begin
// could be 3 byte character (%1110xxxx %10xxxxxx %10xxxxxx)
if (Result<Count-2)
and ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then begin
if (ord(c)=%11100000) and (ord(p[1])<=%10011111) then
if (c=%11100000) and (ord(p[1])<=%10011111) then
exit; // XSS attack: 3 bytes are mapped to the 1 or 2 byte codes
CharLen:=3;
end else
exit; // missing following bytes
end
else if ord(c)<=%11110111 then begin
else if c<=%11110111 then begin
// could be 4 byte character (%11110xxx %10xxxxxx %10xxxxxx %10xxxxxx)
if (Result<Count-3)
and ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then begin
if (ord(c)=%11110000) and (ord(p[1])<=%10001111) then
if (c=%11110000) and (ord(p[1])<=%10001111) then
exit; // XSS attack: 4 bytes are mapped to the 1-3 byte codes
if (c>%11110100) then
exit; // out of range U+10FFFF
if (c=%11110100) and (ord(p[1])>%10001111) then
exit; // out of range U+10FFFF
CharLen:=4;
end else
exit; // missing following bytes

View File

@ -92,6 +92,8 @@ begin
t(UnicodeToUTF8($10000),-1,'unicode($10000)');
t(UnicodeToUTF8($10900),-1,'unicode($10900)');
t(UnicodeToUTF8($10ffff),-1,'unicode($10ffff)');
t(#$F4#$8F#$BF#$BF,-1,'unicode($10ffff)');
t(#$F4#$90#$80#$80,0,'unicode($110000)');
t(#$c0#0,0,'invalid second byte of 2 byte');
t(#$e0#0,0,'invalid second byte of 3 byte');
t(#$e0#$80#0,0,'invalid third byte of 3 byte');
@ -106,6 +108,7 @@ begin
t(#$e0#$9f#$bf,0,'invalid: $7ff encoded as 3 byte');
t(#$f0#$80#$80#$80,0,'invalid: 0 encoded as 4 byte');
t(#$f0#$8f#$bf#$bf,0,'invalid: $ffff encoded as 4 byte');
t(#$F7#$BF#$BF#$BF,0,'invalid 4 byte out of range');
end;
procedure TTestLazUTF8.TestFindUnicodeToUTF8;