lazutils: UTF8FixBroken: fixing out of range and endless loop, added tests

git-svn-id: trunk@65163 -
This commit is contained in:
mattias 2021-06-01 22:01:18 +00:00
parent c887c889e3
commit 6de8b92783
2 changed files with 62 additions and 9 deletions

View File

@ -853,33 +853,33 @@ end;
{ fix any broken UTF8 sequences with spaces }
procedure UTF8FixBroken(P: PChar);
var
b: byte;
c: cardinal;
begin
if p=nil then exit;
while p^<>#0 do begin
if ord(p^)<%10000000 then begin
b:=ord(p^);
if b<%10000000 then begin
// regular single byte character
inc(p);
end
else if ord(p^)<%11000000 then begin
else if b<%11000000 then begin
// invalid
p^:=' ';
inc(p);
end
else if ((ord(p^) and %11100000) = %11000000) then begin
else if (b and %11100000) = %11000000 then begin
// starts with %110 => should be 2 byte character
if ((ord(p[1]) and %11000000) = %10000000) then begin
c:=((ord(p^) and %00011111) shl 6);
//or (ord(p[1]) and %00111111);
if c<(1 shl 7) then
if b<%11000010 then
p^:=' ' // fix XSS attack
else
inc(p,2)
end
else if p[1]<>#0 then
else
p^:=' ';
end
else if ((ord(p^) and %11110000) = %11100000) then begin
else if (b and %11110000) = %11100000 then begin
// starts with %1110 => should be 3 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then begin
@ -893,7 +893,7 @@ begin
end else
p^:=' ';
end
else if ((ord(p^) and %11111000) = %11110000) then begin
else if (b and %11111000) = %11110000 then begin
// starts with %11110 => should be 4 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
@ -904,6 +904,8 @@ begin
//or (ord(p[3]) and %00111111);
if c<(1 shl 16) then
p^:=' ' // fix XSS attack
else if (c>$10FFFF) then
p^:=' ' // out of range U+10FFFF
else
inc(p,4)
end else

View File

@ -30,6 +30,7 @@ type
procedure TestFindInvalidUTF8;
procedure TestFindUnicodeToUTF8;
procedure TestUTF8QuotedStr;
procedure TestUTF8FixBroken;
end;
implementation
@ -95,6 +96,7 @@ begin
t(#$F4#$8F#$BF#$BF,-1,'unicode($10ffff)');
t(#$F4#$90#$80#$80,0,'unicode($110000)');
t(#$c0#0,0,'invalid second byte of 2 byte');
t(#$c2#0,0,'valid 2 byte');
t(#$e0#0,0,'invalid second byte of 3 byte');
t(#$e0#$80#0,0,'invalid third byte of 3 byte');
t(#$f0#0,0,'invalid second byte of 4 byte');
@ -160,6 +162,55 @@ begin
t('cABc','AB','ABcABABcAB');
end;
procedure TTestLazUTF8.TestUTF8FixBroken;
procedure t(const S, Expected: string);
var
Actual: String;
begin
Actual:=S;
UTF8FixBroken(Actual);
AssertEquals('S: '+dbgMemRange(PChar(S),length(S)),
dbgMemRange(PChar(Expected),length(Expected)),
dbgMemRange(PChar(Actual),length(Actual)));
end;
begin
t(#$0,#$0);
t(#$1,#$1);
t(#$7F,#$7F);
t(#$80,' ');
t(#$BF,' ');
t(#$C0#$0,' '#$0);
t(#$C0#$7F,' '#$7F);
t(#$C0#$80,' ');
t(#$C0#$CF,' ');
t(#$C1#$80,' ');
t(#$C2#$7F,' '#$7F);
t(#$C2#$80,#$C2#$80);
t(#$DF#$80,#$DF#$80);
t(#$DF#$BF,#$DF#$BF);
t(#$DF#$C0,' ');
t(#$DF#$70,' '#$70);
t(#$E0#$80,' ');
t(#$E0#$80#$80,' ');
t(#$E0#$9F#$BF,' ');
t(#$E0#$A0#$80,#$E0#$A0#$80);
t(#$E0#$80#$70,' '#$70);
t(#$EF#$BF#$BF,#$EF#$BF#$BF);
t(#$EF#$BF#$7F,' '#$7F);
t(#$EF#$BF#$C0,' ');
t(#$EF#$7F#$80,' '#$7F' ');
t(#$F0#$80#$80#$80,' ');
t(#$F0#$8F#$BF#$BF,' ');
t(#$F0#$9F#$BF#$BF,#$F0#$9F#$BF#$BF);
t(#$F0#$9F#$BF#$CF,' ');
t(#$F0#$9F#$CF#$BF,' '#$CF#$BF);
t(#$F0#$CF#$BF#$BF,' '#$CF#$BF' ');
t(#$F4#$8F#$BF#$BF,#$F4#$8F#$BF#$BF);
t(#$F4#$90#$80#$80,' ');
end;
initialization
AddToLazUtilsTestSuite(TTestLazUTF8);