mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-09-10 12:39:31 +02:00
lazutils: UTF8FixBroken: fixing out of range and endless loop, added tests
git-svn-id: trunk@65163 -
This commit is contained in:
parent
c887c889e3
commit
6de8b92783
@ -853,33 +853,33 @@ end;
|
||||
{ fix any broken UTF8 sequences with spaces }
|
||||
procedure UTF8FixBroken(P: PChar);
|
||||
var
|
||||
b: byte;
|
||||
c: cardinal;
|
||||
begin
|
||||
if p=nil then exit;
|
||||
while p^<>#0 do begin
|
||||
if ord(p^)<%10000000 then begin
|
||||
b:=ord(p^);
|
||||
if b<%10000000 then begin
|
||||
// regular single byte character
|
||||
inc(p);
|
||||
end
|
||||
else if ord(p^)<%11000000 then begin
|
||||
else if b<%11000000 then begin
|
||||
// invalid
|
||||
p^:=' ';
|
||||
inc(p);
|
||||
end
|
||||
else if ((ord(p^) and %11100000) = %11000000) then begin
|
||||
else if (b and %11100000) = %11000000 then begin
|
||||
// starts with %110 => should be 2 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000) then begin
|
||||
c:=((ord(p^) and %00011111) shl 6);
|
||||
//or (ord(p[1]) and %00111111);
|
||||
if c<(1 shl 7) then
|
||||
if b<%11000010 then
|
||||
p^:=' ' // fix XSS attack
|
||||
else
|
||||
inc(p,2)
|
||||
end
|
||||
else if p[1]<>#0 then
|
||||
else
|
||||
p^:=' ';
|
||||
end
|
||||
else if ((ord(p^) and %11110000) = %11100000) then begin
|
||||
else if (b and %11110000) = %11100000 then begin
|
||||
// starts with %1110 => should be 3 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then begin
|
||||
@ -893,7 +893,7 @@ begin
|
||||
end else
|
||||
p^:=' ';
|
||||
end
|
||||
else if ((ord(p^) and %11111000) = %11110000) then begin
|
||||
else if (b and %11111000) = %11110000 then begin
|
||||
// starts with %11110 => should be 4 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
@ -904,6 +904,8 @@ begin
|
||||
//or (ord(p[3]) and %00111111);
|
||||
if c<(1 shl 16) then
|
||||
p^:=' ' // fix XSS attack
|
||||
else if (c>$10FFFF) then
|
||||
p^:=' ' // out of range U+10FFFF
|
||||
else
|
||||
inc(p,4)
|
||||
end else
|
||||
|
@ -30,6 +30,7 @@ type
|
||||
procedure TestFindInvalidUTF8;
|
||||
procedure TestFindUnicodeToUTF8;
|
||||
procedure TestUTF8QuotedStr;
|
||||
procedure TestUTF8FixBroken;
|
||||
end;
|
||||
|
||||
implementation
|
||||
@ -95,6 +96,7 @@ begin
|
||||
t(#$F4#$8F#$BF#$BF,-1,'unicode($10ffff)');
|
||||
t(#$F4#$90#$80#$80,0,'unicode($110000)');
|
||||
t(#$c0#0,0,'invalid second byte of 2 byte');
|
||||
t(#$c2#0,0,'valid 2 byte');
|
||||
t(#$e0#0,0,'invalid second byte of 3 byte');
|
||||
t(#$e0#$80#0,0,'invalid third byte of 3 byte');
|
||||
t(#$f0#0,0,'invalid second byte of 4 byte');
|
||||
@ -160,6 +162,55 @@ begin
|
||||
t('cABc','AB','ABcABABcAB');
|
||||
end;
|
||||
|
||||
procedure TTestLazUTF8.TestUTF8FixBroken;
|
||||
|
||||
procedure t(const S, Expected: string);
|
||||
var
|
||||
Actual: String;
|
||||
begin
|
||||
Actual:=S;
|
||||
UTF8FixBroken(Actual);
|
||||
AssertEquals('S: '+dbgMemRange(PChar(S),length(S)),
|
||||
dbgMemRange(PChar(Expected),length(Expected)),
|
||||
dbgMemRange(PChar(Actual),length(Actual)));
|
||||
end;
|
||||
|
||||
begin
|
||||
t(#$0,#$0);
|
||||
t(#$1,#$1);
|
||||
t(#$7F,#$7F);
|
||||
t(#$80,' ');
|
||||
t(#$BF,' ');
|
||||
t(#$C0#$0,' '#$0);
|
||||
t(#$C0#$7F,' '#$7F);
|
||||
t(#$C0#$80,' ');
|
||||
t(#$C0#$CF,' ');
|
||||
t(#$C1#$80,' ');
|
||||
t(#$C2#$7F,' '#$7F);
|
||||
t(#$C2#$80,#$C2#$80);
|
||||
t(#$DF#$80,#$DF#$80);
|
||||
t(#$DF#$BF,#$DF#$BF);
|
||||
t(#$DF#$C0,' ');
|
||||
t(#$DF#$70,' '#$70);
|
||||
t(#$E0#$80,' ');
|
||||
t(#$E0#$80#$80,' ');
|
||||
t(#$E0#$9F#$BF,' ');
|
||||
t(#$E0#$A0#$80,#$E0#$A0#$80);
|
||||
t(#$E0#$80#$70,' '#$70);
|
||||
t(#$EF#$BF#$BF,#$EF#$BF#$BF);
|
||||
t(#$EF#$BF#$7F,' '#$7F);
|
||||
t(#$EF#$BF#$C0,' ');
|
||||
t(#$EF#$7F#$80,' '#$7F' ');
|
||||
t(#$F0#$80#$80#$80,' ');
|
||||
t(#$F0#$8F#$BF#$BF,' ');
|
||||
t(#$F0#$9F#$BF#$BF,#$F0#$9F#$BF#$BF);
|
||||
t(#$F0#$9F#$BF#$CF,' ');
|
||||
t(#$F0#$9F#$CF#$BF,' '#$CF#$BF);
|
||||
t(#$F0#$CF#$BF#$BF,' '#$CF#$BF' ');
|
||||
t(#$F4#$8F#$BF#$BF,#$F4#$8F#$BF#$BF);
|
||||
t(#$F4#$90#$80#$80,' ');
|
||||
end;
|
||||
|
||||
initialization
|
||||
AddToLazUtilsTestSuite(TTestLazUTF8);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user