mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-09-16 10:19:25 +02:00
lazutils: UTF8FixBroken: fixing out of range and endless loop, added tests
git-svn-id: trunk@65163 -
This commit is contained in:
parent
c887c889e3
commit
6de8b92783
@ -853,33 +853,33 @@ end;
|
|||||||
{ fix any broken UTF8 sequences with spaces }
|
{ fix any broken UTF8 sequences with spaces }
|
||||||
procedure UTF8FixBroken(P: PChar);
|
procedure UTF8FixBroken(P: PChar);
|
||||||
var
|
var
|
||||||
|
b: byte;
|
||||||
c: cardinal;
|
c: cardinal;
|
||||||
begin
|
begin
|
||||||
if p=nil then exit;
|
if p=nil then exit;
|
||||||
while p^<>#0 do begin
|
while p^<>#0 do begin
|
||||||
if ord(p^)<%10000000 then begin
|
b:=ord(p^);
|
||||||
|
if b<%10000000 then begin
|
||||||
// regular single byte character
|
// regular single byte character
|
||||||
inc(p);
|
inc(p);
|
||||||
end
|
end
|
||||||
else if ord(p^)<%11000000 then begin
|
else if b<%11000000 then begin
|
||||||
// invalid
|
// invalid
|
||||||
p^:=' ';
|
p^:=' ';
|
||||||
inc(p);
|
inc(p);
|
||||||
end
|
end
|
||||||
else if ((ord(p^) and %11100000) = %11000000) then begin
|
else if (b and %11100000) = %11000000 then begin
|
||||||
// starts with %110 => should be 2 byte character
|
// starts with %110 => should be 2 byte character
|
||||||
if ((ord(p[1]) and %11000000) = %10000000) then begin
|
if ((ord(p[1]) and %11000000) = %10000000) then begin
|
||||||
c:=((ord(p^) and %00011111) shl 6);
|
if b<%11000010 then
|
||||||
//or (ord(p[1]) and %00111111);
|
|
||||||
if c<(1 shl 7) then
|
|
||||||
p^:=' ' // fix XSS attack
|
p^:=' ' // fix XSS attack
|
||||||
else
|
else
|
||||||
inc(p,2)
|
inc(p,2)
|
||||||
end
|
end
|
||||||
else if p[1]<>#0 then
|
else
|
||||||
p^:=' ';
|
p^:=' ';
|
||||||
end
|
end
|
||||||
else if ((ord(p^) and %11110000) = %11100000) then begin
|
else if (b and %11110000) = %11100000 then begin
|
||||||
// starts with %1110 => should be 3 byte character
|
// starts with %1110 => should be 3 byte character
|
||||||
if ((ord(p[1]) and %11000000) = %10000000)
|
if ((ord(p[1]) and %11000000) = %10000000)
|
||||||
and ((ord(p[2]) and %11000000) = %10000000) then begin
|
and ((ord(p[2]) and %11000000) = %10000000) then begin
|
||||||
@ -893,7 +893,7 @@ begin
|
|||||||
end else
|
end else
|
||||||
p^:=' ';
|
p^:=' ';
|
||||||
end
|
end
|
||||||
else if ((ord(p^) and %11111000) = %11110000) then begin
|
else if (b and %11111000) = %11110000 then begin
|
||||||
// starts with %11110 => should be 4 byte character
|
// starts with %11110 => should be 4 byte character
|
||||||
if ((ord(p[1]) and %11000000) = %10000000)
|
if ((ord(p[1]) and %11000000) = %10000000)
|
||||||
and ((ord(p[2]) and %11000000) = %10000000)
|
and ((ord(p[2]) and %11000000) = %10000000)
|
||||||
@ -904,6 +904,8 @@ begin
|
|||||||
//or (ord(p[3]) and %00111111);
|
//or (ord(p[3]) and %00111111);
|
||||||
if c<(1 shl 16) then
|
if c<(1 shl 16) then
|
||||||
p^:=' ' // fix XSS attack
|
p^:=' ' // fix XSS attack
|
||||||
|
else if (c>$10FFFF) then
|
||||||
|
p^:=' ' // out of range U+10FFFF
|
||||||
else
|
else
|
||||||
inc(p,4)
|
inc(p,4)
|
||||||
end else
|
end else
|
||||||
|
@ -30,6 +30,7 @@ type
|
|||||||
procedure TestFindInvalidUTF8;
|
procedure TestFindInvalidUTF8;
|
||||||
procedure TestFindUnicodeToUTF8;
|
procedure TestFindUnicodeToUTF8;
|
||||||
procedure TestUTF8QuotedStr;
|
procedure TestUTF8QuotedStr;
|
||||||
|
procedure TestUTF8FixBroken;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
implementation
|
implementation
|
||||||
@ -95,6 +96,7 @@ begin
|
|||||||
t(#$F4#$8F#$BF#$BF,-1,'unicode($10ffff)');
|
t(#$F4#$8F#$BF#$BF,-1,'unicode($10ffff)');
|
||||||
t(#$F4#$90#$80#$80,0,'unicode($110000)');
|
t(#$F4#$90#$80#$80,0,'unicode($110000)');
|
||||||
t(#$c0#0,0,'invalid second byte of 2 byte');
|
t(#$c0#0,0,'invalid second byte of 2 byte');
|
||||||
|
t(#$c2#0,0,'valid 2 byte');
|
||||||
t(#$e0#0,0,'invalid second byte of 3 byte');
|
t(#$e0#0,0,'invalid second byte of 3 byte');
|
||||||
t(#$e0#$80#0,0,'invalid third byte of 3 byte');
|
t(#$e0#$80#0,0,'invalid third byte of 3 byte');
|
||||||
t(#$f0#0,0,'invalid second byte of 4 byte');
|
t(#$f0#0,0,'invalid second byte of 4 byte');
|
||||||
@ -160,6 +162,55 @@ begin
|
|||||||
t('cABc','AB','ABcABABcAB');
|
t('cABc','AB','ABcABABcAB');
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
procedure TTestLazUTF8.TestUTF8FixBroken;
|
||||||
|
|
||||||
|
procedure t(const S, Expected: string);
|
||||||
|
var
|
||||||
|
Actual: String;
|
||||||
|
begin
|
||||||
|
Actual:=S;
|
||||||
|
UTF8FixBroken(Actual);
|
||||||
|
AssertEquals('S: '+dbgMemRange(PChar(S),length(S)),
|
||||||
|
dbgMemRange(PChar(Expected),length(Expected)),
|
||||||
|
dbgMemRange(PChar(Actual),length(Actual)));
|
||||||
|
end;
|
||||||
|
|
||||||
|
begin
|
||||||
|
t(#$0,#$0);
|
||||||
|
t(#$1,#$1);
|
||||||
|
t(#$7F,#$7F);
|
||||||
|
t(#$80,' ');
|
||||||
|
t(#$BF,' ');
|
||||||
|
t(#$C0#$0,' '#$0);
|
||||||
|
t(#$C0#$7F,' '#$7F);
|
||||||
|
t(#$C0#$80,' ');
|
||||||
|
t(#$C0#$CF,' ');
|
||||||
|
t(#$C1#$80,' ');
|
||||||
|
t(#$C2#$7F,' '#$7F);
|
||||||
|
t(#$C2#$80,#$C2#$80);
|
||||||
|
t(#$DF#$80,#$DF#$80);
|
||||||
|
t(#$DF#$BF,#$DF#$BF);
|
||||||
|
t(#$DF#$C0,' ');
|
||||||
|
t(#$DF#$70,' '#$70);
|
||||||
|
t(#$E0#$80,' ');
|
||||||
|
t(#$E0#$80#$80,' ');
|
||||||
|
t(#$E0#$9F#$BF,' ');
|
||||||
|
t(#$E0#$A0#$80,#$E0#$A0#$80);
|
||||||
|
t(#$E0#$80#$70,' '#$70);
|
||||||
|
t(#$EF#$BF#$BF,#$EF#$BF#$BF);
|
||||||
|
t(#$EF#$BF#$7F,' '#$7F);
|
||||||
|
t(#$EF#$BF#$C0,' ');
|
||||||
|
t(#$EF#$7F#$80,' '#$7F' ');
|
||||||
|
t(#$F0#$80#$80#$80,' ');
|
||||||
|
t(#$F0#$8F#$BF#$BF,' ');
|
||||||
|
t(#$F0#$9F#$BF#$BF,#$F0#$9F#$BF#$BF);
|
||||||
|
t(#$F0#$9F#$BF#$CF,' ');
|
||||||
|
t(#$F0#$9F#$CF#$BF,' '#$CF#$BF);
|
||||||
|
t(#$F0#$CF#$BF#$BF,' '#$CF#$BF' ');
|
||||||
|
t(#$F4#$8F#$BF#$BF,#$F4#$8F#$BF#$BF);
|
||||||
|
t(#$F4#$90#$80#$80,' ');
|
||||||
|
end;
|
||||||
|
|
||||||
initialization
|
initialization
|
||||||
AddToLazUtilsTestSuite(TTestLazUTF8);
|
AddToLazUtilsTestSuite(TTestLazUTF8);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user