From 6de8b927830928fb30f9b073489c867885702ba5 Mon Sep 17 00:00:00 2001 From: mattias Date: Tue, 1 Jun 2021 22:01:18 +0000 Subject: [PATCH] lazutils: UTF8FixBroken: fixing out of range and endless loop, added tests git-svn-id: trunk@65163 - --- components/lazutils/lazutf8.pas | 20 +++++++------ test/lazutils/testlazutf8.pas | 51 +++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 9 deletions(-) diff --git a/components/lazutils/lazutf8.pas b/components/lazutils/lazutf8.pas index 1fe44b0703..24413a2225 100644 --- a/components/lazutils/lazutf8.pas +++ b/components/lazutils/lazutf8.pas @@ -853,33 +853,33 @@ end; { fix any broken UTF8 sequences with spaces } procedure UTF8FixBroken(P: PChar); var + b: byte; c: cardinal; begin if p=nil then exit; while p^<>#0 do begin - if ord(p^)<%10000000 then begin + b:=ord(p^); + if b<%10000000 then begin // regular single byte character inc(p); end - else if ord(p^)<%11000000 then begin + else if b<%11000000 then begin // invalid p^:=' '; inc(p); end - else if ((ord(p^) and %11100000) = %11000000) then begin + else if (b and %11100000) = %11000000 then begin // starts with %110 => should be 2 byte character if ((ord(p[1]) and %11000000) = %10000000) then begin - c:=((ord(p^) and %00011111) shl 6); - //or (ord(p[1]) and %00111111); - if c<(1 shl 7) then + if b<%11000010 then p^:=' ' // fix XSS attack else inc(p,2) end - else if p[1]<>#0 then + else p^:=' '; end - else if ((ord(p^) and %11110000) = %11100000) then begin + else if (b and %11110000) = %11100000 then begin // starts with %1110 => should be 3 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) then begin @@ -893,7 +893,7 @@ begin end else p^:=' '; end - else if ((ord(p^) and %11111000) = %11110000) then begin + else if (b and %11111000) = %11110000 then begin // starts with %11110 => should be 4 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) @@ -904,6 +904,8 @@ begin //or (ord(p[3]) and %00111111); if c<(1 shl 16) then p^:=' ' // fix XSS attack + else if (c>$10FFFF) then + p^:=' ' // out of range U+10FFFF else inc(p,4) end else diff --git a/test/lazutils/testlazutf8.pas b/test/lazutils/testlazutf8.pas index c1d485aee4..d9920e46f7 100644 --- a/test/lazutils/testlazutf8.pas +++ b/test/lazutils/testlazutf8.pas @@ -30,6 +30,7 @@ type procedure TestFindInvalidUTF8; procedure TestFindUnicodeToUTF8; procedure TestUTF8QuotedStr; + procedure TestUTF8FixBroken; end; implementation @@ -95,6 +96,7 @@ begin t(#$F4#$8F#$BF#$BF,-1,'unicode($10ffff)'); t(#$F4#$90#$80#$80,0,'unicode($110000)'); t(#$c0#0,0,'invalid second byte of 2 byte'); + t(#$c2#0,0,'valid 2 byte'); t(#$e0#0,0,'invalid second byte of 3 byte'); t(#$e0#$80#0,0,'invalid third byte of 3 byte'); t(#$f0#0,0,'invalid second byte of 4 byte'); @@ -160,6 +162,55 @@ begin t('cABc','AB','ABcABABcAB'); end; +procedure TTestLazUTF8.TestUTF8FixBroken; + + procedure t(const S, Expected: string); + var + Actual: String; + begin + Actual:=S; + UTF8FixBroken(Actual); + AssertEquals('S: '+dbgMemRange(PChar(S),length(S)), + dbgMemRange(PChar(Expected),length(Expected)), + dbgMemRange(PChar(Actual),length(Actual))); + end; + +begin + t(#$0,#$0); + t(#$1,#$1); + t(#$7F,#$7F); + t(#$80,' '); + t(#$BF,' '); + t(#$C0#$0,' '#$0); + t(#$C0#$7F,' '#$7F); + t(#$C0#$80,' '); + t(#$C0#$CF,' '); + t(#$C1#$80,' '); + t(#$C2#$7F,' '#$7F); + t(#$C2#$80,#$C2#$80); + t(#$DF#$80,#$DF#$80); + t(#$DF#$BF,#$DF#$BF); + t(#$DF#$C0,' '); + t(#$DF#$70,' '#$70); + t(#$E0#$80,' '); + t(#$E0#$80#$80,' '); + t(#$E0#$9F#$BF,' '); + t(#$E0#$A0#$80,#$E0#$A0#$80); + t(#$E0#$80#$70,' '#$70); + t(#$EF#$BF#$BF,#$EF#$BF#$BF); + t(#$EF#$BF#$7F,' '#$7F); + t(#$EF#$BF#$C0,' '); + t(#$EF#$7F#$80,' '#$7F' '); + t(#$F0#$80#$80#$80,' '); + t(#$F0#$8F#$BF#$BF,' '); + t(#$F0#$9F#$BF#$BF,#$F0#$9F#$BF#$BF); + t(#$F0#$9F#$BF#$CF,' '); + t(#$F0#$9F#$CF#$BF,' '#$CF#$BF); + t(#$F0#$CF#$BF#$BF,' '#$CF#$BF' '); + t(#$F4#$8F#$BF#$BF,#$F4#$8F#$BF#$BF); + t(#$F4#$90#$80#$80,' '); +end; + initialization AddToLazUtilsTestSuite(TTestLazUTF8);