lazutils: FindInvalidUTF8Character: check for wrong mapped codes, changed default to report gaps

git-svn-id: trunk@47172 -
2025-12-11 05:40:45 +01:00 · 2014-12-10 14:26:18 +00:00 · 2014-12-10 14:26:18 +00:00 · 814cf1a717
commit 814cf1a717
parent 98578ff53a
7 changed files with 84 additions and 29 deletions
--- a/components/aarre/src/aarrepkglist.pas
+++ b/components/aarre/src/aarrepkglist.pas
@ -206,7 +206,7 @@ var
 begin
  Result:=s;
  if Result='' then exit;
-  i:=FindInvalidUTF8Character(PChar(Result),length(Result),true);
+  i:=FindInvalidUTF8Character(PChar(Result),length(Result));
  if i<0 then exit;
  Result:=ISO_8859_1ToUTF8(Result);
 end;
--- a/components/lazutils/lazutf8.pas
+++ b/components/lazutils/lazutf8.pas
@ -95,7 +95,7 @@ function UTF8LowerString(const s: string): string;
 function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string;
 function UTF8UpperString(const s: string): string;
 function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
-                                  StopOnNonASCII: Boolean = false): PtrInt;
+                                  StopOnNonASCII: Boolean = true): PtrInt;
 function ValidUTF8String(const s: String): String;
 function Utf8StringOfChar(AUtf8Char: String; N: Integer): String;
 function Utf8AddChar(AUtf8Char: String; const S: String; N: Integer): String;
@ -696,7 +696,7 @@ end;
 procedure UTF8FixBroken(var S: string);
 begin
  if S='' then exit;
-  if FindInvalidUTF8Character(PChar(S),length(S),true)<0 then exit;
+  if FindInvalidUTF8Character(PChar(S),length(S))<0 then exit;
  UniqueString(S);
  UTF8FixBroken(PChar(S));
 end;
@ -2492,38 +2492,44 @@ begin
    Result:=0;
    while Result<Count do begin
      c:=p^;
-      if ord(c)<128 then begin
+      if ord(c)<%10000000 then begin
        // regular single byte ASCII character (#0 is a character, this is pascal ;)
        CharLen:=1;
-      end
-      else if ord(c)<%11000000 then begin
-        // regular single byte character
-        if StopOnNonASCII then
+      end else if ord(c)<=%11000001 then begin
+        // single byte character, between valid UTF-8 encodings
+        // %11000000 and %11000001 map 2 byte to #0..#128, which is invalid and used for XSS attacks
+        if StopOnNonASCII or (ord(c)>=192) then
          exit;
        CharLen:=1;
-      end
-      else if ((ord(c) and %11100000) = %11000000) then begin
-        // could be 2 byte character
-        if (Result<Count-1) and ((ord(p[1]) and %11000000) = %10000000) then
+      end else if ord(c)<=%11011111 then begin
+        // could be 2 byte character (%110xxxxx %10xxxxxx)
+        if (Result<Count-1)
+        and ((ord(p[1]) and %11000000) = %10000000) then
          CharLen:=2
        else
          exit; // missing following bytes
      end
-      else if ((ord(c) and %11110000) = %11100000) then begin
-        // could be 3 byte character
-        if (Result<Count-2) and ((ord(p[1]) and %11000000) = %10000000)
-        and ((ord(p[2]) and %11000000) = %10000000) then
-          CharLen:=3
-        else
+      else if ord(c)<=%11101111 then begin
+        // could be 3 byte character (%1110xxxx %10xxxxxx %10xxxxxx)
+        if (Result<Count-2)
+        and ((ord(p[1]) and %11000000) = %10000000)
+        and ((ord(p[2]) and %11000000) = %10000000) then begin
+          if (ord(c)=%11100000) and (ord(p[1])<=%10011111) then
+            exit; // XSS attack: 3 bytes are mapped to the 1 or 2 byte codes
+          CharLen:=3;
+        end else
          exit; // missing following bytes
      end
-      else if ((ord(c) and %11111000) = %11110000) then begin
-        // could be 4 byte character
-        if (Result<Count-3) and ((ord(p[1]) and %11000000) = %10000000)
+      else if ord(c)<=%11110111 then begin
+        // could be 4 byte character (%11110xxx %10xxxxxx %10xxxxxx %10xxxxxx)
+        if (Result<Count-3)
+        and ((ord(p[1]) and %11000000) = %10000000)
        and ((ord(p[2]) and %11000000) = %10000000)
-        and ((ord(p[3]) and %11000000) = %10000000) then
-          CharLen:=4
-        else
+        and ((ord(p[3]) and %11000000) = %10000000) then begin
+          if (ord(c)=%11110000) and (ord(p[1])<=%10001111) then
+            exit; // XSS attack: 4 bytes are mapped to the 1-3 byte codes
+          CharLen:=4;
+        end else
          exit; // missing following bytes
      end
      else begin
--- a/docs/xml/lazutils/lazutf8.xml
+++ b/docs/xml/lazutils/lazutf8.xml
@ -467,7 +467,7 @@ Returns 0 if not found.
      </element>
      <!-- function Visibility: default -->
      <element name="ValidUTF8String">
-        <short/>
+        <short>Replace invalid UTF8 and replace #0..#31 characters with '#0'..'#31'</short>
        <descr/>
        <errors/>
        <seealso/>
--- a/ide/debugmanager.pas
+++ b/ide/debugmanager.pas
@ -986,7 +986,7 @@ begin
    ExceptMsg := AExceptionText;
    // if AExceptionText is not a valid UTF8 string,
    // then assume it has the ansi encoding and convert it
-    if FindInvalidUTF8Character(pchar(ExceptMsg),length(ExceptMsg), False) > 0 then
+    if FindInvalidUTF8Character(pchar(ExceptMsg),length(ExceptMsg)) > 0 then
      ExceptMsg := AnsiToUtf8(ExceptMsg);
    msg := Format(lisProjectSRaisedExceptionClassSWithMessageSS,
                  [GetTitle, AExceptionClass, LineEnding, ExceptMsg]);
--- a/lcl/include/application.inc
+++ b/lcl/include/application.inc
@ -1544,7 +1544,7 @@ var
 begin
  if AppNoExceptionMessages in FFlags then exit;
  Msg := E.Message;
-  if FindInvalidUTF8Character(PChar(Msg), Length(Msg), False) > 0 then
+  if FindInvalidUTF8Character(PChar(Msg), Length(Msg)) > 0 then
    Msg := AnsiToUtf8(Msg);
  if (Msg <> '') and (Msg[length(Msg)] <> '.') then Msg := Msg + '.';
  if (not Terminated) and (Self <> nil) and (AppInitialized in FFlags) then
--- a/lcl/lclproc.pas
+++ b/lcl/lclproc.pas
@ -355,7 +355,7 @@ procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt
 function UTF8LowerCase(const s: String): String;
 function UTF8UpperCase(const s: String): String;
 function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
-                                  StopOnNonASCII: Boolean = false): PtrInt; inline;
+                                  StopOnNonASCII: Boolean = true): PtrInt; inline;
 function ValidUTF8String(const s: String): String; inline;

 procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
--- a/test/lazutils/testlazutf8.pas
+++ b/test/lazutils/testlazutf8.pas
@ -5,6 +5,7 @@
 Test specific with:
     ./runtests --format=plain --suite=TestUTF8Trim
     ./runtests --format=plain --suite=TestUTF8Pos
+     ./runtests --format=plain --suite=TestFindInvalidUTF8
 }
 unit TestLazUTF8;

@ -13,7 +14,7 @@ unit TestLazUTF8;
 interface

 uses
-  Classes, SysUtils, fpcunit, testglobals, LazUTF8;
+  Classes, SysUtils, fpcunit, testglobals, LazUTF8, LazLoggerBase;

 type

@ -24,6 +25,7 @@ type
  published
    procedure TestUTF8Trim;
    procedure TestUTF8Pos;
+    procedure TestFindInvalidUTF8;
  end;

 implementation
@ -54,6 +56,53 @@ begin
  AssertEquals('Check #0',2,UTF8Pos('bc'#0,'abc'#0'abc'));
 end;

+procedure TTestLazUTF8.TestFindInvalidUTF8;
+
+  procedure t(const s: string; Expected: PtrInt; const Title: string);
+  var
+    Actual: PtrInt;
+  begin
+    Actual:=FindInvalidUTF8Character(PChar(s),length(s));
+    AssertEquals(Title+': '+dbgMemRange(Pointer(s),length(s)),Expected,Actual);
+  end;
+
+begin
+  t('',-1,'empty');
+  t('a',-1,'');
+  t('a'#0,-1,'a with #0');
+  t(#0'a',-1,'#0 with a');
+  t(#128,0,'gap value 128');
+  t(#191,0,'gap value 192');
+  // 1 byte UTF-8
+  t(UnicodeToUTF8(0),-1,'unicode(0)');
+  t(UnicodeToUTF8(1),-1,'unicode(1)');
+  t(UnicodeToUTF8(65),-1,'unicode(65)');
+  t(UnicodeToUTF8($7f),-1,'unicode($7f)');
+  // 2 bytes UTF-8
+  t(UnicodeToUTF8($80),-1,'unicode($80)');
+  t(UnicodeToUTF8($7ff),-1,'unicode($7ff)');
+  // 3 bytes UTF-8
+  t(UnicodeToUTF8($800),-1,'unicode($800)');
+  t(UnicodeToUTF8($ffff),-1,'unicode($ffff)');
+  // 4 bytes UTF-8
+  t(UnicodeToUTF8($10000),-1,'unicode($10000)');
+  t(UnicodeToUTF8($10ffff),-1,'unicode($10ffff)');
+  t(#$c0#0,0,'invalid second byte of 2 byte');
+  t(#$e0#0,0,'invalid second byte of 3 byte');
+  t(#$e0#$80#0,0,'invalid third byte of 3 byte');
+  t(#$f0#0,0,'invalid second byte of 4 byte');
+  t(#$f0#$80#0,0,'invalid third byte of 4 byte');
+  t(#$f0#$80#$80#0,0,'// invalid fourth byte of 4 byte');
+  t(#$c0#$80,0,'invalid: ascii encoded as 2 byte');
+  t(#$c0#$8f,0,'invalid: ascii encoded as 2 byte');
+  t(#$c1#$80,0,'invalid: ascii encoded as 2 byte');
+  t(#$c1#$bf,0,'invalid: ascii encoded as 2 byte');
+  t(#$e0#$80#$80,0,'invalid: 0 encoded as 3 byte');
+  t(#$e0#$9f#$bf,0,'invalid: $7ff encoded as 3 byte');
+  t(#$f0#$80#$80#$80,0,'invalid: 0 encoded as 4 byte');
+  t(#$f0#$8f#$bf#$bf,0,'invalid: $ffff encoded as 4 byte');
+end;
+
 initialization
  AddToLazUtilsTestSuite(TTestLazUTF8);