lazutils: FindInvalidUTF8Codepoint: check if bigger U+10FFFF

git-svn-id: trunk@65162 -
2025-08-23 16:29:13 +02:00 · 2021-06-01 21:02:23 +00:00 · 2021-06-01 21:02:23 +00:00 · c887c889e3
commit c887c889e3
parent e828efa600
2 changed files with 17 additions and 10 deletions
--- a/components/lazutils/lazutf8.pas
+++ b/components/lazutils/lazutf8.pas
@ -2813,22 +2813,22 @@ function FindInvalidUTF8Codepoint(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolea
 // return -1 if ok
 var
  CharLen: Integer;
-  c: Char;
+  c: Byte;
 begin
  if (p<>nil) then begin
    Result:=0;
    while Result<Count do begin
-      c:=p^;
+      c:=ord(p^);
-      if ord(c)<%10000000 then begin
+      if c<%10000000 then begin
        // regular single byte ASCII character (#0 is a character, this is Pascal ;)
        CharLen:=1;
-      end else if ord(c)<=%11000001 then begin
+      end else if c<=%11000001 then begin
        // single byte character, between valid UTF-8 encodings
        // %11000000 and %11000001 map 2 byte to #0..#128, which is invalid and used for XSS attacks
-        if StopOnNonUTF8 or (ord(c)>=192) then
+        if StopOnNonUTF8 or (c>=192) then
          exit;
        CharLen:=1;
-      end else if ord(c)<=%11011111 then begin
+      end else if c<=%11011111 then begin
        // could be 2 byte character (%110xxxxx %10xxxxxx)
        if (Result<Count-1)
        and ((ord(p[1]) and %11000000) = %10000000) then
@ -2836,25 +2836,29 @@ begin
        else
          exit; // missing following bytes
      end
-      else if ord(c)<=%11101111 then begin
+      else if c<=%11101111 then begin
        // could be 3 byte character (%1110xxxx %10xxxxxx %10xxxxxx)
        if (Result<Count-2)
        and ((ord(p[1]) and %11000000) = %10000000)
        and ((ord(p[2]) and %11000000) = %10000000) then begin
-          if (ord(c)=%11100000) and (ord(p[1])<=%10011111) then
+          if (c=%11100000) and (ord(p[1])<=%10011111) then
            exit; // XSS attack: 3 bytes are mapped to the 1 or 2 byte codes
          CharLen:=3;
        end else
          exit; // missing following bytes
      end
-      else if ord(c)<=%11110111 then begin
+      else if c<=%11110111 then begin
        // could be 4 byte character (%11110xxx %10xxxxxx %10xxxxxx %10xxxxxx)
        if (Result<Count-3)
        and ((ord(p[1]) and %11000000) = %10000000)
        and ((ord(p[2]) and %11000000) = %10000000)
        and ((ord(p[3]) and %11000000) = %10000000) then begin
-          if (ord(c)=%11110000) and (ord(p[1])<=%10001111) then
+          if (c=%11110000) and (ord(p[1])<=%10001111) then
            exit; // XSS attack: 4 bytes are mapped to the 1-3 byte codes
          if (c>%11110100) then
            exit; // out of range U+10FFFF
          if (c=%11110100) and (ord(p[1])>%10001111) then
            exit; // out of range U+10FFFF
          CharLen:=4;
        end else
          exit; // missing following bytes
--- a/test/lazutils/testlazutf8.pas
+++ b/test/lazutils/testlazutf8.pas
@ -92,6 +92,8 @@ begin
  t(UnicodeToUTF8($10000),-1,'unicode($10000)');
  t(UnicodeToUTF8($10900),-1,'unicode($10900)');
  t(UnicodeToUTF8($10ffff),-1,'unicode($10ffff)');
  t(#$F4#$8F#$BF#$BF,-1,'unicode($10ffff)');
  t(#$F4#$90#$80#$80,0,'unicode($110000)');
  t(#$c0#0,0,'invalid second byte of 2 byte');
  t(#$e0#0,0,'invalid second byte of 3 byte');
  t(#$e0#$80#0,0,'invalid third byte of 3 byte');
@ -106,6 +108,7 @@ begin
  t(#$e0#$9f#$bf,0,'invalid: $7ff encoded as 3 byte');
  t(#$f0#$80#$80#$80,0,'invalid: 0 encoded as 4 byte');
  t(#$f0#$8f#$bf#$bf,0,'invalid: $ffff encoded as 4 byte');
  t(#$F7#$BF#$BF#$BF,0,'invalid 4 byte out of range');
 end;
 procedure TTestLazUTF8.TestFindUnicodeToUTF8;