From 98ecfb1e4146927de0530a862643ddbafdc7a249 Mon Sep 17 00:00:00 2001
From: Rika Ichinose <rrunewalsh@gmail.com>
Date: Thu, 23 Jun 2022 20:51:04 +0300
Subject: [PATCH] Simplify SanitiseXMLString.

---
 compiler/verbose.pas | 97 ++++----------------------------------------
 1 file changed, 9 insertions(+), 88 deletions(-)

diff --git a/compiler/verbose.pas b/compiler/verbose.pas
index fca4247791..06bd7b959e 100644
--- a/compiler/verbose.pas
+++ b/compiler/verbose.pas
@@ -1090,7 +1090,7 @@ implementation
 
     function SanitiseXMLString(const S: ansistring): ansistring;
       var
-        X, UTF8Len, UTF8Char, CurrentChar: Integer;
+        X, UTF8Len, CurrentChar: Integer;
         needs_quoting, in_quotes, add_end_quote: Boolean;
         DoASCII: Boolean;
 
@@ -1234,96 +1234,17 @@ implementation
                         end;
                     end;
 
-                  UTF8Char := CurrentChar and $3F; { The data bits of the continuation byte }
-                  UTF8Len := 1; { This variable actually holds 1 less than the length }
+                  UTF8Len := 1;
+                  repeat
+                    inc(UTF8Len);
+                    dec(X);
+                  until (X = 0) or (UTF8Len >= 4) or (ord(Result[X]) shr 6 <> 2);
 
-                  { By setting DoASCII to true, it marks the string as 'invalid UTF-8'
-                    automatically if it reaches the beginning of the string unexpectedly }
-                  DoASCII := True;
-
-                  Dec(X);
-                  while X > 0 do
+                  if (X = 0) or (Utf8CodepointLen(@Result[X], UTF8Len, False) <> UTF8Len) then
                     begin
-                      CurrentChar := Ord(Result[X]);
-
-                      case CurrentChar of
-                        { A standard character here is invalid UTF-8 }
-                        $00..$7F:
-                          Break;
-
-                        { Another continuation byte }
-                        $80..$BF:
-                          begin
-                            UTF8Char := UTF8Char or ((CurrentChar and $3F) shl (6 * UTF8Len));
-
-                            dec(X);
-                            Inc(UTF8Len);
-                            if UTF8Len >= 4 then
-                              { Sequence too long }
-                              Break;
-                          end;
-
-                        { Lead byte for 2-byte sequences }
-                        $C2..$DF:
-                          begin
-                            if UTF8Len <> 1 then Break;
-
-                            UTF8Char := UTF8Char or ((CurrentChar and $1F) shl 6);
-
-                            { Check to see if the code is in range and not part of an 'overlong' sequence }
-                            case UTF8Char of
-                              $0080..$07FF:
-                                DoASCII := False;
-                              else
-                                { Do nothing - DoASCII is already true }
-                            end;
-                            Break;
-                          end;
-
-                        { Lead byte for 3-byte sequences }
-                        $E0..$EF:
-                          begin
-                            if UTF8Len <> 2 then Break;
-
-                            UTF8Char := UTF8Char or ((CurrentChar and $0F) shl 12);
-
-                            { Check to see if the code is in range and not part of an 'overlong' sequence }
-                            case UTF8Char of
-                              $0800..$D7FF, $E000..$FFFF: { $D800..$DFFF is reserved and hence invalid }
-                                DoASCII := False;
-                              else
-                                { Do nothing - DoASCII is already true }
-                            end;
-                            Break;
-                          end;
-
-                        { Lead byte for 4-byte sequences }
-                        $F0..$F4:
-                          begin
-                            if UTF8Len <> 3 then Break;
-
-                            UTF8Char := UTF8Char or ((CurrentChar and $07) shl 18);
-
-                            { Check to see if the code is in range and not part of an 'overlong' sequence }
-                            case UTF8Char of
-                              $010000..$10FFFF:
-                                DoASCII := False;
-                              else
-                                { Do nothing - DoASCII is already true }
-                            end;
-                            Break;
-                          end;
-
-                        { Invalid character }
-                        else
-                          Break;
-                      end;
+                      DoASCII := True;
+                      break;
                     end;
-
-                  if DoASCII then
-                    Break;
-
-                  { If all is fine, we don't need to encode any more characters }
                 end;
 
               { Invalid UTF-8 bytes and lead bytes without continuation bytes }