From 242f0ac05629b99b96537c236f06f29621788df2 Mon Sep 17 00:00:00 2001 From: Bart <9132501-flyingsheep@users.noreply.gitlab.com> Date: Wed, 19 Jan 2022 15:38:43 +0100 Subject: [PATCH] LazUtf8: faster implementation Utf8EscapeControlChars based upon idea by Alexey Torgashin. Issue #39573. --- components/lazutils/lazutf8.pas | 45 ++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/components/lazutils/lazutf8.pas b/components/lazutils/lazutf8.pas index 3ead9ef89b..1cd1e32e98 100644 --- a/components/lazutils/lazutf8.pas +++ b/components/lazutils/lazutf8.pas @@ -2937,30 +2937,61 @@ const '[CAN]', '[EM]' , '[SUB]', '[ESC]', '[FS]' , '[GS]' , '[RS]' , '[US]'); var Ch: Char; - i: Integer; + i,ResLen: Integer; + SubLen: SizeInt; +const + MaxGrowFactor: array[TEscapeMode] of integer = (3, 4, 5, 5, 5); begin if FindInvalidUTF8Codepoint(PChar(S), Length(S)) <> -1 then begin UTF8FixBroken(S); end; Result := ''; + SetLength(Result, Length(S)*MaxGrowFactor[EscapeMode]); + ResLen := 0; //a byte < 127 cannot be part of a multi-byte codepoint, so this is safe for i := 1 to Length(S) do begin + Inc(ResLen); Ch := S[i]; if (Ch < #32) then begin case EscapeMode of - emPascal: Result := Result + PascalEscapeStrings[Ch]; - emHexPascal: Result := Result + HexEscapePascalStrings[Ch]; - emHexC: Result := Result + HexEscapeCStrings[Ch]; - emC: Result := Result + CEscapeStrings[Ch]; - emAsciiControlNames: Result := Result + AsciiControlStrings[Ch]; + emPascal: + begin + Move(PascalEscapeStrings[Ch][1], Result[ResLen], 3); + Inc(ResLen, 3-1); + end; + emHexPascal: + begin + Move(HexEscapePascalStrings[Ch][1], Result[ResLen], 4); + Inc(ResLen, 4-1); + end; + emHexC: + begin + Move(HexEscapeCStrings[Ch][1], Result[ResLen], 5); + Inc(ResLen, 5-1); + end; + emC: + begin + SubLen := Length(CEscapeStrings[Ch]); + Move(CEscapeStrings[Ch][1], Result[ResLen], SubLen); + Inc(ResLen, SubLen-1); + end; + emAsciiControlNames: + begin + SubLen := Length(AsciiControlStrings[Ch]); + Move(AsciiControlStrings[Ch][1], Result[ResLen], SubLen); + Inc(ResLen, SubLen-1); + end; end;//case end else - Result := Result + Ch; + begin + Result[ResLen] := Ch; + end; end; + SetLength(Result, ResLen); end; function UTF8StringOfChar(AUtf8Char: String; N: Integer): String;