From f8be53b0e6475b4703ba77e50d1736234ed6a44c Mon Sep 17 00:00:00 2001 From: juha Date: Wed, 13 Dec 2017 00:07:00 +0000 Subject: [PATCH] LazUtils: Change "Character" to "Codepoint" also in some parameter names in LazUTF8. Cleanup. git-svn-id: trunk@56708 - --- components/lazutils/lazutf8.pas | 106 ++++++++++++-------------------- 1 file changed, 38 insertions(+), 68 deletions(-) diff --git a/components/lazutils/lazutf8.pas b/components/lazutils/lazutf8.pas index 5258521fec..453276f109 100644 --- a/components/lazutils/lazutf8.pas +++ b/components/lazutils/lazutf8.pas @@ -86,7 +86,7 @@ function UTF8LengthFast(const s: string): PtrInt; inline; function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt; // Functions dealing with unicode number U+xxx. -function UTF8CodepointToUnicode(p: PChar; out CharLen: integer): Cardinal; +function UTF8CodepointToUnicode(p: PChar; out CodepointLen: integer): Cardinal; function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal; deprecated 'Use UTF8CodepointToUnicode instead.'; function UnicodeToUTF8(CodePoint: cardinal): string; // UTF32 to UTF8 function UnicodeToUTF8(CodePoint: cardinal; Buf: PChar): integer; // UTF32 to UTF8 @@ -96,13 +96,13 @@ function UTF8ToDoubleByteString(const s: string): string; function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt; function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt; BytePos: SizeInt): SizeInt; -function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CharLen: Integer): Boolean; +function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CodepointLen: Integer): Boolean; function Utf8TryFindCodepointStart(const AString: String; var Index: Integer; out CharLen: Integer): Boolean; // find the n-th UTF8 codepoint, ignoring BIDI function UTF8CodepointStart(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PChar; function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar; deprecated 'Use UTF8CodepointStart instead.'; // find the byte index of the n-th UTF8 codepoint, ignoring BIDI (byte len of substr) -function UTF8CodepointToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt; +function UTF8CodepointToByteIndex(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PtrInt; function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt; deprecated 'Use UTF8CodepointToByteIndex instead.'; procedure UTF8FixBroken(P: PChar); overload; procedure UTF8FixBroken(var S: string); overload; @@ -146,11 +146,10 @@ function UTF8StartsText(const ASubText, AText: string): Boolean; function UTF8EndsText(const ASubText, AText: string): Boolean; function UTF8ReverseString(p: PChar; const ByteCount: LongInt): string; function UTF8ReverseString(const AText: string): string; inline; -function UTF8RPosByReverse(const Substr, Source: string): PtrInt; deprecated 'Slow.'; function UTF8RPos(const Substr, Source: string): PtrInt; -function UTF8WrapText(S, BreakStr :string; BreakChars :TSysCharSet; MaxCol: integer): string; overload; -function UTF8WrapText(S :string; MaxCol :integer) :string; overload; +function UTF8WrapText(S, BreakStr: string; BreakChars: TSysCharSet; MaxCol: integer): string; overload; +function UTF8WrapText(S: string; MaxCol: integer): string; overload; type TEscapeMode = (emPascal, emHexPascal, emHexC, emC, emAsciiControlNames); @@ -169,8 +168,6 @@ type TUTF8TrimFlags = set of TUTF8TrimFlag; function UTF8Trim(const s: string; Flags: TUTF8TrimFlags = []): string; -procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings); - //compare functions function UTF8CompareStr(const S1, S2: string): PtrInt; inline; @@ -501,6 +498,7 @@ end; // Ported from: // http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html // The code uses CPU's native data size. In a 64-bit CPU it means 8 bytes at once. +// The UTF-8 data is assumed to be valid. function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt; const {$ifdef CPU32} @@ -550,9 +548,9 @@ begin Result := ByteCount - Result; end; -function UTF8CodepointToUnicode(p: PChar; out CharLen: integer): Cardinal; -{ if p=nil then CharLen=0 otherwise CharLen>0 - If there is an encoding error the Result is 0 and CharLen=1. +function UTF8CodepointToUnicode(p: PChar; out CodepointLen: integer): Cardinal; +{ if p=nil then CodepointLen=0 otherwise CodepointLen>0 + If there is an encoding error the Result is 0 and CodepointLen=1. Use UTF8FixBroken to fix UTF-8 encoding. It does not check if the codepoint is defined in the Unicode tables. } @@ -561,12 +559,12 @@ begin if ord(p^)<%11000000 then begin // regular single byte character (#0 is a normal char, this is pascal ;) Result:=ord(p^); - CharLen:=1; + CodepointLen:=1; end else if ((ord(p^) and %11100000) = %11000000) then begin // starts with %110 => could be double byte character if (ord(p[1]) and %11000000) = %10000000 then begin - CharLen:=2; + CodepointLen:=2; Result:=((ord(p^) and %00011111) shl 6) or (ord(p[1]) and %00111111); if Result<(1 shl 7) then begin // wrong encoded, could be an XSS attack @@ -574,14 +572,14 @@ begin end; end else begin Result:=ord(p^); - CharLen:=1; + CodepointLen:=1; end; end else if ((ord(p^) and %11110000) = %11100000) then begin // starts with %1110 => could be triple byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) then begin - CharLen:=3; + CodepointLen:=3; Result:=((ord(p^) and %00011111) shl 12) or ((ord(p[1]) and %00111111) shl 6) or (ord(p[2]) and %00111111); @@ -591,7 +589,7 @@ begin end; end else begin Result:=ord(p^); - CharLen:=1; + CodepointLen:=1; end; end else if ((ord(p^) and %11111000) = %11110000) then begin @@ -599,7 +597,7 @@ begin if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) and ((ord(p[3]) and %11000000) = %10000000) then begin - CharLen:=4; + CodepointLen:=4; Result:=((ord(p^) and %00001111) shl 18) or ((ord(p[1]) and %00111111) shl 12) or ((ord(p[2]) and %00111111) shl 6) @@ -610,17 +608,17 @@ begin end; end else begin Result:=ord(p^); - CharLen:=1; + CodepointLen:=1; end; end else begin // invalid character Result:=ord(p^); - CharLen:=1; + CodepointLen:=1; end; end else begin Result:=0; - CharLen:=0; + CodepointLen:=0; end; end; @@ -740,36 +738,36 @@ end; - Returns: True if the character pointed to by Curpos is part of a valid UTF8 codepoint (1 to 4 bytes), otherwise it returns False. } -function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CharLen: Integer): Boolean; +function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CodepointLen: Integer): Boolean; var SavedPos: PChar; begin Result := False; - CharLen := 0; + CodepointLen := 0; if (not (Assigned(AString) and Assigned(CurPos))) or (CurPos < AString) then Exit; SavedPos := CurPos; //Note: UTF8CodepointStrictSize will NOT "look" beyond the terminating #0 of a PChar, so this is safe with AnsiStrings - CharLen := UTF8CodepointStrictSize(CurPos); - if (CharLen > 0) then Exit(True); + CodepointLen := UTF8CodepointStrictSize(CurPos); + if (CodepointLen > 0) then Exit(True); if (CurPos > AString) then begin Dec(CurPos); //-1 //is it second byte of 2..4 byte codepoint? - CharLen := UTF8CodepointStrictSize(CurPos); - if (CharLen > 1) then Exit(True); + CodepointLen := UTF8CodepointStrictSize(CurPos); + if (CodepointLen > 1) then Exit(True); if (CurPos > AString) then begin Dec(CurPos); //-2 //is it third byte of 3..4 byte codepoint? - CharLen := UTF8CodepointStrictSize(CurPos); - if (CharLen > 2) then Exit(True); + CodepointLen := UTF8CodepointStrictSize(CurPos); + if (CodepointLen > 2) then Exit(True); if (CurPos > AString) then begin Dec(CurPos); //-3 //is it fouth byte of 4 byte codepoint? - CharLen := UTF8CodepointStrictSize(CurPos); - if (CharLen = 4) then Exit(True); + CodepointLen := UTF8CodepointStrictSize(CurPos); + if (CodepointLen = 4) then Exit(True); end; end; end; @@ -829,11 +827,11 @@ begin Result := UTF8CodepointStart(UTF8Str, Len, CharIndex); end; -function UTF8CodepointToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt; +function UTF8CodepointToByteIndex(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PtrInt; var p: PChar; begin - p := UTF8CodepointStart(UTF8Str, Len, CharIndex); + p := UTF8CodepointStart(UTF8Str, Len, CodepointIndex); if p = nil then Result := -1 else Result := p - UTF8Str; @@ -3093,22 +3091,6 @@ begin Result := UTF8ReverseString(PChar(AText), length(AText)); end; -function UTF8RPosByReverse(const Substr, Source: string): PtrInt; -var - RevSubstr, RevSource: string; - pRev: PtrInt; -begin - if (Pos(Substr, Source) = 0) then - Result := 0 - else - begin - RevSubstr := UTF8ReverseString(Substr); - RevSource := UTF8ReverseString(Source); - pRev := UTF8Pos(RevSubstr, RevSource); - Result := UTF8Length(Source) -pRev -UTF8Length(Substr) +2; - end; -end; - function UTF8RPos(const Substr, Source: string): PtrInt; var pRev: PtrInt; @@ -3117,16 +3099,14 @@ begin Result := UTF8Length(PChar(Source), pRev); // Length of the leading part. end; -function UTF8WrapText(S, BreakStr :string; BreakChars :TSysCharSet; MaxCol: integer): string; +function UTF8WrapText(S, BreakStr: string; BreakChars: TSysCharSet; MaxCol: integer): string; var - P :PChar; - CharLen :integer; - RightSpace : Integer = 0; - N :integer = 0; - i : Integer; - j : Integer; - Len :integer = 0; - ResultLen, RP :Integer; + P : PChar; + RightSpace : integer = 0; + N : integer = 0; + Len : integer = 0; + i, j : integer; + CharLen, ResultLen, RP : integer; begin Result := ''; if (S = '') or (MaxCol = 0) or (BreakStr = '') or (BreakChars = []) then Exit; @@ -3166,7 +3146,7 @@ begin end; end; -function UTF8WrapText(S :string; MaxCol: integer): string; +function UTF8WrapText(S: string; MaxCol: integer): string; begin Result := UTF8WrapText(S, LineEnding, [' ', '-', #9], MaxCol); end; @@ -3282,16 +3262,6 @@ begin end; end; -procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings); -var - i: Integer; -begin - AnsiList.Clear; - if UTF8List=nil then exit; - for i:=0 to UTF8List.Count-1 do - AnsiList.Add(UTF8ToSys(UTF8List[i])); -end; - {------------------------------------------------------------------------------ Name: UTF8CompareStr Params: S1, S2 - UTF8 encoded strings