{ ***************************************************************************** This file is part of LazUtils. See the file COPYING.modifiedLGPL.txt, included in this distribution, for details about the license. ***************************************************************************** This unit provides encoding agnostic unicode string manipulation functions and an enumerator. It works transparently with UTF-8 and UTF-16 encodings, thus allowing one source code to work between : 1. Lazarus with its UTF-8 solution. 2. Future FPC and Lazarus with Delphi compatible UTF-16 solution. 3. Delphi, where String = UnicodeString. } unit LazUnicode; {$IFDEF FPC} {$mode objfpc}{$H+}{$inline on} {$ENDIF} // For testing the UTF16 version. {$IF DEFINED(FPC) and DEFINED(UseUTF16)} {$ModeSwitch UnicodeStrings} // Sets also FPC_UNICODESTRINGS ... {$ENDIF} {$IF DEFINED(FPC_UNICODESTRINGS) or not DEFINED(FPC)} {$DEFINE ReallyUseUTF16} // FPC with UTF-16 or Delphi {$ENDIF} interface uses Classes, SysUtils {$IFDEF ReallyUseUTF16} ,character, LazUTF16 {$ENDIF} {$IFDEF FPC} ,LazUTF8 {$ENDIF} ; // Helper functions for codepoints. They change behavior depending on ModeSwitch. function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string; inline; function CodePointLength(const s: string): NativeInt; inline; function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt; inline; function CodePointSize(p: PChar): integer; inline; function IsCombining(const AChar: PChar): Boolean; inline; function UnicodeToWinCP(const s: string): AnsiString; function WinCPToUnicode(const s: AnsiString): string; type // Base class for CodePoint and Character enumerators. TUnicodeEnumeratorBase = class private fCurrent: String; fCurrentPos, fEndPos: PChar; fCurrentCodeUnitCount: Integer; public constructor Create(const A: String); property Current: String read fCurrent; property CurrentCodeUnitCount: Integer read fCurrentCodeUnitCount; end; { TCodePointEnumerator } // Traverse Unicode codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch. TCodePointEnumerator = class(TUnicodeEnumeratorBase) public function MoveNext: Boolean; end; { TUnicodeCharacterEnumerator } // Traverse Unicode (user perceived) characters, including accented characters // with combined codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch. TUnicodeCharacterEnumerator = class(TUnicodeEnumeratorBase) private fCurrentCodePointCount: Integer; public property CurrentCodePointCount: Integer read fCurrentCodePointCount; function MoveNext: Boolean; end; {$IFDEF FPC} // Enumerator for CodePoints is used for for-in loop now. operator Enumerator(A: String): TCodePointEnumerator; // This enumerator combines diacritical marks. Not enabled by default because // there are many more rules for combining codepoints. //operator Enumerator(A: String): TUnicodeCharacterEnumerator; {$ENDIF} implementation {$IFDEF ReallyUseUTF16} function UTF16IsCombining(const AChar: PWideChar): Boolean; var ch: WideChar; begin ch := AChar[0]; Result := // Combining Diacritical Marks (belongs to previos char) ( (ch >= #$300) and (ch <= #$36F) ) or // 0300-036F ( (ch >= #$610) and (ch <= #$61A) ) or // Arabic 0610..061A ( (ch >= #$64B) and (ch <= #$65F) ) or // Arabic 064B..065F ( ch = #$670) or // Arabic 0670 ( (ch >= #$6D6) and (ch <= #$6DC) ) or // Arabic 06D6..06DC ( (ch >= #$6DF) and (ch <= #$6E4) ) or // Arabic 06DF..06E4 ( (ch >= #$6E7) and (ch <= #$6E8) ) or // Arabic 06E7..06E8 ( (ch >= #$6EA) and (ch <= #$6ED) ) or // Arabic 06EA..06ED ( (ch >= #$8E4) and (ch <= #$8FE) ) or // Arabic 08E4..08FE ( (ch >= #$1DC0) and (ch <= #$1DFF) ) or // Combining Diacritical Marks Supplement 1DC0-1DFF ( (ch >= #$20D0) and (ch <= #$20FF) ) or // Combining Diacritical Marks for Symbols 20D0-20FF ( (ch >= #$FE20) and (ch <= #$FE2F) ); // Combining half Marks FE20-FE2F end; {$ELSE} function UTF8IsCombining(const AChar: PChar): Boolean; begin Result := ( (AChar[0] = #$CC) ) or // Combining Diacritical Marks (belongs to previos char) 0300-036F ( (AChar[0] = #$CD) and (AChar[1] in [#$80..#$AF]) ) or // Combining Diacritical Marks ( (AChar[0] = #$D8) and (AChar[1] in [#$90..#$9A]) ) or // Arabic 0610 (d890)..061A (d89a) ( (AChar[0] = #$D9) and (AChar[1] in [#$8b..#$9f, #$B0]) ) or // Arabic 064B (d98b)..065F (d99f) // 0670 (d9b0) ( (AChar[0] = #$DB) and (AChar[1] in [#$96..#$9C, #$9F..#$A4, #$A7..#$A8, #$AA..#$AD]) ) or // Arabic 06D6 (db96).. .. ..06ED (dbad) ( (AChar[0] = #$E0) and (AChar[1] = #$A3) and (AChar[2] in [#$A4..#$BE]) ) or // Arabic 08E4 (e0a3a4) ..08FE (e0a3be) ( (AChar[0] = #$E1) and (AChar[1] = #$B7) ) or // Combining Diacritical Marks Supplement 1DC0-1DFF (e1b780) ( (AChar[0] = #$E2) and (AChar[1] = #$83) and (AChar[2] in [#$90..#$FF]) ) or // Combining Diacritical Marks for Symbols 20D0-20FF ( (AChar[0] = #$EF) and (AChar[1] = #$B8) and (AChar[2] in [#$A0..#$AF]) ); // Combining half Marks FE20-FE2F end; {$ENDIF} //--- function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string; // Copy CharCount CodePoints from s, starting from StartCharIndex'th CodePoints. begin {$IFDEF ReallyUseUTF16} Result := UTF16Copy(s, StartCharIndex, CharCount); {$ELSE} Result := UTF8Copy(s, StartCharIndex, CharCount); {$ENDIF} end; function CodePointLength(const s: string): NativeInt; // Number of CodePoints in s. begin {$IFDEF ReallyUseUTF16} Result := UTF16Length(s); {$ELSE} Result := UTF8LengthFast(s); {$ENDIF} end; function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt; // Position of SearchForText in CodePoints. begin {$IFDEF ReallyUseUTF16} Result := UTF16Pos(SearchForText, SearchInText, StartPos); {$ELSE} Result := UTF8Pos(SearchForText, SearchInText, StartPos); {$ENDIF} end; function CodePointSize(p: PChar): integer; // Returns the number of CodeUnits in one CodePoint pointed by p. begin {$IFDEF ReallyUseUTF16} if TCharacter.IsHighSurrogate(p^) then Result := 2 else Result := 1 {$ELSE} Result := UTF8CharacterLengthFast(p); {$ENDIF} end; function IsCombining(const AChar: PChar): Boolean; // Note: there are many more rules for combining codepoints. // The diacritical marks hare are only a subset. begin {$IFDEF ReallyUseUTF16} Result := UTF16IsCombining(AChar); {$ELSE} Result := UTF8IsCombining(AChar); {$ENDIF} end; function UnicodeToWinCP(const s: string): AnsiString; // Convert s to Windows system codepage. The Unicode encoding of s depends on mode. begin {$IFDEF ReallyUseUTF16} {$IFDEF FPC} // ToDo: Don't convert through UTF-8. Result := UTF8ToWinCP(UTF16ToUTF8(s)); {$ELSE} Result := s; // s is UnicodeString in Delphi. Conversion may be lossy. {$ENDIF} {$ELSE} Result := UTF8ToWinCP(s); {$ENDIF} end; function WinCPToUnicode(const s: AnsiString): string; // Convert Windows system codepage s to Unicode (encoding depends on mode). begin {$IFDEF ReallyUseUTF16} {$IFDEF FPC} // ToDo: Don't convert through UTF-8. Result := UTF8ToUTF16(WinCPToUTF8(s)); {$ELSE} Result := s; // Result is UnicodeString in Delphi. {$ENDIF} {$ELSE} Result := WinCPToUTF8(s); {$ENDIF} end; { TUnicodeEnumeratorBase } constructor TUnicodeEnumeratorBase.Create(const A: String); begin fCurrentPos := PChar(A); // Note: if A='' then PChar(A) returns a pointer to a #0 string fEndPos := fCurrentPos + length(A); end; { TCodePointEnumerator } function TCodePointEnumerator.MoveNext: Boolean; begin if fCurrentPos < fEndPos then begin fCurrentCodeUnitCount := CodePointSize(fCurrentPos); SetLength(fCurrent, fCurrentCodeUnitCount); Move(fCurrentPos^, fCurrent[1], fCurrentCodeUnitCount*SizeOf(Char)); inc(fCurrentPos, fCurrentCodeUnitCount); Result := true; end else Result := false; end; { TUnicodeCharacterEnumerator } function TUnicodeCharacterEnumerator.MoveNext: Boolean; var NextCP: PChar; NextCUCount: Integer; begin if fCurrentPos < fEndPos then begin fCurrentCodePointCount := 0; NextCP := fCurrentPos; repeat NextCUCount := CodePointSize(NextCP); // Prepare for combining diacritical marks. Inc(NextCP, NextCUCount); // Prepare for combining diacritical marks. Inc(fCurrentCodePointCount); until not IsCombining(NextCP); fCurrentCodeUnitCount := NextCP - fCurrentPos; // Pointer arithmetics. SetLength(fCurrent, fCurrentCodeUnitCount); Move(fCurrentPos^, fCurrent[1], fCurrentCodeUnitCount*SizeOf(Char)); inc(fCurrentPos, fCurrentCodeUnitCount); Result := true; end else Result := false; end; //--- // Enumerator //--- {$IFDEF FPC} operator Enumerator(A: String): TCodePointEnumerator; begin Result := TCodePointEnumerator.Create(A); end; {$ENDIF} end.