From dc90c9173e0cb0f81ae2803b280e5f7bd9de9edd Mon Sep 17 00:00:00 2001 From: juha Date: Tue, 12 Jul 2016 09:55:21 +0000 Subject: [PATCH] LazUtils: New unit LazUnicode for encoding agnostic code. Supports Delphi. LazUTF16 is updated. git-svn-id: trunk@52670 - --- .gitattributes | 1 + components/lazutils/lazunicode.pas | 282 +++++++++++++++++++++++++++++ components/lazutils/lazutf16.pas | 133 +++++++++++++- 3 files changed, 408 insertions(+), 8 deletions(-) create mode 100644 components/lazutils/lazunicode.pas diff --git a/.gitattributes b/.gitattributes index cc7a845f11..ac6a0565a1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3015,6 +3015,7 @@ components/lazutils/lazloggerbase.pas svneol=native#text/pascal components/lazutils/lazloggerdummy.pas svneol=native#text/pascal components/lazutils/lazloggerprofiling.pas svneol=native#text/pascal components/lazutils/lazmethodlist.pas svneol=native#text/pascal +components/lazutils/lazunicode.pas svneol=native#text/plain components/lazutils/lazutf16.pas svneol=native#text/pascal components/lazutils/lazutf8.pas svneol=native#text/pascal components/lazutils/lazutf8classes.pas svneol=native#text/pascal diff --git a/components/lazutils/lazunicode.pas b/components/lazutils/lazunicode.pas new file mode 100644 index 0000000000..492dc72084 --- /dev/null +++ b/components/lazutils/lazunicode.pas @@ -0,0 +1,282 @@ +{ + ***************************************************************************** + This file is part of LazUtils. + + See the file COPYING.modifiedLGPL.txt, included in this distribution, + for details about the license. + ***************************************************************************** + + This unit provides encoding agnostic unicode string manipulation functions and + an enumerator. It works transparently with UTF-8 and UTF-16 encodings, + thus allowing one source code to work between : + 1. Lazarus with its UTF-8 solution. + 2. Future FPC and Lazarus with Delphi compatible UTF-16 solution. + 3. Delphi, where String = UnicodeString. +} +unit LazUnicode; + +{$IFDEF FPC} + {$mode objfpc}{$H+} + {$IF FPC_FULLVERSION < 30000} + {$ERROR needs at least FPC 3.0} + {$ENDIF} +{$ENDIF} + +// For testing the UTF16 version. +{$IF DEFINED(FPC) and DEFINED(UseUTF16)} + {$ModeSwitch UnicodeStrings} // Sets also FPC_UNICODESTRINGS ... + {$DEFINE FPC_UNICODESTRINGS} // but Lazarus editor hi-lighting doesn't know it. +{$ENDIF} + +{$IF DEFINED(FPC_UNICODESTRINGS) or not DEFINED(FPC)} + {$DEFINE ReallyUseUTF16} // FPC with UTF-16 or Delphi +{$ENDIF} + +interface + +uses + Classes, SysUtils + {$IFDEF ReallyUseUTF16} + ,character, LazUTF16 + {$ENDIF} + {$IFDEF FPC} + ,LazUTF8 + {$ENDIF} + ; + + // Helper functions for codepoints. They change behavior depending on ModeSwitch. + function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string; + function CodePointLength(const s: string): NativeInt; + function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt; + function CodeUnitCount(p: PChar): integer; + function IsCombining(const AChar: PChar): Boolean; + + function CodePointToWinCP(const s: string): AnsiString; + function WinCPToCodePoint(const s: AnsiString): string; + +type + // Base class for CodePoint and Character enumerators. + TUnicodeEnumeratorBase = class + private + fCurrent: String; + fCurrentPos, fEndPos: PChar; + fCurrentCodeUnitCount: Integer; + public + constructor Create(const A: String); + property Current: String read fCurrent; + property CurrentCodeUnitCount: Integer read fCurrentCodeUnitCount; + end; + + { TCodePointEnumerator } + + // Traverse Unicode codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch. + TCodePointEnumerator = class(TUnicodeEnumeratorBase) + public + function MoveNext: Boolean; + end; + + { TUnicodeCharacterEnumerator } + + // Traverse Unicode (user perceived) characters, including accented characters + // with combined codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch. + TUnicodeCharacterEnumerator = class(TUnicodeEnumeratorBase) + private + fCurrentCodePointCount: Integer; + public + property CurrentCodePointCount: Integer read fCurrentCodePointCount; + function MoveNext: Boolean; + end; + + {$IFDEF FPC} + // Enumerator for CodePoints is used for for-in loop now. + operator Enumerator(A: String): TCodePointEnumerator; + + // This enumerator combines diacritical marks. Not enabled by default because + // there are many more rules for combining codepoints. + //operator Enumerator(A: String): TUnicodeCharacterEnumerator; + {$ENDIF} + +implementation + +{$IFDEF ReallyUseUTF16} + +function UTF16IsCombining(const AChar: PWideChar): Boolean; +var + ch: WideChar; +begin + ch := AChar[0]; + Result := // Combining Diacritical Marks (belongs to previos char) + ( (ch >= #$300) and (ch <= #$36F) ) or // 0300-036F + ( (ch >= #$610) and (ch <= #$61A) ) or // Arabic 0610..061A + ( (ch >= #$64B) and (ch <= #$65F) ) or // Arabic 064B..065F + ( ch = #$670) or // Arabic 0670 + ( (ch >= #$6D6) and (ch <= #$6DC) ) or // Arabic 06D6..06DC + ( (ch >= #$6DF) and (ch <= #$6E4) ) or // Arabic 06DF..06E4 + ( (ch >= #$6E7) and (ch <= #$6E8) ) or // Arabic 06E7..06E8 + ( (ch >= #$6EA) and (ch <= #$6ED) ) or // Arabic 06EA..06ED + ( (ch >= #$8E4) and (ch <= #$8FE) ) or // Arabic 08E4..08FE + ( (ch >= #$1DC0) and (ch <= #$1DFF) ) or // Combining Diacritical Marks Supplement 1DC0-1DFF + ( (ch >= #$20D0) and (ch <= #$20FF) ) or // Combining Diacritical Marks for Symbols 20D0-20FF + ( (ch >= #$FE20) and (ch <= #$FE2F) ); // Combining half Marks FE20-FE2F +end; + +{$ELSE} + +function UTF8IsCombining(const AChar: PChar): Boolean; +begin + Result := + ( (AChar[0] = #$CC) ) or // Combining Diacritical Marks (belongs to previos char) 0300-036F + ( (AChar[0] = #$CD) and (AChar[1] in [#$80..#$AF]) ) or // Combining Diacritical Marks + ( (AChar[0] = #$D8) and (AChar[1] in [#$90..#$9A]) ) or // Arabic 0610 (d890)..061A (d89a) + ( (AChar[0] = #$D9) and (AChar[1] in [#$8b..#$9f, #$B0]) ) or // Arabic 064B (d98b)..065F (d99f) // 0670 (d9b0) + + ( (AChar[0] = #$DB) and (AChar[1] in [#$96..#$9C, #$9F..#$A4, #$A7..#$A8, #$AA..#$AD]) ) or // Arabic 06D6 (db96).. .. ..06ED (dbad) + ( (AChar[0] = #$E0) and (AChar[1] = #$A3) and (AChar[2] in [#$A4..#$BE]) ) or // Arabic 08E4 (e0a3a4) ..08FE (e0a3be) + + ( (AChar[0] = #$E1) and (AChar[1] = #$B7) ) or // Combining Diacritical Marks Supplement 1DC0-1DFF (e1b780) + ( (AChar[0] = #$E2) and (AChar[1] = #$83) and (AChar[2] in [#$90..#$FF]) ) or // Combining Diacritical Marks for Symbols 20D0-20FF + ( (AChar[0] = #$EF) and (AChar[1] = #$B8) and (AChar[2] in [#$A0..#$AF]) ); // Combining half Marks FE20-FE2F +end; + +{$ENDIF} + +//--- + +function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string; +begin + {$IFDEF ReallyUseUTF16} + Result := UTF16Copy(s, StartCharIndex, CharCount); + {$ELSE} + Result := UTF8Copy(s, StartCharIndex, CharCount); + {$ENDIF} +end; + +function CodePointLength(const s: string): NativeInt; +begin + {$IFDEF ReallyUseUTF16} + Result := UTF16Length(s); + {$ELSE} + Result := UTF8Length(s); + {$ENDIF} +end; + +function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt; +begin + {$IFDEF ReallyUseUTF16} + Result := UTF16Pos(SearchForText, SearchInText, StartPos); + {$ELSE} + Result := UTF8Pos(SearchForText, SearchInText, StartPos); + {$ENDIF} +end; + +function CodeUnitCount(p: PChar): integer; +begin + {$IFDEF ReallyUseUTF16} + if TCharacter.IsHighSurrogate(p^) then + Result := 2 + else + Result := 1 + {$ELSE} + Result := UTF8CharacterLength(p); + {$ENDIF} +end; + +function IsCombining(const AChar: PChar): Boolean; +// Note: there are many more rules for combining codepoints. +// The diacritical marks hare are only a subset. +begin + {$IFDEF ReallyUseUTF16} + Result := UTF16IsCombining(AChar); + {$ELSE} + Result := UTF8IsCombining(AChar); + {$ENDIF} +end; + +function CodePointToWinCP(const s: string): AnsiString; +begin + {$IFDEF ReallyUseUTF16} + {$IFDEF FPC} + // ToDo: Don't convert through UTF-8. + Result := UTF8ToWinCP(UTF16ToUTF8(s)); + {$ELSE} + Result := s; // s is UnicodeString in Delphi. Conversion may be lossy. + {$ENDIF} + {$ELSE} + Result := UTF8ToWinCP(s); + {$ENDIF} +end; + +function WinCPToCodePoint(const s: AnsiString): string; +begin + {$IFDEF ReallyUseUTF16} + {$IFDEF FPC} + // ToDo: Don't convert through UTF-8. + Result := UTF8ToUTF16(WinCPToUTF8(s)); + {$ELSE} + Result := s; // Result is UnicodeString in Delphi. + {$ENDIF} + {$ELSE} + Result := WinCPToUTF8(s); + {$ENDIF} +end; + +{ TUnicodeEnumeratorBase } + +constructor TUnicodeEnumeratorBase.Create(const A: String); +begin + fCurrentPos := PChar(A); // Note: if A='' then PChar(A) returns a pointer to a #0 string + fEndPos := fCurrentPos + length(A); +end; + +{ TCodePointEnumerator } + +function TCodePointEnumerator.MoveNext: Boolean; +begin + if fCurrentPos < fEndPos then + begin + fCurrentCodeUnitCount := CodeUnitCount(fCurrentPos); + SetLength(fCurrent, fCurrentCodeUnitCount); + Move(fCurrentPos^, fCurrent[1], fCurrentCodeUnitCount*SizeOf(Char)); + inc(fCurrentPos, fCurrentCodeUnitCount); + Result := true; + end else + Result := false; +end; + +{ TUnicodeCharacterEnumerator } + +function TUnicodeCharacterEnumerator.MoveNext: Boolean; +var + NextCP: PChar; + NextCUCount: Integer; +begin + if fCurrentPos < fEndPos then + begin + fCurrentCodePointCount := 0; + NextCP := fCurrentPos; + repeat + NextCUCount := CodeUnitCount(NextCP); // Prepare for combining diacritical marks. + Inc(NextCP, NextCUCount); // Prepare for combining diacritical marks. + Inc(fCurrentCodePointCount); + until not IsCombining(NextCP); + fCurrentCodeUnitCount := NextCP - fCurrentPos; // Pointer arithmetics. + SetLength(fCurrent, fCurrentCodeUnitCount); + Move(fCurrentPos^, fCurrent[1], fCurrentCodeUnitCount*SizeOf(Char)); + inc(fCurrentPos, fCurrentCodeUnitCount); + Result := true; + end else + Result := false; +end; + +//--- +// Enumerator +//--- +{$IFDEF FPC} +operator Enumerator(A: String): TCodePointEnumerator; +begin + Result := TCodePointEnumerator.Create(A); +end; +{$ENDIF} + +end. + diff --git a/components/lazutils/lazutf16.pas b/components/lazutils/lazutf16.pas index 3fc1e8d90a..a9aa56fb1f 100644 --- a/components/lazutils/lazutf16.pas +++ b/components/lazutils/lazutf16.pas @@ -17,24 +17,40 @@ } unit LazUTF16; -{$mode objfpc}{$H+} +{$IFDEF FPC} + {$mode objfpc}{$H+} +{$ENDIF} interface uses - Classes, SysUtils, lazutf8; + Classes, SysUtils +{$IFDEF FPC} + ,lazutf8 +{$ENDIF} + ; + +{$IFnDEF FPC} +type + PtrInt = NativeInt; +{$ENDIF} function UTF16CharacterLength(p: PWideChar): integer; -function UTF16Length(const s: widestring): PtrInt; -function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt; +function UTF16Length(const s: widestring): PtrInt; overload; +function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt; overload; +function UTF16Copy(const s: UnicodeString; StartCharIndex, CharCount: PtrInt): Unicodestring; +function UTF16CharStart(P: PWideChar; Len, CharIndex: PtrInt): PWideChar; +function UTF16Pos(const SearchForText, SearchInText: UnicodeString; StartPos: PtrInt = 1): PtrInt; function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal; function UnicodeToUTF16(u: cardinal): widestring; function IsUTF16CharValid(AChar, ANextChar: WideChar): Boolean; function IsUTF16StringValid(AWideStr: widestring): Boolean; -function Utf16StringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString; +function Utf16StringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString; function UnicodeLowercase(u: cardinal): cardinal; +{$IFDEF FPC} function UTF8LowerCaseViaTables(const s: string): string; +{$ENDIF} implementation @@ -70,6 +86,107 @@ begin end; end; +function UTF16Copy(const s: UnicodeString; StartCharIndex, CharCount: PtrInt): Unicodestring; +// returns substring +var + StartPos: PWideChar; + EndPos: PWideChar; + MaxBytes: PtrInt; +begin + StartPos:=UTF16CharStart(PWideChar(s),length(s),StartCharIndex-1); + if StartPos=nil then + Result:='' + else begin + MaxBytes:=PtrInt(PWideChar(s)+length(s)-StartPos); + EndPos:=UTF16CharStart(StartPos,MaxBytes,CharCount); + if EndPos=nil then + Result:=copy(s,StartPos-PWideChar(s)+1,MaxBytes) + else + Result:=copy(s,StartPos-PWideChar(s)+1,EndPos-StartPos); + end; +end; + +function UTF16CharStart(P: PWideChar; Len, CharIndex: PtrInt): PWideChar; +// Len is the length in words of P. +// CharIndex is the position of the desired UnicodeChar (starting at 0). +var + CharLen: LongInt; +begin + Result:=P; + if Result=nil then Exit; + while (CharIndex>0) and (Len>0) do + begin + CharLen:=UTF16CharacterLength(Result); + dec(Len,CharLen); + dec(CharIndex); + inc(Result,CharLen); + end; + if (CharIndex<>0) or (Len<0) then + Result:=nil; +end; + +function IndexOfWideChar(Const Buf: PWideChar; Len: PtrInt; b: WideChar): PtrInt; +begin + for Result:=0 to len-1 do + if buf[Result]=b then + Exit; + Result:=-1; +end; + +// Helper used by UTF16Pos +function UTF16PosP(SearchForText: PWideChar; SearchForTextLen: PtrInt; + SearchInText: PWideChar; SearchInTextLen: PtrInt): PWideChar; +// returns the position where SearchInText starts in SearchForText +// returns nil if not found +var + p: PtrInt; +begin + Result:=nil; + if (SearchForText=nil) or (SearchForTextLen=0) or (SearchInText=nil) then + exit; + while SearchInTextLen>0 do begin + p:=IndexOfWideChar(SearchInText,SearchInTextLen,SearchForText^); + if p<0 then exit; + inc(SearchInText, p); + dec(SearchInTextLen, p); + if SearchInTextLen0 then + Result:=UTF16Length(PWideChar(SearchInText),i-1)+1; + end + else if StartPos>1 then + begin + // skip + StartPosP:=UTF16CharStart(PWideChar(SearchInText),Length(SearchInText),StartPos-1); + if StartPosP=nil then exit; + // search + p:=UTF16PosP(PWideChar(SearchForText),length(SearchForText), + StartPosP,length(SearchInText)+PWideChar(SearchInText)-StartPosP); + // get UTF-8 position + if p=nil then exit; + Result:=StartPos+UTF16Length(StartPosP,p-StartPosP); + end; +end; + function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal; var w1: cardinal; @@ -138,7 +255,7 @@ begin end; end; -//Same as SysUtil.StringReplace but for WideStrings/UncodeStrings, since it's not available in fpc yet +//Same as SysUtil.StringReplace but for WideStrings/UnicodeStrings, since it's not available in fpc yet function Utf16StringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString; var Srch, OldP, RemS: WideString; // Srch and OldP can contain WideUpperCase versions of S,OldPattern @@ -897,8 +1014,7 @@ begin end; end; - - +{$IFDEF FPC} function UTF8LowercaseDynLength(const s: string): string; var Buf: shortstring; @@ -1014,6 +1130,7 @@ begin end; until false; end; +{$ENDIF} initialization InitUnicodeTables;