LazUtils: New unit LazUnicode for encoding agnostic code. Supports Delphi. LazUTF16 is updated.

git-svn-id: trunk@52670 -
2025-11-26 11:27:20 +01:00 · 2016-07-12 09:55:21 +00:00 · 2016-07-12 09:55:21 +00:00 · dc90c9173e
commit dc90c9173e
parent 503a8f7442
3 changed files with 408 additions and 8 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -3015,6 +3015,7 @@ components/lazutils/lazloggerbase.pas svneol=native#text/pascal
 components/lazutils/lazloggerdummy.pas svneol=native#text/pascal
 components/lazutils/lazloggerprofiling.pas svneol=native#text/pascal
 components/lazutils/lazmethodlist.pas svneol=native#text/pascal
 components/lazutils/lazunicode.pas svneol=native#text/plain
 components/lazutils/lazutf16.pas svneol=native#text/pascal
 components/lazutils/lazutf8.pas svneol=native#text/pascal
 components/lazutils/lazutf8classes.pas svneol=native#text/pascal
--- a/components/lazutils/lazunicode.pas
+++ b/components/lazutils/lazunicode.pas
@ -0,0 +1,282 @@
 {
 *****************************************************************************
  This file is part of LazUtils.
  See the file COPYING.modifiedLGPL.txt, included in this distribution,
  for details about the license.
 *****************************************************************************
  This unit provides encoding agnostic unicode string manipulation functions and
  an enumerator. It works transparently with UTF-8 and UTF-16 encodings,
  thus allowing one source code to work between :
  1. Lazarus with its UTF-8 solution.
  2. Future FPC and Lazarus with Delphi compatible UTF-16 solution.
  3. Delphi, where String = UnicodeString.
 }
 unit LazUnicode;
 {$IFDEF FPC}
 {$mode objfpc}{$H+}
 {$IF FPC_FULLVERSION < 30000}
  {$ERROR needs at least FPC 3.0}
 {$ENDIF}
 {$ENDIF}
 // For testing the UTF16 version.
 {$IF DEFINED(FPC) and DEFINED(UseUTF16)}
 {$ModeSwitch UnicodeStrings}   // Sets also FPC_UNICODESTRINGS ...
 {$DEFINE FPC_UNICODESTRINGS}   // but Lazarus editor hi-lighting doesn't know it.
 {$ENDIF}
 {$IF DEFINED(FPC_UNICODESTRINGS) or not DEFINED(FPC)}
 {$DEFINE ReallyUseUTF16}       // FPC with UTF-16 or Delphi
 {$ENDIF}
 interface
 uses
  Classes, SysUtils
  {$IFDEF ReallyUseUTF16}
   ,character, LazUTF16
  {$ENDIF}
  {$IFDEF FPC}
   ,LazUTF8
  {$ENDIF}
  ;
  // Helper functions for codepoints. They change behavior depending on ModeSwitch.
  function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string;
  function CodePointLength(const s: string): NativeInt;
  function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt;
  function CodeUnitCount(p: PChar): integer;
  function IsCombining(const AChar: PChar): Boolean;
  function CodePointToWinCP(const s: string): AnsiString;
  function WinCPToCodePoint(const s: AnsiString): string;
 type
  // Base class for CodePoint and Character enumerators.
  TUnicodeEnumeratorBase = class
  private
    fCurrent: String;
    fCurrentPos, fEndPos: PChar;
    fCurrentCodeUnitCount: Integer;
  public
    constructor Create(const A: String);
    property Current: String read fCurrent;
    property CurrentCodeUnitCount: Integer read fCurrentCodeUnitCount;
  end;
  { TCodePointEnumerator }
  // Traverse Unicode codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch.
  TCodePointEnumerator = class(TUnicodeEnumeratorBase)
  public
    function MoveNext: Boolean;
  end;
  { TUnicodeCharacterEnumerator }
  // Traverse Unicode (user perceived) characters, including accented characters
  //  with combined codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch.
  TUnicodeCharacterEnumerator = class(TUnicodeEnumeratorBase)
  private
    fCurrentCodePointCount: Integer;
  public
    property CurrentCodePointCount: Integer read fCurrentCodePointCount;
    function MoveNext: Boolean;
  end;
  {$IFDEF FPC}
  // Enumerator for CodePoints is used for for-in loop now.
  operator Enumerator(A: String): TCodePointEnumerator;
  // This enumerator combines diacritical marks. Not enabled by default because
  //  there are many more rules for combining codepoints.
  //operator Enumerator(A: String): TUnicodeCharacterEnumerator;
  {$ENDIF}
 implementation
 {$IFDEF ReallyUseUTF16}
 function UTF16IsCombining(const AChar: PWideChar): Boolean;
 var
  ch: WideChar;
 begin
  ch := AChar[0];
  Result :=           // Combining Diacritical Marks (belongs to previos char)
    ( (ch >= #$300) and (ch <= #$36F) ) or    // 0300-036F
    ( (ch >= #$610) and (ch <= #$61A) ) or    // Arabic 0610..061A
    ( (ch >= #$64B) and (ch <= #$65F) ) or    // Arabic 064B..065F
    (  ch = #$670) or                         // Arabic 0670
    ( (ch >= #$6D6) and (ch <= #$6DC) ) or    // Arabic 06D6..06DC
    ( (ch >= #$6DF) and (ch <= #$6E4) ) or    // Arabic 06DF..06E4
    ( (ch >= #$6E7) and (ch <= #$6E8) ) or    // Arabic 06E7..06E8
    ( (ch >= #$6EA) and (ch <= #$6ED) ) or    // Arabic 06EA..06ED
    ( (ch >= #$8E4) and (ch <= #$8FE) ) or    // Arabic 08E4..08FE
    ( (ch >= #$1DC0) and (ch <= #$1DFF) ) or  // Combining Diacritical Marks Supplement 1DC0-1DFF
    ( (ch >= #$20D0) and (ch <= #$20FF) ) or  // Combining Diacritical Marks for Symbols 20D0-20FF
    ( (ch >= #$FE20) and (ch <= #$FE2F) );    // Combining half Marks FE20-FE2F
 end;
 {$ELSE}
 function UTF8IsCombining(const AChar: PChar): Boolean;
 begin
  Result :=
   ( (AChar[0] = #$CC) ) or                   // Combining Diacritical Marks (belongs to previos char) 0300-036F
   ( (AChar[0] = #$CD) and (AChar[1] in [#$80..#$AF]) ) or                        // Combining Diacritical Marks
   ( (AChar[0] = #$D8) and (AChar[1] in [#$90..#$9A]) ) or                        // Arabic 0610 (d890)..061A (d89a)
   ( (AChar[0] = #$D9) and (AChar[1] in [#$8b..#$9f, #$B0]) ) or                  // Arabic 064B (d98b)..065F (d99f) // 0670 (d9b0)
   ( (AChar[0] = #$DB) and (AChar[1] in [#$96..#$9C, #$9F..#$A4, #$A7..#$A8, #$AA..#$AD]) ) or // Arabic 06D6 (db96)..  .. ..06ED (dbad)
   ( (AChar[0] = #$E0) and (AChar[1] = #$A3) and (AChar[2] in [#$A4..#$BE]) ) or  // Arabic 08E4 (e0a3a4) ..08FE (e0a3be)
   ( (AChar[0] = #$E1) and (AChar[1] = #$B7) ) or                                 // Combining Diacritical Marks Supplement 1DC0-1DFF (e1b780)
   ( (AChar[0] = #$E2) and (AChar[1] = #$83) and (AChar[2] in [#$90..#$FF]) ) or  // Combining Diacritical Marks for Symbols 20D0-20FF
   ( (AChar[0] = #$EF) and (AChar[1] = #$B8) and (AChar[2] in [#$A0..#$AF]) );    // Combining half Marks FE20-FE2F
 end;
 {$ENDIF}
 //---
 function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string;
 begin
  {$IFDEF ReallyUseUTF16}
  Result := UTF16Copy(s, StartCharIndex, CharCount);
  {$ELSE}
  Result := UTF8Copy(s, StartCharIndex, CharCount);
  {$ENDIF}
 end;
 function CodePointLength(const s: string): NativeInt;
 begin
  {$IFDEF ReallyUseUTF16}
  Result := UTF16Length(s);
  {$ELSE}
  Result := UTF8Length(s);
  {$ENDIF}
 end;
 function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt;
 begin
  {$IFDEF ReallyUseUTF16}
  Result := UTF16Pos(SearchForText, SearchInText, StartPos);
  {$ELSE}
  Result := UTF8Pos(SearchForText, SearchInText, StartPos);
  {$ENDIF}
 end;
 function CodeUnitCount(p: PChar): integer;
 begin
  {$IFDEF ReallyUseUTF16}
  if TCharacter.IsHighSurrogate(p^) then
    Result := 2
  else
    Result := 1
  {$ELSE}
  Result := UTF8CharacterLength(p);
  {$ENDIF}
 end;
 function IsCombining(const AChar: PChar): Boolean;
 // Note: there are many more rules for combining codepoints.
 //  The diacritical marks hare are only a subset.
 begin
  {$IFDEF ReallyUseUTF16}
  Result := UTF16IsCombining(AChar);
  {$ELSE}
  Result := UTF8IsCombining(AChar);
  {$ENDIF}
 end;
 function CodePointToWinCP(const s: string): AnsiString;
 begin
  {$IFDEF ReallyUseUTF16}
  {$IFDEF FPC}
   // ToDo: Don't convert through UTF-8.
   Result := UTF8ToWinCP(UTF16ToUTF8(s));
   {$ELSE}
   Result := s; // s is UnicodeString in Delphi. Conversion may be lossy.
   {$ENDIF}
  {$ELSE}
   Result := UTF8ToWinCP(s);
  {$ENDIF}
 end;
 function WinCPToCodePoint(const s: AnsiString): string;
 begin
  {$IFDEF ReallyUseUTF16}
  {$IFDEF FPC}
   // ToDo: Don't convert through UTF-8.
   Result := UTF8ToUTF16(WinCPToUTF8(s));
   {$ELSE}
   Result := s; // Result is UnicodeString in Delphi.
   {$ENDIF}
  {$ELSE}
   Result := WinCPToUTF8(s);
  {$ENDIF}
 end;
 { TUnicodeEnumeratorBase }
 constructor TUnicodeEnumeratorBase.Create(const A: String);
 begin
  fCurrentPos := PChar(A); // Note: if A='' then PChar(A) returns a pointer to a #0 string
  fEndPos := fCurrentPos + length(A);
 end;
 { TCodePointEnumerator }
 function TCodePointEnumerator.MoveNext: Boolean;
 begin
  if fCurrentPos < fEndPos then
  begin
    fCurrentCodeUnitCount := CodeUnitCount(fCurrentPos);
    SetLength(fCurrent, fCurrentCodeUnitCount);
    Move(fCurrentPos^, fCurrent[1], fCurrentCodeUnitCount*SizeOf(Char));
    inc(fCurrentPos, fCurrentCodeUnitCount);
    Result := true;
  end else
    Result := false;
 end;
 { TUnicodeCharacterEnumerator }
 function TUnicodeCharacterEnumerator.MoveNext: Boolean;
 var
  NextCP: PChar;
  NextCUCount: Integer;
 begin
  if fCurrentPos < fEndPos then
  begin
    fCurrentCodePointCount := 0;
    NextCP := fCurrentPos;
    repeat
      NextCUCount := CodeUnitCount(NextCP); // Prepare for combining diacritical marks.
      Inc(NextCP, NextCUCount); // Prepare for combining diacritical marks.
      Inc(fCurrentCodePointCount);
    until not IsCombining(NextCP);
    fCurrentCodeUnitCount := NextCP - fCurrentPos;  // Pointer arithmetics.
    SetLength(fCurrent, fCurrentCodeUnitCount);
    Move(fCurrentPos^, fCurrent[1], fCurrentCodeUnitCount*SizeOf(Char));
    inc(fCurrentPos, fCurrentCodeUnitCount);
    Result := true;
  end else
    Result := false;
 end;
 //---
 // Enumerator
 //---
 {$IFDEF FPC}
 operator Enumerator(A: String): TCodePointEnumerator;
 begin
  Result := TCodePointEnumerator.Create(A);
 end;
 {$ENDIF}
 end.
--- a/components/lazutils/lazutf16.pas
+++ b/components/lazutils/lazutf16.pas
@ -17,16 +17,30 @@
 }
 unit LazUTF16;
-{$mode objfpc}{$H+}
+{$IFDEF FPC}
 {$mode objfpc}{$H+}
 {$ENDIF}
 interface
 uses
-  Classes, SysUtils, lazutf8;
+  Classes, SysUtils
 {$IFDEF FPC}
  ,lazutf8
 {$ENDIF}
  ;
 {$IFnDEF FPC}
 type
  PtrInt = NativeInt;
 {$ENDIF}
 function UTF16CharacterLength(p: PWideChar): integer;
-function UTF16Length(const s: widestring): PtrInt;
+function UTF16Length(const s: widestring): PtrInt; overload;
-function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt;
+function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt; overload;
 function UTF16Copy(const s: UnicodeString; StartCharIndex, CharCount: PtrInt): Unicodestring;
 function UTF16CharStart(P: PWideChar; Len, CharIndex: PtrInt): PWideChar;
 function UTF16Pos(const SearchForText, SearchInText: UnicodeString; StartPos: PtrInt = 1): PtrInt;
 function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
 function UnicodeToUTF16(u: cardinal): widestring;
 function IsUTF16CharValid(AChar, ANextChar: WideChar): Boolean;
@ -34,7 +48,9 @@ function IsUTF16StringValid(AWideStr: widestring): Boolean;
 function Utf16StringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString;
 function UnicodeLowercase(u: cardinal): cardinal;
 {$IFDEF FPC}
 function UTF8LowerCaseViaTables(const s: string): string;
 {$ENDIF}
 implementation
@ -70,6 +86,107 @@ begin
  end;
 end;
 function UTF16Copy(const s: UnicodeString; StartCharIndex, CharCount: PtrInt): Unicodestring;
 // returns substring
 var
  StartPos: PWideChar;
  EndPos: PWideChar;
  MaxBytes: PtrInt;
 begin
  StartPos:=UTF16CharStart(PWideChar(s),length(s),StartCharIndex-1);
  if StartPos=nil then
    Result:=''
  else begin
    MaxBytes:=PtrInt(PWideChar(s)+length(s)-StartPos);
    EndPos:=UTF16CharStart(StartPos,MaxBytes,CharCount);
    if EndPos=nil then
      Result:=copy(s,StartPos-PWideChar(s)+1,MaxBytes)
    else
      Result:=copy(s,StartPos-PWideChar(s)+1,EndPos-StartPos);
  end;
 end;
 function UTF16CharStart(P: PWideChar; Len, CharIndex: PtrInt): PWideChar;
 // Len is the length in words of P.
 // CharIndex is the position of the desired UnicodeChar (starting at 0).
 var
  CharLen: LongInt;
 begin
  Result:=P;
  if Result=nil then Exit;
  while (CharIndex>0) and (Len>0) do
  begin
    CharLen:=UTF16CharacterLength(Result);
    dec(Len,CharLen);
    dec(CharIndex);
    inc(Result,CharLen);
  end;
  if (CharIndex<>0) or (Len<0) then
    Result:=nil;
 end;
 function IndexOfWideChar(Const Buf: PWideChar; Len: PtrInt; b: WideChar): PtrInt;
 begin
  for Result:=0 to len-1 do
    if buf[Result]=b then
      Exit;
  Result:=-1;
 end;
 // Helper used by UTF16Pos
 function UTF16PosP(SearchForText: PWideChar; SearchForTextLen: PtrInt;
  SearchInText: PWideChar; SearchInTextLen: PtrInt): PWideChar;
 // returns the position where SearchInText starts in SearchForText
 // returns nil if not found
 var
  p: PtrInt;
 begin
  Result:=nil;
  if (SearchForText=nil) or (SearchForTextLen=0) or (SearchInText=nil) then
    exit;
  while SearchInTextLen>0 do begin
    p:=IndexOfWideChar(SearchInText,SearchInTextLen,SearchForText^);
    if p<0 then exit;
    inc(SearchInText, p);
    dec(SearchInTextLen, p);
    if SearchInTextLen<SearchForTextLen then exit;
    if CompareMem(SearchInText,SearchForText,SearchForTextLen * 2) then
      exit(SearchInText);
    inc(SearchInText);
    dec(SearchInTextLen);
  end;
 end;
 function UTF16Pos(const SearchForText, SearchInText: UnicodeString; StartPos: PtrInt = 1): PtrInt;
 // returns the character index, where the SearchForText starts in SearchInText
 // an optional StartPos can be given (in UTF-16 codepoints, not in word)
 // returns 0 if not found
 var
  i: PtrInt;
  p: PWideChar;
  StartPosP: PWideChar;
 begin
  Result:=0;
  if StartPos=1 then
  begin
    i:=System.Pos(SearchForText,SearchInText);
    if i>0 then
      Result:=UTF16Length(PWideChar(SearchInText),i-1)+1;
  end
  else if StartPos>1 then
  begin
    // skip
    StartPosP:=UTF16CharStart(PWideChar(SearchInText),Length(SearchInText),StartPos-1);
    if StartPosP=nil then exit;
    // search
    p:=UTF16PosP(PWideChar(SearchForText),length(SearchForText),
                StartPosP,length(SearchInText)+PWideChar(SearchInText)-StartPosP);
    // get UTF-8 position
    if p=nil then exit;
    Result:=StartPos+UTF16Length(StartPosP,p-StartPosP);
  end;
 end;
 function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
 var
  w1: cardinal;
@ -138,7 +255,7 @@ begin
  end;
 end;
-//Same as SysUtil.StringReplace but for WideStrings/UncodeStrings, since it's not available in fpc yet
+//Same as SysUtil.StringReplace but for WideStrings/UnicodeStrings, since it's not available in fpc yet
 function Utf16StringReplace(const S, OldPattern, NewPattern: WideString;  Flags: TReplaceFlags): WideString;
 var
  Srch, OldP, RemS: WideString; // Srch and OldP can contain WideUpperCase versions of S,OldPattern
@ -897,8 +1014,7 @@ begin
  end;
 end;
-
+{$IFDEF FPC}
 function UTF8LowercaseDynLength(const s: string): string;
 var
  Buf: shortstring;
@ -1014,6 +1130,7 @@ begin
    end;
  until false;
 end;
 {$ENDIF}
 initialization
  InitUnicodeTables;