LazUtils: New unit LazUnicode for encoding agnostic code. Supports Delphi. LazUTF16 is updated.

git-svn-id: trunk@52670 -
2025-11-20 20:59:39 +01:00 · 2016-07-12 09:55:21 +00:00 · 2016-07-12 09:55:21 +00:00 · dc90c9173e
commit dc90c9173e
parent 503a8f7442
3 changed files with 408 additions and 8 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -3015,6 +3015,7 @@ components/lazutils/lazloggerbase.pas svneol=native#text/pascal
 components/lazutils/lazloggerdummy.pas svneol=native#text/pascal
 components/lazutils/lazloggerprofiling.pas svneol=native#text/pascal
 components/lazutils/lazmethodlist.pas svneol=native#text/pascal
+components/lazutils/lazunicode.pas svneol=native#text/plain
 components/lazutils/lazutf16.pas svneol=native#text/pascal
 components/lazutils/lazutf8.pas svneol=native#text/pascal
 components/lazutils/lazutf8classes.pas svneol=native#text/pascal
--- a/components/lazutils/lazunicode.pas
+++ b/components/lazutils/lazunicode.pas
@ -0,0 +1,282 @@
+{
+ *****************************************************************************
+  This file is part of LazUtils.
+
+  See the file COPYING.modifiedLGPL.txt, included in this distribution,
+  for details about the license.
+ *****************************************************************************
+
+  This unit provides encoding agnostic unicode string manipulation functions and
+  an enumerator. It works transparently with UTF-8 and UTF-16 encodings,
+  thus allowing one source code to work between :
+  1. Lazarus with its UTF-8 solution.
+  2. Future FPC and Lazarus with Delphi compatible UTF-16 solution.
+  3. Delphi, where String = UnicodeString.
+}
+unit LazUnicode;
+
+{$IFDEF FPC}
+ {$mode objfpc}{$H+}
+ {$IF FPC_FULLVERSION < 30000}
+  {$ERROR needs at least FPC 3.0}
+ {$ENDIF}
+{$ENDIF}
+
+// For testing the UTF16 version.
+{$IF DEFINED(FPC) and DEFINED(UseUTF16)}
+ {$ModeSwitch UnicodeStrings}   // Sets also FPC_UNICODESTRINGS ...
+ {$DEFINE FPC_UNICODESTRINGS}   // but Lazarus editor hi-lighting doesn't know it.
+{$ENDIF}
+
+{$IF DEFINED(FPC_UNICODESTRINGS) or not DEFINED(FPC)}
+ {$DEFINE ReallyUseUTF16}       // FPC with UTF-16 or Delphi
+{$ENDIF}
+
+interface
+
+uses
+  Classes, SysUtils
+  {$IFDEF ReallyUseUTF16}
+   ,character, LazUTF16
+  {$ENDIF}
+  {$IFDEF FPC}
+   ,LazUTF8
+  {$ENDIF}
+  ;
+
+  // Helper functions for codepoints. They change behavior depending on ModeSwitch.
+  function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string;
+  function CodePointLength(const s: string): NativeInt;
+  function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt;
+  function CodeUnitCount(p: PChar): integer;
+  function IsCombining(const AChar: PChar): Boolean;
+
+  function CodePointToWinCP(const s: string): AnsiString;
+  function WinCPToCodePoint(const s: AnsiString): string;
+
+type
+  // Base class for CodePoint and Character enumerators.
+  TUnicodeEnumeratorBase = class
+  private
+    fCurrent: String;
+    fCurrentPos, fEndPos: PChar;
+    fCurrentCodeUnitCount: Integer;
+  public
+    constructor Create(const A: String);
+    property Current: String read fCurrent;
+    property CurrentCodeUnitCount: Integer read fCurrentCodeUnitCount;
+  end;
+
+  { TCodePointEnumerator }
+
+  // Traverse Unicode codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch.
+  TCodePointEnumerator = class(TUnicodeEnumeratorBase)
+  public
+    function MoveNext: Boolean;
+  end;
+
+  { TUnicodeCharacterEnumerator }
+
+  // Traverse Unicode (user perceived) characters, including accented characters
+  //  with combined codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch.
+  TUnicodeCharacterEnumerator = class(TUnicodeEnumeratorBase)
+  private
+    fCurrentCodePointCount: Integer;
+  public
+    property CurrentCodePointCount: Integer read fCurrentCodePointCount;
+    function MoveNext: Boolean;
+  end;
+
+  {$IFDEF FPC}
+  // Enumerator for CodePoints is used for for-in loop now.
+  operator Enumerator(A: String): TCodePointEnumerator;
+
+  // This enumerator combines diacritical marks. Not enabled by default because
+  //  there are many more rules for combining codepoints.
+  //operator Enumerator(A: String): TUnicodeCharacterEnumerator;
+  {$ENDIF}
+
+implementation
+
+{$IFDEF ReallyUseUTF16}
+
+function UTF16IsCombining(const AChar: PWideChar): Boolean;
+var
+  ch: WideChar;
+begin
+  ch := AChar[0];
+  Result :=           // Combining Diacritical Marks (belongs to previos char)
+    ( (ch >= #$300) and (ch <= #$36F) ) or    // 0300-036F
+    ( (ch >= #$610) and (ch <= #$61A) ) or    // Arabic 0610..061A
+    ( (ch >= #$64B) and (ch <= #$65F) ) or    // Arabic 064B..065F
+    (  ch = #$670) or                         // Arabic 0670
+    ( (ch >= #$6D6) and (ch <= #$6DC) ) or    // Arabic 06D6..06DC
+    ( (ch >= #$6DF) and (ch <= #$6E4) ) or    // Arabic 06DF..06E4
+    ( (ch >= #$6E7) and (ch <= #$6E8) ) or    // Arabic 06E7..06E8
+    ( (ch >= #$6EA) and (ch <= #$6ED) ) or    // Arabic 06EA..06ED
+    ( (ch >= #$8E4) and (ch <= #$8FE) ) or    // Arabic 08E4..08FE
+    ( (ch >= #$1DC0) and (ch <= #$1DFF) ) or  // Combining Diacritical Marks Supplement 1DC0-1DFF
+    ( (ch >= #$20D0) and (ch <= #$20FF) ) or  // Combining Diacritical Marks for Symbols 20D0-20FF
+    ( (ch >= #$FE20) and (ch <= #$FE2F) );    // Combining half Marks FE20-FE2F
+end;
+
+{$ELSE}
+
+function UTF8IsCombining(const AChar: PChar): Boolean;
+begin
+  Result :=
+   ( (AChar[0] = #$CC) ) or                   // Combining Diacritical Marks (belongs to previos char) 0300-036F
+   ( (AChar[0] = #$CD) and (AChar[1] in [#$80..#$AF]) ) or                        // Combining Diacritical Marks
+   ( (AChar[0] = #$D8) and (AChar[1] in [#$90..#$9A]) ) or                        // Arabic 0610 (d890)..061A (d89a)
+   ( (AChar[0] = #$D9) and (AChar[1] in [#$8b..#$9f, #$B0]) ) or                  // Arabic 064B (d98b)..065F (d99f) // 0670 (d9b0)
+
+   ( (AChar[0] = #$DB) and (AChar[1] in [#$96..#$9C, #$9F..#$A4, #$A7..#$A8, #$AA..#$AD]) ) or // Arabic 06D6 (db96)..  .. ..06ED (dbad)
+   ( (AChar[0] = #$E0) and (AChar[1] = #$A3) and (AChar[2] in [#$A4..#$BE]) ) or  // Arabic 08E4 (e0a3a4) ..08FE (e0a3be)
+
+   ( (AChar[0] = #$E1) and (AChar[1] = #$B7) ) or                                 // Combining Diacritical Marks Supplement 1DC0-1DFF (e1b780)
+   ( (AChar[0] = #$E2) and (AChar[1] = #$83) and (AChar[2] in [#$90..#$FF]) ) or  // Combining Diacritical Marks for Symbols 20D0-20FF
+   ( (AChar[0] = #$EF) and (AChar[1] = #$B8) and (AChar[2] in [#$A0..#$AF]) );    // Combining half Marks FE20-FE2F
+end;
+
+{$ENDIF}
+
+//---
+
+function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string;
+begin
+  {$IFDEF ReallyUseUTF16}
+  Result := UTF16Copy(s, StartCharIndex, CharCount);
+  {$ELSE}
+  Result := UTF8Copy(s, StartCharIndex, CharCount);
+  {$ENDIF}
+end;
+
+function CodePointLength(const s: string): NativeInt;
+begin
+  {$IFDEF ReallyUseUTF16}
+  Result := UTF16Length(s);
+  {$ELSE}
+  Result := UTF8Length(s);
+  {$ENDIF}
+end;
+
+function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt;
+begin
+  {$IFDEF ReallyUseUTF16}
+  Result := UTF16Pos(SearchForText, SearchInText, StartPos);
+  {$ELSE}
+  Result := UTF8Pos(SearchForText, SearchInText, StartPos);
+  {$ENDIF}
+end;
+
+function CodeUnitCount(p: PChar): integer;
+begin
+  {$IFDEF ReallyUseUTF16}
+  if TCharacter.IsHighSurrogate(p^) then
+    Result := 2
+  else
+    Result := 1
+  {$ELSE}
+  Result := UTF8CharacterLength(p);
+  {$ENDIF}
+end;
+
+function IsCombining(const AChar: PChar): Boolean;
+// Note: there are many more rules for combining codepoints.
+//  The diacritical marks hare are only a subset.
+begin
+  {$IFDEF ReallyUseUTF16}
+  Result := UTF16IsCombining(AChar);
+  {$ELSE}
+  Result := UTF8IsCombining(AChar);
+  {$ENDIF}
+end;
+
+function CodePointToWinCP(const s: string): AnsiString;
+begin
+  {$IFDEF ReallyUseUTF16}
+  {$IFDEF FPC}
+   // ToDo: Don't convert through UTF-8.
+   Result := UTF8ToWinCP(UTF16ToUTF8(s));
+   {$ELSE}
+   Result := s; // s is UnicodeString in Delphi. Conversion may be lossy.
+   {$ENDIF}
+  {$ELSE}
+   Result := UTF8ToWinCP(s);
+  {$ENDIF}
+end;
+
+function WinCPToCodePoint(const s: AnsiString): string;
+begin
+  {$IFDEF ReallyUseUTF16}
+  {$IFDEF FPC}
+   // ToDo: Don't convert through UTF-8.
+   Result := UTF8ToUTF16(WinCPToUTF8(s));
+   {$ELSE}
+   Result := s; // Result is UnicodeString in Delphi.
+   {$ENDIF}
+  {$ELSE}
+   Result := WinCPToUTF8(s);
+  {$ENDIF}
+end;
+
+{ TUnicodeEnumeratorBase }
+
+constructor TUnicodeEnumeratorBase.Create(const A: String);
+begin
+  fCurrentPos := PChar(A); // Note: if A='' then PChar(A) returns a pointer to a #0 string
+  fEndPos := fCurrentPos + length(A);
+end;
+
+{ TCodePointEnumerator }
+
+function TCodePointEnumerator.MoveNext: Boolean;
+begin
+  if fCurrentPos < fEndPos then
+  begin
+    fCurrentCodeUnitCount := CodeUnitCount(fCurrentPos);
+    SetLength(fCurrent, fCurrentCodeUnitCount);
+    Move(fCurrentPos^, fCurrent[1], fCurrentCodeUnitCount*SizeOf(Char));
+    inc(fCurrentPos, fCurrentCodeUnitCount);
+    Result := true;
+  end else
+    Result := false;
+end;
+
+{ TUnicodeCharacterEnumerator }
+
+function TUnicodeCharacterEnumerator.MoveNext: Boolean;
+var
+  NextCP: PChar;
+  NextCUCount: Integer;
+begin
+  if fCurrentPos < fEndPos then
+  begin
+    fCurrentCodePointCount := 0;
+    NextCP := fCurrentPos;
+    repeat
+      NextCUCount := CodeUnitCount(NextCP); // Prepare for combining diacritical marks.
+      Inc(NextCP, NextCUCount); // Prepare for combining diacritical marks.
+      Inc(fCurrentCodePointCount);
+    until not IsCombining(NextCP);
+    fCurrentCodeUnitCount := NextCP - fCurrentPos;  // Pointer arithmetics.
+    SetLength(fCurrent, fCurrentCodeUnitCount);
+    Move(fCurrentPos^, fCurrent[1], fCurrentCodeUnitCount*SizeOf(Char));
+    inc(fCurrentPos, fCurrentCodeUnitCount);
+    Result := true;
+  end else
+    Result := false;
+end;
+
+//---
+// Enumerator
+//---
+{$IFDEF FPC}
+operator Enumerator(A: String): TCodePointEnumerator;
+begin
+  Result := TCodePointEnumerator.Create(A);
+end;
+{$ENDIF}
+
+end.
+
--- a/components/lazutils/lazutf16.pas
+++ b/components/lazutils/lazutf16.pas
@ -17,24 +17,40 @@
 }
 unit LazUTF16;

-{$mode objfpc}{$H+}
+{$IFDEF FPC}
+ {$mode objfpc}{$H+}
+{$ENDIF}

 interface

 uses
-  Classes, SysUtils, lazutf8;
+  Classes, SysUtils
+{$IFDEF FPC}
+  ,lazutf8
+{$ENDIF}
+  ;
+
+{$IFnDEF FPC}
+type
+  PtrInt = NativeInt;
+{$ENDIF}

 function UTF16CharacterLength(p: PWideChar): integer;
-function UTF16Length(const s: widestring): PtrInt;
-function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt;
+function UTF16Length(const s: widestring): PtrInt; overload;
+function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt; overload;
+function UTF16Copy(const s: UnicodeString; StartCharIndex, CharCount: PtrInt): Unicodestring;
+function UTF16CharStart(P: PWideChar; Len, CharIndex: PtrInt): PWideChar;
+function UTF16Pos(const SearchForText, SearchInText: UnicodeString; StartPos: PtrInt = 1): PtrInt;
 function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
 function UnicodeToUTF16(u: cardinal): widestring;
 function IsUTF16CharValid(AChar, ANextChar: WideChar): Boolean;
 function IsUTF16StringValid(AWideStr: widestring): Boolean;
-function Utf16StringReplace(const S, OldPattern, NewPattern: WideString;  Flags: TReplaceFlags): WideString;
+function Utf16StringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString;

 function UnicodeLowercase(u: cardinal): cardinal;
+{$IFDEF FPC}
 function UTF8LowerCaseViaTables(const s: string): string;
+{$ENDIF}

 implementation

@ -70,6 +86,107 @@ begin
  end;
 end;

+function UTF16Copy(const s: UnicodeString; StartCharIndex, CharCount: PtrInt): Unicodestring;
+// returns substring
+var
+  StartPos: PWideChar;
+  EndPos: PWideChar;
+  MaxBytes: PtrInt;
+begin
+  StartPos:=UTF16CharStart(PWideChar(s),length(s),StartCharIndex-1);
+  if StartPos=nil then
+    Result:=''
+  else begin
+    MaxBytes:=PtrInt(PWideChar(s)+length(s)-StartPos);
+    EndPos:=UTF16CharStart(StartPos,MaxBytes,CharCount);
+    if EndPos=nil then
+      Result:=copy(s,StartPos-PWideChar(s)+1,MaxBytes)
+    else
+      Result:=copy(s,StartPos-PWideChar(s)+1,EndPos-StartPos);
+  end;
+end;
+
+function UTF16CharStart(P: PWideChar; Len, CharIndex: PtrInt): PWideChar;
+// Len is the length in words of P.
+// CharIndex is the position of the desired UnicodeChar (starting at 0).
+var
+  CharLen: LongInt;
+begin
+  Result:=P;
+  if Result=nil then Exit;
+  while (CharIndex>0) and (Len>0) do
+  begin
+    CharLen:=UTF16CharacterLength(Result);
+    dec(Len,CharLen);
+    dec(CharIndex);
+    inc(Result,CharLen);
+  end;
+  if (CharIndex<>0) or (Len<0) then
+    Result:=nil;
+end;
+
+function IndexOfWideChar(Const Buf: PWideChar; Len: PtrInt; b: WideChar): PtrInt;
+begin
+  for Result:=0 to len-1 do
+    if buf[Result]=b then
+      Exit;
+  Result:=-1;
+end;
+
+// Helper used by UTF16Pos
+function UTF16PosP(SearchForText: PWideChar; SearchForTextLen: PtrInt;
+  SearchInText: PWideChar; SearchInTextLen: PtrInt): PWideChar;
+// returns the position where SearchInText starts in SearchForText
+// returns nil if not found
+var
+  p: PtrInt;
+begin
+  Result:=nil;
+  if (SearchForText=nil) or (SearchForTextLen=0) or (SearchInText=nil) then
+    exit;
+  while SearchInTextLen>0 do begin
+    p:=IndexOfWideChar(SearchInText,SearchInTextLen,SearchForText^);
+    if p<0 then exit;
+    inc(SearchInText, p);
+    dec(SearchInTextLen, p);
+    if SearchInTextLen<SearchForTextLen then exit;
+    if CompareMem(SearchInText,SearchForText,SearchForTextLen * 2) then
+      exit(SearchInText);
+    inc(SearchInText);
+    dec(SearchInTextLen);
+  end;
+end;
+
+function UTF16Pos(const SearchForText, SearchInText: UnicodeString; StartPos: PtrInt = 1): PtrInt;
+// returns the character index, where the SearchForText starts in SearchInText
+// an optional StartPos can be given (in UTF-16 codepoints, not in word)
+// returns 0 if not found
+var
+  i: PtrInt;
+  p: PWideChar;
+  StartPosP: PWideChar;
+begin
+  Result:=0;
+  if StartPos=1 then
+  begin
+    i:=System.Pos(SearchForText,SearchInText);
+    if i>0 then
+      Result:=UTF16Length(PWideChar(SearchInText),i-1)+1;
+  end
+  else if StartPos>1 then
+  begin
+    // skip
+    StartPosP:=UTF16CharStart(PWideChar(SearchInText),Length(SearchInText),StartPos-1);
+    if StartPosP=nil then exit;
+    // search
+    p:=UTF16PosP(PWideChar(SearchForText),length(SearchForText),
+                StartPosP,length(SearchInText)+PWideChar(SearchInText)-StartPosP);
+    // get UTF-8 position
+    if p=nil then exit;
+    Result:=StartPos+UTF16Length(StartPosP,p-StartPosP);
+  end;
+end;
+
 function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
 var
  w1: cardinal;
@ -138,7 +255,7 @@ begin
  end;
 end;

-//Same as SysUtil.StringReplace but for WideStrings/UncodeStrings, since it's not available in fpc yet
+//Same as SysUtil.StringReplace but for WideStrings/UnicodeStrings, since it's not available in fpc yet
 function Utf16StringReplace(const S, OldPattern, NewPattern: WideString;  Flags: TReplaceFlags): WideString;
 var
  Srch, OldP, RemS: WideString; // Srch and OldP can contain WideUpperCase versions of S,OldPattern
@ -897,8 +1014,7 @@ begin
  end;
 end;

-
-
+{$IFDEF FPC}
 function UTF8LowercaseDynLength(const s: string): string;
 var
  Buf: shortstring;
@ -1014,6 +1130,7 @@ begin
    end;
  until false;
 end;
+{$ENDIF}

 initialization
  InitUnicodeTables;