mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-04-27 04:53:39 +02:00
LazUtils: New unit LazUnicode for encoding agnostic code. Supports Delphi. LazUTF16 is updated.
git-svn-id: trunk@52670 -
This commit is contained in:
parent
503a8f7442
commit
dc90c9173e
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -3015,6 +3015,7 @@ components/lazutils/lazloggerbase.pas svneol=native#text/pascal
|
||||
components/lazutils/lazloggerdummy.pas svneol=native#text/pascal
|
||||
components/lazutils/lazloggerprofiling.pas svneol=native#text/pascal
|
||||
components/lazutils/lazmethodlist.pas svneol=native#text/pascal
|
||||
components/lazutils/lazunicode.pas svneol=native#text/plain
|
||||
components/lazutils/lazutf16.pas svneol=native#text/pascal
|
||||
components/lazutils/lazutf8.pas svneol=native#text/pascal
|
||||
components/lazutils/lazutf8classes.pas svneol=native#text/pascal
|
||||
|
282
components/lazutils/lazunicode.pas
Normal file
282
components/lazutils/lazunicode.pas
Normal file
@ -0,0 +1,282 @@
|
||||
{
|
||||
*****************************************************************************
|
||||
This file is part of LazUtils.
|
||||
|
||||
See the file COPYING.modifiedLGPL.txt, included in this distribution,
|
||||
for details about the license.
|
||||
*****************************************************************************
|
||||
|
||||
This unit provides encoding agnostic unicode string manipulation functions and
|
||||
an enumerator. It works transparently with UTF-8 and UTF-16 encodings,
|
||||
thus allowing one source code to work between :
|
||||
1. Lazarus with its UTF-8 solution.
|
||||
2. Future FPC and Lazarus with Delphi compatible UTF-16 solution.
|
||||
3. Delphi, where String = UnicodeString.
|
||||
}
|
||||
unit LazUnicode;
|
||||
|
||||
{$IFDEF FPC}
|
||||
{$mode objfpc}{$H+}
|
||||
{$IF FPC_FULLVERSION < 30000}
|
||||
{$ERROR needs at least FPC 3.0}
|
||||
{$ENDIF}
|
||||
{$ENDIF}
|
||||
|
||||
// For testing the UTF16 version.
|
||||
{$IF DEFINED(FPC) and DEFINED(UseUTF16)}
|
||||
{$ModeSwitch UnicodeStrings} // Sets also FPC_UNICODESTRINGS ...
|
||||
{$DEFINE FPC_UNICODESTRINGS} // but Lazarus editor hi-lighting doesn't know it.
|
||||
{$ENDIF}
|
||||
|
||||
{$IF DEFINED(FPC_UNICODESTRINGS) or not DEFINED(FPC)}
|
||||
{$DEFINE ReallyUseUTF16} // FPC with UTF-16 or Delphi
|
||||
{$ENDIF}
|
||||
|
||||
interface
|
||||
|
||||
uses
|
||||
Classes, SysUtils
|
||||
{$IFDEF ReallyUseUTF16}
|
||||
,character, LazUTF16
|
||||
{$ENDIF}
|
||||
{$IFDEF FPC}
|
||||
,LazUTF8
|
||||
{$ENDIF}
|
||||
;
|
||||
|
||||
// Helper functions for codepoints. They change behavior depending on ModeSwitch.
|
||||
function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string;
|
||||
function CodePointLength(const s: string): NativeInt;
|
||||
function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt;
|
||||
function CodeUnitCount(p: PChar): integer;
|
||||
function IsCombining(const AChar: PChar): Boolean;
|
||||
|
||||
function CodePointToWinCP(const s: string): AnsiString;
|
||||
function WinCPToCodePoint(const s: AnsiString): string;
|
||||
|
||||
type
|
||||
// Base class for CodePoint and Character enumerators.
|
||||
TUnicodeEnumeratorBase = class
|
||||
private
|
||||
fCurrent: String;
|
||||
fCurrentPos, fEndPos: PChar;
|
||||
fCurrentCodeUnitCount: Integer;
|
||||
public
|
||||
constructor Create(const A: String);
|
||||
property Current: String read fCurrent;
|
||||
property CurrentCodeUnitCount: Integer read fCurrentCodeUnitCount;
|
||||
end;
|
||||
|
||||
{ TCodePointEnumerator }
|
||||
|
||||
// Traverse Unicode codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch.
|
||||
TCodePointEnumerator = class(TUnicodeEnumeratorBase)
|
||||
public
|
||||
function MoveNext: Boolean;
|
||||
end;
|
||||
|
||||
{ TUnicodeCharacterEnumerator }
|
||||
|
||||
// Traverse Unicode (user perceived) characters, including accented characters
|
||||
// with combined codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch.
|
||||
TUnicodeCharacterEnumerator = class(TUnicodeEnumeratorBase)
|
||||
private
|
||||
fCurrentCodePointCount: Integer;
|
||||
public
|
||||
property CurrentCodePointCount: Integer read fCurrentCodePointCount;
|
||||
function MoveNext: Boolean;
|
||||
end;
|
||||
|
||||
{$IFDEF FPC}
|
||||
// Enumerator for CodePoints is used for for-in loop now.
|
||||
operator Enumerator(A: String): TCodePointEnumerator;
|
||||
|
||||
// This enumerator combines diacritical marks. Not enabled by default because
|
||||
// there are many more rules for combining codepoints.
|
||||
//operator Enumerator(A: String): TUnicodeCharacterEnumerator;
|
||||
{$ENDIF}
|
||||
|
||||
implementation
|
||||
|
||||
{$IFDEF ReallyUseUTF16}
|
||||
|
||||
function UTF16IsCombining(const AChar: PWideChar): Boolean;
|
||||
var
|
||||
ch: WideChar;
|
||||
begin
|
||||
ch := AChar[0];
|
||||
Result := // Combining Diacritical Marks (belongs to previos char)
|
||||
( (ch >= #$300) and (ch <= #$36F) ) or // 0300-036F
|
||||
( (ch >= #$610) and (ch <= #$61A) ) or // Arabic 0610..061A
|
||||
( (ch >= #$64B) and (ch <= #$65F) ) or // Arabic 064B..065F
|
||||
( ch = #$670) or // Arabic 0670
|
||||
( (ch >= #$6D6) and (ch <= #$6DC) ) or // Arabic 06D6..06DC
|
||||
( (ch >= #$6DF) and (ch <= #$6E4) ) or // Arabic 06DF..06E4
|
||||
( (ch >= #$6E7) and (ch <= #$6E8) ) or // Arabic 06E7..06E8
|
||||
( (ch >= #$6EA) and (ch <= #$6ED) ) or // Arabic 06EA..06ED
|
||||
( (ch >= #$8E4) and (ch <= #$8FE) ) or // Arabic 08E4..08FE
|
||||
( (ch >= #$1DC0) and (ch <= #$1DFF) ) or // Combining Diacritical Marks Supplement 1DC0-1DFF
|
||||
( (ch >= #$20D0) and (ch <= #$20FF) ) or // Combining Diacritical Marks for Symbols 20D0-20FF
|
||||
( (ch >= #$FE20) and (ch <= #$FE2F) ); // Combining half Marks FE20-FE2F
|
||||
end;
|
||||
|
||||
{$ELSE}
|
||||
|
||||
function UTF8IsCombining(const AChar: PChar): Boolean;
|
||||
begin
|
||||
Result :=
|
||||
( (AChar[0] = #$CC) ) or // Combining Diacritical Marks (belongs to previos char) 0300-036F
|
||||
( (AChar[0] = #$CD) and (AChar[1] in [#$80..#$AF]) ) or // Combining Diacritical Marks
|
||||
( (AChar[0] = #$D8) and (AChar[1] in [#$90..#$9A]) ) or // Arabic 0610 (d890)..061A (d89a)
|
||||
( (AChar[0] = #$D9) and (AChar[1] in [#$8b..#$9f, #$B0]) ) or // Arabic 064B (d98b)..065F (d99f) // 0670 (d9b0)
|
||||
|
||||
( (AChar[0] = #$DB) and (AChar[1] in [#$96..#$9C, #$9F..#$A4, #$A7..#$A8, #$AA..#$AD]) ) or // Arabic 06D6 (db96).. .. ..06ED (dbad)
|
||||
( (AChar[0] = #$E0) and (AChar[1] = #$A3) and (AChar[2] in [#$A4..#$BE]) ) or // Arabic 08E4 (e0a3a4) ..08FE (e0a3be)
|
||||
|
||||
( (AChar[0] = #$E1) and (AChar[1] = #$B7) ) or // Combining Diacritical Marks Supplement 1DC0-1DFF (e1b780)
|
||||
( (AChar[0] = #$E2) and (AChar[1] = #$83) and (AChar[2] in [#$90..#$FF]) ) or // Combining Diacritical Marks for Symbols 20D0-20FF
|
||||
( (AChar[0] = #$EF) and (AChar[1] = #$B8) and (AChar[2] in [#$A0..#$AF]) ); // Combining half Marks FE20-FE2F
|
||||
end;
|
||||
|
||||
{$ENDIF}
|
||||
|
||||
//---
|
||||
|
||||
function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string;
|
||||
begin
|
||||
{$IFDEF ReallyUseUTF16}
|
||||
Result := UTF16Copy(s, StartCharIndex, CharCount);
|
||||
{$ELSE}
|
||||
Result := UTF8Copy(s, StartCharIndex, CharCount);
|
||||
{$ENDIF}
|
||||
end;
|
||||
|
||||
function CodePointLength(const s: string): NativeInt;
|
||||
begin
|
||||
{$IFDEF ReallyUseUTF16}
|
||||
Result := UTF16Length(s);
|
||||
{$ELSE}
|
||||
Result := UTF8Length(s);
|
||||
{$ENDIF}
|
||||
end;
|
||||
|
||||
function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt;
|
||||
begin
|
||||
{$IFDEF ReallyUseUTF16}
|
||||
Result := UTF16Pos(SearchForText, SearchInText, StartPos);
|
||||
{$ELSE}
|
||||
Result := UTF8Pos(SearchForText, SearchInText, StartPos);
|
||||
{$ENDIF}
|
||||
end;
|
||||
|
||||
function CodeUnitCount(p: PChar): integer;
|
||||
begin
|
||||
{$IFDEF ReallyUseUTF16}
|
||||
if TCharacter.IsHighSurrogate(p^) then
|
||||
Result := 2
|
||||
else
|
||||
Result := 1
|
||||
{$ELSE}
|
||||
Result := UTF8CharacterLength(p);
|
||||
{$ENDIF}
|
||||
end;
|
||||
|
||||
function IsCombining(const AChar: PChar): Boolean;
|
||||
// Note: there are many more rules for combining codepoints.
|
||||
// The diacritical marks hare are only a subset.
|
||||
begin
|
||||
{$IFDEF ReallyUseUTF16}
|
||||
Result := UTF16IsCombining(AChar);
|
||||
{$ELSE}
|
||||
Result := UTF8IsCombining(AChar);
|
||||
{$ENDIF}
|
||||
end;
|
||||
|
||||
function CodePointToWinCP(const s: string): AnsiString;
|
||||
begin
|
||||
{$IFDEF ReallyUseUTF16}
|
||||
{$IFDEF FPC}
|
||||
// ToDo: Don't convert through UTF-8.
|
||||
Result := UTF8ToWinCP(UTF16ToUTF8(s));
|
||||
{$ELSE}
|
||||
Result := s; // s is UnicodeString in Delphi. Conversion may be lossy.
|
||||
{$ENDIF}
|
||||
{$ELSE}
|
||||
Result := UTF8ToWinCP(s);
|
||||
{$ENDIF}
|
||||
end;
|
||||
|
||||
function WinCPToCodePoint(const s: AnsiString): string;
|
||||
begin
|
||||
{$IFDEF ReallyUseUTF16}
|
||||
{$IFDEF FPC}
|
||||
// ToDo: Don't convert through UTF-8.
|
||||
Result := UTF8ToUTF16(WinCPToUTF8(s));
|
||||
{$ELSE}
|
||||
Result := s; // Result is UnicodeString in Delphi.
|
||||
{$ENDIF}
|
||||
{$ELSE}
|
||||
Result := WinCPToUTF8(s);
|
||||
{$ENDIF}
|
||||
end;
|
||||
|
||||
{ TUnicodeEnumeratorBase }
|
||||
|
||||
constructor TUnicodeEnumeratorBase.Create(const A: String);
|
||||
begin
|
||||
fCurrentPos := PChar(A); // Note: if A='' then PChar(A) returns a pointer to a #0 string
|
||||
fEndPos := fCurrentPos + length(A);
|
||||
end;
|
||||
|
||||
{ TCodePointEnumerator }
|
||||
|
||||
function TCodePointEnumerator.MoveNext: Boolean;
|
||||
begin
|
||||
if fCurrentPos < fEndPos then
|
||||
begin
|
||||
fCurrentCodeUnitCount := CodeUnitCount(fCurrentPos);
|
||||
SetLength(fCurrent, fCurrentCodeUnitCount);
|
||||
Move(fCurrentPos^, fCurrent[1], fCurrentCodeUnitCount*SizeOf(Char));
|
||||
inc(fCurrentPos, fCurrentCodeUnitCount);
|
||||
Result := true;
|
||||
end else
|
||||
Result := false;
|
||||
end;
|
||||
|
||||
{ TUnicodeCharacterEnumerator }
|
||||
|
||||
function TUnicodeCharacterEnumerator.MoveNext: Boolean;
|
||||
var
|
||||
NextCP: PChar;
|
||||
NextCUCount: Integer;
|
||||
begin
|
||||
if fCurrentPos < fEndPos then
|
||||
begin
|
||||
fCurrentCodePointCount := 0;
|
||||
NextCP := fCurrentPos;
|
||||
repeat
|
||||
NextCUCount := CodeUnitCount(NextCP); // Prepare for combining diacritical marks.
|
||||
Inc(NextCP, NextCUCount); // Prepare for combining diacritical marks.
|
||||
Inc(fCurrentCodePointCount);
|
||||
until not IsCombining(NextCP);
|
||||
fCurrentCodeUnitCount := NextCP - fCurrentPos; // Pointer arithmetics.
|
||||
SetLength(fCurrent, fCurrentCodeUnitCount);
|
||||
Move(fCurrentPos^, fCurrent[1], fCurrentCodeUnitCount*SizeOf(Char));
|
||||
inc(fCurrentPos, fCurrentCodeUnitCount);
|
||||
Result := true;
|
||||
end else
|
||||
Result := false;
|
||||
end;
|
||||
|
||||
//---
|
||||
// Enumerator
|
||||
//---
|
||||
{$IFDEF FPC}
|
||||
operator Enumerator(A: String): TCodePointEnumerator;
|
||||
begin
|
||||
Result := TCodePointEnumerator.Create(A);
|
||||
end;
|
||||
{$ENDIF}
|
||||
|
||||
end.
|
||||
|
@ -17,24 +17,40 @@
|
||||
}
|
||||
unit LazUTF16;
|
||||
|
||||
{$mode objfpc}{$H+}
|
||||
{$IFDEF FPC}
|
||||
{$mode objfpc}{$H+}
|
||||
{$ENDIF}
|
||||
|
||||
interface
|
||||
|
||||
uses
|
||||
Classes, SysUtils, lazutf8;
|
||||
Classes, SysUtils
|
||||
{$IFDEF FPC}
|
||||
,lazutf8
|
||||
{$ENDIF}
|
||||
;
|
||||
|
||||
{$IFnDEF FPC}
|
||||
type
|
||||
PtrInt = NativeInt;
|
||||
{$ENDIF}
|
||||
|
||||
function UTF16CharacterLength(p: PWideChar): integer;
|
||||
function UTF16Length(const s: widestring): PtrInt;
|
||||
function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt;
|
||||
function UTF16Length(const s: widestring): PtrInt; overload;
|
||||
function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt; overload;
|
||||
function UTF16Copy(const s: UnicodeString; StartCharIndex, CharCount: PtrInt): Unicodestring;
|
||||
function UTF16CharStart(P: PWideChar; Len, CharIndex: PtrInt): PWideChar;
|
||||
function UTF16Pos(const SearchForText, SearchInText: UnicodeString; StartPos: PtrInt = 1): PtrInt;
|
||||
function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
|
||||
function UnicodeToUTF16(u: cardinal): widestring;
|
||||
function IsUTF16CharValid(AChar, ANextChar: WideChar): Boolean;
|
||||
function IsUTF16StringValid(AWideStr: widestring): Boolean;
|
||||
function Utf16StringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString;
|
||||
function Utf16StringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString;
|
||||
|
||||
function UnicodeLowercase(u: cardinal): cardinal;
|
||||
{$IFDEF FPC}
|
||||
function UTF8LowerCaseViaTables(const s: string): string;
|
||||
{$ENDIF}
|
||||
|
||||
implementation
|
||||
|
||||
@ -70,6 +86,107 @@ begin
|
||||
end;
|
||||
end;
|
||||
|
||||
function UTF16Copy(const s: UnicodeString; StartCharIndex, CharCount: PtrInt): Unicodestring;
|
||||
// returns substring
|
||||
var
|
||||
StartPos: PWideChar;
|
||||
EndPos: PWideChar;
|
||||
MaxBytes: PtrInt;
|
||||
begin
|
||||
StartPos:=UTF16CharStart(PWideChar(s),length(s),StartCharIndex-1);
|
||||
if StartPos=nil then
|
||||
Result:=''
|
||||
else begin
|
||||
MaxBytes:=PtrInt(PWideChar(s)+length(s)-StartPos);
|
||||
EndPos:=UTF16CharStart(StartPos,MaxBytes,CharCount);
|
||||
if EndPos=nil then
|
||||
Result:=copy(s,StartPos-PWideChar(s)+1,MaxBytes)
|
||||
else
|
||||
Result:=copy(s,StartPos-PWideChar(s)+1,EndPos-StartPos);
|
||||
end;
|
||||
end;
|
||||
|
||||
function UTF16CharStart(P: PWideChar; Len, CharIndex: PtrInt): PWideChar;
|
||||
// Len is the length in words of P.
|
||||
// CharIndex is the position of the desired UnicodeChar (starting at 0).
|
||||
var
|
||||
CharLen: LongInt;
|
||||
begin
|
||||
Result:=P;
|
||||
if Result=nil then Exit;
|
||||
while (CharIndex>0) and (Len>0) do
|
||||
begin
|
||||
CharLen:=UTF16CharacterLength(Result);
|
||||
dec(Len,CharLen);
|
||||
dec(CharIndex);
|
||||
inc(Result,CharLen);
|
||||
end;
|
||||
if (CharIndex<>0) or (Len<0) then
|
||||
Result:=nil;
|
||||
end;
|
||||
|
||||
function IndexOfWideChar(Const Buf: PWideChar; Len: PtrInt; b: WideChar): PtrInt;
|
||||
begin
|
||||
for Result:=0 to len-1 do
|
||||
if buf[Result]=b then
|
||||
Exit;
|
||||
Result:=-1;
|
||||
end;
|
||||
|
||||
// Helper used by UTF16Pos
|
||||
function UTF16PosP(SearchForText: PWideChar; SearchForTextLen: PtrInt;
|
||||
SearchInText: PWideChar; SearchInTextLen: PtrInt): PWideChar;
|
||||
// returns the position where SearchInText starts in SearchForText
|
||||
// returns nil if not found
|
||||
var
|
||||
p: PtrInt;
|
||||
begin
|
||||
Result:=nil;
|
||||
if (SearchForText=nil) or (SearchForTextLen=0) or (SearchInText=nil) then
|
||||
exit;
|
||||
while SearchInTextLen>0 do begin
|
||||
p:=IndexOfWideChar(SearchInText,SearchInTextLen,SearchForText^);
|
||||
if p<0 then exit;
|
||||
inc(SearchInText, p);
|
||||
dec(SearchInTextLen, p);
|
||||
if SearchInTextLen<SearchForTextLen then exit;
|
||||
if CompareMem(SearchInText,SearchForText,SearchForTextLen * 2) then
|
||||
exit(SearchInText);
|
||||
inc(SearchInText);
|
||||
dec(SearchInTextLen);
|
||||
end;
|
||||
end;
|
||||
|
||||
function UTF16Pos(const SearchForText, SearchInText: UnicodeString; StartPos: PtrInt = 1): PtrInt;
|
||||
// returns the character index, where the SearchForText starts in SearchInText
|
||||
// an optional StartPos can be given (in UTF-16 codepoints, not in word)
|
||||
// returns 0 if not found
|
||||
var
|
||||
i: PtrInt;
|
||||
p: PWideChar;
|
||||
StartPosP: PWideChar;
|
||||
begin
|
||||
Result:=0;
|
||||
if StartPos=1 then
|
||||
begin
|
||||
i:=System.Pos(SearchForText,SearchInText);
|
||||
if i>0 then
|
||||
Result:=UTF16Length(PWideChar(SearchInText),i-1)+1;
|
||||
end
|
||||
else if StartPos>1 then
|
||||
begin
|
||||
// skip
|
||||
StartPosP:=UTF16CharStart(PWideChar(SearchInText),Length(SearchInText),StartPos-1);
|
||||
if StartPosP=nil then exit;
|
||||
// search
|
||||
p:=UTF16PosP(PWideChar(SearchForText),length(SearchForText),
|
||||
StartPosP,length(SearchInText)+PWideChar(SearchInText)-StartPosP);
|
||||
// get UTF-8 position
|
||||
if p=nil then exit;
|
||||
Result:=StartPos+UTF16Length(StartPosP,p-StartPosP);
|
||||
end;
|
||||
end;
|
||||
|
||||
function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
|
||||
var
|
||||
w1: cardinal;
|
||||
@ -138,7 +255,7 @@ begin
|
||||
end;
|
||||
end;
|
||||
|
||||
//Same as SysUtil.StringReplace but for WideStrings/UncodeStrings, since it's not available in fpc yet
|
||||
//Same as SysUtil.StringReplace but for WideStrings/UnicodeStrings, since it's not available in fpc yet
|
||||
function Utf16StringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString;
|
||||
var
|
||||
Srch, OldP, RemS: WideString; // Srch and OldP can contain WideUpperCase versions of S,OldPattern
|
||||
@ -897,8 +1014,7 @@ begin
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
|
||||
{$IFDEF FPC}
|
||||
function UTF8LowercaseDynLength(const s: string): string;
|
||||
var
|
||||
Buf: shortstring;
|
||||
@ -1014,6 +1130,7 @@ begin
|
||||
end;
|
||||
until false;
|
||||
end;
|
||||
{$ENDIF}
|
||||
|
||||
initialization
|
||||
InitUnicodeTables;
|
||||
|
Loading…
Reference in New Issue
Block a user