mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-05-04 22:24:01 +02:00
323 lines
10 KiB
ObjectPascal
323 lines
10 KiB
ObjectPascal
{
|
|
*****************************************************************************
|
|
This file is part of LazUtils.
|
|
|
|
See the file COPYING.modifiedLGPL.txt, included in this distribution,
|
|
for details about the license.
|
|
*****************************************************************************
|
|
|
|
This unit provides encoding agnostic unicode string manipulation functions and
|
|
an enumerator. It works transparently with UTF-8 and UTF-16 encodings,
|
|
thus allowing one source code to work between :
|
|
1. Lazarus with its UTF-8 solution.
|
|
2. Future FPC and Lazarus with Delphi compatible UTF-16 solution.
|
|
3. Delphi, where String = UnicodeString.
|
|
}
|
|
unit LazUnicode;
|
|
|
|
{$IFDEF FPC}
|
|
{$mode objfpc}{$H+}{$inline on}
|
|
{$ENDIF}
|
|
|
|
// For testing the UTF16 version.
|
|
{$IF DEFINED(FPC) and DEFINED(UseUTF16)}
|
|
{$ModeSwitch UnicodeStrings} // Sets also FPC_UNICODESTRINGS.
|
|
{$ENDIF}
|
|
|
|
{$IF DEFINED(FPC_UNICODESTRINGS) or not DEFINED(FPC)}
|
|
{$DEFINE ReallyUseUTF16} // FPC with UTF-16 or Delphi
|
|
{$ENDIF}
|
|
|
|
interface
|
|
|
|
uses
|
|
Classes, SysUtils
|
|
{$IFDEF ReallyUseUTF16}
|
|
,character, LazUTF16
|
|
{$ENDIF}
|
|
{$IFDEF FPC}
|
|
,LazUTF8
|
|
{$ENDIF}
|
|
;
|
|
|
|
// Helper functions for codepoints. They change behavior depending on ModeSwitch.
|
|
function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string; inline;
|
|
function CodePointLength(const s: string): NativeInt; inline;
|
|
function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt; inline;
|
|
function CodePointSize(p: PChar): integer; inline;
|
|
function IsCombining(const AChar: PChar): Boolean; {$IFDEF FPC}inline;{$ENDIF}
|
|
|
|
function UnicodeToWinCP(const s: string): AnsiString;
|
|
function WinCPToUnicode(const s: AnsiString): string;
|
|
|
|
function StringOfCodePoint(ACodePoint: String; N: Integer): String;
|
|
|
|
type
|
|
// Base class for CodePoint and Character enumerators.
|
|
|
|
{ TUnicodeEnumeratorBase }
|
|
|
|
TUnicodeEnumeratorBase = class
|
|
private
|
|
fSrcPos, fEndPos: PChar; // Pointers to source string.
|
|
// Preset variables for different codepoint/character lengths.
|
|
// Current will be assigned to one of them.
|
|
fCurOne, fCurTwo, fCurThree, fCurLong: String;
|
|
fCurrent: String; // Current separated codepoint/character.
|
|
fCurrentCodeUnitCount: Integer; // Number of CodeUnits (Pascal Char) in Current.
|
|
procedure UpdateCurrent(aCount: integer);
|
|
public
|
|
constructor Create(const A: String);
|
|
property Current: String read fCurrent;
|
|
property CurrentCodeUnitCount: Integer read fCurrentCodeUnitCount;
|
|
end;
|
|
|
|
{ TCodePointEnumerator }
|
|
|
|
// Traverse Unicode codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch.
|
|
TCodePointEnumerator = class(TUnicodeEnumeratorBase)
|
|
public
|
|
function MoveNext: Boolean;
|
|
end;
|
|
|
|
{ TUnicodeCharacterEnumerator }
|
|
|
|
// Traverse Unicode (user perceived) characters, including accented characters
|
|
// with combined codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch.
|
|
TUnicodeCharacterEnumerator = class(TUnicodeEnumeratorBase)
|
|
private
|
|
fCurrentCodePointCount: Integer; // Number of CodePoints in Current.
|
|
public
|
|
property CurrentCodePointCount: Integer read fCurrentCodePointCount;
|
|
function MoveNext: Boolean;
|
|
end;
|
|
|
|
{$IFDEF FPC}
|
|
// Enumerator for CodePoints could be used for the for-in loop.
|
|
//operator Enumerator(A: String): TCodePointEnumerator;
|
|
|
|
// This enumerator combines diacritical marks.
|
|
// It is used by default although there are more rules for combining codepoints.
|
|
// Diacritical marks cover rules for most western languages.
|
|
operator Enumerator(A: String): TUnicodeCharacterEnumerator;
|
|
{$ENDIF}
|
|
|
|
implementation
|
|
|
|
{$IFDEF ReallyUseUTF16}
|
|
|
|
function UTF16IsCombining(const AChar: PWideChar): Boolean;
|
|
var
|
|
ch: WideChar;
|
|
begin
|
|
ch := AChar[0];
|
|
Result := // Combining Diacritical Marks (belongs to previos char)
|
|
( (ch >= #$300) and (ch <= #$36F) ) or // 0300-036F
|
|
( (ch >= #$610) and (ch <= #$61A) ) or // Arabic 0610..061A
|
|
( (ch >= #$64B) and (ch <= #$65F) ) or // Arabic 064B..065F
|
|
( ch = #$670) or // Arabic 0670
|
|
( (ch >= #$6D6) and (ch <= #$6DC) ) or // Arabic 06D6..06DC
|
|
( (ch >= #$6DF) and (ch <= #$6E4) ) or // Arabic 06DF..06E4
|
|
( (ch >= #$6E7) and (ch <= #$6E8) ) or // Arabic 06E7..06E8
|
|
( (ch >= #$6EA) and (ch <= #$6ED) ) or // Arabic 06EA..06ED
|
|
( (ch >= #$8E4) and (ch <= #$8FE) ) or // Arabic 08E4..08FE
|
|
( (ch >= #$1DC0) and (ch <= #$1DFF) ) or // Combining Diacritical Marks Supplement 1DC0-1DFF
|
|
( (ch >= #$20D0) and (ch <= #$20FF) ) or // Combining Diacritical Marks for Symbols 20D0-20FF
|
|
( (ch >= #$FE20) and (ch <= #$FE2F) ); // Combining half Marks FE20-FE2F
|
|
end;
|
|
|
|
{$ELSE}
|
|
|
|
function UTF8IsCombining(const AChar: PChar): Boolean;
|
|
begin
|
|
Result :=
|
|
( (AChar[0] = #$CC) ) or // Combining Diacritical Marks (belongs to previos char) 0300-036F
|
|
( (AChar[0] = #$CD) and (AChar[1] in [#$80..#$AF]) ) or // Combining Diacritical Marks
|
|
( (AChar[0] = #$D8) and (AChar[1] in [#$90..#$9A]) ) or // Arabic 0610 (d890)..061A (d89a)
|
|
( (AChar[0] = #$D9) and (AChar[1] in [#$8b..#$9f, #$B0]) ) or // Arabic 064B (d98b)..065F (d99f) // 0670 (d9b0)
|
|
|
|
( (AChar[0] = #$DB) and (AChar[1] in [#$96..#$9C, #$9F..#$A4, #$A7..#$A8, #$AA..#$AD]) ) or // Arabic 06D6 (db96).. .. ..06ED (dbad)
|
|
( (AChar[0] = #$E0) and (AChar[1] = #$A3) and (AChar[2] in [#$A4..#$BE]) ) or // Arabic 08E4 (e0a3a4) ..08FE (e0a3be)
|
|
|
|
( (AChar[0] = #$E1) and (AChar[1] = #$B7) ) or // Combining Diacritical Marks Supplement 1DC0-1DFF (e1b780)
|
|
( (AChar[0] = #$E2) and (AChar[1] = #$83) and (AChar[2] in [#$90..#$FF]) ) or // Combining Diacritical Marks for Symbols 20D0-20FF
|
|
( (AChar[0] = #$EF) and (AChar[1] = #$B8) and (AChar[2] in [#$A0..#$AF]) ); // Combining half Marks FE20-FE2F
|
|
end;
|
|
|
|
{$ENDIF}
|
|
|
|
//---
|
|
|
|
function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string;
|
|
// Copy CharCount CodePoints from s, starting from StartCharIndex'th CodePoints.
|
|
begin
|
|
{$IFDEF ReallyUseUTF16}
|
|
Result := UTF16Copy(s, StartCharIndex, CharCount);
|
|
{$ELSE}
|
|
Result := UTF8Copy(s, StartCharIndex, CharCount);
|
|
{$ENDIF}
|
|
end;
|
|
|
|
function CodePointLength(const s: string): NativeInt;
|
|
// Number of CodePoints in s.
|
|
begin
|
|
{$IFDEF ReallyUseUTF16}
|
|
Result := UTF16Length(s);
|
|
{$ELSE}
|
|
Result := UTF8LengthFast(s);
|
|
{$ENDIF}
|
|
end;
|
|
|
|
function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt;
|
|
// Position of SearchForText in CodePoints.
|
|
begin
|
|
{$IFDEF ReallyUseUTF16}
|
|
Result := UTF16Pos(SearchForText, SearchInText, StartPos);
|
|
{$ELSE}
|
|
Result := UTF8Pos(SearchForText, SearchInText, StartPos);
|
|
{$ENDIF}
|
|
end;
|
|
|
|
function CodePointSize(p: PChar): integer;
|
|
// Returns the number of CodeUnits in one CodePoint pointed by p.
|
|
begin
|
|
{$IFDEF ReallyUseUTF16}
|
|
if TCharacter.IsHighSurrogate(p^) then
|
|
Result := 2
|
|
else
|
|
Result := 1
|
|
{$ELSE}
|
|
Result := UTF8CodepointSizeFast(p);
|
|
{$ENDIF}
|
|
end;
|
|
|
|
function IsCombining(const AChar: PChar): Boolean;
|
|
// Note: there are many more rules for combining codepoints.
|
|
// The diacritical marks here are only a subset.
|
|
begin
|
|
{$IFDEF ReallyUseUTF16}
|
|
Result := UTF16IsCombining(AChar);
|
|
{$ELSE}
|
|
Result := UTF8IsCombining(AChar);
|
|
{$ENDIF}
|
|
end;
|
|
|
|
function UnicodeToWinCP(const s: string): AnsiString;
|
|
// Convert s to Windows system codepage. The Unicode encoding of s depends on mode.
|
|
begin
|
|
{$IFDEF ReallyUseUTF16}
|
|
{$IFDEF FPC}
|
|
// ToDo: Don't convert through UTF-8.
|
|
Result := UTF8ToWinCP(UTF16ToUTF8(s));
|
|
{$ELSE}
|
|
Result := s; // s is UnicodeString in Delphi. Conversion may be lossy.
|
|
{$ENDIF}
|
|
{$ELSE}
|
|
Result := UTF8ToWinCP(s);
|
|
{$ENDIF}
|
|
end;
|
|
|
|
function WinCPToUnicode(const s: AnsiString): string;
|
|
// Convert Windows system codepage s to Unicode (encoding depends on mode).
|
|
begin
|
|
{$IFDEF ReallyUseUTF16}
|
|
{$IFDEF FPC}
|
|
// ToDo: Don't convert through UTF-8.
|
|
Result := UTF8ToUTF16(WinCPToUTF8(s));
|
|
{$ELSE}
|
|
Result := s; // Result is UnicodeString in Delphi.
|
|
{$ENDIF}
|
|
{$ELSE}
|
|
Result := WinCPToUTF8(s);
|
|
{$ENDIF}
|
|
end;
|
|
|
|
function StringOfCodePoint(ACodePoint: String; N: Integer): String;
|
|
// Like StringOfChar
|
|
{$IFDEF ReallyUseUTF16}
|
|
var
|
|
i: Integer;
|
|
{$ENDIF}
|
|
begin
|
|
{$IFDEF ReallyUseUTF16}
|
|
Result := '';
|
|
for i := 1 to N do
|
|
Result := Result + ACodePoint;
|
|
{$ELSE}
|
|
Result := Utf8StringOfChar(ACodePoint, N);
|
|
{$ENDIF}
|
|
end;
|
|
|
|
{ TUnicodeEnumeratorBase }
|
|
|
|
constructor TUnicodeEnumeratorBase.Create(const A: String);
|
|
begin
|
|
fSrcPos := PChar(A); // Note: if A='' then PChar(A) returns a pointer to a #0 string
|
|
fEndPos := fSrcPos + length(A);
|
|
SetLength(fCurOne, 1); // Space for the most common codepoint/character lengths.
|
|
SetLength(fCurTwo, 2);
|
|
SetLength(fCurThree, 3);
|
|
end;
|
|
|
|
procedure TUnicodeEnumeratorBase.UpdateCurrent(aCount: integer);
|
|
// Copy the needed bytes to fCurrent which then holds a codepoint or "character".
|
|
begin
|
|
fCurrentCodeUnitCount := aCount;
|
|
Assert(aCount<>0, 'TUnicodeEnumeratorBase.UpdateCurrent: aCount=0.');
|
|
case aCount of
|
|
1: fCurrent := fCurOne; // Assignment does not copy but reference count changes.
|
|
2: fCurrent := fCurTwo;
|
|
3: fCurrent := fCurThree;
|
|
else begin
|
|
SetLength(fCurLong, aCount);
|
|
fCurrent := fCurLong;
|
|
end;
|
|
end;
|
|
Move(fSrcPos^, fCurrent[1], aCount*SizeOf(Char));
|
|
Assert(Length(fCurrent)=aCount, 'TUnicodeEnumeratorBase.UpdateCurrent: Length(fCurrent)<>aCount.');
|
|
inc(fSrcPos, aCount);
|
|
end;
|
|
|
|
{ TCodePointEnumerator }
|
|
|
|
function TCodePointEnumerator.MoveNext: Boolean;
|
|
begin
|
|
Result := fSrcPos < fEndPos;
|
|
if Result then
|
|
UpdateCurrent(CodePointSize(fSrcPos));
|
|
end;
|
|
|
|
{ TUnicodeCharacterEnumerator }
|
|
|
|
function TUnicodeCharacterEnumerator.MoveNext: Boolean;
|
|
var
|
|
NextCP: PChar;
|
|
NextCUCount: Integer;
|
|
begin
|
|
Result := fSrcPos < fEndPos;
|
|
if Result then
|
|
begin
|
|
fCurrentCodePointCount := 0;
|
|
NextCP := fSrcPos;
|
|
repeat
|
|
NextCUCount := CodePointSize(NextCP);
|
|
Inc(NextCP, NextCUCount); // Prepare for combining diacritical marks.
|
|
Inc(fCurrentCodePointCount);
|
|
until not IsCombining(NextCP);
|
|
UpdateCurrent(NextCP - fSrcPos); // Pointer arithmetics.
|
|
end;
|
|
end;
|
|
|
|
//---
|
|
// Enumerator
|
|
//---
|
|
{$IFDEF FPC}
|
|
operator Enumerator(A: String): TUnicodeCharacterEnumerator;
|
|
begin
|
|
Result := TUnicodeCharacterEnumerator.Create(A);
|
|
end;
|
|
{$ENDIF}
|
|
|
|
end.
|
|
|