+ added and implemented a TUnicodeStringExtendedGraphemeClustersEnumerator class

in the graphemebreakproperty unit - an enumerator that splits a UTF-16 string
  into extended grapheme clusters (i.e. user-perceived characters), conforming
  to the Unicode 9.0 algorithm:
  https://www.unicode.org/reports/tr29/tr29-29.html

git-svn-id: trunk@48726 -
This commit is contained in:
nickysn 2021-02-20 01:24:46 +00:00
parent 4ab485225c
commit 8c06529ae2

View File

@ -55,6 +55,29 @@ type
gbpGlue_After_Zwj,
gbpE_Base_GAZ);
{ TUnicodeStringExtendedGraphemeClustersEnumerator }
TUnicodeStringExtendedGraphemeClustersEnumerator = class
private
FStr: UnicodeString;
FCurrentIndexStart: SizeInt;
FCurrentIndexEnd: SizeInt;
FNextIndexEnd: SizeInt;
FNextGBP: TGraphemeBreakProperty;
FNextCodePoint: UCS4Char;
FCurrentGBP: TGraphemeBreakProperty;
FCurrentCodePoint: UCS4Char;
FRI_Sequence_Length: Integer;
FE_Base_EBG_Extend_Sequence: Boolean;
function GetCurrent: UnicodeString;
procedure FetchNextChar;
public
constructor Create(const S: UnicodeString);
function GetEnumerator: TUnicodeStringExtendedGraphemeClustersEnumerator;
function MoveNext: Boolean;
property Current: UnicodeString read GetCurrent;
end;
function GetGraphemeBreakProperty(Ch: UCS4Char): TGraphemeBreakProperty;
implementation
@ -64,4 +87,94 @@ begin
{$I graphemebreakproperty_code.inc}
end;
{ TUnicodeStringExtendedGraphemeClustersEnumerator }
function TUnicodeStringExtendedGraphemeClustersEnumerator.GetCurrent: UnicodeString;
begin
Result := Copy(FStr, FCurrentIndexStart, FCurrentIndexEnd - FCurrentIndexStart + 1);
end;
procedure TUnicodeStringExtendedGraphemeClustersEnumerator.FetchNextChar;
begin
Inc(FNextIndexEnd);
if FNextIndexEnd <= Length(FStr) then
begin
FNextCodePoint := Ord(FStr[FNextIndexEnd]);
{ high surrogate, followed by low surrogate? }
if (FNextCodePoint >= $D800) and (FNextCodePoint <= $DBFF) and ((FNextIndexEnd + 1) <= Length(FStr)) and
(Ord(FStr[FNextIndexEnd + 1]) >= $DC00) and (Ord(FStr[FNextIndexEnd + 1]) <= $DFFF) then
begin
Inc(FNextIndexEnd);
FNextCodePoint := $10000 + (((FNextCodePoint - $D800) shl 10) or (Ord(FStr[FNextIndexEnd]) - $DC00));
end;
end
else
FNextCodePoint := 0;
FNextGBP := GetGraphemeBreakProperty(FNextCodePoint);
end;
constructor TUnicodeStringExtendedGraphemeClustersEnumerator.Create(const S: UnicodeString);
begin
FStr := S;
FCurrentIndexStart := 0;
FCurrentIndexEnd := 0;
FNextIndexEnd := 0;
FRI_Sequence_Length := 0;
FE_Base_EBG_Extend_Sequence := False;
FetchNextChar;
end;
function TUnicodeStringExtendedGraphemeClustersEnumerator.GetEnumerator: TUnicodeStringExtendedGraphemeClustersEnumerator;
begin
Result := Self;
end;
function TUnicodeStringExtendedGraphemeClustersEnumerator.MoveNext: Boolean;
begin
FCurrentIndexStart := FCurrentIndexEnd + 1;
if FCurrentIndexStart > Length(FStr) then
Exit(false);
repeat
FCurrentGBP := FNextGBP;
FCurrentCodePoint := FNextCodePoint;
FCurrentIndexEnd := FNextIndexEnd;
if FCurrentGBP = gpbRegional_Indicator then
Inc(FRI_Sequence_Length)
else
FRI_Sequence_Length := 0;
FE_Base_EBG_Extend_Sequence := (FCurrentGBP in [gbpE_Base, gbpE_Base_GAZ]) or (FE_Base_EBG_Extend_Sequence and (FCurrentGBP = gbpExtend));
FetchNextChar;
if FNextIndexEnd > Length(FStr) then
Exit(True);
{ Do not break between a CR and LF. Otherwise, break before and after controls. }
if (FCurrentGBP = gbpCR) and (FNextGBP = gbpLF) then
continue
else if (FCurrentGBP in [gbpControl, gbpCR, gbpLF]) or (FNextGBP in [gbpControl, gbpCR, gbpLF]) then
Exit(True)
{ Do not break Hangul syllable sequences. }
else if ((FCurrentGBP = gbpL) and (FNextGBP in [gbpL, gbpV, gbpLV, gbpLVT])) or
((FCurrentGBP in [gbpLV, gbpV]) and (FNextGBP in [gbpV, gbpT])) or
((FCurrentGBP in [gbpLVT, gbpT]) and (FNextGBP = gbpT)) then
continue
{ Do not break before extending characters or ZWJ. }
else if FNextGBP in [gbpExtend, gbpZWJ] then
continue
{ Only for extended grapheme clusters:
Do not break before SpacingMarks, or after Prepend characters. }
else if (FCurrentGBP = gbpPrepend) or (FNextGBP = gbpSpacingMark) then
continue
{ Do not break within emoji modifier sequences or emoji zwj sequences. }
else if ((FCurrentGBP = gbpZWJ) and (FNextGBP in [gbpGlue_After_Zwj, gbpE_Base_GAZ])) or
(FE_Base_EBG_Extend_Sequence and (FNextGBP = gbpE_Modifier)) then
continue
{ Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point. }
else if (FCurrentGBP = gpbRegional_Indicator) and (FNextGBP = gpbRegional_Indicator) and Odd(FRI_Sequence_Length) then
continue
{ Otherwise, break everywhere. }
else
Exit(True);
until False;
end;
end.