mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-12 02:48:24 +02:00
498 lines
12 KiB
ObjectPascal
498 lines
12 KiB
ObjectPascal
{ UnicodeSet implementation.
|
|
|
|
Copyright (c) 2013-2015 by Inoussa OUEDRAOGO
|
|
|
|
The source code is distributed under the Library GNU
|
|
General Public License with the following modification:
|
|
|
|
- object files and libraries linked into an application may be
|
|
distributed without source code.
|
|
|
|
If you didn't receive a copy of the file COPYING, contact:
|
|
Free Software Foundation
|
|
675 Mass Ave
|
|
Cambridge, MA 02139
|
|
USA
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
}
|
|
unit unicodeset;
|
|
|
|
{$mode delphi}{$H+}
|
|
{$scopedenums on}
|
|
|
|
interface
|
|
|
|
uses
|
|
SysUtils,
|
|
grbtree, helper;
|
|
|
|
type
|
|
|
|
EUnicodeSetException = class(Exception)
|
|
end;
|
|
|
|
TUnicodeSet = class;
|
|
|
|
{ TPatternParser }
|
|
|
|
TPatternParser = class
|
|
private
|
|
FBufferStr : UnicodeString;
|
|
FBuffer : PUnicodeChar;
|
|
FBufferLength : Integer;
|
|
FSet : TUnicodeSet;
|
|
FPosition : Integer;
|
|
FSpecialChar: Boolean;
|
|
private
|
|
procedure Error(const AMsg : string; const AArgs : array of const);overload;inline;
|
|
procedure Error(const AMsg : string);overload;inline;
|
|
procedure SetBuffer(const APattern : PUnicodeChar; const ALength : Integer);
|
|
procedure CheckEOF();inline;overload;
|
|
procedure CheckEOF(ALength : Integer);overload;inline;
|
|
procedure UnexpectedEOF();inline;
|
|
function IsThis(AItem : UnicodeString; const APosition : Integer) : Boolean;overload;
|
|
function IsThis(AItem : UnicodeString) : Boolean;overload;inline;
|
|
procedure Expect(AItem : UnicodeString; const APosition : Integer);overload;inline;
|
|
procedure Expect(AItem : UnicodeString);overload;inline;
|
|
procedure SkipSpaces();inline;
|
|
function NextChar() : TUnicodeCodePoint;
|
|
procedure ParseItem();
|
|
procedure DoParse();
|
|
property SpecialChar : Boolean read FSpecialChar;
|
|
public
|
|
procedure Parse(const APattern : PUnicodeChar; const ALength : Integer);overload;
|
|
procedure Parse(const APattern : UnicodeString);overload;inline;
|
|
property CurrentSet : TUnicodeSet read FSet write FSet;
|
|
end;
|
|
|
|
TUnicodeCodePointArrayComparator = class
|
|
public
|
|
// Return
|
|
// * if A>B then 1
|
|
// * if A=B then 0
|
|
// * if A<B then -1
|
|
class function Compare(const A, B : TUnicodeCodePointArray) : Integer;static;inline;
|
|
end;
|
|
|
|
{ TUnicodeSet }
|
|
|
|
TUnicodeSet = class
|
|
private type
|
|
TItem = TUnicodeCodePointArray;
|
|
TTree = TRBTree<TItem,TUnicodeCodePointArrayComparator>;
|
|
public type
|
|
TIterator = TTree.TIterator;
|
|
private
|
|
FTree : TTree;
|
|
FParser : TPatternParser;
|
|
private
|
|
procedure CreateParser();inline;
|
|
function InternalContains(const AString : UnicodeString) : Boolean;overload;
|
|
public
|
|
constructor Create();
|
|
destructor Destroy;override;
|
|
procedure Add(AChar : TUnicodeCodePoint);inline;overload;
|
|
procedure Add(AString : TUnicodeCodePointArray);inline;overload;
|
|
procedure AddRange(const AStart, AEnd : TUnicodeCodePoint);inline;
|
|
procedure AddPattern(const APattern : UnicodeString);inline;overload;
|
|
procedure AddPattern(const APattern : RawByteString);inline;overload;
|
|
function CreateIterator() : TIterator;
|
|
function Contains(const AString : array of TUnicodeCodePoint) : Boolean;overload;
|
|
function Contains(const AChar : TUnicodeCodePoint) : Boolean;inline;overload;
|
|
function Contains(const AChar : UnicodeChar) : Boolean;inline;overload;
|
|
function Contains(const AChar : AnsiChar) : Boolean;inline;overload;
|
|
function Contains(const AString : UnicodeString) : Boolean;overload;
|
|
function Contains(const AString : RawByteString) : Boolean;overload;
|
|
end;
|
|
|
|
resourcestring
|
|
SInvalidLength = 'Invalid length value : "%d".';
|
|
SInvalidPosition = 'Invalid position : "%d".';
|
|
SInvalidRangeLimits = 'Invalid range limits : ["%x" , "%x"].';
|
|
SExpectedBut = 'Expects "%s" but got "%s..." .';
|
|
SUnexpectedEOF = 'Unexpected end of file.';
|
|
|
|
implementation
|
|
uses
|
|
unicodedata;
|
|
|
|
function ToArray(const AItem : TUnicodeCodePoint) : TUnicodeCodePointArray;inline;
|
|
begin
|
|
SetLength(Result,1);
|
|
Result[Low(Result)] := AItem;
|
|
end;
|
|
|
|
function CompareItem(const Item1, Item2 : TUnicodeCodePointArray): Integer;
|
|
var
|
|
a, b : ^TUnicodeCodePoint;
|
|
i, ha, hb : Integer;
|
|
begin
|
|
if (Pointer(Item1) = Pointer(Item2)) then
|
|
exit(0);
|
|
if (Item1 = nil) then
|
|
exit(-1);
|
|
if (Item2 = nil) then
|
|
exit(1);
|
|
a := @Item1[0];
|
|
b := @Item2[0];
|
|
Result := 1;
|
|
ha := Length(Item1) - 1;
|
|
hb := Length(Item2) - 1;
|
|
for i := 0 to ha do begin
|
|
if (i > hb) then
|
|
exit;
|
|
if (a^ < b^) then
|
|
exit(-1);
|
|
if (a^ > b^) then
|
|
exit(1);
|
|
Inc(a);
|
|
Inc(b);
|
|
end;
|
|
if (ha = hb) then
|
|
exit(0);
|
|
exit(-1);
|
|
end;
|
|
|
|
{ TUnicodeCodePointArrayComparator }
|
|
|
|
class function TUnicodeCodePointArrayComparator.Compare(const A, B : TUnicodeCodePointArray): Integer;
|
|
begin
|
|
Result := CompareItem(A,B);
|
|
end;
|
|
|
|
{ TPatternParser }
|
|
|
|
procedure TPatternParser.Error(const AMsg: string; const AArgs: array of const);
|
|
begin
|
|
raise EUnicodeSetException.CreateFmt(AMsg,AArgs);
|
|
end;
|
|
|
|
procedure TPatternParser.Error(const AMsg: string);
|
|
begin
|
|
raise EUnicodeSetException.Create(AMsg);
|
|
end;
|
|
|
|
procedure TPatternParser.SetBuffer(
|
|
const APattern : PUnicodeChar;
|
|
const ALength : Integer
|
|
);
|
|
begin
|
|
FPosition := 0;
|
|
if (ALength <= 1) then begin
|
|
FBufferStr := '';
|
|
FBuffer := nil;
|
|
FBufferLength := 0;
|
|
exit;
|
|
end;
|
|
FBufferLength := ALength;
|
|
SetLength(FBufferStr,FBufferLength);
|
|
FBuffer := @FBufferStr[1];
|
|
Move(APattern^,FBuffer^,(FBufferLength*SizeOf(FBuffer^)));
|
|
end;
|
|
|
|
procedure TPatternParser.CheckEOF();
|
|
begin
|
|
CheckEOF(0);
|
|
end;
|
|
|
|
procedure TPatternParser.CheckEOF(ALength : Integer);
|
|
begin
|
|
if (ALength < 0) then
|
|
Error(SInvalidLength,[ALength]);
|
|
if ((FPosition+ALength) >= FBufferLength) then
|
|
UnexpectedEOF();
|
|
end;
|
|
|
|
procedure TPatternParser.UnexpectedEOF();
|
|
begin
|
|
Error(SUnexpectedEOF);
|
|
end;
|
|
|
|
function TPatternParser.IsThis(AItem: UnicodeString; const APosition: Integer): Boolean;
|
|
var
|
|
i, k, c : Integer;
|
|
begin
|
|
if (APosition < 0) then
|
|
Error(SInvalidPosition,[APosition]);
|
|
Result := False;
|
|
c := Length(AItem);
|
|
if (c = 0) then
|
|
exit;
|
|
i := APosition;
|
|
k := i + c;
|
|
if (k >= FBufferLength) then
|
|
exit;
|
|
if CompareMem(@AItem[1], @FBuffer[APosition],c) then
|
|
Result := True;
|
|
end;
|
|
|
|
function TPatternParser.IsThis(AItem : UnicodeString) : Boolean;
|
|
begin
|
|
Result := IsThis(AItem,FPosition);
|
|
end;
|
|
|
|
procedure TPatternParser.Expect(AItem: UnicodeString; const APosition: Integer);
|
|
begin
|
|
if not IsThis(AItem,APosition) then
|
|
Error(SExpectedBut,[AItem,Copy(FBuffer,APosition,Length(AItem))]);
|
|
end;
|
|
|
|
procedure TPatternParser.Expect(AItem: UnicodeString);
|
|
begin
|
|
Expect(AItem,FPosition);
|
|
end;
|
|
|
|
procedure TPatternParser.SkipSpaces();
|
|
begin
|
|
while (FPosition < FBufferLength) do begin
|
|
if (FBuffer[FPosition] <> ' ') then
|
|
Break;
|
|
Inc(FPosition);
|
|
end;
|
|
end;
|
|
|
|
function TPatternParser.NextChar(): TUnicodeCodePoint;
|
|
var
|
|
i : Integer;
|
|
c : UnicodeChar;
|
|
cp : TUnicodeCodePoint;
|
|
s : UnicodeString;
|
|
begin
|
|
SkipSpaces();
|
|
CheckEOF();
|
|
c := FBuffer[FPosition];
|
|
cp := Ord(c);
|
|
Inc(FPosition);
|
|
if (c = '\') and (FPosition < FBufferLength) then begin
|
|
if IsThis('\') then begin
|
|
Inc(FPosition);
|
|
CheckEOF();
|
|
cp := Ord(FBuffer[FPosition]);
|
|
Inc(FPosition);
|
|
end else if IsThis('u') then begin
|
|
Inc(FPosition);
|
|
CheckEOF(4);
|
|
s := Copy(FBufferStr,(FPosition+1),4);
|
|
Inc(FPosition,4);
|
|
if not TryStrToInt(string('$'+s),i) then
|
|
Error(SExpectedBut,['\uXXXX',s]);
|
|
cp := i;
|
|
end;
|
|
end;
|
|
if (cp <= MAX_WORD) and UnicodeIsLowSurrogate(UnicodeChar(Word(cp))) then begin
|
|
SkipSpaces();
|
|
CheckEOF();
|
|
c := UnicodeChar(Word(cp));
|
|
if UnicodeIsSurrogatePair(c,FBuffer[FPosition]) then begin
|
|
cp := ToUCS4(c,FBuffer[FPosition]);
|
|
Inc(FPosition);
|
|
end;
|
|
end;
|
|
FSpecialChar := (cp = Ord('{')) or (cp = Ord('}'));
|
|
Result := cp;
|
|
end;
|
|
|
|
function CompareTo(const A : TUnicodeCodePoint; const B : UnicodeChar) : Boolean;inline;
|
|
begin
|
|
Result := (A = Ord(B));
|
|
end;
|
|
|
|
procedure TPatternParser.ParseItem();
|
|
var
|
|
cp, lastCp : TUnicodeCodePoint;
|
|
charCount, k : Integer;
|
|
cpa : TUnicodeCodePointArray;
|
|
begin
|
|
SkipSpaces();
|
|
Expect('[');
|
|
charCount := 0;
|
|
Inc(FPosition);
|
|
cp:=0;
|
|
while (FPosition < FBufferLength) do begin
|
|
lastCp := cp;
|
|
cp := NextChar();
|
|
if CompareTo(cp,']') then
|
|
Break;
|
|
if SpecialChar and (cp = Ord('{')) then begin
|
|
SetLength(cpa,12);
|
|
k := 0;
|
|
while True do begin
|
|
cp := NextChar();
|
|
if SpecialChar and (cp = Ord('}')) then
|
|
break;
|
|
if (k >= Length(cpa)) then
|
|
SetLength(cpa,(2*k));
|
|
cpa[k] := cp;
|
|
k := k+1;
|
|
end;
|
|
if (k > 0) then begin
|
|
SetLength(cpa,k);
|
|
FSet.Add(cpa);
|
|
end;
|
|
end else begin
|
|
if CompareTo(cp,'-') then begin
|
|
if (charCount = 0) then
|
|
Error(SExpectedBut,['<char>','-']);
|
|
cp := NextChar();
|
|
FSet.AddRange(lastCp,cp);
|
|
end else begin
|
|
FSet.Add(cp);
|
|
end;
|
|
end;
|
|
Inc(charCount);
|
|
end;
|
|
end;
|
|
|
|
procedure TPatternParser.DoParse();
|
|
begin
|
|
SkipSpaces();
|
|
while (FPosition < FBufferLength) do begin
|
|
ParseItem();
|
|
SkipSpaces();
|
|
end;
|
|
end;
|
|
|
|
procedure TPatternParser.Parse(const APattern: PUnicodeChar; const ALength: Integer);
|
|
begin
|
|
if (ALength < 2) then
|
|
exit;
|
|
SetBuffer(APattern,ALength);
|
|
DoParse();
|
|
end;
|
|
|
|
procedure TPatternParser.Parse(const APattern : UnicodeString);
|
|
begin
|
|
Parse(@APattern[1],Length(APattern));
|
|
end;
|
|
|
|
{ TUnicodeSet }
|
|
|
|
procedure TUnicodeSet.CreateParser();
|
|
begin
|
|
if (FParser = nil) then begin
|
|
FParser := TPatternParser.Create();
|
|
FParser.CurrentSet := Self;
|
|
end;
|
|
end;
|
|
|
|
function TUnicodeSet.InternalContains(const AString: UnicodeString): Boolean;
|
|
var
|
|
u4 : UCS4String;
|
|
c, i : Integer;
|
|
cpa : TUnicodeCodePointArray;
|
|
begin
|
|
u4 := UnicodeStringToUCS4String(AString);
|
|
c := Length(u4)-1;
|
|
if (c = 1) then
|
|
exit(Contains(u4[0]));
|
|
SetLength(cpa,c);
|
|
for i := 0 to c-1 do
|
|
cpa[i] := u4[i];
|
|
Result := Contains(cpa);
|
|
end;
|
|
|
|
constructor TUnicodeSet.Create;
|
|
begin
|
|
FTree := TTree.Create();
|
|
end;
|
|
|
|
destructor TUnicodeSet.Destroy;
|
|
begin
|
|
FParser.Free();
|
|
FTree.Free();
|
|
inherited Destroy;
|
|
end;
|
|
|
|
procedure TUnicodeSet.Add(AChar: TUnicodeCodePoint);
|
|
begin
|
|
FTree.Insert(ToArray(AChar));
|
|
end;
|
|
|
|
procedure TUnicodeSet.Add(AString: TUnicodeCodePointArray);
|
|
begin
|
|
if (AString <> nil) then
|
|
FTree.Insert(AString);
|
|
end;
|
|
|
|
procedure TUnicodeSet.AddRange(const AStart, AEnd : TUnicodeCodePoint);
|
|
var
|
|
i : Integer;
|
|
begin
|
|
if (AStart > AEnd) then
|
|
raise EUnicodeSetException.CreateFmt(SInvalidRangeLimits,[AStart,AEnd]);
|
|
for i := AStart to AEnd do
|
|
Add(i);
|
|
end;
|
|
|
|
procedure TUnicodeSet.AddPattern(const APattern : UnicodeString);
|
|
begin
|
|
CreateParser();
|
|
FParser.Parse(APattern);
|
|
end;
|
|
|
|
procedure TUnicodeSet.AddPattern(const APattern: RawByteString);
|
|
var
|
|
us : UnicodeString;
|
|
begin
|
|
us := UnicodeString(APattern);
|
|
AddPattern(us);
|
|
end;
|
|
|
|
function TUnicodeSet.CreateIterator() : TIterator;
|
|
begin
|
|
Result := FTree.CreateForwardIterator();
|
|
end;
|
|
|
|
function TUnicodeSet.Contains(const AString : array of TUnicodeCodePoint) : Boolean;
|
|
var
|
|
c : Integer;
|
|
x : TUnicodeCodePointArray;
|
|
begin
|
|
Result := False;
|
|
c := Length(AString);
|
|
if (c = 0) then
|
|
exit;
|
|
SetLength(x,c);
|
|
Move(AString[Low(AString)],x[Low(x)],(c*SizeOf(x[0])));
|
|
if (FTree.FindNode(x) <> nil) then
|
|
Result := True;
|
|
end;
|
|
|
|
function TUnicodeSet.Contains(const AChar : TUnicodeCodePoint) : Boolean;
|
|
begin
|
|
Result := Contains([AChar]);
|
|
end;
|
|
|
|
function TUnicodeSet.Contains(const AChar : UnicodeChar) : Boolean;
|
|
begin
|
|
Result := Contains(TUnicodeCodePoint(Ord(AChar)));
|
|
end;
|
|
|
|
function TUnicodeSet.Contains(const AChar : AnsiChar) : Boolean;
|
|
begin
|
|
Result := Contains(TUnicodeCodePoint(Ord(AChar)));
|
|
end;
|
|
|
|
function TUnicodeSet.Contains(const AString: UnicodeString): Boolean;
|
|
begin
|
|
if (AString = '') then
|
|
exit(Contains([]));
|
|
if (Length(AString) = 1) then
|
|
exit(Contains(AString[1]));
|
|
Result := InternalContains(AString);
|
|
end;
|
|
|
|
function TUnicodeSet.Contains(const AString: RawByteString): Boolean;
|
|
var
|
|
us : UnicodeString;
|
|
begin
|
|
us := UnicodeString(AString);
|
|
Result := Contains(us);
|
|
end;
|
|
|
|
end.
|
|
|