fpc/utils/unicode/unicodeset.pas
Jonas Maebe f0753d85ad * fix the Upgrade to Unicode version 7 and CLDR 27 from r32814
(mantis , fixes tuca* tests)

git-svn-id: trunk@33708 -
2016-05-19 15:44:22 +00:00

498 lines
12 KiB
ObjectPascal

{ UnicodeSet implementation.
Copyright (c) 2013-2015 by Inoussa OUEDRAOGO
The source code is distributed under the Library GNU
General Public License with the following modification:
- object files and libraries linked into an application may be
distributed without source code.
If you didn't receive a copy of the file COPYING, contact:
Free Software Foundation
675 Mass Ave
Cambridge, MA 02139
USA
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
}
unit unicodeset;
{$mode delphi}{$H+}
{$scopedenums on}
interface
uses
SysUtils,
grbtree, helper;
type
EUnicodeSetException = class(Exception)
end;
TUnicodeSet = class;
{ TPatternParser }
TPatternParser = class
private
FBufferStr : UnicodeString;
FBuffer : PUnicodeChar;
FBufferLength : Integer;
FSet : TUnicodeSet;
FPosition : Integer;
FSpecialChar: Boolean;
private
procedure Error(const AMsg : string; const AArgs : array of const);overload;inline;
procedure Error(const AMsg : string);overload;inline;
procedure SetBuffer(const APattern : PUnicodeChar; const ALength : Integer);
procedure CheckEOF();inline;overload;
procedure CheckEOF(ALength : Integer);overload;inline;
procedure UnexpectedEOF();inline;
function IsThis(AItem : UnicodeString; const APosition : Integer) : Boolean;overload;
function IsThis(AItem : UnicodeString) : Boolean;overload;inline;
procedure Expect(AItem : UnicodeString; const APosition : Integer);overload;inline;
procedure Expect(AItem : UnicodeString);overload;inline;
procedure SkipSpaces();inline;
function NextChar() : TUnicodeCodePoint;
procedure ParseItem();
procedure DoParse();
property SpecialChar : Boolean read FSpecialChar;
public
procedure Parse(const APattern : PUnicodeChar; const ALength : Integer);overload;
procedure Parse(const APattern : UnicodeString);overload;inline;
property CurrentSet : TUnicodeSet read FSet write FSet;
end;
TUnicodeCodePointArrayComparator = class
public
// Return
// * if A>B then 1
// * if A=B then 0
// * if A<B then -1
class function Compare(const A, B : TUnicodeCodePointArray) : Integer;static;inline;
end;
{ TUnicodeSet }
TUnicodeSet = class
private type
TItem = TUnicodeCodePointArray;
TTree = TRBTree<TItem,TUnicodeCodePointArrayComparator>;
public type
TIterator = TTree.TIterator;
private
FTree : TTree;
FParser : TPatternParser;
private
procedure CreateParser();inline;
function InternalContains(const AString : UnicodeString) : Boolean;overload;
public
constructor Create();
destructor Destroy;override;
procedure Add(AChar : TUnicodeCodePoint);inline;overload;
procedure Add(AString : TUnicodeCodePointArray);inline;overload;
procedure AddRange(const AStart, AEnd : TUnicodeCodePoint);inline;
procedure AddPattern(const APattern : UnicodeString);inline;overload;
procedure AddPattern(const APattern : RawByteString);inline;overload;
function CreateIterator() : TIterator;
function Contains(const AString : array of TUnicodeCodePoint) : Boolean;overload;
function Contains(const AChar : TUnicodeCodePoint) : Boolean;inline;overload;
function Contains(const AChar : UnicodeChar) : Boolean;inline;overload;
function Contains(const AChar : AnsiChar) : Boolean;inline;overload;
function Contains(const AString : UnicodeString) : Boolean;overload;
function Contains(const AString : RawByteString) : Boolean;overload;
end;
resourcestring
SInvalidLength = 'Invalid length value : "%d".';
SInvalidPosition = 'Invalid position : "%d".';
SInvalidRangeLimits = 'Invalid range limits : ["%x" , "%x"].';
SExpectedBut = 'Expects "%s" but got "%s..." .';
SUnexpectedEOF = 'Unexpected end of file.';
implementation
uses
unicodedata;
function ToArray(const AItem : TUnicodeCodePoint) : TUnicodeCodePointArray;inline;
begin
SetLength(Result,1);
Result[Low(Result)] := AItem;
end;
function CompareItem(const Item1, Item2 : TUnicodeCodePointArray): Integer;
var
a, b : ^TUnicodeCodePoint;
i, ha, hb : Integer;
begin
if (Pointer(Item1) = Pointer(Item2)) then
exit(0);
if (Item1 = nil) then
exit(-1);
if (Item2 = nil) then
exit(1);
a := @Item1[0];
b := @Item2[0];
Result := 1;
ha := Length(Item1) - 1;
hb := Length(Item2) - 1;
for i := 0 to ha do begin
if (i > hb) then
exit;
if (a^ < b^) then
exit(-1);
if (a^ > b^) then
exit(1);
Inc(a);
Inc(b);
end;
if (ha = hb) then
exit(0);
exit(-1);
end;
{ TUnicodeCodePointArrayComparator }
class function TUnicodeCodePointArrayComparator.Compare(const A, B : TUnicodeCodePointArray): Integer;
begin
Result := CompareItem(A,B);
end;
{ TPatternParser }
procedure TPatternParser.Error(const AMsg: string; const AArgs: array of const);
begin
raise EUnicodeSetException.CreateFmt(AMsg,AArgs);
end;
procedure TPatternParser.Error(const AMsg: string);
begin
raise EUnicodeSetException.Create(AMsg);
end;
procedure TPatternParser.SetBuffer(
const APattern : PUnicodeChar;
const ALength : Integer
);
begin
FPosition := 0;
if (ALength <= 1) then begin
FBufferStr := '';
FBuffer := nil;
FBufferLength := 0;
exit;
end;
FBufferLength := ALength;
SetLength(FBufferStr,FBufferLength);
FBuffer := @FBufferStr[1];
Move(APattern^,FBuffer^,(FBufferLength*SizeOf(FBuffer^)));
end;
procedure TPatternParser.CheckEOF();
begin
CheckEOF(0);
end;
procedure TPatternParser.CheckEOF(ALength : Integer);
begin
if (ALength < 0) then
Error(SInvalidLength,[ALength]);
if ((FPosition+ALength) >= FBufferLength) then
UnexpectedEOF();
end;
procedure TPatternParser.UnexpectedEOF();
begin
Error(SUnexpectedEOF);
end;
function TPatternParser.IsThis(AItem: UnicodeString; const APosition: Integer): Boolean;
var
i, k, c : Integer;
begin
if (APosition < 0) then
Error(SInvalidPosition,[APosition]);
Result := False;
c := Length(AItem);
if (c = 0) then
exit;
i := APosition;
k := i + c;
if (k >= FBufferLength) then
exit;
if CompareMem(@AItem[1], @FBuffer[APosition],c) then
Result := True;
end;
function TPatternParser.IsThis(AItem : UnicodeString) : Boolean;
begin
Result := IsThis(AItem,FPosition);
end;
procedure TPatternParser.Expect(AItem: UnicodeString; const APosition: Integer);
begin
if not IsThis(AItem,APosition) then
Error(SExpectedBut,[AItem,Copy(FBuffer,APosition,Length(AItem))]);
end;
procedure TPatternParser.Expect(AItem: UnicodeString);
begin
Expect(AItem,FPosition);
end;
procedure TPatternParser.SkipSpaces();
begin
while (FPosition < FBufferLength) do begin
if (FBuffer[FPosition] <> ' ') then
Break;
Inc(FPosition);
end;
end;
function TPatternParser.NextChar(): TUnicodeCodePoint;
var
i : Integer;
c : UnicodeChar;
cp : TUnicodeCodePoint;
s : UnicodeString;
begin
SkipSpaces();
CheckEOF();
c := FBuffer[FPosition];
cp := Ord(c);
Inc(FPosition);
if (c = '\') and (FPosition < FBufferLength) then begin
if IsThis('\') then begin
Inc(FPosition);
CheckEOF();
cp := Ord(FBuffer[FPosition]);
Inc(FPosition);
end else if IsThis('u') then begin
Inc(FPosition);
CheckEOF(4);
s := Copy(FBufferStr,(FPosition+1),4);
Inc(FPosition,4);
if not TryStrToInt(string('$'+s),i) then
Error(SExpectedBut,['\uXXXX',s]);
cp := i;
end;
end;
if (cp <= MAX_WORD) and UnicodeIsLowSurrogate(UnicodeChar(Word(cp))) then begin
SkipSpaces();
CheckEOF();
c := UnicodeChar(Word(cp));
if UnicodeIsSurrogatePair(c,FBuffer[FPosition]) then begin
cp := ToUCS4(c,FBuffer[FPosition]);
Inc(FPosition);
end;
end;
FSpecialChar := (cp = Ord('{')) or (cp = Ord('}'));
Result := cp;
end;
function CompareTo(const A : TUnicodeCodePoint; const B : UnicodeChar) : Boolean;inline;
begin
Result := (A = Ord(B));
end;
procedure TPatternParser.ParseItem();
var
cp, lastCp : TUnicodeCodePoint;
charCount, k : Integer;
cpa : TUnicodeCodePointArray;
begin
SkipSpaces();
Expect('[');
charCount := 0;
Inc(FPosition);
cp:=0;
while (FPosition < FBufferLength) do begin
lastCp := cp;
cp := NextChar();
if CompareTo(cp,']') then
Break;
if SpecialChar and (cp = Ord('{')) then begin
SetLength(cpa,12);
k := 0;
while True do begin
cp := NextChar();
if SpecialChar and (cp = Ord('}')) then
break;
if (k >= Length(cpa)) then
SetLength(cpa,(2*k));
cpa[k] := cp;
k := k+1;
end;
if (k > 0) then begin
SetLength(cpa,k);
FSet.Add(cpa);
end;
end else begin
if CompareTo(cp,'-') then begin
if (charCount = 0) then
Error(SExpectedBut,['<char>','-']);
cp := NextChar();
FSet.AddRange(lastCp,cp);
end else begin
FSet.Add(cp);
end;
end;
Inc(charCount);
end;
end;
procedure TPatternParser.DoParse();
begin
SkipSpaces();
while (FPosition < FBufferLength) do begin
ParseItem();
SkipSpaces();
end;
end;
procedure TPatternParser.Parse(const APattern: PUnicodeChar; const ALength: Integer);
begin
if (ALength < 2) then
exit;
SetBuffer(APattern,ALength);
DoParse();
end;
procedure TPatternParser.Parse(const APattern : UnicodeString);
begin
Parse(@APattern[1],Length(APattern));
end;
{ TUnicodeSet }
procedure TUnicodeSet.CreateParser();
begin
if (FParser = nil) then begin
FParser := TPatternParser.Create();
FParser.CurrentSet := Self;
end;
end;
function TUnicodeSet.InternalContains(const AString: UnicodeString): Boolean;
var
u4 : UCS4String;
c, i : Integer;
cpa : TUnicodeCodePointArray;
begin
u4 := UnicodeStringToUCS4String(AString);
c := Length(u4)-1;
if (c = 1) then
exit(Contains(u4[0]));
SetLength(cpa,c);
for i := 0 to c-1 do
cpa[i] := u4[i];
Result := Contains(cpa);
end;
constructor TUnicodeSet.Create;
begin
FTree := TTree.Create();
end;
destructor TUnicodeSet.Destroy;
begin
FParser.Free();
FTree.Free();
inherited Destroy;
end;
procedure TUnicodeSet.Add(AChar: TUnicodeCodePoint);
begin
FTree.Insert(ToArray(AChar));
end;
procedure TUnicodeSet.Add(AString: TUnicodeCodePointArray);
begin
if (AString <> nil) then
FTree.Insert(AString);
end;
procedure TUnicodeSet.AddRange(const AStart, AEnd : TUnicodeCodePoint);
var
i : Integer;
begin
if (AStart > AEnd) then
raise EUnicodeSetException.CreateFmt(SInvalidRangeLimits,[AStart,AEnd]);
for i := AStart to AEnd do
Add(i);
end;
procedure TUnicodeSet.AddPattern(const APattern : UnicodeString);
begin
CreateParser();
FParser.Parse(APattern);
end;
procedure TUnicodeSet.AddPattern(const APattern: RawByteString);
var
us : UnicodeString;
begin
us := UnicodeString(APattern);
AddPattern(us);
end;
function TUnicodeSet.CreateIterator() : TIterator;
begin
Result := FTree.CreateForwardIterator();
end;
function TUnicodeSet.Contains(const AString : array of TUnicodeCodePoint) : Boolean;
var
c : Integer;
x : TUnicodeCodePointArray;
begin
Result := False;
c := Length(AString);
if (c = 0) then
exit;
SetLength(x,c);
Move(AString[Low(AString)],x[Low(x)],(c*SizeOf(x[0])));
if (FTree.FindNode(x) <> nil) then
Result := True;
end;
function TUnicodeSet.Contains(const AChar : TUnicodeCodePoint) : Boolean;
begin
Result := Contains([AChar]);
end;
function TUnicodeSet.Contains(const AChar : UnicodeChar) : Boolean;
begin
Result := Contains(TUnicodeCodePoint(Ord(AChar)));
end;
function TUnicodeSet.Contains(const AChar : AnsiChar) : Boolean;
begin
Result := Contains(TUnicodeCodePoint(Ord(AChar)));
end;
function TUnicodeSet.Contains(const AString: UnicodeString): Boolean;
begin
if (AString = '') then
exit(Contains([]));
if (Length(AString) = 1) then
exit(Contains(AString[1]));
Result := InternalContains(AString);
end;
function TUnicodeSet.Contains(const AString: RawByteString): Boolean;
var
us : UnicodeString;
begin
us := UnicodeString(AString);
Result := Contains(us);
end;
end.