mirror of
				https://gitlab.com/freepascal.org/fpc/source.git
				synced 2025-10-31 04:31:35 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			498 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			ObjectPascal
		
	
	
	
	
	
			
		
		
	
	
			498 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			ObjectPascal
		
	
	
	
	
	
| {   UnicodeSet implementation.
 | |
| 
 | |
|     Copyright (c) 2013-2015 by Inoussa OUEDRAOGO
 | |
| 
 | |
|     The source code is distributed under the Library GNU
 | |
|     General Public License with the following modification:
 | |
| 
 | |
|         - object files and libraries linked into an application may be
 | |
|           distributed without source code.
 | |
| 
 | |
|     If you didn't receive a copy of the file COPYING, contact:
 | |
|           Free Software Foundation
 | |
|           675 Mass Ave
 | |
|           Cambridge, MA  02139
 | |
|           USA
 | |
| 
 | |
|     This program is distributed in the hope that it will be useful,
 | |
|     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 | |
| }
 | |
| unit unicodeset;
 | |
| 
 | |
| {$mode delphi}{$H+}
 | |
| {$scopedenums on}
 | |
| 
 | |
| interface
 | |
| 
 | |
| uses
 | |
|   SysUtils,
 | |
|   grbtree, helper;
 | |
| 
 | |
| type
 | |
| 
 | |
|   EUnicodeSetException = class(Exception)
 | |
|   end;
 | |
| 
 | |
|   TUnicodeSet = class;
 | |
| 
 | |
|   { TPatternParser }
 | |
| 
 | |
|   TPatternParser = class
 | |
|   private
 | |
|     FBufferStr : UnicodeString;
 | |
|     FBuffer : PUnicodeChar;
 | |
|     FBufferLength : Integer;
 | |
|     FSet : TUnicodeSet;
 | |
|     FPosition : Integer;
 | |
|     FSpecialChar: Boolean;
 | |
|   private
 | |
|     procedure Error(const AMsg : string; const AArgs : array of const);overload;inline;
 | |
|     procedure Error(const AMsg : string);overload;inline;
 | |
|     procedure SetBuffer(const APattern : PUnicodeChar; const ALength : Integer);
 | |
|     procedure CheckEOF();inline;overload;
 | |
|     procedure CheckEOF(ALength : Integer);overload;inline;
 | |
|     procedure UnexpectedEOF();inline;
 | |
|     function IsThis(AItem : UnicodeString; const APosition : Integer) : Boolean;overload;
 | |
|     function IsThis(AItem : UnicodeString) : Boolean;overload;inline;
 | |
|     procedure Expect(AItem : UnicodeString; const APosition : Integer);overload;inline;
 | |
|     procedure Expect(AItem : UnicodeString);overload;inline;
 | |
|     procedure SkipSpaces();inline;
 | |
|     function NextChar() : TUnicodeCodePoint;
 | |
|     procedure ParseItem();
 | |
|     procedure DoParse();
 | |
|     property SpecialChar : Boolean read FSpecialChar;
 | |
|   public
 | |
|     procedure Parse(const APattern : PUnicodeChar; const ALength : Integer);overload;
 | |
|     procedure Parse(const APattern : UnicodeString);overload;inline;
 | |
|     property CurrentSet : TUnicodeSet read FSet write FSet;
 | |
|   end;
 | |
| 
 | |
|   TUnicodeCodePointArrayComparator = class
 | |
|   public
 | |
|     // Return
 | |
|     //    * if A>B then  1
 | |
|     //    * if A=B then  0
 | |
|     //    * if A<B then -1
 | |
|     class function Compare(const A, B : TUnicodeCodePointArray) : Integer;static;inline;
 | |
|   end;
 | |
| 
 | |
|   { TUnicodeSet }
 | |
| 
 | |
|   TUnicodeSet = class
 | |
|   private type
 | |
|       TItem = TUnicodeCodePointArray;
 | |
|       TTree = TRBTree<TItem,TUnicodeCodePointArrayComparator>;
 | |
|   public type
 | |
|     TIterator = TTree.TIterator;
 | |
|   private
 | |
|     FTree : TTree;
 | |
|     FParser : TPatternParser;
 | |
|   private
 | |
|     procedure CreateParser();inline;
 | |
|     function InternalContains(const AString : UnicodeString) : Boolean;overload;
 | |
|   public
 | |
|     constructor Create();
 | |
|     destructor Destroy;override;
 | |
|     procedure Add(AChar : TUnicodeCodePoint);inline;overload;
 | |
|     procedure Add(AString : TUnicodeCodePointArray);inline;overload;
 | |
|     procedure AddRange(const AStart, AEnd : TUnicodeCodePoint);inline;
 | |
|     procedure AddPattern(const APattern : UnicodeString);inline;overload;
 | |
|     procedure AddPattern(const APattern : RawByteString);inline;overload;
 | |
|     function CreateIterator() : TIterator;
 | |
|     function Contains(const AString : array of TUnicodeCodePoint) : Boolean;overload;
 | |
|     function Contains(const AChar : TUnicodeCodePoint) : Boolean;inline;overload;
 | |
|     function Contains(const AChar : UnicodeChar) : Boolean;inline;overload;
 | |
|     function Contains(const AChar : AnsiChar) : Boolean;inline;overload;
 | |
|     function Contains(const AString : UnicodeString) : Boolean;overload;
 | |
|     function Contains(const AString : RawByteString) : Boolean;overload;
 | |
|   end;
 | |
| 
 | |
| resourcestring
 | |
|   SInvalidLength = 'Invalid length value : "%d".';
 | |
|   SInvalidPosition = 'Invalid position : "%d".';
 | |
|   SInvalidRangeLimits = 'Invalid range limits : ["%x" , "%x"].';
 | |
|   SExpectedBut = 'Expects "%s" but got "%s..." .';
 | |
|   SUnexpectedEOF = 'Unexpected end of file.';
 | |
| 
 | |
| implementation
 | |
| uses
 | |
|   unicodedata;
 | |
| 
 | |
| function ToArray(const AItem : TUnicodeCodePoint) : TUnicodeCodePointArray;inline;
 | |
| begin
 | |
|   SetLength(Result,1);
 | |
|   Result[Low(Result)] := AItem;
 | |
| end;
 | |
| 
 | |
| function CompareItem(const Item1, Item2 : TUnicodeCodePointArray): Integer;
 | |
| var
 | |
|   a, b : ^TUnicodeCodePoint;
 | |
|   i, ha, hb : Integer;
 | |
| begin
 | |
|   if (Pointer(Item1) = Pointer(Item2)) then
 | |
|     exit(0);
 | |
|   if (Item1 = nil) then
 | |
|     exit(-1);
 | |
|   if (Item2 = nil) then
 | |
|     exit(1);
 | |
|   a := @Item1[0];
 | |
|   b := @Item2[0];
 | |
|   Result := 1;
 | |
|   ha := Length(Item1) - 1;
 | |
|   hb := Length(Item2) - 1;
 | |
|   for i := 0 to ha do begin
 | |
|     if (i > hb) then
 | |
|       exit;
 | |
|     if (a^ < b^) then
 | |
|       exit(-1);
 | |
|     if (a^ > b^) then
 | |
|       exit(1);
 | |
|     Inc(a);
 | |
|     Inc(b);
 | |
|   end;
 | |
|   if (ha = hb) then
 | |
|     exit(0);
 | |
|   exit(-1);
 | |
| end;
 | |
| 
 | |
| { TUnicodeCodePointArrayComparator }
 | |
| 
 | |
| class function TUnicodeCodePointArrayComparator.Compare(const A, B : TUnicodeCodePointArray): Integer;
 | |
| begin
 | |
|   Result := CompareItem(A,B);
 | |
| end;
 | |
| 
 | |
| { TPatternParser }
 | |
| 
 | |
| procedure TPatternParser.Error(const AMsg: string; const AArgs: array of const);
 | |
| begin
 | |
|   raise EUnicodeSetException.CreateFmt(AMsg,AArgs);
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.Error(const AMsg: string);
 | |
| begin
 | |
|   raise EUnicodeSetException.Create(AMsg);
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.SetBuffer(
 | |
|   const APattern : PUnicodeChar;
 | |
|   const ALength  : Integer
 | |
| );
 | |
| begin
 | |
|   FPosition := 0;
 | |
|   if (ALength <= 1) then begin
 | |
|     FBufferStr := '';
 | |
|     FBuffer := nil;
 | |
|     FBufferLength := 0;
 | |
|     exit;
 | |
|   end;
 | |
|   FBufferLength := ALength;
 | |
|   SetLength(FBufferStr,FBufferLength);
 | |
|   FBuffer := @FBufferStr[1];
 | |
|   Move(APattern^,FBuffer^,(FBufferLength*SizeOf(FBuffer^)));
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.CheckEOF();
 | |
| begin
 | |
|   CheckEOF(0);
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.CheckEOF(ALength : Integer);
 | |
| begin
 | |
|   if (ALength < 0) then
 | |
|     Error(SInvalidLength,[ALength]);
 | |
|   if ((FPosition+ALength) >= FBufferLength) then
 | |
|     UnexpectedEOF();
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.UnexpectedEOF();
 | |
| begin
 | |
|   Error(SUnexpectedEOF);
 | |
| end;
 | |
| 
 | |
| function TPatternParser.IsThis(AItem: UnicodeString; const APosition: Integer): Boolean;
 | |
| var
 | |
|   i, k, c : Integer;
 | |
| begin
 | |
|   if (APosition < 0) then
 | |
|     Error(SInvalidPosition,[APosition]);
 | |
|   Result := False;
 | |
|   c := Length(AItem);
 | |
|   if (c = 0) then
 | |
|     exit;
 | |
|   i := APosition;
 | |
|   k := i + c;
 | |
|   if (k >= FBufferLength) then
 | |
|     exit;
 | |
|   if CompareMem(@AItem[1], @FBuffer[APosition],c) then
 | |
|     Result := True;
 | |
| end;
 | |
| 
 | |
| function TPatternParser.IsThis(AItem : UnicodeString) : Boolean;
 | |
| begin
 | |
|   Result := IsThis(AItem,FPosition);
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.Expect(AItem: UnicodeString; const APosition: Integer);
 | |
| begin
 | |
|   if not IsThis(AItem,APosition) then
 | |
|     Error(SExpectedBut,[AItem,Copy(FBuffer,APosition,Length(AItem))]);
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.Expect(AItem: UnicodeString);
 | |
| begin
 | |
|   Expect(AItem,FPosition);
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.SkipSpaces();
 | |
| begin
 | |
|   while (FPosition < FBufferLength) do begin
 | |
|     if (FBuffer[FPosition] <> ' ') then
 | |
|       Break;
 | |
|     Inc(FPosition);
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| function TPatternParser.NextChar(): TUnicodeCodePoint;
 | |
| var
 | |
|   i : Integer;
 | |
|   c : UnicodeChar;
 | |
|   cp : TUnicodeCodePoint;
 | |
|   s : UnicodeString;
 | |
| begin
 | |
|   SkipSpaces();
 | |
|   CheckEOF();
 | |
|   c := FBuffer[FPosition];
 | |
|   cp := Ord(c);
 | |
|   Inc(FPosition);
 | |
|   if (c = '\') and (FPosition < FBufferLength) then begin
 | |
|     if IsThis('\') then begin
 | |
|       Inc(FPosition);
 | |
|       CheckEOF();
 | |
|       cp := Ord(FBuffer[FPosition]);
 | |
|       Inc(FPosition);
 | |
|     end else if IsThis('u') then begin
 | |
|       Inc(FPosition);
 | |
|       CheckEOF(4);
 | |
|       s := Copy(FBufferStr,(FPosition+1),4);
 | |
|       Inc(FPosition,4);
 | |
|       if not TryStrToInt(string('$'+s),i) then
 | |
|         Error(SExpectedBut,['\uXXXX',s]);
 | |
|       cp := i;
 | |
|     end;
 | |
|   end;
 | |
|   if (cp <= MAX_WORD) and UnicodeIsLowSurrogate(UnicodeChar(Word(cp))) then begin
 | |
|     SkipSpaces();
 | |
|     CheckEOF();
 | |
|     c := UnicodeChar(Word(cp));
 | |
|     if UnicodeIsSurrogatePair(c,FBuffer[FPosition]) then begin
 | |
|       cp := ToUCS4(c,FBuffer[FPosition]);
 | |
|       Inc(FPosition);
 | |
|     end;
 | |
|   end;
 | |
|   FSpecialChar := (cp = Ord('{')) or (cp = Ord('}'));
 | |
|   Result := cp;
 | |
| end;
 | |
| 
 | |
| function CompareTo(const A : TUnicodeCodePoint; const B : UnicodeChar) : Boolean;inline;
 | |
| begin
 | |
|   Result := (A = Ord(B));
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.ParseItem();
 | |
| var
 | |
|   cp, lastCp : TUnicodeCodePoint;
 | |
|   charCount, k : Integer;
 | |
|   cpa : TUnicodeCodePointArray;
 | |
| begin
 | |
|   SkipSpaces();
 | |
|   Expect('[');
 | |
|   charCount := 0;
 | |
|   Inc(FPosition);
 | |
|   cp:=0;
 | |
|   while (FPosition < FBufferLength) do begin
 | |
|     lastCp := cp;
 | |
|     cp := NextChar();
 | |
|     if CompareTo(cp,']') then
 | |
|       Break;
 | |
|     if SpecialChar and (cp = Ord('{')) then begin
 | |
|       SetLength(cpa,12);
 | |
|       k := 0;
 | |
|       while True do begin
 | |
|         cp := NextChar();
 | |
|         if SpecialChar and (cp = Ord('}')) then
 | |
|           break;
 | |
|         if (k >= Length(cpa)) then
 | |
|           SetLength(cpa,(2*k));
 | |
|         cpa[k] := cp;
 | |
|         k := k+1;
 | |
|       end;
 | |
|       if (k > 0) then begin
 | |
|         SetLength(cpa,k);
 | |
|         FSet.Add(cpa);
 | |
|       end;
 | |
|     end else begin
 | |
|       if CompareTo(cp,'-') then begin
 | |
|         if (charCount = 0) then
 | |
|           Error(SExpectedBut,['<char>','-']);
 | |
|         cp := NextChar();
 | |
|         FSet.AddRange(lastCp,cp);
 | |
|       end else begin
 | |
|         FSet.Add(cp);
 | |
|       end;
 | |
|     end;
 | |
|     Inc(charCount);
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.DoParse();
 | |
| begin
 | |
|   SkipSpaces();
 | |
|   while (FPosition < FBufferLength) do begin
 | |
|     ParseItem();
 | |
|     SkipSpaces();
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.Parse(const APattern: PUnicodeChar; const ALength: Integer);
 | |
| begin
 | |
|   if (ALength < 2) then
 | |
|     exit;
 | |
|   SetBuffer(APattern,ALength);
 | |
|   DoParse();
 | |
| end;
 | |
| 
 | |
| procedure TPatternParser.Parse(const APattern : UnicodeString);
 | |
| begin
 | |
|   Parse(@APattern[1],Length(APattern));
 | |
| end;
 | |
| 
 | |
| { TUnicodeSet }
 | |
| 
 | |
| procedure TUnicodeSet.CreateParser();
 | |
| begin
 | |
|   if (FParser = nil) then begin
 | |
|     FParser := TPatternParser.Create();
 | |
|     FParser.CurrentSet := Self;
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| function TUnicodeSet.InternalContains(const AString: UnicodeString): Boolean;
 | |
| var
 | |
|   u4 : UCS4String;
 | |
|   c, i : Integer;
 | |
|   cpa : TUnicodeCodePointArray;
 | |
| begin
 | |
|   u4 := UnicodeStringToUCS4String(AString);
 | |
|   c := Length(u4)-1;
 | |
|   if (c = 1) then
 | |
|     exit(Contains(u4[0]));
 | |
|   SetLength(cpa,c);
 | |
|   for i := 0 to c-1 do
 | |
|     cpa[i] := u4[i];
 | |
|   Result := Contains(cpa);
 | |
| end;
 | |
| 
 | |
| constructor TUnicodeSet.Create;
 | |
| begin
 | |
|   FTree := TTree.Create();
 | |
| end;
 | |
| 
 | |
| destructor TUnicodeSet.Destroy;
 | |
| begin
 | |
|   FParser.Free();
 | |
|   FTree.Free();
 | |
|   inherited Destroy;
 | |
| end;
 | |
| 
 | |
| procedure TUnicodeSet.Add(AChar: TUnicodeCodePoint);
 | |
| begin
 | |
|   FTree.Insert(ToArray(AChar));
 | |
| end;
 | |
| 
 | |
| procedure TUnicodeSet.Add(AString: TUnicodeCodePointArray);
 | |
| begin
 | |
|   if (AString <> nil) then
 | |
|     FTree.Insert(AString);
 | |
| end;
 | |
| 
 | |
| procedure TUnicodeSet.AddRange(const AStart, AEnd : TUnicodeCodePoint);
 | |
| var
 | |
|   i : Integer;
 | |
| begin
 | |
|   if (AStart > AEnd) then
 | |
|     raise EUnicodeSetException.CreateFmt(SInvalidRangeLimits,[AStart,AEnd]);
 | |
|   for i := AStart to AEnd do
 | |
|     Add(i);
 | |
| end;
 | |
| 
 | |
| procedure TUnicodeSet.AddPattern(const APattern : UnicodeString);
 | |
| begin
 | |
|   CreateParser();
 | |
|   FParser.Parse(APattern);
 | |
| end;
 | |
| 
 | |
| procedure TUnicodeSet.AddPattern(const APattern: RawByteString);
 | |
| var
 | |
|   us : UnicodeString;
 | |
| begin
 | |
|   us := UnicodeString(APattern);
 | |
|   AddPattern(us);
 | |
| end;
 | |
| 
 | |
| function TUnicodeSet.CreateIterator() : TIterator;
 | |
| begin
 | |
|   Result := FTree.CreateForwardIterator();
 | |
| end;
 | |
| 
 | |
| function TUnicodeSet.Contains(const AString : array of TUnicodeCodePoint) : Boolean;
 | |
| var
 | |
|   c : Integer;
 | |
|   x : TUnicodeCodePointArray;
 | |
| begin
 | |
|   Result := False;
 | |
|   c := Length(AString);
 | |
|   if (c = 0) then
 | |
|     exit;
 | |
|   SetLength(x,c);
 | |
|   Move(AString[Low(AString)],x[Low(x)],(c*SizeOf(x[0])));
 | |
|   if (FTree.FindNode(x) <> nil) then
 | |
|     Result := True;
 | |
| end;
 | |
| 
 | |
| function TUnicodeSet.Contains(const AChar : TUnicodeCodePoint) : Boolean;
 | |
| begin
 | |
|   Result := Contains([AChar]);
 | |
| end;
 | |
| 
 | |
| function TUnicodeSet.Contains(const AChar : UnicodeChar) : Boolean;
 | |
| begin
 | |
|   Result := Contains(TUnicodeCodePoint(Ord(AChar)));
 | |
| end;
 | |
| 
 | |
| function TUnicodeSet.Contains(const AChar : AnsiChar) : Boolean;
 | |
| begin
 | |
|   Result := Contains(TUnicodeCodePoint(Ord(AChar)));
 | |
| end;
 | |
| 
 | |
| function TUnicodeSet.Contains(const AString: UnicodeString): Boolean;
 | |
| begin
 | |
|   if (AString = '') then
 | |
|     exit(Contains([]));
 | |
|   if (Length(AString) = 1) then
 | |
|     exit(Contains(AString[1]));
 | |
|   Result := InternalContains(AString);
 | |
| end;
 | |
| 
 | |
| function TUnicodeSet.Contains(const AString: RawByteString): Boolean;
 | |
| var
 | |
|   us : UnicodeString;
 | |
| begin
 | |
|   us := UnicodeString(AString);
 | |
|   Result := Contains(us);
 | |
| end;
 | |
| 
 | |
| end.
 | |
| 
 | 
