mirror of
				https://gitlab.com/freepascal.org/lazarus/lazarus.git
				synced 2025-10-31 09:21:43 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			323 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			ObjectPascal
		
	
	
	
	
	
			
		
		
	
	
			323 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			ObjectPascal
		
	
	
	
	
	
| {
 | |
|  *****************************************************************************
 | |
|   This file is part of LazUtils.
 | |
| 
 | |
|   See the file COPYING.modifiedLGPL.txt, included in this distribution,
 | |
|   for details about the license.
 | |
|  *****************************************************************************
 | |
| 
 | |
|   This unit provides encoding agnostic unicode string manipulation functions and
 | |
|   an enumerator. It works transparently with UTF-8 and UTF-16 encodings,
 | |
|   thus allowing one source code to work between :
 | |
|   1. Lazarus with its UTF-8 solution.
 | |
|   2. Future FPC and Lazarus with Delphi compatible UTF-16 solution.
 | |
|   3. Delphi, where String = UnicodeString.
 | |
| }
 | |
| unit LazUnicode;
 | |
| 
 | |
| {$IFDEF FPC}
 | |
|  {$mode objfpc}{$H+}{$inline on}
 | |
| {$ENDIF}
 | |
| 
 | |
| // For testing the UTF16 version.
 | |
| {$IF DEFINED(FPC) and DEFINED(UseUTF16)}
 | |
|  {$ModeSwitch UnicodeStrings}   // Sets also FPC_UNICODESTRINGS.
 | |
| {$ENDIF}
 | |
| 
 | |
| {$IF DEFINED(FPC_UNICODESTRINGS) or not DEFINED(FPC)}
 | |
|  {$DEFINE ReallyUseUTF16}       // FPC with UTF-16 or Delphi
 | |
| {$ENDIF}
 | |
| 
 | |
| interface
 | |
| 
 | |
| uses
 | |
|   Classes, SysUtils
 | |
|   {$IFDEF ReallyUseUTF16}
 | |
|    ,character, LazUTF16
 | |
|   {$ENDIF}
 | |
|   {$IFDEF FPC}
 | |
|    ,LazUTF8
 | |
|   {$ENDIF}
 | |
|   ;
 | |
| 
 | |
|   // Helper functions for codepoints. They change behavior depending on ModeSwitch.
 | |
|   function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string; inline;
 | |
|   function CodePointLength(const s: string): NativeInt; inline;
 | |
|   function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt; inline;
 | |
|   function CodePointSize(p: PChar): integer; inline;
 | |
|   function IsCombining(const AChar: PChar): Boolean; {$IFDEF FPC}inline;{$ENDIF}
 | |
| 
 | |
|   function UnicodeToWinCP(const s: string): AnsiString;
 | |
|   function WinCPToUnicode(const s: AnsiString): string;
 | |
| 
 | |
|   function StringOfCodePoint(ACodePoint: String; N: Integer): String;
 | |
| 
 | |
| type
 | |
|   // Base class for CodePoint and Character enumerators.
 | |
| 
 | |
|   { TUnicodeEnumeratorBase }
 | |
| 
 | |
|   TUnicodeEnumeratorBase = class
 | |
|   private
 | |
|     fSrcPos, fEndPos: PChar;        // Pointers to source string.
 | |
|     // Preset variables for different codepoint/character lengths.
 | |
|     // Current will be assigned to one of them.
 | |
|     fCurOne, fCurTwo, fCurThree, fCurLong: String;
 | |
|     fCurrent: String;               // Current separated codepoint/character.
 | |
|     fCurrentCodeUnitCount: Integer; // Number of CodeUnits (Pascal Char) in Current.
 | |
|     procedure UpdateCurrent(aCount: integer);
 | |
|   public
 | |
|     constructor Create(const A: String);
 | |
|     property Current: String read fCurrent;
 | |
|     property CurrentCodeUnitCount: Integer read fCurrentCodeUnitCount;
 | |
|   end;
 | |
| 
 | |
|   { TCodePointEnumerator }
 | |
| 
 | |
|   // Traverse Unicode codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch.
 | |
|   TCodePointEnumerator = class(TUnicodeEnumeratorBase)
 | |
|   public
 | |
|     function MoveNext: Boolean;
 | |
|   end;
 | |
| 
 | |
|   { TUnicodeCharacterEnumerator }
 | |
| 
 | |
|   // Traverse Unicode (user perceived) characters, including accented characters
 | |
|   //  with combined codepoints. Uses UTF-8 or UTF-16 depending on $ModeSwitch.
 | |
|   TUnicodeCharacterEnumerator = class(TUnicodeEnumeratorBase)
 | |
|   private
 | |
|     fCurrentCodePointCount: Integer; // Number of CodePoints in Current.
 | |
|   public
 | |
|     property CurrentCodePointCount: Integer read fCurrentCodePointCount;
 | |
|     function MoveNext: Boolean;
 | |
|   end;
 | |
| 
 | |
|   {$IFDEF FPC}
 | |
|   // Enumerator for CodePoints could be used for the for-in loop.
 | |
|   //operator Enumerator(A: String): TCodePointEnumerator;
 | |
| 
 | |
|   // This enumerator combines diacritical marks.
 | |
|   // It is used by default although there are more rules for combining codepoints.
 | |
|   // Diacritical marks cover rules for most western languages.
 | |
|   operator Enumerator(A: String): TUnicodeCharacterEnumerator;
 | |
|   {$ENDIF}
 | |
| 
 | |
| implementation
 | |
| 
 | |
| {$IFDEF ReallyUseUTF16}
 | |
| 
 | |
| function UTF16IsCombining(const AChar: PWideChar): Boolean;
 | |
| var
 | |
|   ch: WideChar;
 | |
| begin
 | |
|   ch := AChar[0];
 | |
|   Result :=           // Combining Diacritical Marks (belongs to previos char)
 | |
|     ( (ch >= #$300) and (ch <= #$36F) ) or    // 0300-036F
 | |
|     ( (ch >= #$610) and (ch <= #$61A) ) or    // Arabic 0610..061A
 | |
|     ( (ch >= #$64B) and (ch <= #$65F) ) or    // Arabic 064B..065F
 | |
|     (  ch = #$670) or                         // Arabic 0670
 | |
|     ( (ch >= #$6D6) and (ch <= #$6DC) ) or    // Arabic 06D6..06DC
 | |
|     ( (ch >= #$6DF) and (ch <= #$6E4) ) or    // Arabic 06DF..06E4
 | |
|     ( (ch >= #$6E7) and (ch <= #$6E8) ) or    // Arabic 06E7..06E8
 | |
|     ( (ch >= #$6EA) and (ch <= #$6ED) ) or    // Arabic 06EA..06ED
 | |
|     ( (ch >= #$8E4) and (ch <= #$8FE) ) or    // Arabic 08E4..08FE
 | |
|     ( (ch >= #$1DC0) and (ch <= #$1DFF) ) or  // Combining Diacritical Marks Supplement 1DC0-1DFF
 | |
|     ( (ch >= #$20D0) and (ch <= #$20FF) ) or  // Combining Diacritical Marks for Symbols 20D0-20FF
 | |
|     ( (ch >= #$FE20) and (ch <= #$FE2F) );    // Combining half Marks FE20-FE2F
 | |
| end;
 | |
| 
 | |
| {$ELSE}
 | |
| 
 | |
| function UTF8IsCombining(const AChar: PChar): Boolean;
 | |
| begin
 | |
|   Result :=
 | |
|    ( (AChar[0] = #$CC) ) or                   // Combining Diacritical Marks (belongs to previos char) 0300-036F
 | |
|    ( (AChar[0] = #$CD) and (AChar[1] in [#$80..#$AF]) ) or                        // Combining Diacritical Marks
 | |
|    ( (AChar[0] = #$D8) and (AChar[1] in [#$90..#$9A]) ) or                        // Arabic 0610 (d890)..061A (d89a)
 | |
|    ( (AChar[0] = #$D9) and (AChar[1] in [#$8b..#$9f, #$B0]) ) or                  // Arabic 064B (d98b)..065F (d99f) // 0670 (d9b0)
 | |
| 
 | |
|    ( (AChar[0] = #$DB) and (AChar[1] in [#$96..#$9C, #$9F..#$A4, #$A7..#$A8, #$AA..#$AD]) ) or // Arabic 06D6 (db96)..  .. ..06ED (dbad)
 | |
|    ( (AChar[0] = #$E0) and (AChar[1] = #$A3) and (AChar[2] in [#$A4..#$BE]) ) or  // Arabic 08E4 (e0a3a4) ..08FE (e0a3be)
 | |
| 
 | |
|    ( (AChar[0] = #$E1) and (AChar[1] = #$B7) ) or                                 // Combining Diacritical Marks Supplement 1DC0-1DFF (e1b780)
 | |
|    ( (AChar[0] = #$E2) and (AChar[1] = #$83) and (AChar[2] in [#$90..#$FF]) ) or  // Combining Diacritical Marks for Symbols 20D0-20FF
 | |
|    ( (AChar[0] = #$EF) and (AChar[1] = #$B8) and (AChar[2] in [#$A0..#$AF]) );    // Combining half Marks FE20-FE2F
 | |
| end;
 | |
| 
 | |
| {$ENDIF}
 | |
| 
 | |
| //---
 | |
| 
 | |
| function CodePointCopy(const s: string; StartCharIndex, CharCount: NativeInt): string;
 | |
| // Copy CharCount CodePoints from s, starting from StartCharIndex'th CodePoints.
 | |
| begin
 | |
|   {$IFDEF ReallyUseUTF16}
 | |
|   Result := UTF16Copy(s, StartCharIndex, CharCount);
 | |
|   {$ELSE}
 | |
|   Result := UTF8Copy(s, StartCharIndex, CharCount);
 | |
|   {$ENDIF}
 | |
| end;
 | |
| 
 | |
| function CodePointLength(const s: string): NativeInt;
 | |
| // Number of CodePoints in s.
 | |
| begin
 | |
|   {$IFDEF ReallyUseUTF16}
 | |
|   Result := UTF16Length(s);
 | |
|   {$ELSE}
 | |
|   Result := UTF8LengthFast(s);
 | |
|   {$ENDIF}
 | |
| end;
 | |
| 
 | |
| function CodePointPos(const SearchForText, SearchInText: string; StartPos: NativeInt = 1): NativeInt;
 | |
| // Position of SearchForText in CodePoints.
 | |
| begin
 | |
|   {$IFDEF ReallyUseUTF16}
 | |
|   Result := UTF16Pos(SearchForText, SearchInText, StartPos);
 | |
|   {$ELSE}
 | |
|   Result := UTF8Pos(SearchForText, SearchInText, StartPos);
 | |
|   {$ENDIF}
 | |
| end;
 | |
| 
 | |
| function CodePointSize(p: PChar): integer;
 | |
| // Returns the number of CodeUnits in one CodePoint pointed by p.
 | |
| begin
 | |
|   {$IFDEF ReallyUseUTF16}
 | |
|   if TCharacter.IsHighSurrogate(p^) then
 | |
|     Result := 2
 | |
|   else
 | |
|     Result := 1
 | |
|   {$ELSE}
 | |
|   Result := UTF8CodepointSizeFast(p);
 | |
|   {$ENDIF}
 | |
| end;
 | |
| 
 | |
| function IsCombining(const AChar: PChar): Boolean;
 | |
| // Note: there are many more rules for combining codepoints.
 | |
| //  The diacritical marks here are only a subset.
 | |
| begin
 | |
|   {$IFDEF ReallyUseUTF16}
 | |
|   Result := UTF16IsCombining(AChar);
 | |
|   {$ELSE}
 | |
|   Result := UTF8IsCombining(AChar);
 | |
|   {$ENDIF}
 | |
| end;
 | |
| 
 | |
| function UnicodeToWinCP(const s: string): AnsiString;
 | |
| // Convert s to Windows system codepage. The Unicode encoding of s depends on mode.
 | |
| begin
 | |
|   {$IFDEF ReallyUseUTF16}
 | |
|   {$IFDEF FPC}
 | |
|    // ToDo: Don't convert through UTF-8.
 | |
|    Result := UTF8ToWinCP(UTF16ToUTF8(s));
 | |
|    {$ELSE}
 | |
|    Result := s; // s is UnicodeString in Delphi. Conversion may be lossy.
 | |
|    {$ENDIF}
 | |
|   {$ELSE}
 | |
|    Result := UTF8ToWinCP(s);
 | |
|   {$ENDIF}
 | |
| end;
 | |
| 
 | |
| function WinCPToUnicode(const s: AnsiString): string;
 | |
| // Convert Windows system codepage s to Unicode (encoding depends on mode).
 | |
| begin
 | |
|   {$IFDEF ReallyUseUTF16}
 | |
|   {$IFDEF FPC}
 | |
|    // ToDo: Don't convert through UTF-8.
 | |
|    Result := UTF8ToUTF16(WinCPToUTF8(s));
 | |
|    {$ELSE}
 | |
|    Result := s; // Result is UnicodeString in Delphi.
 | |
|    {$ENDIF}
 | |
|   {$ELSE}
 | |
|    Result := WinCPToUTF8(s);
 | |
|   {$ENDIF}
 | |
| end;
 | |
| 
 | |
| function StringOfCodePoint(ACodePoint: String; N: Integer): String;
 | |
| // Like StringOfChar
 | |
| {$IFDEF ReallyUseUTF16}
 | |
| var
 | |
|   i: Integer;
 | |
| {$ENDIF}
 | |
| begin
 | |
|   {$IFDEF ReallyUseUTF16}
 | |
|   Result := '';
 | |
|   for i := 1 to N do
 | |
|     Result := Result + ACodePoint;
 | |
|   {$ELSE}
 | |
|   Result := Utf8StringOfChar(ACodePoint, N);
 | |
|   {$ENDIF}
 | |
| end;
 | |
| 
 | |
| { TUnicodeEnumeratorBase }
 | |
| 
 | |
| constructor TUnicodeEnumeratorBase.Create(const A: String);
 | |
| begin
 | |
|   fSrcPos := PChar(A); // Note: if A='' then PChar(A) returns a pointer to a #0 string
 | |
|   fEndPos := fSrcPos + length(A);
 | |
|   SetLength(fCurOne, 1); // Space for the most common codepoint/character lengths.
 | |
|   SetLength(fCurTwo, 2);
 | |
|   SetLength(fCurThree, 3);
 | |
| end;
 | |
| 
 | |
| procedure TUnicodeEnumeratorBase.UpdateCurrent(aCount: integer);
 | |
| // Copy the needed bytes to fCurrent which then holds a codepoint or "character".
 | |
| begin
 | |
|   fCurrentCodeUnitCount := aCount;
 | |
|   Assert(aCount<>0, 'TUnicodeEnumeratorBase.UpdateCurrent: aCount=0.');
 | |
|   case aCount of
 | |
|     1: fCurrent := fCurOne; // Assignment does not copy but reference count changes.
 | |
|     2: fCurrent := fCurTwo;
 | |
|     3: fCurrent := fCurThree;
 | |
|     else begin
 | |
|       SetLength(fCurLong, aCount);
 | |
|       fCurrent := fCurLong;
 | |
|     end;
 | |
|   end;
 | |
|   Move(fSrcPos^, fCurrent[1], aCount*SizeOf(Char));
 | |
|   Assert(Length(fCurrent)=aCount, 'TUnicodeEnumeratorBase.UpdateCurrent: Length(fCurrent)<>aCount.');
 | |
|   inc(fSrcPos, aCount);
 | |
| end;
 | |
| 
 | |
| { TCodePointEnumerator }
 | |
| 
 | |
| function TCodePointEnumerator.MoveNext: Boolean;
 | |
| begin
 | |
|   Result := fSrcPos < fEndPos;
 | |
|   if Result then
 | |
|     UpdateCurrent(CodePointSize(fSrcPos));
 | |
| end;
 | |
| 
 | |
| { TUnicodeCharacterEnumerator }
 | |
| 
 | |
| function TUnicodeCharacterEnumerator.MoveNext: Boolean;
 | |
| var
 | |
|   NextCP: PChar;
 | |
|   NextCUCount: Integer;
 | |
| begin
 | |
|   Result := fSrcPos < fEndPos;
 | |
|   if Result then
 | |
|   begin
 | |
|     fCurrentCodePointCount := 0;
 | |
|     NextCP := fSrcPos;
 | |
|     repeat
 | |
|       NextCUCount := CodePointSize(NextCP);
 | |
|       Inc(NextCP, NextCUCount);       // Prepare for combining diacritical marks.
 | |
|       Inc(fCurrentCodePointCount);
 | |
|     until not IsCombining(NextCP);
 | |
|     UpdateCurrent(NextCP - fSrcPos);  // Pointer arithmetics.
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| //---
 | |
| // Enumerator
 | |
| //---
 | |
| {$IFDEF FPC}
 | |
| operator Enumerator(A: String): TUnicodeCharacterEnumerator;
 | |
| begin
 | |
|   Result := TUnicodeCharacterEnumerator.Create(A);
 | |
| end;
 | |
| {$ENDIF}
 | |
| 
 | |
| end.
 | |
| 
 | 
