mirror of
				https://gitlab.com/freepascal.org/fpc/source.git
				synced 2025-10-31 07:31:49 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			473 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			ObjectPascal
		
	
	
	
	
	
			
		
		
	
	
			473 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			ObjectPascal
		
	
	
	
	
	
| {
 | |
| Author Mazen NEIFER
 | |
| Licence LGPL
 | |
| }
 | |
| unit UTF8BIDI;
 | |
| 
 | |
| {$mode objfpc}{$H+}
 | |
| 
 | |
| interface
 | |
| 
 | |
| uses
 | |
|   FreeBIDI;
 | |
| 
 | |
| type
 | |
|   TUCS32Char = Cardinal;
 | |
|   TUCS16Char = Word;
 | |
|   TUTF8Char = String[4];
 | |
|   TUTF8String = UTF8String;
 | |
| 
 | |
| {****************************Conversion routines*******************************}
 | |
| {Converts an UCS 16/32 bits charcater to UTF8 character}
 | |
| function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
 | |
| {Converts a wide char UCS 16 bits chcarcter to UTF8 character}
 | |
| function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
 | |
| {Converts a wide char UCS 16 bits string to UTF8 character}
 | |
| function UnicodeToUTF8(const Src:TString):TUTF8String;
 | |
| {Converts an UTF8 character to UCS 32 bits character}
 | |
| function UTF8ToUCS32(const UTF8Char:TUTF8Char):TUCS32Char;
 | |
| {Converts an UTF8 character to UCS 16 bits character}
 | |
| function UTF8ToUCS16(const UTF8Char:TUTF8Char):TUCS16Char;
 | |
| {Converts an UTF8 string to UCS 16 bits string}
 | |
| function UTF8ToUnicode(const Src:TUTF8String):TString;
 | |
| {Converts an UTF8 string to a double byte string}
 | |
| function UTF8ToDoubleByteString(const UTF8Str:TUTF8String):String;
 | |
| function UTF8ToDoubleByte(UTF8Str:PChar; Len:Cardinal; DBStr:PByte):Cardinal;
 | |
| {****************************Logical aspects***********************************}
 | |
| {Returns the number of logical characters}
 | |
| function LLength(const UTF8Str:TUTF8String):Cardinal;
 | |
| {Converts visual position to logical position}
 | |
| function LPos(const UTF8Str:TUTF8String; vp:Integer; pDir:TDirection):Cardinal;
 | |
| {Returns character at a given logical position according to paragraph direction}
 | |
| function LCharOf(UTF8String:TUTF8String; lp:Integer):TUTF8Char;
 | |
| {****************************Visual aspects************************************}
 | |
| {Returns the number of visual characters}
 | |
| function VLength(const Src:TUTF8String; pDir:TDirection):Cardinal;
 | |
| {Converts a logical position to a visual position}
 | |
| function VPos(const UTF8Str:TUTF8String; lp:Integer; pDir, cDir:TDirection):Cardinal;
 | |
| {Returns character at a given visual position according to paragraph direction}
 | |
| function VCharOf(UTF8Str:TUTF8String; vp:Integer; dir:TDirection):TUTF8Char;
 | |
| {Inserts a string into an other paying attention of RTL/LTR direction}
 | |
| procedure VInsert(const Src:TUTF8String; var Dest:TUTF8String; vp:Integer; pDir:TDirection);
 | |
| {Deletes a string into an other paying attention of RTL/LTR direction}
 | |
| procedure VDelete(var str:TUTF8String; vp, len:Integer; pDir:TDirection);
 | |
| {****************************Helper routines***********************************}
 | |
| {Returns direction of a character}
 | |
| function DirectionOf(Character:TUTF8Char):TDirection;
 | |
| {Returns contextual direction of caracter in a string}
 | |
| function DirectionOf(UTF8String:TUTF8String; lp:Integer; pDir:TDirection):TDirection;
 | |
| {Inserts a char as if it was typed using keyboard in the most user friendly way.
 | |
| Returns the new cursor position after insersion depending on the new visual text}
 | |
| function InsertChar(Src:TUTF8Char; var Dest:TUTF8String; vp:Integer; pDir:TDirection):Integer;
 | |
| {Returns a table mapping each visual position to its logical position in an UTF8*
 | |
| string}
 | |
| function VisualToLogical(const UTF8String:TUTF8String; pDir:TDirection):TVisualToLogical;
 | |
| 
 | |
| implementation
 | |
| 
 | |
| function DumpStr(const s:TUTF8String):String;
 | |
| var
 | |
|   i:Integer;
 | |
| begin
 | |
|   Result := '';
 | |
|   for i:= 1 to Length(s) do
 | |
|     case s[i] of
 | |
|       #0..#127:
 | |
|          Result := Result + s[i];
 | |
|     else
 | |
|       Result := Result + '$' + HexStr(Ord(s[i]),2);
 | |
|     end;
 | |
| end;
 | |
| function ComputeCharLength(p:PChar):Cardinal;
 | |
| begin
 | |
|   if ord(p^)<%11000000
 | |
|   then
 | |
| {regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
 | |
|     Result:=1
 | |
|   else if ((ord(p^) and %11100000) = %11000000)
 | |
|   then
 | |
|     if (ord(p[1]) and %11000000) = %10000000 then
 | |
|       Result:=2
 | |
|     else
 | |
|       Result:=1
 | |
|   else if ((ord(p^) and %11110000) = %11100000)
 | |
|   then
 | |
|     if ((ord(p[1]) and %11000000) = %10000000)
 | |
|       and ((ord(p[2]) and %11000000) = %10000000)
 | |
|     then
 | |
|       Result:=3
 | |
|     else
 | |
|         Result:=1
 | |
|   else if ((ord(p^) and %11111000) = %11110000)
 | |
|   then
 | |
|     if ((ord(p[1]) and %11000000) = %10000000)
 | |
|     and ((ord(p[2]) and %11000000) = %10000000)
 | |
|     and ((ord(p[3]) and %11000000) = %10000000)
 | |
|     then
 | |
|       Result:=4
 | |
|     else
 | |
|       Result:=1
 | |
|   else
 | |
|     Result:=1
 | |
| end;
 | |
| 
 | |
| {****************************Conversion routines*******************************}
 | |
| function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
 | |
| begin
 | |
|   case aChar of
 | |
|     0..$7f:
 | |
|       begin
 | |
|         Result[1]:=char(aChar);
 | |
|         SetLength(UnicodeToUTF8,1);
 | |
|       end;
 | |
|     $80..$7ff:
 | |
|       begin
 | |
|         Result[1]:=char($c0 or (aChar shr 6));
 | |
|         Result[2]:=char($80 or (aChar and $3f));
 | |
|         SetLength(UnicodeToUTF8,2);
 | |
|       end;
 | |
|     $800..$ffff:
 | |
|       begin
 | |
|         SetLength(Result,3);
 | |
|         Result[1]:=char($e0 or (aChar shr 12));
 | |
|         Result[2]:=char($80 or ((aChar shr 6) and $3f));
 | |
|         Result[3]:=char($80 or (aChar and $3f));
 | |
|       end;
 | |
|     $10000..$1fffff:
 | |
|       begin
 | |
|         SetLength(UnicodeToUTF8,4);
 | |
|         Result[1]:=char($f0 or (aChar shr 18));
 | |
|         Result[2]:=char($80 or ((aChar shr 12) and $3f));
 | |
|         Result[3]:=char($80 or ((aChar shr 6) and $3f));
 | |
|         Result[4]:=char($80 or (aChar and $3f));
 | |
|       end;
 | |
|   else
 | |
|     SetLength(UnicodeToUTF8, 0);
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
 | |
| var
 | |
|   c:TUCS16Char absolute aChar;
 | |
| begin
 | |
|   case c of
 | |
|     0..$7f:
 | |
|       begin
 | |
|         Result[1]:=char(c);
 | |
|         SetLength(UnicodeToUTF8,1);
 | |
|       end;
 | |
|     $80..$7ff:
 | |
|       begin
 | |
|         Result[1]:=char($c0 or (c shr 6));
 | |
|         Result[2]:=char($80 or (c and $3f));
 | |
|         SetLength(UnicodeToUTF8,2);
 | |
|       end;
 | |
|   else
 | |
|     SetLength(UnicodeToUTF8, 0);
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| function UnicodeToUTF8(const Src:TString):TUTF8String;
 | |
| var
 | |
|   vp:Integer;
 | |
| begin
 | |
|   vp := 1;
 | |
|   Result := '';
 | |
|   for vp := 1 to Length(Src) do
 | |
|     Result += UnicodeToUTF8(Src[vp]);
 | |
| end;
 | |
| 
 | |
| function UTF8ToUCS32(const UTF8Char:TUTF8Char):TUCS32Char;
 | |
| begin
 | |
|   case ComputeCharLength(@UTF8Char[1]) of
 | |
|     1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
 | |
|       Result := ord(UTF8Char[1]);
 | |
|     2:
 | |
|       Result := ((ord(UTF8Char[1]) and %00011111) shl 6)
 | |
|                 or (ord(UTF8Char[2]) and %00111111);
 | |
|     3:
 | |
|       Result := ((ord(UTF8Char[1]) and %00011111) shl 12)
 | |
|                 or ((ord(UTF8Char[1]) and %00111111) shl 6)
 | |
|                 or (ord(UTF8Char[2]) and %00111111);
 | |
|     4:
 | |
|       Result := ((ord(UTF8Char[1]) and %00011111) shl 18)
 | |
|                 or ((ord(UTF8Char[2]) and %00111111) shl 12)
 | |
|                 or ((ord(UTF8Char[3]) and %00111111) shl 6)
 | |
|                 or (ord(UTF8Char[4]) and %00111111);
 | |
|   else
 | |
|     Result := $FFFFFFFF;
 | |
|   end
 | |
| end;
 | |
| 
 | |
| function UTF8ToUCS16(const UTF8Char:TUTF8Char):TUCS16Char;
 | |
| begin
 | |
|   case Length(UTF8Char) of
 | |
|     1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
 | |
|       Result := ord(UTF8Char[1]);
 | |
|     2:
 | |
|       Result := ((ord(UTF8Char[1]) and %00011111) shl 6)
 | |
|                 or (ord(UTF8Char[2]) and %00111111);
 | |
|   else
 | |
|     Result := $FFFF;
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| 
 | |
| function UTF8ToUnicode(const Src:TUTF8String):TString;
 | |
| var
 | |
|   lp, vp:Integer;
 | |
|   c:TUTF8Char;
 | |
| begin
 | |
|   lp := 1;
 | |
|   vp := 0;
 | |
|   SetLength(Result, Length(Src));
 | |
|   while lp <= Length(Src) do
 | |
|   begin
 | |
|     vp += 1;
 | |
|     c := LCharOf(Src, lp);
 | |
|     Result[vp] := WideChar(UTF8ToUCS16(c));
 | |
|     lp += Length(c);
 | |
|   end;
 | |
|   SetLength(Result, vp);
 | |
| end;
 | |
| 
 | |
| function UTF8ToDoubleByteString(const UTF8Str: TUTF8String): string;
 | |
| var
 | |
|   Len: Integer;
 | |
| begin
 | |
|   Len:=VLength(UTF8Str, drLTR);
 | |
|   SetLength(Result,Len*2);
 | |
|   if Len=0 then exit;
 | |
|   UTF8ToDoubleByte(PChar(UTF8Str),length(UTF8Str),PByte(Result));
 | |
| end;
 | |
| 
 | |
| function UTF8ToDoubleByte(UTF8Str: PChar; Len:Cardinal; DBStr: PByte):Cardinal;
 | |
| var
 | |
|   SrcPos: PChar;
 | |
|   CharLen: LongInt;
 | |
|   DestPos: PByte;
 | |
|   u: Cardinal;
 | |
| begin
 | |
|   SrcPos:=UTF8Str;
 | |
|   DestPos:=DBStr;
 | |
|   Result:=0;
 | |
|   while Len>0 do begin
 | |
|     u:=UTF8ToUCS32(SrcPos);
 | |
|     DestPos^:=byte((u shr 8) and $ff);
 | |
|     inc(DestPos);
 | |
|     DestPos^:=byte(u and $ff);
 | |
|     inc(DestPos);
 | |
|     inc(SrcPos,CharLen);
 | |
|     dec(Len,CharLen);
 | |
|     inc(Result);
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| {****************************Logical aspects***********************************}
 | |
| function LLength(const UTF8Str:TUTF8String):Cardinal;
 | |
| begin
 | |
|   Result := Length(UTF8Str);
 | |
| end;
 | |
| 
 | |
| function LPos(const UTF8Str:TUTF8String; vp:Integer; pDir:TDirection):Cardinal;
 | |
| var
 | |
|   v2l:TVisualToLogical;
 | |
|   i:integer;
 | |
| begin
 | |
|   v2l := VisualToLogical(UTF8Str, pDir);
 | |
|   if vp <= v2l[0]
 | |
|   then
 | |
|     Result := v2l[vp]
 | |
|   else
 | |
|     Result := Length(UTF8Str) + 1;
 | |
| end;
 | |
| 
 | |
| function LCharOf(UTF8String:TUTF8String; lp:Integer):TUTF8Char;
 | |
| begin
 | |
|   if lp > Length(UTF8String)
 | |
|   then
 | |
|     Exit('');
 | |
|   while(lp > 0) and ((Ord(UTF8String[lp]) and $F0) in [$80..$B0]) do
 | |
| begin
 | |
|     Dec(lp);
 | |
| end;
 | |
|   if lp = 0
 | |
|   then
 | |
|     Exit('');
 | |
|   Move(UTF8String[lp], Result[1], SizeOf(TUTF8Char) - 1);
 | |
|   SetLength(Result, ComputeCharLength(@Result[1]));
 | |
| end;
 | |
| {****************************Visual aspects************************************}
 | |
| function VLength(const Src:TUTF8String; pDir:TDirection):Cardinal;
 | |
| begin
 | |
|   Result := FreeBIDI.VLength(UTF8ToUnicode(Src), pDir);
 | |
| end;
 | |
| 
 | |
| function VPos(const UTF8Str:TUTF8String; lp:Integer; pDir, cDir:TDirection):Cardinal;
 | |
| var
 | |
|   v2l:TVisualToLogical;
 | |
|   vp:Integer;
 | |
| begin
 | |
|   v2l := VisualToLogical(UTF8Str, pDir);
 | |
|   for vp := 1 to v2l[0] do
 | |
|   if lp = v2l[vp]
 | |
|   then
 | |
|     begin
 | |
|       Exit(vp);
 | |
|     end;
 | |
|   Result := v2l[0];
 | |
| end;
 | |
| 
 | |
| function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
 | |
| begin
 | |
| end;
 | |
| 
 | |
| 
 | |
| function VCharOf(UTF8Str:TUTF8String; vp:Integer; dir:TDirection):TUTF8Char;
 | |
| var
 | |
|   CharLen: LongInt;
 | |
| begin
 | |
|   Result:=LCharOf(UTF8Str,LPos(UTF8Str, vp, dir));
 | |
| end;
 | |
| 
 | |
| {****************************Helper routines***********************************}
 | |
| function DirectionOf(Character:TUTF8Char):TDirection;
 | |
| begin
 | |
|   case Character[1] of
 | |
|     #9,#32,
 | |
|     '/',
 | |
|     '{','}',
 | |
|     '[',']',
 | |
|     '(',')':
 | |
|       Result := drNONE;
 | |
|     #$D8,#$D9:
 | |
|       Result := drRTL;
 | |
|   else
 | |
|     Result := drLTR;
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| function DirectionOf(UTF8String:TUTF8String; lp:Integer; pDir:TDirection):TDirection;
 | |
| var
 | |
|   c:TUTF8Char;
 | |
|   lDir,rDir:TDirection;
 | |
|   p:Integer;
 | |
| begin
 | |
|   if(lp <= 0)
 | |
|   then
 | |
|     lp := 1;
 | |
| {Seek for proper character direction}
 | |
|   c := LCharOf(UTF8String, lp);
 | |
|   lDir := DirectionOf(c);
 | |
| {Seek for left character direction if it is neutral}
 | |
|   p := lp;
 | |
|   while(p > 1) and (lDir = drNONE)do
 | |
|   begin
 | |
|     c := LCharOf(UTF8String, p - 1);
 | |
|     lDir := DirectionOf(c);
 | |
|     p := p - Length(c);
 | |
|   end;
 | |
| {Seek for right character direction if it is neutral}
 | |
|   p := lp;
 | |
|   repeat
 | |
|     c := LCharOf(UTF8String, p);
 | |
|     rDir := DirectionOf(c);
 | |
|     p := p + Length(c);
 | |
|   until(p > Length(UTF8String)) or (rDir <> drNONE);
 | |
|   if(lDir = rDir)
 | |
|   then
 | |
|     Result := rDir
 | |
|   else
 | |
|     Result := pDir;
 | |
| end;
 | |
| 
 | |
| function VisualToLogical(const UTF8String:TUTF8String; pDir:TDirection):TVisualToLogical;
 | |
|   procedure Insert(value:Byte; var v2l:TVisualToLogical; InsPos:Byte);
 | |
|     var
 | |
|       l:Byte;
 | |
|     begin
 | |
|       if v2l[0] < 255
 | |
|       then
 | |
|         Inc(InsPos);
 | |
|       if InsPos > v2l[0]
 | |
|       then
 | |
|         InsPos := v2l[0];
 | |
|       for l := v2l[0] downto InsPos do
 | |
|         v2l[l] := v2l[l-1];
 | |
|       v2l[InsPos] := Value;
 | |
|     end;
 | |
| var
 | |
|   lp, vp : Integer;
 | |
|   cDir,lDir:TDirection;
 | |
|   Character:TUTF8Char;
 | |
| i:Integer;
 | |
| begin
 | |
|   Result[0] := 0;
 | |
|   lp := 1;
 | |
|   vp := 1;
 | |
|   lDir := drNONE;
 | |
|   while lp <= Length(UTF8String) do
 | |
|   begin
 | |
|     Character := LCharOf(UTF8String, lp);
 | |
|     cDir := DirectionOf(UTF8String, lp, pDir);
 | |
|     Inc(Result[0]);
 | |
|     case cDir of
 | |
|       drRTL:
 | |
|         begin
 | |
|           lDir := drRTL;
 | |
|         end;
 | |
|       drLTR:
 | |
|         begin
 | |
|           lDir := drLTR;
 | |
|           vp := Result[0];
 | |
|         end;
 | |
|     else
 | |
|       vp := Result[0];
 | |
|     end;
 | |
|     Insert(lp, Result, vp);
 | |
|     Inc(lp, Length(Character));
 | |
|   end;
 | |
| end;
 | |
| 
 | |
| function InsertChar(Src:TUTF8Char; var Dest:TUTF8String; vp:Integer; pDir:TDirection):Integer;
 | |
| var
 | |
|   temp:TString;
 | |
|   c:TCharacter;
 | |
| begin
 | |
|   temp := UTF8ToUnicode(Dest);
 | |
|   c := WideChar(UTF8ToUCS16(Src));
 | |
|   Result := FreeBIDI.InsertChar(c, temp, vp, pDir);
 | |
|   Dest := UnicodeToUTF8(temp);
 | |
| end;
 | |
| 
 | |
| procedure VInsert(const Src:TUTF8String;var Dest:TUTF8String; vp:Integer; pDir:TDirection);
 | |
|   function VStr(const Src:TUTF8String; pDir:TDirection):TUTF8String;
 | |
|   var
 | |
|     v2lSrc:TVisualToLogical;
 | |
|     i:Integer;
 | |
|   begin
 | |
|     v2lSrc := VisualToLogical(Src,pDir);
 | |
|     Result := '';
 | |
|     for i := 1 to v2lSrc[0] do
 | |
|       Result := Result + LCharOf(Src,v2lSrc[i]);
 | |
|   end;
 | |
| var
 | |
|   vSrc,vDest:TUTF8String;
 | |
| begin
 | |
|   vSrc := VStr(Src,pDir);
 | |
|   vDest := VStr(Dest,pDir);
 | |
|   Insert(vSrc, vDest, vp);
 | |
|   Dest := VStr(vDest, pDir);
 | |
| end;
 | |
| 
 | |
| procedure VDelete(var str:TUTF8String; vp, len:Integer; pDir:TDirection);
 | |
| var
 | |
|   temp:TString;
 | |
| begin
 | |
|   temp := UTF8ToUnicode(str);
 | |
|   FreeBIDI.VDelete(temp, vp, len, pDir);
 | |
|   str := UnicodeToUTF8(temp);
 | |
| end;
 | |
| 
 | |
| end.
 | 
