mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-10-24 12:01:39 +02:00
473 lines
12 KiB
ObjectPascal
473 lines
12 KiB
ObjectPascal
{
|
|
Author Mazen NEIFER
|
|
Licence LGPL
|
|
}
|
|
unit UTF8BIDI;
|
|
|
|
{$mode objfpc}{$H+}
|
|
|
|
interface
|
|
|
|
uses
|
|
FreeBIDI;
|
|
|
|
type
|
|
TUCS32Char = Cardinal;
|
|
TUCS16Char = Word;
|
|
TUTF8Char = String[4];
|
|
TUTF8String = UTF8String;
|
|
|
|
{****************************Conversion routines*******************************}
|
|
{Converts an UCS 16/32 bits charcater to UTF8 character}
|
|
function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
|
|
{Converts a wide char UCS 16 bits chcarcter to UTF8 character}
|
|
function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
|
|
{Converts a wide char UCS 16 bits string to UTF8 character}
|
|
function UnicodeToUTF8(const Src:TString):TUTF8String;
|
|
{Converts an UTF8 character to UCS 32 bits character}
|
|
function UTF8ToUCS32(const UTF8Char:TUTF8Char):TUCS32Char;
|
|
{Converts an UTF8 character to UCS 16 bits character}
|
|
function UTF8ToUCS16(const UTF8Char:TUTF8Char):TUCS16Char;
|
|
{Converts an UTF8 string to UCS 16 bits string}
|
|
function UTF8ToUnicode(const Src:TUTF8String):TString;
|
|
{Converts an UTF8 string to a double byte string}
|
|
function UTF8ToDoubleByteString(const UTF8Str:TUTF8String):String;
|
|
function UTF8ToDoubleByte(UTF8Str:PChar; Len:Cardinal; DBStr:PByte):Cardinal;
|
|
{****************************Logical aspects***********************************}
|
|
{Returns the number of logical characters}
|
|
function LLength(const UTF8Str:TUTF8String):Cardinal;
|
|
{Converts visual position to logical position}
|
|
function LPos(const UTF8Str:TUTF8String; vp:Integer; pDir:TDirection):Cardinal;
|
|
{Returns character at a given logical position according to paragraph direction}
|
|
function LCharOf(UTF8String:TUTF8String; lp:Integer):TUTF8Char;
|
|
{****************************Visual aspects************************************}
|
|
{Returns the number of visual characters}
|
|
function VLength(const Src:TUTF8String; pDir:TDirection):Cardinal;
|
|
{Converts a logical position to a visual position}
|
|
function VPos(const UTF8Str:TUTF8String; lp:Integer; pDir, cDir:TDirection):Cardinal;
|
|
{Returns character at a given visual position according to paragraph direction}
|
|
function VCharOf(UTF8Str:TUTF8String; vp:Integer; dir:TDirection):TUTF8Char;
|
|
{Inserts a string into an other paying attention of RTL/LTR direction}
|
|
procedure VInsert(const Src:TUTF8String; var Dest:TUTF8String; vp:Integer; pDir:TDirection);
|
|
{Deletes a string into an other paying attention of RTL/LTR direction}
|
|
procedure VDelete(var str:TUTF8String; vp, len:Integer; pDir:TDirection);
|
|
{****************************Helper routines***********************************}
|
|
{Returns direction of a character}
|
|
function DirectionOf(Character:TUTF8Char):TDirection;
|
|
{Returns contextual direction of caracter in a string}
|
|
function DirectionOf(UTF8String:TUTF8String; lp:Integer; pDir:TDirection):TDirection;
|
|
{Inserts a char as if it was typed using keyboard in the most user friendly way.
|
|
Returns the new cursor position after insersion depending on the new visual text}
|
|
function InsertChar(Src:TUTF8Char; var Dest:TUTF8String; vp:Integer; pDir:TDirection):Integer;
|
|
{Returns a table mapping each visual position to its logical position in an UTF8*
|
|
string}
|
|
function VisualToLogical(const UTF8String:TUTF8String; pDir:TDirection):TVisualToLogical;
|
|
|
|
implementation
|
|
|
|
function DumpStr(const s:TUTF8String):String;
|
|
var
|
|
i:Integer;
|
|
begin
|
|
Result := '';
|
|
for i:= 1 to Length(s) do
|
|
case s[i] of
|
|
#0..#127:
|
|
Result := Result + s[i];
|
|
else
|
|
Result := Result + '$' + HexStr(Ord(s[i]),2);
|
|
end;
|
|
end;
|
|
function ComputeCharLength(p:PChar):Cardinal;
|
|
begin
|
|
if ord(p^)<%11000000
|
|
then
|
|
{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
|
|
Result:=1
|
|
else if ((ord(p^) and %11100000) = %11000000)
|
|
then
|
|
if (ord(p[1]) and %11000000) = %10000000 then
|
|
Result:=2
|
|
else
|
|
Result:=1
|
|
else if ((ord(p^) and %11110000) = %11100000)
|
|
then
|
|
if ((ord(p[1]) and %11000000) = %10000000)
|
|
and ((ord(p[2]) and %11000000) = %10000000)
|
|
then
|
|
Result:=3
|
|
else
|
|
Result:=1
|
|
else if ((ord(p^) and %11111000) = %11110000)
|
|
then
|
|
if ((ord(p[1]) and %11000000) = %10000000)
|
|
and ((ord(p[2]) and %11000000) = %10000000)
|
|
and ((ord(p[3]) and %11000000) = %10000000)
|
|
then
|
|
Result:=4
|
|
else
|
|
Result:=1
|
|
else
|
|
Result:=1
|
|
end;
|
|
|
|
{****************************Conversion routines*******************************}
|
|
function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
|
|
begin
|
|
case aChar of
|
|
0..$7f:
|
|
begin
|
|
Result[1]:=char(aChar);
|
|
SetLength(UnicodeToUTF8,1);
|
|
end;
|
|
$80..$7ff:
|
|
begin
|
|
Result[1]:=char($c0 or (aChar shr 6));
|
|
Result[2]:=char($80 or (aChar and $3f));
|
|
SetLength(UnicodeToUTF8,2);
|
|
end;
|
|
$800..$ffff:
|
|
begin
|
|
SetLength(Result,3);
|
|
Result[1]:=char($e0 or (aChar shr 12));
|
|
Result[2]:=char($80 or ((aChar shr 6) and $3f));
|
|
Result[3]:=char($80 or (aChar and $3f));
|
|
end;
|
|
$10000..$1fffff:
|
|
begin
|
|
SetLength(UnicodeToUTF8,4);
|
|
Result[1]:=char($f0 or (aChar shr 18));
|
|
Result[2]:=char($80 or ((aChar shr 12) and $3f));
|
|
Result[3]:=char($80 or ((aChar shr 6) and $3f));
|
|
Result[4]:=char($80 or (aChar and $3f));
|
|
end;
|
|
else
|
|
SetLength(UnicodeToUTF8, 0);
|
|
end;
|
|
end;
|
|
|
|
function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
|
|
var
|
|
c:TUCS16Char absolute aChar;
|
|
begin
|
|
case c of
|
|
0..$7f:
|
|
begin
|
|
Result[1]:=char(c);
|
|
SetLength(UnicodeToUTF8,1);
|
|
end;
|
|
$80..$7ff:
|
|
begin
|
|
Result[1]:=char($c0 or (c shr 6));
|
|
Result[2]:=char($80 or (c and $3f));
|
|
SetLength(UnicodeToUTF8,2);
|
|
end;
|
|
else
|
|
SetLength(UnicodeToUTF8, 0);
|
|
end;
|
|
end;
|
|
|
|
function UnicodeToUTF8(const Src:TString):TUTF8String;
|
|
var
|
|
vp:Integer;
|
|
begin
|
|
vp := 1;
|
|
Result := '';
|
|
for vp := 1 to Length(Src) do
|
|
Result += UnicodeToUTF8(Src[vp]);
|
|
end;
|
|
|
|
function UTF8ToUCS32(const UTF8Char:TUTF8Char):TUCS32Char;
|
|
begin
|
|
case ComputeCharLength(@UTF8Char[1]) of
|
|
1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
|
|
Result := ord(UTF8Char[1]);
|
|
2:
|
|
Result := ((ord(UTF8Char[1]) and %00011111) shl 6)
|
|
or (ord(UTF8Char[2]) and %00111111);
|
|
3:
|
|
Result := ((ord(UTF8Char[1]) and %00011111) shl 12)
|
|
or ((ord(UTF8Char[1]) and %00111111) shl 6)
|
|
or (ord(UTF8Char[2]) and %00111111);
|
|
4:
|
|
Result := ((ord(UTF8Char[1]) and %00011111) shl 18)
|
|
or ((ord(UTF8Char[2]) and %00111111) shl 12)
|
|
or ((ord(UTF8Char[3]) and %00111111) shl 6)
|
|
or (ord(UTF8Char[4]) and %00111111);
|
|
else
|
|
Result := $FFFFFFFF;
|
|
end
|
|
end;
|
|
|
|
function UTF8ToUCS16(const UTF8Char:TUTF8Char):TUCS16Char;
|
|
begin
|
|
case Length(UTF8Char) of
|
|
1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
|
|
Result := ord(UTF8Char[1]);
|
|
2:
|
|
Result := ((ord(UTF8Char[1]) and %00011111) shl 6)
|
|
or (ord(UTF8Char[2]) and %00111111);
|
|
else
|
|
Result := $FFFF;
|
|
end;
|
|
end;
|
|
|
|
|
|
function UTF8ToUnicode(const Src:TUTF8String):TString;
|
|
var
|
|
lp, vp:Integer;
|
|
c:TUTF8Char;
|
|
begin
|
|
lp := 1;
|
|
vp := 0;
|
|
SetLength(Result, Length(Src));
|
|
while lp <= Length(Src) do
|
|
begin
|
|
vp += 1;
|
|
c := LCharOf(Src, lp);
|
|
Result[vp] := WideChar(UTF8ToUCS16(c));
|
|
lp += Length(c);
|
|
end;
|
|
SetLength(Result, vp);
|
|
end;
|
|
|
|
function UTF8ToDoubleByteString(const UTF8Str: TUTF8String): string;
|
|
var
|
|
Len: Integer;
|
|
begin
|
|
Len:=VLength(UTF8Str, drLTR);
|
|
SetLength(Result,Len*2);
|
|
if Len=0 then exit;
|
|
UTF8ToDoubleByte(PChar(UTF8Str),length(UTF8Str),PByte(Result));
|
|
end;
|
|
|
|
function UTF8ToDoubleByte(UTF8Str: PChar; Len:Cardinal; DBStr: PByte):Cardinal;
|
|
var
|
|
SrcPos: PChar;
|
|
CharLen: LongInt;
|
|
DestPos: PByte;
|
|
u: Cardinal;
|
|
begin
|
|
SrcPos:=UTF8Str;
|
|
DestPos:=DBStr;
|
|
Result:=0;
|
|
while Len>0 do begin
|
|
u:=UTF8ToUCS32(SrcPos);
|
|
DestPos^:=byte((u shr 8) and $ff);
|
|
inc(DestPos);
|
|
DestPos^:=byte(u and $ff);
|
|
inc(DestPos);
|
|
inc(SrcPos,CharLen);
|
|
dec(Len,CharLen);
|
|
inc(Result);
|
|
end;
|
|
end;
|
|
|
|
{****************************Logical aspects***********************************}
|
|
function LLength(const UTF8Str:TUTF8String):Cardinal;
|
|
begin
|
|
Result := Length(UTF8Str);
|
|
end;
|
|
|
|
function LPos(const UTF8Str:TUTF8String; vp:Integer; pDir:TDirection):Cardinal;
|
|
var
|
|
v2l:TVisualToLogical;
|
|
i:integer;
|
|
begin
|
|
v2l := VisualToLogical(UTF8Str, pDir);
|
|
if vp <= v2l[0]
|
|
then
|
|
Result := v2l[vp]
|
|
else
|
|
Result := Length(UTF8Str) + 1;
|
|
end;
|
|
|
|
function LCharOf(UTF8String:TUTF8String; lp:Integer):TUTF8Char;
|
|
begin
|
|
if lp > Length(UTF8String)
|
|
then
|
|
Exit('');
|
|
while(lp > 0) and ((Ord(UTF8String[lp]) and $F0) in [$80..$B0]) do
|
|
begin
|
|
Dec(lp);
|
|
end;
|
|
if lp = 0
|
|
then
|
|
Exit('');
|
|
Move(UTF8String[lp], Result[1], SizeOf(TUTF8Char) - 1);
|
|
SetLength(Result, ComputeCharLength(@Result[1]));
|
|
end;
|
|
{****************************Visual aspects************************************}
|
|
function VLength(const Src:TUTF8String; pDir:TDirection):Cardinal;
|
|
begin
|
|
Result := FreeBIDI.VLength(UTF8ToUnicode(Src), pDir);
|
|
end;
|
|
|
|
function VPos(const UTF8Str:TUTF8String; lp:Integer; pDir, cDir:TDirection):Cardinal;
|
|
var
|
|
v2l:TVisualToLogical;
|
|
vp:Integer;
|
|
begin
|
|
v2l := VisualToLogical(UTF8Str, pDir);
|
|
for vp := 1 to v2l[0] do
|
|
if lp = v2l[vp]
|
|
then
|
|
begin
|
|
Exit(vp);
|
|
end;
|
|
Result := v2l[0];
|
|
end;
|
|
|
|
function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
|
|
begin
|
|
end;
|
|
|
|
|
|
function VCharOf(UTF8Str:TUTF8String; vp:Integer; dir:TDirection):TUTF8Char;
|
|
var
|
|
CharLen: LongInt;
|
|
begin
|
|
Result:=LCharOf(UTF8Str,LPos(UTF8Str, vp, dir));
|
|
end;
|
|
|
|
{****************************Helper routines***********************************}
|
|
function DirectionOf(Character:TUTF8Char):TDirection;
|
|
begin
|
|
case Character[1] of
|
|
#9,#32,
|
|
'/',
|
|
'{','}',
|
|
'[',']',
|
|
'(',')':
|
|
Result := drNONE;
|
|
#$D8,#$D9:
|
|
Result := drRTL;
|
|
else
|
|
Result := drLTR;
|
|
end;
|
|
end;
|
|
|
|
function DirectionOf(UTF8String:TUTF8String; lp:Integer; pDir:TDirection):TDirection;
|
|
var
|
|
c:TUTF8Char;
|
|
lDir,rDir:TDirection;
|
|
p:Integer;
|
|
begin
|
|
if(lp <= 0)
|
|
then
|
|
lp := 1;
|
|
{Seek for proper character direction}
|
|
c := LCharOf(UTF8String, lp);
|
|
lDir := DirectionOf(c);
|
|
{Seek for left character direction if it is neutral}
|
|
p := lp;
|
|
while(p > 1) and (lDir = drNONE)do
|
|
begin
|
|
c := LCharOf(UTF8String, p - 1);
|
|
lDir := DirectionOf(c);
|
|
p := p - Length(c);
|
|
end;
|
|
{Seek for right character direction if it is neutral}
|
|
p := lp;
|
|
repeat
|
|
c := LCharOf(UTF8String, p);
|
|
rDir := DirectionOf(c);
|
|
p := p + Length(c);
|
|
until(p > Length(UTF8String)) or (rDir <> drNONE);
|
|
if(lDir = rDir)
|
|
then
|
|
Result := rDir
|
|
else
|
|
Result := pDir;
|
|
end;
|
|
|
|
function VisualToLogical(const UTF8String:TUTF8String; pDir:TDirection):TVisualToLogical;
|
|
procedure Insert(value:Byte; var v2l:TVisualToLogical; InsPos:Byte);
|
|
var
|
|
l:Byte;
|
|
begin
|
|
if v2l[0] < 255
|
|
then
|
|
Inc(InsPos);
|
|
if InsPos > v2l[0]
|
|
then
|
|
InsPos := v2l[0];
|
|
for l := v2l[0] downto InsPos do
|
|
v2l[l] := v2l[l-1];
|
|
v2l[InsPos] := Value;
|
|
end;
|
|
var
|
|
lp, vp : Integer;
|
|
cDir,lDir:TDirection;
|
|
Character:TUTF8Char;
|
|
i:Integer;
|
|
begin
|
|
Result[0] := 0;
|
|
lp := 1;
|
|
vp := 1;
|
|
lDir := drNONE;
|
|
while lp <= Length(UTF8String) do
|
|
begin
|
|
Character := LCharOf(UTF8String, lp);
|
|
cDir := DirectionOf(UTF8String, lp, pDir);
|
|
Inc(Result[0]);
|
|
case cDir of
|
|
drRTL:
|
|
begin
|
|
lDir := drRTL;
|
|
end;
|
|
drLTR:
|
|
begin
|
|
lDir := drLTR;
|
|
vp := Result[0];
|
|
end;
|
|
else
|
|
vp := Result[0];
|
|
end;
|
|
Insert(lp, Result, vp);
|
|
Inc(lp, Length(Character));
|
|
end;
|
|
end;
|
|
|
|
function InsertChar(Src:TUTF8Char; var Dest:TUTF8String; vp:Integer; pDir:TDirection):Integer;
|
|
var
|
|
temp:TString;
|
|
c:TCharacter;
|
|
begin
|
|
temp := UTF8ToUnicode(Dest);
|
|
c := WideChar(UTF8ToUCS16(Src));
|
|
Result := FreeBIDI.InsertChar(c, temp, vp, pDir);
|
|
Dest := UnicodeToUTF8(temp);
|
|
end;
|
|
|
|
procedure VInsert(const Src:TUTF8String;var Dest:TUTF8String; vp:Integer; pDir:TDirection);
|
|
function VStr(const Src:TUTF8String; pDir:TDirection):TUTF8String;
|
|
var
|
|
v2lSrc:TVisualToLogical;
|
|
i:Integer;
|
|
begin
|
|
v2lSrc := VisualToLogical(Src,pDir);
|
|
Result := '';
|
|
for i := 1 to v2lSrc[0] do
|
|
Result := Result + LCharOf(Src,v2lSrc[i]);
|
|
end;
|
|
var
|
|
vSrc,vDest:TUTF8String;
|
|
begin
|
|
vSrc := VStr(Src,pDir);
|
|
vDest := VStr(Dest,pDir);
|
|
Insert(vSrc, vDest, vp);
|
|
Dest := VStr(vDest, pDir);
|
|
end;
|
|
|
|
procedure VDelete(var str:TUTF8String; vp, len:Integer; pDir:TDirection);
|
|
var
|
|
temp:TString;
|
|
begin
|
|
temp := UTF8ToUnicode(str);
|
|
FreeBIDI.VDelete(temp, vp, len, pDir);
|
|
str := UnicodeToUTF8(temp);
|
|
end;
|
|
|
|
end.
|