* improve UTF8 support

+ add BIDI support based on widechar widestring which eases
  BIDI algo implementation
This commit is contained in:
mazen 2004-09-10 16:18:06 +00:00
parent 0ae95485ef
commit dcfd027099
2 changed files with 429 additions and 46 deletions

348
rtl/objpas/freebidi.pp Normal file
View File

@ -0,0 +1,348 @@
{
Author Mazen NEIFER
Licence LGPL
}
unit FreeBIDI;
{$mode objfpc}{$H+}
interface
type
TCharacter = WideChar;
TString = WideSTring;
TDirection=(
drNONE,
drRTL,
drLTR
);
TVisualToLogical = Array[Byte]Of Byte;
TFontInfoPtr = Pointer;
TCharWidthRoutine = function(Character:TCharacter;FontInfo:TFontInfoPtr):Integer;
var
FontInfoPtr:TFontInfoPtr;
CharWidth:TCharWidthRoutine;
{****************************Logical aspects***********************************}
{Returns the number of logical characters}
function LLength(const Src:TString):Cardinal;
{Converts visual position to logical position}
function LPos(const Src:TString; vp:Integer; pDir:TDirection):Cardinal;
{****************************Visual aspects************************************}
{Returns the number of visual characters}
function VLength(const Src:TString; pDir:TDirection):Cardinal;
{Converts a logical position to a visual position}
function VPos(const Src:TString; lp:Integer; pDir, cDir:TDirection):Cardinal;
function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
{Returns character at a given visual position according to paragraph direction}
function VCharOf(Src:TString; vp:Integer; dir:TDirection):TCharacter;
{Inserts a string into an other paying attention of RTL/LTR direction}
procedure VInsert(const Src:TString; var Dest:TString; vp:Integer; pDir:TDirection);
{Deletes a string into an other paying attention of RTL/LTR direction}
procedure VDelete(var str:TString; vp, len:Integer; pDir:TDirection);
{****************************Helper routines***********************************}
{Returns direction of a character}
function DirectionOf(Character:TCharacter):TDirection;
{Returns contextual direction of caracter in a string}
function DirectionOf(Src:TString; lp:Integer; pDir:TDirection):TDirection;
{Inserts a char as if it was typed using keyboard in the most user friendly way.
Returns the new cursor position after insersion depending on the new visual text}
function InsertChar(Src:TCharacter; var Dest:TString; vp:Integer; pDir:TDirection):Integer;
{Returns a table mapping each visual position to its logical position in an UTF8*
string}
function VisualToLogical(const Src:TString; pDir:TDirection):TVisualToLogical;
implementation
function DefaultCharWidth(Character:TCharacter; FontInfoPtr:TFontInfoPtr):Integer;
begin
case Character of
#9:
Result := 8;
else
Result := 1;
end;
end;
function DumpStr(const Src:TString):String;
var
i:Integer;
begin
Result := '';
for i:= 1 to Length(Src) do
case Src[i] of
#0..#127:
Result := Result + Src[i];
else
Result := Result + '$' + HexStr(Ord(Src[i]),4);
end;
end;
function ComputeCharLength(p:PChar):Cardinal;
begin
if ord(p^)<%11000000
then
{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
Result:=1
else if ((ord(p^) and %11100000) = %11000000)
then
if (ord(p[1]) and %11000000) = %10000000 then
Result:=2
else
Result:=1
else if ((ord(p^) and %11110000) = %11100000)
then
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
then
Result:=3
else
Result:=1
else if ((ord(p^) and %11111000) = %11110000)
then
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000)
then
Result:=4
else
Result:=1
else
Result:=1
end;
{****************************Logical aspects***********************************}
function LLength(const Src:TString):Cardinal;
begin
Result := Length(Src);
end;
function LPos(const Src:TString; vp:Integer; pDir:TDirection):Cardinal;
var
v2l:TVisualToLogical;
i:integer;
begin
v2l := VisualToLogical(Src, pDir);
if vp <= v2l[0]
then
Result := v2l[vp]
else
Result := Length(Src) + 1;
end;
{****************************Visual aspects************************************}
function VLength(const Src:TString; pDir:TDirection):Cardinal;
var
Count:Integer;
begin
Result := 0;
Count := Length(Src);
while (Count>0) do
begin
Result += CharWidth(Src[Count], FontInfoPtr);
Count -= 1;
end;
end;
function VPos(const Src:TString; lp:Integer; pDir, cDir:TDirection):Cardinal;
var
v2l:TVisualToLogical;
vp:Integer;
begin
v2l := VisualToLogical(Src, pDir);
for vp := 1 to v2l[0] do
if lp = v2l[vp]
then
begin
Exit(vp);
end;
Result := v2l[0];
end;
function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
begin
end;
function VCharOf(Src:TString; vp:Integer; dir:TDirection):TCharacter;
var
CharLen: LongInt;
begin
Result := Src[LPos(Src, vp, dir)];
end;
{****************************Helper routines***********************************}
function DirectionOf(Character:TCharacter):TDirection;
begin
case Character of
#9,#32,
'/',
'{','}',
'[',']',
'(',')':
Result := drNONE;
#$0600..#$06FF:
Result := drRTL;
else
Result := drLTR;
end;
end;
function DirectionOf(Src:TString; lp:Integer; pDir:TDirection):TDirection;
var
c:TCharacter;
lDir,rDir:TDirection;
p:Integer;
begin
if(lp <= 0)
then
lp := 1;
{Seek for proper character direction}
c := Src[lp];
lDir := DirectionOf(c);
{Seek for left character direction if it is neutral}
p := lp;
while(p > 1) and (lDir = drNONE)do
begin
c := Src[p - 1];
lDir := DirectionOf(c);
p := p - Length(c);
end;
{Seek for right character direction if it is neutral}
p := lp;
repeat
c := Src[p];
rDir := DirectionOf(c);
p := p + Length(c);
until(p > Length(Src)) or (rDir <> drNONE);
if(lDir = rDir)
then
Result := rDir
else
Result := pDir;
end;
function VisualToLogical(const Src:TString; pDir:TDirection):TVisualToLogical;
procedure Insert(value:Byte; var v2l:TVisualToLogical; InsPos:Byte);
var
l:Byte;
begin
if v2l[0] < 255
then
Inc(InsPos);
if InsPos > v2l[0]
then
InsPos := v2l[0];
for l := v2l[0] downto InsPos do
v2l[l] := v2l[l-1];
v2l[InsPos] := Value;
end;
var
lp, vp : Integer;
cDir,lDir:TDirection;
Character:TCharacter;
i:Integer;
begin
Result[0] := 0;
lp := 1;
vp := 1;
lDir := drNONE;
while lp <= Length(Src) do
begin
Character := Src[lp];
cDir := DirectionOf(Src, lp, pDir);
Inc(Result[0]);
case cDir of
drRTL:
begin
lDir := drRTL;
end;
drLTR:
begin
lDir := drLTR;
vp := Result[0];
end;
else
vp := Result[0];
end;
Insert(lp, Result, vp);
Inc(lp, Length(Character));
end;
end;
function InsertChar(Src:TCharacter; var Dest:TString; vp:Integer; pDir:TDirection):Integer;
var
v2l:TVisualToLogical;
lp,rvp:Integer;
c:TCharacter;
begin
v2l := VisualToLogical(Dest, pDir);
rvp := v2l[0];
if vp > rvp
then
begin
lp := Length(Dest) + 1
end
else
lp := v2l[vp];
c := Dest[lp];
if DirectionOf(c) = drRTL
then
begin
lp := lp + Length(c);
rvp := rvp + 1;
end;
case DirectionOf(Src) of
drRTL:
begin
Result := vp;
while (Result > 0) and (DirectionOf(Dest[v2l[Result]]) <> drLTR) do
Result := Result - 1;
while (Result < vp) and (DirectionOf(Dest[v2l[Result]]) <> drRTL) do
Result := Result + 1;
end;
drLTR:
begin
Result := rvp + 1;
end;
else
begin
Result := rvp + 1;
end;
end;
Insert(Src, Dest, lp);
end;
procedure VInsert(const Src:TString;var Dest:TString; vp:Integer; pDir:TDirection);
function VStr(const Src:TString; pDir:TDirection):TString;
var
v2lSrc:TVisualToLogical;
i:Integer;
begin
v2lSrc := VisualToLogical(Src,pDir);
Result := '';
for i := 1 to v2lSrc[0] do
Result := Result + Src[v2lSrc[i]];
end;
var
vSrc,vDest:TString;
begin
vSrc := VStr(Src,pDir);
vDest := VStr(Dest,pDir);
Insert(vSrc, vDest, vp);
Dest := VStr(vDest, pDir);
end;
procedure VDelete(var str:TString; vp, len:Integer; pDir:TDirection);
var
v2l:TVisualToLogical;
i:Integer;
begin
v2l := VisualToLogical(str, pDir);
for i := 1 to v2l[0] do
if(v2l[i] < vp) and (v2l[i] > vp + len)
then
Delete(str, v2l[i], 1);
end;
end.

View File

@ -8,17 +8,14 @@ unit UTF8BIDI;
interface
uses
FreeBIDI;
type
TUCS32Char = Cardinal;
TUCS16Char = Word;
TUTF8Char = String[4];
TUTF8String = UTF8String;
TDirection=(
drNONE,
drRTL,
drLTR
);
TVisualToLogical = Array[Byte]Of Byte;
{****************************Conversion routines*******************************}
{Converts an UCS 16/32 bits charcater to UTF8 character}
@ -26,7 +23,9 @@ function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
{Converts a wide char UCS 16 bits chcarcter to UTF8 character}
function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
{Converts an UTF8 character to UCS 32 bits character}
function UTF8ToUnicode(const UTF8Char:TUTF8Char):TUCS32Char;
function UTF8ToUCS32(const UTF8Char:TUTF8Char):TUCS32Char;
{Converts an UTF8 string to UCS 16 bits string}
function UTF8ToUnicode(const Src:TUTF8String):TString;
{Converts an UTF8 string to a double byte string}
function UTF8ToDoubleByteString(const UTF8Str:TUTF8String):String;
function UTF8ToDoubleByte(UTF8Str:PChar; Len:Cardinal; DBStr:PByte):Cardinal;
@ -39,11 +38,9 @@ function LPos(const UTF8Str:TUTF8String; vp:Integer; pDir:TDirection):Cardinal;
function LCharOf(UTF8String:TUTF8String; lp:Integer):TUTF8Char;
{****************************Visual aspects************************************}
{Returns the number of visual characters}
function VLength(const UTF8Str:TUTF8String):Cardinal;
function VLength(p: PChar; Count:Cardinal):Cardinal;
function VLength(const Src:TUTF8String; pDir:TDirection):Cardinal;
{Converts a logical position to a visual position}
function VPos(const UTF8Str:TUTF8String; lp:Integer; pDir, cDir:TDirection):Cardinal;
function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
{Returns character at a given visual position according to paragraph direction}
function VCharOf(UTF8Str:TUTF8String; vp:Integer; dir:TDirection):TUTF8Char;
{Inserts a string into an other paying attention of RTL/LTR direction}
@ -52,7 +49,7 @@ procedure VInsert(const Src:TUTF8String; var Dest:TUTF8String; vp:Integer; pDir:
procedure VDelete(var str:TUTF8String; vp, len:Integer; pDir:TDirection);
{****************************Helper routines***********************************}
{Returns direction of a character}
function DirectionOf(UTF8Char:TUTF8Char):TDirection;
function DirectionOf(Character:TUTF8Char):TDirection;
{Returns contextual direction of caracter in a string}
function DirectionOf(UTF8String:TUTF8String; lp:Integer; pDir:TDirection):TDirection;
{Inserts a char as if it was typed using keyboard in the most user friendly way.
@ -150,7 +147,7 @@ begin
Result := UnicodeToUTF8(Word(aChar));
end;
function UTF8ToUnicode(const UTF8Char:TUTF8Char):TUCS32Char;
function UTF8ToUCS32(const UTF8Char:TUTF8Char):TUCS32Char;
begin
case ComputeCharLength(@UTF8Char[1]) of
1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
@ -172,11 +169,26 @@ begin
end
end;
function UTF8ToUnicode(const Src:TUTF8String):TString;
var
lp:Integer;
c:TUTF8Char;
begin
lp := 1;
Result := '';
while lp <= Length(Src) do
begin
c := LCharOf(Src, lp);
Result += WideChar(UTF8ToUCS32(c));
lp += Length(c);
end;
end;
function UTF8ToDoubleByteString(const UTF8Str: TUTF8String): string;
var
Len: Integer;
begin
Len:=VLength(UTF8Str);
Len:=VLength(UTF8Str, drLTR);
SetLength(Result,Len*2);
if Len=0 then exit;
UTF8ToDoubleByte(PChar(UTF8Str),length(UTF8Str),PByte(Result));
@ -193,7 +205,7 @@ begin
DestPos:=DBStr;
Result:=0;
while Len>0 do begin
u:=UTF8ToUnicode(SrcPos);
u:=UTF8ToUCS32(SrcPos);
DestPos^:=byte((u shr 8) and $ff);
inc(DestPos);
DestPos^:=byte(u and $ff);
@ -203,6 +215,7 @@ begin
inc(Result);
end;
end;
{****************************Logical aspects***********************************}
function LLength(const UTF8Str:TUTF8String):Cardinal;
begin
@ -229,7 +242,6 @@ begin
Exit('');
while(lp > 0) and ((Ord(UTF8String[lp]) and $F0) in [$80..$B0]) do
begin
writeln(lp,' ',HexStr(Ord(UTF8String[lp]),2),'!',HexStr(Ord(UTF8String[lp]) and $F0,2));
Dec(lp);
end;
if lp = 0
@ -239,22 +251,9 @@ end;
SetLength(Result, ComputeCharLength(@Result[1]));
end;
{****************************Visual aspects************************************}
function VLength(const UTF8Str:TUTF8String):Cardinal;
function VLength(const Src:TUTF8String; pDir:TDirection):Cardinal;
begin
Result := VLength(PChar(UTF8Str),LLength(UTF8Str));
end;
function VLength(p:PChar; Count:Cardinal):Cardinal;
var
CharLen: LongInt;
begin
Result := 0;
while (Count>0) do begin
inc(Result);
CharLen:=ComputeCharLength(p);
inc(p,CharLen);
dec(Count,CharLen);
end;
Result := VLength(UTF8ToUnicode(Src), pDir);
end;
function VPos(const UTF8Str:TUTF8String; lp:Integer; pDir, cDir:TDirection):Cardinal;
@ -285,10 +284,14 @@ begin
end;
{****************************Helper routines***********************************}
function DirectionOf(UTF8Char:TUTF8Char):TDirection;
function DirectionOf(Character:TUTF8Char):TDirection;
begin
case UTF8Char[1] of
#9,#32,'/','{','}','[',']','(',')':
case Character[1] of
#9,#32,
'/',
'{','}',
'[',']',
'(',')':
Result := drNONE;
#$D8,#$D9:
Result := drRTL;
@ -300,15 +303,35 @@ end;
function DirectionOf(UTF8String:TUTF8String; lp:Integer; pDir:TDirection):TDirection;
var
c:TUTF8Char;
lDir,rDir:TDirection;
p:Integer;
begin
if(lp <= 0)
then
lp := 1;
{Seek for proper character direction}
c := LCharOf(UTF8String, lp);
Result := DirectionOf(c);
while(lp > 1) and (Result = drNONE)do
lDir := DirectionOf(c);
{Seek for left character direction if it is neutral}
p := lp;
while(p > 1) and (lDir = drNONE)do
begin
c := LCharOf(UTF8String, lp - 1);
Result := DirectionOf(c);
lp := lp - Length(c);
c := LCharOf(UTF8String, p - 1);
lDir := DirectionOf(c);
p := p - Length(c);
end;
{Seek for right character direction if it is neutral}
p := lp;
repeat
c := LCharOf(UTF8String, p);
rDir := DirectionOf(c);
p := p + Length(c);
until(p > Length(UTF8String)) or (rDir <> drNONE);
if(lDir = rDir)
then
Result := rDir
else
Result := pDir;
end;
function VisualToLogical(const UTF8String:TUTF8String; pDir:TDirection):TVisualToLogical;
@ -328,13 +351,14 @@ function VisualToLogical(const UTF8String:TUTF8String; pDir:TDirection):TVisualT
end;
var
lp, vp : Integer;
cDir:TDirection;
cDir,lDir:TDirection;
Character:TUTF8Char;
i:Integer;
begin
Result[0] := 0;
lp := 1;
vp := 1;
lDir := drNONE;
while lp <= Length(UTF8String) do
begin
Character := LCharOf(UTF8String, lp);
@ -343,11 +367,11 @@ begin
case cDir of
drRTL:
begin
pDir := drRTL;
lDir := drRTL;
end;
drLTR:
begin
pDir := drLTR;
lDir := drLTR;
vp := Result[0];
end;
else
@ -384,13 +408,10 @@ begin
drRTL:
begin
Result := vp;
Write(Result);
while (Result > 0) and (DirectionOf(LCharOf(Dest, v2l[Result])) <> drLTR) do
Result := Result - 1;
Write('-->',Result);
while (Result < vp) and (DirectionOf(LCharOf(Dest, v2l[Result])) <> drRTL) do
Result := Result + 1;
WriteLn('-->',Result)
end;
drLTR:
begin
@ -402,12 +423,26 @@ WriteLn('-->',Result)
end;
end;
Insert(Src, Dest, lp);
WriteLn('vp=',vp,' lp=',lp,' len=', Length(Dest),' Result=', Result);
end;
procedure VInsert(const Src:TUTF8String;var Dest:TUTF8String; vp:Integer; pDir:TDirection);
function VStr(const Src:TUTF8String; pDir:TDirection):TUTF8String;
var
v2lSrc:TVisualToLogical;
i:Integer;
begin
v2lSrc := VisualToLogical(Src,pDir);
Result := '';
for i := 1 to v2lSrc[0] do
Result := Result + LCharOf(Src,v2lSrc[i]);
end;
var
vSrc,vDest:TUTF8String;
begin
Insert(Src, Dest, LPos(Dest, vp, pDir));
vSrc := VStr(Src,pDir);
vDest := VStr(Dest,pDir);
Insert(vSrc, vDest, vp);
Dest := VStr(vDest, pDir);
end;
procedure VDelete(var str:TUTF8String; vp, len:Integer; pDir:TDirection);