+ widestringmanager.codepointlengthproc added, which can be used to

determine the length of a multi-byte character. The return values
    are defined to be the same as those of POSIX' mblen: -1 =
    invalid/incomplete sequence, 0 = , > 0 = length of sequence in
    bytes.
  + default implementation for widestringmanager.codepointlengthproc
    (assumes all code points have length 1) and Unix implementation
    (based on mb(r)len); Windows implementation is still required
  * replaced default implementation of
    widestringmanager.CharLengthPCharProc with strlen() of the input
    instead of an error (correct if all code points have length 1,
    still needs Windows implementation)
  + implemented fpc_text_read_{wide,unicode}str() and
    fpc_text_read_widechar() (mantis ); fpc_text_read_widechar()
    uses the new widestringmanager.codepointlengthproc()
  + unicodestring support for readstr/writestr
  * fixed declaration of fpc_Write_Text_UnicodeStr (unicodestring
    instead of widestring parameter)
  * extended test/twide*.pp tests to test the new/fixed functionality

git-svn-id: trunk@16533 -
This commit is contained in:
Jonas Maebe 2010-12-10 14:10:01 +00:00
parent 9410f7d5d3
commit f4c31ecf3c
10 changed files with 376 additions and 26 deletions

View File

@ -471,7 +471,7 @@ Procedure fpc_Write_Text_AnsiStr (Len : Longint; Var f : Text; const S : AnsiStr
Procedure fpc_Write_Text_WideStr (Len : Longint; Var f : Text; const S : WideString); compilerproc;
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
Procedure fpc_Write_Text_UnicodeStr (Len : Longint; Var f : Text; const S : WideString); compilerproc;
Procedure fpc_Write_Text_UnicodeStr (Len : Longint; Var f : Text; const S : UnicodeString); compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
Procedure fpc_Write_Text_SInt(Len : Longint;var t : Text;l : ValSInt); compilerproc;
Procedure fpc_Write_Text_UInt(Len : Longint;var t : Text;l : ValUInt); compilerproc;
@ -504,16 +504,22 @@ function fpc_SetupWriteStr_Shortstr(out s: shortstring): PText; compilerproc;
function fpc_SetupWriteStr_Ansistr(out s: ansistring): PText; compilerproc;
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
function fpc_SetupWriteStr_Widestr(out s: widestring): PText; compilerproc;
function fpc_SetupWriteStr_Unicodestr(out s: unicodestring): PText; compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
function fpc_SetupWriteStr_Widestr(out s: widestring): PText; compilerproc;
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
function fpc_SetupReadStr_Shortstr(const s: shortstring): PText; compilerproc;
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
function fpc_SetupReadStr_Ansistr(const s: ansistring): PText; compilerproc;
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
function fpc_SetupReadStr_Widestr(const s: widestring): PText; compilerproc;
function fpc_SetupReadStr_Unicodestr(const s: unicodestring): PText; compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
function fpc_SetupReadStr_Widestr(const s: widestring): PText; compilerproc;
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
{$endif FPC_HAS_FEATURE_TEXTIO}
{$ifdef FPC_HAS_FEATURE_VARIANTS}
@ -541,7 +547,16 @@ Procedure fpc_Read_Text_PChar_As_Array(var f : Text;out s : array of char; zerob
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
Procedure fpc_Read_Text_AnsiStr(var f : Text;out s : AnsiString); compilerproc;
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
Procedure fpc_Read_Text_UnicodeStr(var f : Text;out us : UnicodeString); compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
Procedure fpc_Read_Text_WideStr(var f : Text;out ws : WideString); compilerproc;
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
Procedure fpc_Read_Text_Char(var f : Text; out c : char); compilerproc;
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
procedure fpc_Read_Text_WideChar(var f : Text; out wc: widechar); compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
Procedure fpc_Read_Text_Char_Iso(var f : Text; out c : char); compilerproc;
Procedure fpc_Read_Text_SInt(var f : Text; out l :ValSInt); compilerproc;
Procedure fpc_Read_Text_UInt(var f : Text; out u :ValUInt); compilerproc;

View File

@ -689,8 +689,8 @@ begin
end;
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
Procedure fpc_Write_Text_UnicodeStr (Len : Longint; Var f : Text; const S : WideString); iocheck; compilerproc;
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
Procedure fpc_Write_Text_UnicodeStr (Len : Longint; Var f : Text; const S : UnicodeString); iocheck; compilerproc;
{
Writes a UnicodeString to the Text file T
}
@ -714,7 +714,7 @@ begin
else InOutRes:=103;
end;
end;
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
@ -1288,7 +1288,7 @@ End;
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
Procedure fpc_Read_Text_AnsiStr(var f : Text;out s : AnsiString); iocheck; compilerproc;
Procedure fpc_Read_Text_AnsiStr(var f : Text;out s : AnsiString); [public, alias: 'FPC_READ_TEXT_ANSISTR']; iocheck; compilerproc;
var
slen,len : SizeInt;
Begin
@ -1302,10 +1302,36 @@ Begin
// Set actual length
SetLength(S,Slen);
End;
Procedure fpc_Read_Text_AnsiStr_Intern(var f : Text;out s : AnsiString); [external name 'FPC_READ_TEXT_ANSISTR'];
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
procedure fpc_Read_Text_Char(var f : Text; out c: char); iocheck;compilerproc;
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
Procedure fpc_Read_Text_UnicodeStr(var f : Text;out us : UnicodeString); iocheck; compilerproc;
var
s: AnsiString;
Begin
// all standard input is assumed to be ansi-encoded
fpc_Read_Text_AnsiStr_Intern(f,s);
// Convert to unicodestring
us:=s;
End;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
Procedure fpc_Read_Text_WideStr(var f : Text;out ws : WideString); iocheck; compilerproc;
var
s: AnsiString;
Begin
// all standard input is assumed to be ansi-encoded
fpc_Read_Text_AnsiStr_Intern(f,s);
// Convert to widestring
ws:=s;
End;
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
procedure fpc_Read_Text_Char(var f : Text; out c: char); [public, alias: 'FPC_READ_TEXT_CHAR']; iocheck;compilerproc;
Begin
c:=#0;
If not CheckRead(f) then
@ -1319,6 +1345,49 @@ Begin
inc(TextRec(f).BufPos);
end;
procedure fpc_Read_Text_Char_intern(var f : Text; out c: char); iocheck; [external name 'FPC_READ_TEXT_CHAR'];
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
procedure fpc_Read_Text_WideChar(var f : Text; out wc: widechar); iocheck;compilerproc;
var
ws: widestring;
i: longint;
{ maximum code point length is 6 characters (with UTF-8) }
str: array[0..5] of char;
Begin
fillchar(str[0],sizeof(str),0);
for i:=low(str) to high(str) do
begin
fpc_Read_Text_Char_intern(f,str[i]);
case widestringmanager.CodePointLengthProc(@str[0],i+1) of
-1: { possibly incomplete code point, try with an extra character }
;
0: { null character }
begin
wc:=#0;
exit;
end;
else
begin
{ valid code point -> convert to widestring}
widestringmanager.Ansi2WideMoveProc(@str[0],ws,i+1);
{ has to be exactly one widechar }
if length(ws)=1 then
begin
wc:=ws[1];
exit
end
else
break;
end;
end;
end;
{ invalid widechar input }
inoutres:=106;
end;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
procedure fpc_Read_Text_Char_Iso(var f : Text; out c: char); iocheck;compilerproc;
Begin
@ -1604,6 +1673,22 @@ end;
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
procedure WriteStrUnicode(var t: textrec);
var
temp: ansistring;
str: punicodestring;
begin
if (t.bufpos=0) then
exit;
str:=punicodestring(ppointer(@t.userdata[StrPtrIndex])^);
setlength(temp,t.bufpos);
move(t.bufptr^,temp[1],t.bufpos);
str^:=str^+temp;
t.bufpos:=0;
end;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
procedure WriteStrWide(var t: textrec);
var
temp: ansistring;
@ -1617,8 +1702,7 @@ begin
str^:=str^+temp;
t.bufpos:=0;
end;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
procedure SetupWriteStrCommon(out t: textrec);
begin
@ -1657,6 +1741,20 @@ end;
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
function fpc_SetupWriteStr_Unicodestr(out s: unicodestring): PText; compilerproc;
begin
setupwritestrcommon(ReadWriteStrText);
PPointer(@ReadWriteStrText.userdata[StrPtrIndex])^:=@s;
// automatically done by out-semantics
// setlength(s,0);
ReadWriteStrText.InOutFunc:=@WriteStrUnicode;
ReadWriteStrText.FlushFunc:=@WriteStrUnicode;
result:=@ReadWriteStrText;
end;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
function fpc_SetupWriteStr_Widestr(out s: widestring): PText; compilerproc;
begin
setupwritestrcommon(ReadWriteStrText);
@ -1667,7 +1765,7 @@ begin
ReadWriteStrText.FlushFunc:=@WriteStrWide;
result:=@ReadWriteStrText;
end;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
procedure ReadAnsiStrFinal(var t: textrec);
@ -1763,7 +1861,7 @@ end;
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
function fpc_SetupReadStr_Widestr(const s: widestring): PText; compilerproc;
function fpc_SetupReadStr_Unicodestr(const s: unicodestring): PText; compilerproc;
begin
{ we use an ansistring to avoid code duplication, and let the }
{ assignment convert the widestring to an equivalent ansistring }
@ -1772,6 +1870,16 @@ end;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
function fpc_SetupReadStr_Widestr(const s: widestring): PText; compilerproc;
begin
{ we use an ansistring to avoid code duplication, and let the }
{ assignment convert the widestring to an equivalent ansistring }
result:=fpc_SetupReadStr_Ansistr_Intern(s);
end;
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
{*****************************************************************************
Initializing
*****************************************************************************}

View File

@ -67,7 +67,16 @@ Type
}
CompareWideStringProc : function(const s1, s2 : WideString) : PtrInt;
CompareTextWideStringProc : function(const s1, s2 : WideString): PtrInt;
{ return value: number of code points in the string. Whenever an invalid
code point is encountered, all characters part of this invalid code point
are considered to form one "character" and the next character is
considered to be the start of a new (possibly also invalid) code point }
CharLengthPCharProc : function(const Str: PChar): PtrInt;
{ return value:
-1 if incomplete or invalid code point
0 if NULL character,
> 0 if that's the length in bytes of the code point }
CodePointLengthProc : function(const Str: PChar; MaxLookAead: PtrInt): Ptrint;
UpperAnsiStringProc : function(const s : ansistring) : ansistring;
LowerAnsiStringProc : function(const s : ansistring) : ansistring;

View File

@ -88,6 +88,21 @@ begin
end;
function DefaultCharLengthPChar(const Str: PChar): PtrInt;
begin
DefaultCharLengthPChar:=length(Str);
end;
function DefaultCodePointLength(const Str: PChar; MaxLookAead: PtrInt): Ptrint;
begin
if str[0]<>#0 then
DefaultCodePointLength:=1
else
DefaultCodePointLength:=0;
end;
Procedure GetUnicodeStringManager (Var Manager : TUnicodeStringManager);
begin
manager:=widestringmanager;
@ -2506,13 +2521,6 @@ function CompareTextUnicodeString(const s1, s2 : UnicodeString): PtrInt;
begin
unimplementedunicodestring;
end;
function CharLengthPChar(const Str: PChar): PtrInt;
begin
unimplementedunicodestring;
end;
{$warnings on}
procedure initunicodestringmanager;
@ -2535,7 +2543,8 @@ procedure initunicodestringmanager;
{$endif HAS_WIDESTRINGMANAGER}
widestringmanager.CompareWideStringProc:=@CompareUnicodeString;
widestringmanager.CompareTextWideStringProc:=@CompareTextUnicodeString;
widestringmanager.CharLengthPCharProc:=@CharLengthPChar;
widestringmanager.CharLengthPCharProc:=@DefaultCharLengthPChar;
widestringmanager.CodePointLengthProc:=@DefaultCodePointLength;
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
end;

View File

@ -1357,7 +1357,16 @@ function Win32CompareTextUnicodeString(const s1, s2 : UnicodeString) : PtrInt;
are relevant already for the system unit }
procedure InitWin32Widestrings;
begin
{ return value: number of code points in the string. Whenever an invalid
code point is encountered, all characters part of this invalid code point
are considered to form one "character" and the next character is
considered to be the start of a new (possibly also invalid) code point }
//!!! CharLengthPCharProc : function(const Str: PChar): PtrInt;
{ return value:
-1 if incomplete or invalid code point
0 if NULL character,
> 0 if that's the length in bytes of the code point }
//!!!! CodePointLengthProc : function(const Str: PChar; MaxLookAead: PtrInt): Ptrint;
widestringmanager.CompareWideStringProc:=@Win32CompareWideString;
widestringmanager.CompareTextWideStringProc:=@Win32CompareTextWideString;
widestringmanager.UpperAnsiStringProc:=@Win32AnsiUpperCase;

View File

@ -5,6 +5,7 @@ uses
var
w : widestring;
u : unicodestring;
a : ansistring;
begin
@ -14,6 +15,15 @@ begin
halt(1);
a:=w;
if a[1]<>'A' then
halt(1);
halt(2);
writeln('ok');
a:='A';
u:=a;
if u[1]<>#65 then
halt(3);
a:=u;
if a[1]<>'A' then
halt(4);
writeln('ok');
end.

View File

@ -6,6 +6,7 @@ uses
var
i : longint;
w,w2 : widestring;
u,u2 : unicodestring;
a : ansistring;
begin
@ -17,4 +18,12 @@ begin
a:=w;
w2:=a;
end;
setlength(u,1000);
for i:=1 to 1000 do
u[i]:=widechar(i);
for i:=1 to 10 do
begin
a:=u;
u2:=a;
end;
end.

View File

@ -5,32 +5,76 @@
{$codepage utf-8}
{$mode objfpc}
uses
{$ifdef unix}
cwstring,
{$endif}
sysutils;
SysUtils;
{$i+}
var
t: text;
w: widestring;
u: unicodestring;
a: ansistring;
wc: widechar;
begin
assign(t,'twide3.txt');
rewrite(t);
writeln(t,'łóżka');
close(t);
reset(t);
try
read(t,wc);
if wc<>'ł' then
raise Exception.create('wrong widechar read: '+inttostr(ord(wc))+'<>'+inttostr(ord('ł')));
except
close(t);
// erase(t);
raise;
end;
reset(t);
try
readln(t,a);
w:=a;
if (w<>'łóżka') then
raise Exception.create('wrong string read');
raise Exception.create('wrong ansistring read');
except
close(t);
erase(t);
raise;
end;
reset(t);
try
readln(t,w);
if (w<>'łóżka') then
raise Exception.create('wrong widestring read');
except
close(t);
erase(t);
raise;
end;
reset(t);
try
readln(t,u);
if (u<>'łóżka') then
raise Exception.create('wrong unicodestring read');
finally
close(t);
erase(t);
end;
readstr(u,a);
if u<>a then
raise Exception.create('wrong readstr(u,a)');
readstr(w,a);
if w<>u then
raise Exception.create('wrong readstr(w,a)');
end.

View File

@ -2,6 +2,7 @@
var
ws: widestring;
uns: unicodestring;
us: UCS4String;
begin
// the compiler does not yet support characters which require
@ -42,4 +43,15 @@ begin
(ws[7]<>#$d87e) or
(ws[8]<>#$dc04) then
halt(3);
uns:='鳣ćçŹ'#$d87e#$dc04;
if (length(uns)<>8) or
(uns[1]<>'é') or
(uns[2]<>'ł') or
(uns[3]<>'Ł') or
(uns[4]<>'ć') or
(uns[5]<>'ç') or
(uns[6]<>'Ź') or
(uns[7]<>#$d87e) or
(uns[8]<>#$dc04) then
halt(4);
end.

View File

@ -13,11 +13,12 @@ procedure doerror(i : integer);
end;
{ normal upper case testing }
{ normal upper case testing (widestring) }
procedure testupper;
var
s: ansistring;
w1,w2,w3,w4: widestring;
u1,u2,u3,u4: unicodestring;
i: longint;
begin
w1:='aé'#0'èàł'#$d87e#$dc04;
@ -72,11 +73,74 @@ begin
doerror(21);
if (w4 <> w2) then
doerror(22);
end;
{ normal lower case testing }
{ normal upper case testing (unicodestring) }
procedure testupperu;
var
s: ansistring;
w1,w2,w3,w4: widestring;
u1,u2,u3,u4: unicodestring;
i: longint;
begin
w1:='aé'#0'èàł'#$d87e#$dc04;
w2:='AÉ'#0'ÈÀŁ'#$d87e#$dc04;
{$ifdef print}
// the utf-8 output can confuse the testsuite parser
writeln('original: ',w1);
writeln('original upper: ',w2);
{$endif print}
s:=w1;
{$ifdef print}
writeln('ansi: ',s);
{$endif print}
w3:=s;
w4:=AnsiUpperCase(s);
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=unicodeuppercase(w1);
{$ifdef print}
writeln('wideupper: ',w1);
writeln('original upper: ',w2);
writeln('ansiupper: ',w4);
{$endif print}
if (w1 <> w2) then
doerror(1);
if (w4 <> w2) then
doerror(2);
w1:='aéèàł'#$d87e#$dc04;
w2:='AÉÈÀŁ'#$d87e#$dc04;
s:=w1;
w3:=s;
w4:=AnsiStrUpper(pchar(s));
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=unicodeuppercase(w1);
{$ifdef print}
writeln('unicodeupper: ',w1);
writeln('ansistrupper: ',w4);
{$endif print}
if (w1 <> w2) then
doerror(21);
if (w4 <> w2) then
doerror(22);
end;
{ normal lower case testing (widestring) }
procedure testlower;
var
s: ansistring;
@ -135,6 +199,63 @@ begin
end;
{ normal lower case testing (unicodestring) }
procedure testloweru;
var
s: ansistring;
w1,w2,w3,w4: unicodestring;
i: longint;
begin
w1:='AÉ'#0'ÈÀŁ'#$d87e#$dc04;
w2:='aé'#0'èàł'#$d87e#$dc04;
{$ifdef print}
// the utf-8 output can confuse the testsuite parser
writeln('original: ',w1);
writeln('original lower: ',w2);
{$endif print}
s:=w1;
w3:=s;
w4:=AnsiLowerCase(s);
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=unicodelowercase(w1);
{$ifdef print}
writeln('unicodelower: ',w1);
writeln('ansilower: ',w4);
{$endif print}
if (w1 <> w2) then
doerror(3);
if (w4 <> w2) then
doerror(4);
w1:='AÉÈÀŁ'#$d87e#$dc04;
w2:='aéèàł'#$d87e#$dc04;
s:=w1;
w3:=s;
w4:=AnsiStrLower(pchar(s));
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=unicodelowercase(w1);
{$ifdef print}
writeln('unicodelower: ',w1);
writeln('ansistrlower: ',w4);
{$endif print}
if (w1 <> w2) then
doerror(3);
if (w4 <> w2) then
doerror(4);
end;
{ upper case testing with a missing utf-16 pair at the end }
procedure testupperinvalid;
@ -377,8 +498,12 @@ end;
begin
testupper;
writeln;
testupperu;
writeln;
testlower;
writeln;
testloweru;
writeln;
writeln;
testupperinvalid;
writeln;