* fixed wchar_t type (was: widechar, now is cint/cint32/long depending on

platform)
  + mbstate_t type for all unixes except BeOS (doesn't exist for BeOS)
  + implemented UpperAnsiStringProc/LowerAnsiStringProc for unix
  * fixed Ansi2UCS4Move in cwstring (although it isn't used anywhere
    currently)
  + test for Upper/LowerAnsiString

git-svn-id: trunk@9393 -
This commit is contained in:
Jonas Maebe 2007-12-05 17:42:35 +00:00
parent 997d7b33db
commit 68595c8b72
8 changed files with 685 additions and 10 deletions

1
.gitattributes vendored
View File

@ -7300,6 +7300,7 @@ tests/test/twide2.pp svneol=native#text/plain
tests/test/twide3.pp svneol=native#text/plain
tests/test/twide4.pp svneol=native#text/plain
tests/test/twide5.pp svneol=native#text/plain
tests/test/twide6.pp svneol=native#text/plain
tests/test/twrstr1.pp svneol=native#text/plain
tests/test/twrstr2.pp svneol=native#text/plain
tests/test/twrstr3.pp svneol=native#text/plain

View File

@ -90,7 +90,7 @@ type
pTime = ^time_t;
ptime_t = ^time_t;
wchar_t = widechar;
wchar_t = cint32;
pwchar_t = ^wchar_t;
socklen_t= cuint32;

View File

@ -77,9 +77,9 @@ type
pTime = ^time_t;
ptime_t = ^time_t;
wchar_t = widechar;
wchar_t = cint32;
pwchar_t = ^wchar_t;
wint_t = cint;
wint_t = cint32;
socklen_t= cuint32;
TSocklen = socklen_t;
@ -163,6 +163,13 @@ type
end;
pstatfs = ^tstatfs;
mbstate_t = record
case byte of
0: (__mbstate8: array[0..127] of char);
1: (_mbstateL: clonglong); { for alignment }
end;
pmbstate_t = ^mbstate_t;
pthread_t = pointer;
pthread_attr_t = record sig: clong; opaque: array[0..{$ifdef cpu64}56{$else}36{$endif}-1] of byte; end;
pthread_mutex_t = {$i pmutext.inc}

View File

@ -80,7 +80,7 @@ type
pUid = ^Uid_t;
wint_t = cint32;
wchar_t = widechar;
wchar_t = cint32;
pwchar_t = ^wchar_t;
@ -186,6 +186,13 @@ type
end;
PStatFS=^TStatFS;
mbstate_t = record
case byte of
0: (__mbstate8: array[0..127] of char);
1: (_mbstateL: cint64); { for alignment }
end;
pmbstate_t = ^mbstate_t;
ITimerVal= Record
It_Interval,
It_Value : TimeVal;

View File

@ -102,7 +102,7 @@ Type
pTime = ^time_t;
ptime_t = ^time_t;
wchar_t = widechar;
wchar_t = cint32;
pwchar_t = ^wchar_t;
{$ifdef cpu64}
@ -154,6 +154,18 @@ Type
end;
PStatFS=^TStatFS;
mbstate_value_t = record
case byte of
0: (__wch: wint_t);
1: (__wchb: array[0..3] of char);
end;
mbstate_t = record
__count: cint;
__value: mbstate_value_t;
end;
pmbstate_t = ^mbstate_t;
pthread_t = culong;
sched_param = record

View File

@ -129,7 +129,11 @@ Type
uint_t = cuint;
wchar_t = widechar;
{$ifdef cpu64}
wchar_t = cint;
{$else cpu64}
wchar_t = clong;
{$endif cpu64}
pwchar_t = ^wchar_t;
uid_t = cuint32; { used for user ID type }
@ -168,6 +172,15 @@ Type
end;
PStatFS=^TStatFS;
mbstate_t = record
{$ifdef cpu64}
__filler: array[0..3] of clong;
{$else cpu64}
__filler: array[0..5] of cint;
{$endif cpu64}
end;
pmbstate_t = ^mbstate_t;
clock32_t = int32_t;
timeval32 = record

View File

@ -14,6 +14,7 @@
**********************************************************************}
{$mode objfpc}
{$inline on}
unit cwstring;
@ -45,11 +46,22 @@ Const
{$endif}
{ helper functions from libc }
function tolower(__wc:cint):cint;cdecl;external libiconvname name 'tolower';
function toupper(__wc:cint):cint;cdecl;external libiconvname name 'toupper';
function towlower(__wc:wint_t):wint_t;cdecl;external libiconvname name 'towlower';
function towupper(__wc:wint_t):wint_t;cdecl;external libiconvname name 'towupper';
function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external libiconvname name 'wcscoll';
function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external libiconvname name 'strcoll';
function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
{$ifndef beos}
function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
{$else beos}
function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
{$endif beos}
const
{$ifdef linux}
@ -253,6 +265,92 @@ procedure Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt);
end;
(*
function LowerWideString(const s : WideString) : WideString;
var
i, slen : SizeInt;
{$if sizeof(wchar_t) = 4}
nc : wint_t;
resindex : SizeInt;
len : longint;
valid : boolean;
{$elseif sizeof(wchar_t) = 2}
p : PWideChar;
{$endif sizeof(wchar_t)}
begin
slen:=length(s);
SetLength(result,slen);
{$if sizeof(wint_t) = 4}
i:=1;
resindex:=1;
while (i<=slen) do
begin
nc:=utf16toutf32(s,i,len,valid);
inc(i,len);
if (valid) then
ConcatUTF32ToWideStr(towlower(nc),result,resindex)
else
{ do nothing }
pwidechar(@result[i])^:=s[i];
end;
{ adjust length }
setlength(result,resindex-1);
{$elseif sizeof(wchar_t) = 2}
{ avoid unique calls for each character }
p:=@result[1];
{ this will fail for surrogate pairs, but that's inherent }
{ to choosing sizeof(wint_t)=2, and nothing we can help }
for i:=1 to length(s) do
p[i]:=WideChar(towlower(wint_t(s[i])));
{$else}
{$error Unsupported wchar_t size}
{$endif}
end;
function UpperWideString(const s : WideString) : WideString;
var
i, slen : SizeInt;
{$if sizeof(wint_t) = 4}
nc : wint_t;
resindex : SizeInt;
len : longint;
valid : boolean;
{$elseif sizeof(wchar_t) = 2}
p : PWideChar;
{$endif sizeof(wchar_t)}
begin
slen:=length(s);
SetLength(result,slen);
{$if sizeof(wchar_t) = 4}
i:=1;
resindex:=1;
while (i<=slen) do
begin
nc:=utf16toutf32(s,i,len,valid);
inc(i,len);
if (valid) then
ConcatUTF32ToWideStr (towupper(nc),result,resindex)
else
{ do nothing }
pwidechar(@result[i])^:=s[i];
end;
{ adjust length }
setlength(result,resindex-1);
{$elseif sizeof(wchar_t) = 2}
{ avoid unique calls for each character }
p:=@result[1];
{ this will fail for surrogate pairs, but that's inherent }
{ to choosing sizeof(wint_t)=2, and nothing we can help }
for i:=1 to length(s) do
p[i]:=WideChar(towupper(wint_t(s[i])));
{$else}
{$error Unsupported wchar_t size}
{$endif}
end;
*)
function LowerWideString(const s : WideString) : WideString;
var
i : SizeInt;
@ -273,11 +371,193 @@ function UpperWideString(const s : WideString) : WideString;
end;
procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
begin
if (len>length(s)) then
if (length(s) < 10*256) then
setlength(s,length(s)+10)
else
setlength(s,length(s)+length(s) shr 8);
end;
procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
begin
EnsureAnsiLen(s,index);
pchar(@s[index])^:=c;
inc(index);
end;
{ concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
{$ifndef beos}
procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
{$else not beos}
procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
{$endif beos}
var
p : pchar;
mblen : size_t;
begin
{ we know that s is unique -> avoid uniquestring calls}
p:=@s[index];
if (nc<$7f) then
ConcatCharToAnsiStr(char(nc),s,index)
else
begin
{ en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
{ -> 10 should be enough? Should actually use MB_CUR_MAX, but }
{ that's a libc macro mapped to internal functions/variables }
{ and thus not a stable external API on systems where libc }
{ breaks backwards compatibility every now and then }
EnsureAnsiLen(s,index+10);
{$ifndef beos}
mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
{$else not beos}
mblen:=wctomb(p,wchar_t(nc));
{$endif not beos}
if (mblen<>size_t(-1)) then
inc(index,mblen)
else
begin
{ invalid wide char }
p^:='?';
inc(index);
end;
end;
end;
function LowerAnsiString(const s : AnsiString) : AnsiString;
var
i, slen,
resindex : SizeInt;
mblen : size_t;
{$ifndef beos}
ombstate,
nmbstate : mbstate_t;
{$endif beos}
wc : wchar_t;
begin
fillchar(ombstate,sizeof(ombstate),0);
fillchar(nmbstate,sizeof(nmbstate),0);
slen:=length(s);
SetLength(result,slen+10);
i:=1;
resindex:=1;
while (i<=slen) do
begin
if (s[i]<=#127) then
begin
ConcatCharToAnsiStr(char(tolower(cint(s[i]))),result,resindex);
inc(i)
end
else
begin
{$ifndef beos}
mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
{$else not beos}
mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
{$endif not beos}
case mblen of
size_t(-2):
begin
{ partial invalid character, copy literally }
while (i<=slen) do
begin
ConcatCharToAnsiStr(s[i],result,resindex);
inc(i);
end;
end;
size_t(-1), 0:
begin
{ invalid or null character }
ConcatCharToAnsiStr(s[i],result,resindex);
inc(i);
end;
else
begin
{ a valid sequence }
{ even if mblen = 1, the lowercase version may have a }
{ different length }
{ We can't do anything special if wchar_t is 16 bit... }
{$ifndef beos}
ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
{$else not beos}
ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
{$endif not beos}
inc(i,mblen);
end;
end;
end;
end;
SetLength(result,resindex-1);
end;
function UpperAnsiString(const s : AnsiString) : AnsiString;
var
i, slen,
resindex : SizeInt;
mblen : size_t;
ombstate,
nmbstate : mbstate_t;
wc : wchar_t;
begin
fillchar(ombstate,sizeof(ombstate),0);
fillchar(nmbstate,sizeof(nmbstate),0);
slen:=length(s);
SetLength(result,slen+10);
i:=1;
resindex:=1;
while (i<=slen) do
begin
if (s[i]<=#127) then
begin
ConcatCharToAnsiStr(char(toupper(cint(s[i]))),result,resindex);
inc(i)
end
else
begin
mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
case mblen of
size_t(-2):
begin
{ partial invalid character, copy literally }
while (i<=slen) do
begin
ConcatCharToAnsiStr(s[i],result,resindex);
inc(i);
end;
end;
size_t(-1), 0:
begin
{ invalid or null character }
ConcatCharToAnsiStr(s[i],result,resindex);
inc(i);
end;
else
begin
{ a valid sequence }
{ even if mblen = 1, the uppercase version may have a }
{ different length }
{ We can't do anything special if wchar_t is 16 bit... }
ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
inc(i,mblen);
end;
end;
end;
end;
SetLength(result,resindex-1);
end;
procedure Ansi2UCS4Move(source:pchar;var dest:UCS4String;len:SizeInt);
var
outlength,
outoffset,
outleft : size_t;
err: cint;
srcpos,
destpos: pchar;
mynil : pchar;
@ -294,7 +574,22 @@ procedure Ansi2UCS4Move(source:pchar;var dest:UCS4String;len:SizeInt);
outleft:=outlength*4;
while iconv(iconv_ansi2ucs4,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
begin
case fpgetCerrno of
err:=fpgetCerrno;
case err of
ESysEINVAL,
ESysEILSEQ:
begin
{ skip and set to '?' }
inc(srcpos);
dec(len);
plongint(destpos)^:=longint('?');
inc(destpos,4);
dec(outleft,4);
{ reset }
iconv(iconv_ansi2ucs4,@mynil,@my0,@mynil,@my0);
if err=ESysEINVAL then
break;
end;
ESysE2BIG:
begin
outoffset:=destpos-pchar(dest);
@ -375,9 +670,10 @@ begin
CompareTextWideStringProc:=@CompareTextWideString;
{
CharLengthPCharProc
UpperAnsiStringProc
LowerAnsiStringProc
}
UpperAnsiStringProc:=@UpperAnsiString;
LowerAnsiStringProc:=@LowerAnsiString;
{
CompareStrAnsiStringProc
CompareTextAnsiStringProc
}

339
tests/test/twide6.pp Normal file
View File

@ -0,0 +1,339 @@
{$codepage utf-8}
uses
{$ifdef unix}
cwstring,
{$endif}
sysutils;
{ normal upper case testing }
procedure testupper;
var
s: ansistring;
w1,w2,w3,w4: widestring;
i: longint;
begin
w1:='aé'#0'èàł'#$d87e#$dc04;
w2:='AÉ'#0'ÈÀŁ'#$d87e#$dc04;
{$ifdef print}
// the utf-8 output can confuse the testsuite parser
writeln('original: ',w1);
writeln('original upper: ',w2);
{$endif print}
s:=w1;
w3:=s;
w4:=AnsiUpperCase(s);
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=wideuppercase(w1);
{$ifdef print}
writeln('wideupper: ',w1);
writeln('ansiupper: ',w4);
{$endif print}
if (w1 <> w2) then
halt(1);
if (w4 <> w2) then
halt(2);
end;
{ normal lower case testing }
procedure testlower;
var
s: ansistring;
w1,w2,w3,w4: widestring;
i: longint;
begin
w1:='AÉ'#0'ÈÀŁ'#$d87e#$dc04;
w2:='aé'#0'èàł'#$d87e#$dc04;
{$ifdef print}
// the utf-8 output can confuse the testsuite parser
writeln('original: ',w1);
writeln('original lower: ',w2);
{$endif print}
s:=w1;
w3:=s;
w4:=AnsiLowerCase(s);
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=widelowercase(w1);
{$ifdef print}
writeln('widelower: ',w1);
writeln('ansilower: ',w4);
{$endif print}
if (w1 <> w2) then
halt(3);
if (w4 <> w2) then
halt(4);
end;
{ upper case testing with a missing utf-16 pair at the end }
procedure testupperinvalid;
var
s: ansistring;
w1,w2,w3,w4: widestring;
i: longint;
begin
{ missing utf-16 pair at end }
w1:='aé'#0'èàł'#$d87e;
w2:='AÉ'#0'ÈÀŁ'#$d87e;
{$ifdef print}
// the utf-8 output can confuse the testsuite parser
writeln('original: ',w1);
writeln('original upper: ',w2);
{$endif print}
s:=w1;
w3:=s;
w4:=AnsiUpperCase(s);
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=wideuppercase(w1);
{$ifdef print}
writeln('wideupper: ',w1);
writeln('ansiupper: ',w4);
{$endif print}
if (w1 <> w2) then
halt(5);
if (w4 <> w2) then
halt(6);
end;
{ lower case testing with a missing utf-16 pair at the end }
procedure testlowerinvalid;
var
s: ansistring;
w1,w2,w3,w4: widestring;
i: longint;
begin
{ missing utf-16 pair at end}
w1:='AÉ'#0'ÈÀŁ'#$d87e;
w2:='aé'#0'èàł'#$d87e;
{$ifdef print}
// the utf-8 output can confuse the testsuite parser
writeln('original: ',w1);
writeln('original lower: ',w2);
{$endif print}
s:=w1;
w3:=s;
w4:=AnsiLowerCase(s);
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=widelowercase(w1);
{$ifdef print}
writeln('widelower: ',w1);
writeln('ansilower: ',w4);
{$endif print}
if (w1 <> w2) then
halt(7);
if (w4 <> w2) then
halt(8);
end;
{ upper case testing with a missing utf-16 pair at the end, followed by a normal char }
procedure testupperinvalid1;
var
s: ansistring;
w1,w2,w3,w4: widestring;
i: longint;
begin
{ missing utf-16 pair at end with char after it}
w1:='aé'#0'èàł'#$d87e'j';
w2:='AÉ'#0'ÈÀŁ'#$d87e'J';
{$ifdef print}
// the utf-8 output can confuse the testsuite parser
writeln('original: ',w1);
writeln('original upper: ',w2);
{$endif print}
s:=w1;
w3:=s;
w4:=AnsiUpperCase(s);
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=wideuppercase(w1);
{$ifdef print}
writeln('wideupper: ',w1);
writeln('ansiupper: ',w4);
{$endif print}
if (w1 <> w2) then
halt(9);
if (w4 <> w2) then
halt(10);
end;
{ lower case testing with a missing utf-16 pair at the end, followed by a normal char }
procedure testlowerinvalid1;
var
s: ansistring;
w1,w2,w3,w4: widestring;
i: longint;
begin
{ missing utf-16 pair at end with char after it}
w1:='AÉ'#0'ÈÀŁ'#$d87e'J';
w2:='aé'#0'èàł'#$d87e'j';
{$ifdef print}
// the utf-8 output can confuse the testsuite parser
writeln('original: ',w1);
writeln('original lower: ',w2);
{$endif print}
s:=w1;
w3:=s;
w4:=AnsiLowerCase(s);
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=widelowercase(w1);
{$ifdef print}
writeln('widelower: ',w1);
writeln('ansilower: ',w4);
{$endif print}
if (w1 <> w2) then
halt(11);
if (w4 <> w2) then
halt(12);
end;
{ upper case testing with corrupting the utf-8 string after conversion }
procedure testupperinvalid2;
var
s: ansistring;
w1,w2,w3,w4: widestring;
i: longint;
begin
w1:='aé'#0'èàł'#$d87e#$dc04'ö';
w2:='AÉ'#0'ÈÀŁ'#$d87e#$dc04'Ö';
{$ifdef print}
// the utf-8 output can confuse the testsuite parser
writeln('original: ',w1);
writeln('original upper: ',w2);
{$endif print}
s:=w1;
{ truncate the last utf-8 character }
setlength(s,length(s)-1);
w3:=s;
{ adjust checking values for new length due to corruption }
if length(w3)<>length(w2) then
begin
setlength(w2,length(w3));
setlength(w1,length(w3));
end;
w4:=AnsiUpperCase(s);
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=wideuppercase(w1);
{$ifdef print}
writeln('wideupper: ',w1);
writeln('ansiupper: ',w4);
{$endif print}
if (w1 <> w2) then
halt(13);
if (w4 <> w2) then
halt(14);
end;
{ lower case testing with corrupting the utf-8 string after conversion }
procedure testlowerinvalid2;
var
s: ansistring;
w1,w2,w3,w4: widestring;
i: longint;
begin
w1:='AÉ'#0'ÈÀŁ'#$d87e#$dc04'Ö';
w2:='aé'#0'èàł'#$d87e#$dc04'ö';
{$ifdef print}
// the utf-8 output can confuse the testsuite parser
writeln('original: ',w1);
writeln('original lower: ',w2);
{$endif print}
s:=w1;
{ truncate the last utf-8 character }
setlength(s,length(s)-1);
w3:=s;
{ adjust checking values for new length due to corruption }
if length(w3)<>length(w2) then
begin
setlength(w2,length(w3));
setlength(w1,length(w3));
end;
w4:=AnsiLowerCase(s);
{ filter out unsupported characters }
for i:=1 to length(w3) do
if w3[i]='?' then
begin
w2[i]:='?';
w1[i]:='?';
end;
w1:=widelowercase(w1);
{$ifdef print}
writeln('widelower: ',w1);
writeln('ansilower: ',w4);
{$endif print}
if (w1 <> w2) then
halt(15);
if (w4 <> w2) then
halt(16);
end;
begin
testupper;
writeln;
testlower;
writeln;
writeln;
testupperinvalid;
writeln;
testlowerinvalid;
writeln;
writeln;
testupperinvalid1;
writeln;
testlowerinvalid1;
writeln;
writeln;
testupperinvalid2;
writeln;
testlowerinvalid2;
end.