fpc/rtl/unix/cwstring.pp

1224 lines
34 KiB
ObjectPascal
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
This file is part of the Free Pascal run time library.
Copyright (c) 2005 by Florian Klaempfl,
member of the Free Pascal development team.
libc based wide string support
See the file COPYING.FPC, included in this distribution,
for details about the copyright.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
**********************************************************************}
{$mode objfpc}
{$modeswitch unicodestrings-}
{$h-}
{$inline on}
{$implicitexceptions off}
{$IFNDEF FPC_DOTTEDUNITS}
unit cwstring;
{$ENDIF FPC_DOTTEDUNITS}
interface
procedure SetCWidestringManager;
implementation
{$linklib c}
// Linux (and maybe glibc platforms in general), have iconv in glibc.
{$if defined(linux) or defined(solaris)}
{$define iconv_is_in_libc}
{$endif}
{$ifdef netbsd}
{$ifndef DISABLE_ICONV_LIBC}
{$define iconv_is_in_libc}
{$endif}
{$endif}
{$ifdef dragonfly}
{ iconv function are included inside libc for DragonFly version 6.3 }
{ See https://man.dragonflybsd.org/?command=iconv&section=3 }
{$ifndef DISABLE_ICONV_LIBC}
{$define iconv_is_in_libc}
{$endif}
{$endif}
{ Modern FreeBSD releases also claim to have iconv
function inside libc, at least
this is stated explicitly in iconv(3) man
of FreeBSD from 10.0 release
PM 2023-05-05 }
{$ifdef freebsd}
{$ifndef DISABLE_ICONV_LIBC}
{$define iconv_is_in_libc}
{$endif}
{$endif}
{$ifndef iconv_is_in_libc}
{$if defined(haiku)}
{$linklib textencoding}
{$else}
{$linklib iconv}
{$endif}
{$define useiconv}
{$endif not iconv_is_in_libc}
{$i rtldefs.inc}
{$IFDEF FPC_DOTTEDUNITS}
Uses
UnixApi.Base,
System.CTypes,
UnixApi.Unix,
UnixApi.Types,
System.InitC,
System.DynLibs,
UnixApi.CP;
{$ELSE FPC_DOTTEDUNITS}
Uses
BaseUnix,
ctypes,
unix,
unixtype,
initc,
dynlibs,
unixcp;
{$ENDIF FPC_DOTTEDUNITS}
Const
{$ifndef useiconv}
libiconvname='c'; // is in libc under Linux.
libprefix='lib';
{$else}
{$ifdef haiku}
libiconvname='textencoding'; // is in libtextencoding under Haiku
libprefix='lib';
{$else}
{$if defined(darwin) or defined(aix)}
libiconvname='libiconv';
libprefix='';
{$else}
libiconvname='iconv';
libprefix='lib';
{$endif}
{$endif}
{$endif}
{ helper functions from libc }
function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
function strcoll (__s1:PAnsiChar; __s2:PAnsiChar):cint;cdecl;external clib name 'strcoll';
{$ifdef netbsd}
{ NetBSD has a new setlocale function defined in /usr/include/locale.h
that should be used }
function setlocale(category: cint; locale: PAnsiChar): PAnsiChar; cdecl; external clib name '__setlocale50';
{$else}
function setlocale(category: cint; locale: PAnsiChar): PAnsiChar; cdecl; external clib name 'setlocale';
{$endif}
{$if not(defined(beos) and not defined(haiku))}
function mbrtowc(pwc: pwchar_t; const s: PAnsiChar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
function wcrtomb(s: PAnsiChar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
function mbrlen(const s: PAnsiChar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
{$else beos}
function mbtowc(pwc: pwchar_t; const s: PAnsiChar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
function wctomb(s: PAnsiChar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
function mblen(const s: PAnsiChar; n: size_t): size_t; cdecl; external clib name 'mblen';
{$endif beos}
const
{$if defined(linux)}
__LC_CTYPE = 0;
LC_ALL = 6;
_NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
_NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
CODESET = _NL_CTYPE_CODESET_NAME;
{$elseif defined(darwin)}
CODESET = 0;
LC_ALL = 0;
{$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
__LC_CTYPE = 0;
LC_ALL = 0;
_NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
_NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
CODESET = 0; // _NL_CTYPE_CODESET_NAME;
{$elseif defined(solaris)}
{$define ACCEPT_646}
CODESET=49;
LC_ALL = 6;
{$elseif defined(beos)}
{$ifdef haiku}
CODESET= 0; // Checked for Haiku
LC_ALL = 0; // Checked for Haiku
{$else}
{$warning check correct value for BeOS}
CODESET=49;
LC_ALL = 6; // Checked for BeOS
{$endif}
ESysEILSEQ = EILSEQ;
{$elseif defined(OpenBSD)}
{$define ACCEPT_646}
CODESET = 51;
LC_ALL = 0;
{$elseif defined(NetBSD)}
{$define ACCEPT_646}
CODESET = 51;
LC_ALL = 0;
{$elseif defined(aix)}
CODESET = 49;
LC_ALL = -1;
{$elseif defined(dragonfly)}
CODESET = 0;
LC_ALL = 0;
__LC_CTYPE = 0;
_NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
_NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
{$else not aix}
{$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
// and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
{$endif}
{ unicode encoding name }
{$ifdef FPC_LITTLE_ENDIAN}
unicode_encoding2 = 'UTF-16LE';
unicode_encoding4 = 'UCS-4LE';
{$else FPC_LITTLE_ENDIAN}
{$ifdef AIX}
unicode_encoding2 = 'UTF-16';
{$else AIX}
unicode_encoding2 = 'UTF-16BE';
unicode_encoding4 = 'UCS-4BE';
{$endif AIX}
{$endif FPC_LITTLE_ENDIAN}
{ en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
{ -> 10 should be enough? Should actually use MB_CUR_MAX, but }
{ that's a libc macro mapped to internal functions/variables }
{ and thus not a stable external API on systems where libc }
{ breaks backwards compatibility every now and then }
MB_CUR_MAX = 10;
{ Requests for iconvctl }
ICONV_TRIVIALP = 0; // int *argument
ICONV_GET_TRANSLITERATE = 1; // int *argument
ICONV_SET_TRANSLITERATE = 2; // const int *argument
ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
type
piconv_t = ^iconv_t;
iconv_t = pointer;
nl_item = cint;
{$ifdef haiku}
function nl_langinfo(__item:nl_item):PAnsiChar;cdecl;external 'root' name 'nl_langinfo';
{$else}
{$ifndef beos}
function nl_langinfo(__item:nl_item):PAnsiChar;cdecl;external libiconvname name 'nl_langinfo';
{$endif}
{$endif}
{$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
function iconv_open(__tocode:PAnsiChar; __fromcode:PAnsiChar):iconv_t;cdecl;external libiconvname name 'iconv_open';
function iconv(__cd:iconv_t; __inbuf:PPAnsiChar; __inbytesleft:psize_t; __outbuf:PPAnsiChar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
const
iconvctlname='iconvctl';
{$else}
function iconv_open(__tocode:PAnsiChar; __fromcode:PAnsiChar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
function iconv(__cd:iconv_t; __inbuf:PPAnsiChar; __inbytesleft:psize_t; __outbuf:PPAnsiChar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
const
iconvctlname='libiconvctl';
{$endif}
var
iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
Const
UTF8Name : ShortString = 'UTF-8';
TransLitname : ShortString ='//TRANSLIT';
threadvar
iconv_ansi2wide,
iconv_wide2ansi : iconv_t;
{ since we cache the iconv_t converters, we have to do the same
for the DefaultSystemCodePage variable since if it changes, we
have to re-initialize the converters too. We can't do that via
a callback in the widestring manager because DefaultSystemCodePage
is not a threadvar and we can't automatically change this in all
threads }
current_DefaultSystemCodePage: TSystemCodePage;
procedure InitThread;
var
transliterate: cint;
{$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
iconvindex: longint;
{$endif}
iconvname, toencoding: shortstring;
begin
current_DefaultSystemCodePage:=DefaultSystemCodePage;
{$if declared(iconvindex)}
iconvindex:=GetCodepageData(DefaultSystemCodePage);
if iconvindex<>-1 then
iconvname:=UnixCpMap[iconvindex].name
else
{ default to UTF-8 on Unix platforms }
iconvname:=UTF8Name;
{$else}
{ Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
iconvname:='UTF-8';
{$endif}
toencoding:=iconvname;
if not assigned(iconvctl) then
toencoding:=toencoding+TransLitName+#0;
iconvname:=iconvname+#0;
iconv_wide2ansi:=iconv_open(PAnsiChar(@toencoding[1]),unicode_encoding2);
iconv_ansi2wide:=iconv_open(unicode_encoding2,PAnsiChar(@iconvname[1]));
if assigned(iconvctl) and
(iconv_wide2ansi<>iconv_t(-1)) then
begin
transliterate:=1;
iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
end;
end;
procedure FiniThread;
begin
if (iconv_wide2ansi <> iconv_t(-1)) then
iconv_close(iconv_wide2ansi);
if (iconv_ansi2wide <> iconv_t(-1)) then
iconv_close(iconv_ansi2wide);
end;
{$if defined(beos) and not defined(haiku)}
function nl_langinfo(__item:nl_item):PAnsiChar;
begin
{$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
// Now implement the minimum required to correctly initialize WideString support
case __item of
CODESET : Result := 'UTF-8'; // BeOS use UTF-8
else
begin
Assert(False, 'nl_langinfo was called with an unknown nl_item value');
Result := '';
end;
end;
end;
{$endif}
function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: PAnsiChar; cp_is_from: boolean): iconv_t;
var
iconvindex: longint;
toencoding: rawbytestring;
transliterate: cint;
begin
{ TODO: add caching (then we also don't need separate code for
the default system page and other ones)
-- typecasting an ansistring function result to PAnsiChar is
unsafe normally, but these are constant strings -> no
problem }
open_iconv_for_cps:=iconv_t(-1);
iconvindex:=GetCodepageData(cp);
if iconvindex=-1 then
exit;
repeat
if cp_is_from then
open_iconv_for_cps:=iconv_open(otherencoding,PAnsiChar(UnixCpMap[iconvindex].name))
else
begin
toencoding:=UnixCpMap[iconvindex].name;
if not assigned(iconvctl) then
toencoding:=toencoding+'//TRANSLIT';
open_iconv_for_cps:=iconv_open(PAnsiChar(toencoding),otherencoding);
end;
inc(iconvindex);
until (open_iconv_for_cps<>iconv_t(-1)) or
(iconvindex>high(UnixCpMap)) or
(UnixCpMap[iconvindex].cp<>cp);
if not cp_is_from and
(open_iconv_for_cps<>iconv_t(-1)) and
assigned(iconvctl) then
begin
transliterate:=1;
iconvctl(open_iconv_for_cps,ICONV_SET_TRANSLITERATE,@transliterate);
end;
end;
{$ifdef aix}
{$i cwstraix.inc}
{$endif aix}
procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
var
outlength,
outoffset,
srclen,
outleft : size_t;
use_iconv: iconv_t;
srcpos : pwidechar;
destpos: PAnsiChar;
mynil : PAnsiChar;
my0 : size_t;
err : longint;
transliterate: cint;
free_iconv: boolean;
{$ifdef aix}
intermediate: rawbytestring;
{$endif aix}
begin
{$ifdef aix}
{ AIX libiconv does not support converting cp866 to anything else except
for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
if cp=866 then
begin
Wide2AnsiMove(source,intermediate,28595,len);
if handle_aix_intermediate(PAnsiChar(intermediate),28595,cp,dest,len) then
exit;
end;
{$endif aix}
if (cp=DefaultSystemCodePage) then
begin
{ update iconv converter in case the DefaultSystemCodePage has been
changed }
if current_DefaultSystemCodePage<>DefaultSystemCodePage then
begin
FiniThread;
InitThread;
end;
use_iconv:=iconv_wide2ansi;
free_iconv:=false;
end
else
begin
use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
if (use_iconv<>iconv_t(-1)) and
assigned(iconvctl) then
begin
transliterate:=1;
iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
end;
free_iconv:=true;
end;
{ unsupported encoding -> default move }
if use_iconv=iconv_t(-1) then
begin
DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
exit;
end;
mynil:=nil;
my0:=0;
{ rought estimation }
setlength(dest,len*3);
outlength:=len*3;
srclen:=len*2;
srcpos:=source;
destpos:=PAnsiChar(dest);
outleft:=outlength;
while iconv(use_iconv,PPAnsiChar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
begin
err:=fpgetCerrno;
case err of
{ last character is incomplete sequence }
ESysEINVAL,
{ incomplete sequence in the middle }
ESysEILSEQ:
begin
{ skip and set to '?' }
inc(srcpos);
dec(srclen,2);
destpos^:='?';
inc(destpos);
dec(outleft);
{ reset }
iconv(use_iconv,@mynil,@my0,@mynil,@my0);
if err=ESysEINVAL then
break;
end;
ESysE2BIG:
begin
outoffset:=destpos-PAnsiChar(dest);
{ extend }
setlength(dest,outlength+len*3);
inc(outleft,len*3);
inc(outlength,len*3);
{ string could have been moved }
destpos:=PAnsiChar(dest)+outoffset;
end;
else
runerror(231);
end;
end;
// truncate string
setlength(dest,length(dest)-outleft);
SetCodePage(dest,cp,false);
if free_iconv then
iconv_close(use_iconv);
end;
procedure Ansi2WideMove(source:PAnsiChar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
var
outlength,
outoffset,
outleft : size_t;
use_iconv: iconv_t;
srcpos,
destpos: PAnsiChar;
mynil : PAnsiChar;
my0 : size_t;
err: cint;
iconvindex: longint;
free_iconv: boolean;
{$ifdef aix}
intermediate: rawbytestring;
{$endif aix}
begin
{$ifdef aix}
{ AIX libiconv does not support converting cp866 to anything else except
for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
if cp=866 then
begin
if handle_aix_intermediate(source,cp,cp,intermediate,len) then
source:=PAnsiChar(intermediate);
end;
{$endif aix}
if (cp=DefaultSystemCodePage) then
begin
{ update iconv converter in case the DefaultSystemCodePage has been
changed }
if current_DefaultSystemCodePage<>DefaultSystemCodePage then
begin
FiniThread;
InitThread;
end;
use_iconv:=iconv_ansi2wide;
free_iconv:=false;
// Using Unicode RTL, we can end up here when iconv_ansi2wide is not yet initialized
if (use_iconv=Nil) then
begin
DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
exit;
end;
end
else
begin
{ TODO: add caching (then we also don't need separate code for
the default system page and other ones)
-- typecasting an ansistring function result to PAnsiChar is
unsafe normally, but these are constant strings -> no
problem }
use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
free_iconv:=true;
end;
{ unsupported encoding -> default move }
if use_iconv=iconv_t(-1) then
begin
DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
exit;
end;
mynil:=nil;
my0:=0;
// extra space
outlength:=len+1;
setlength(dest,outlength);
srcpos:=source;
destpos:=PAnsiChar(dest);
outleft:=outlength*2;
while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
begin
err:=fpgetCerrno;
case err of
ESysEINVAL,
ESysEILSEQ:
begin
{ skip and set to '?' }
inc(srcpos);
dec(len);
pwidechar(destpos)^:='?';
inc(destpos,2);
dec(outleft,2);
{ reset }
iconv(use_iconv,@mynil,@my0,@mynil,@my0);
if err=ESysEINVAL then
break;
end;
ESysE2BIG:
begin
outoffset:=destpos-PAnsiChar(dest);
{ extend }
setlength(dest,outlength+len);
inc(outleft,len*2);
inc(outlength,len);
{ string could have been moved }
destpos:=PAnsiChar(dest)+outoffset;
end;
else
runerror(231);
end;
end;
// truncate string
setlength(dest,length(dest)-outleft div 2);
if free_iconv then
iconv_close(use_iconv);
end;
function LowerWideString(const s : WideString) : WideString;
var
i : SizeInt;
begin
SetLength(result,length(s));
for i:=0 to length(s)-1 do
pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
end;
function UpperWideString(const s : WideString) : WideString;
var
i : SizeInt;
begin
SetLength(result,length(s));
for i:=0 to length(s)-1 do
pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
end;
procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
begin
if (len>length(s)) then
if (length(s) < 10*256) then
setlength(s,length(s)+10)
else
setlength(s,length(s)+length(s) shr 8);
end;
procedure ConcatCharToAnsiStr(const c: AnsiChar; var S: AnsiString; var index: SizeInt);
begin
EnsureAnsiLen(s,index);
PAnsiChar(@s[index])^:=c;
inc(index);
end;
{ concatenates an utf-32 Char to a ansistring. S *must* be unique when entering. }
{$if not(defined(beos) and not defined(haiku))}
procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
{$else not beos}
procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
{$endif beos}
var
p : PAnsiChar;
mblen : size_t;
begin
{ we know that s is unique -> avoid uniquestring calls}
p:=@s[index];
if (nc<=127) then
ConcatCharToAnsiStr(AnsiChar(nc),s,index)
else
begin
EnsureAnsiLen(s,index+MB_CUR_MAX);
{$if not(defined(beos) and not defined(haiku))}
mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
{$else not beos}
mblen:=wctomb(p,wchar_t(nc));
{$endif not beos}
if (mblen<>size_t(-1)) then
inc(index,mblen)
else
begin
{ invalid wide AnsiChar }
p^:='?';
inc(index);
end;
end;
end;
function LowerAnsiString(const s : AnsiString) : AnsiString;
var
i, slen,
resindex : SizeInt;
mblen : size_t;
{$if not(defined(beos) and not defined(haiku))}
ombstate,
nmbstate : mbstate_t;
{$endif beos}
wc : wchar_t;
begin
{$if not(defined(beos) and not defined(haiku))}
fillchar(ombstate,sizeof(ombstate),0);
fillchar(nmbstate,sizeof(nmbstate),0);
{$endif beos}
slen:=length(s);
SetLength(result,slen+10);
i:=1;
resindex:=1;
while (i<=slen) do
begin
if (s[i]<=#127) then
begin
wc:=wchar_t(s[i]);
mblen:= 1;
end
else
{$if not(defined(beos) and not defined(haiku))}
mblen:=mbrtowc(@wc, PAnsiChar(@s[i]), slen-i+1, @ombstate);
{$else not beos}
mblen:=mbtowc(@wc, PAnsiChar(@s[i]), slen-i+1);
{$endif not beos}
case mblen of
size_t(-2):
begin
{ partial invalid character, copy literally }
while (i<=slen) do
begin
ConcatCharToAnsiStr(s[i],result,resindex);
inc(i);
end;
end;
size_t(-1), 0:
begin
{ invalid or null character }
ConcatCharToAnsiStr(s[i],result,resindex);
inc(i);
end;
else
begin
{ a valid sequence }
{ even if mblen = 1, the lowercase version may have a }
{ different length }
{ We can't do anything special if wchar_t is 16 bit... }
{$if not(defined(beos) and not defined(haiku))}
ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
{$else not beos}
ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
{$endif not beos}
inc(i,mblen);
end;
end;
end;
SetLength(result,resindex-1);
end;
function UpperAnsiString(const s : AnsiString) : AnsiString;
var
i, slen,
resindex : SizeInt;
mblen : size_t;
{$if not(defined(beos) and not defined(haiku))}
ombstate,
nmbstate : mbstate_t;
{$endif beos}
wc : wchar_t;
begin
{$if not(defined(beos) and not defined(haiku))}
fillchar(ombstate,sizeof(ombstate),0);
fillchar(nmbstate,sizeof(nmbstate),0);
{$endif beos}
slen:=length(s);
SetLength(result,slen+10);
i:=1;
resindex:=1;
while (i<=slen) do
begin
if (s[i]<=#127) then
begin
wc:=wchar_t(s[i]);
mblen:= 1;
end
else
{$if not(defined(beos) and not defined(haiku))}
mblen:=mbrtowc(@wc, PAnsiChar(@s[i]), slen-i+1, @ombstate);
{$else not beos}
mblen:=mbtowc(@wc, PAnsiChar(@s[i]), slen-i+1);
{$endif beos}
case mblen of
size_t(-2):
begin
{ partial invalid character, copy literally }
while (i<=slen) do
begin
ConcatCharToAnsiStr(s[i],result,resindex);
inc(i);
end;
end;
size_t(-1), 0:
begin
{ invalid or null character }
ConcatCharToAnsiStr(s[i],result,resindex);
inc(i);
end;
else
begin
{ a valid sequence }
{ even if mblen = 1, the uppercase version may have a }
{ different length }
{ We can't do anything special if wchar_t is 16 bit... }
{$if not(defined(beos) and not defined(haiku))}
ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
{$else not beos}
ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
{$endif not beos}
inc(i,mblen);
end;
end;
end;
SetLength(result,resindex-1);
end;
function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
var
i, slen,
destindex : SizeInt;
uch : UCS4Char;
begin
slen:=length(s);
setlength(result,slen+1);
i:=1;
destindex:=0;
while (i<=slen) do
begin
uch:=UCS4Char(s[i]);
if (uch=0) then
result[destindex]:=32
else if (uch<=$d7ff) or (uch>=$e000) then
result[destindex]:=uch
else if (uch<=$dbff) and
(i<slen) and
(s[i+1]>=#$dc00) and
(s[i+1]<=#$dfff) then
begin
result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
inc(i);
end
else { invalid surrogate pair }
result[destindex]:=uch;
inc(i);
inc(destindex);
end;
result[destindex]:=UCS4Char(0);
{ Trimming length in this particular case is just a waste of time,
because result will be interpreted as null-terminated and discarded
almost immediately }
end;
function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
{$if not(defined (aix) and defined(cpupowerpc32))}
var
hs1,hs2 : UCS4String;
us1,us2 : WideString;
begin
{ wcscoll interprets null chars as end-of-string -> filter out }
if coIgnoreCase in Options then
begin
us1:=UpperWideString(s1);
us2:=UpperWideString(s2);
end
else
begin
us1:=s1;
us2:=s2;
end;
hs1:=WideStringToUCS4StringNoNulls(us1);
hs2:=WideStringToUCS4StringNoNulls(us2);
result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
end;
{$else}
{ AIX/PPC32 has a 16 bit wchar_t }
var
i, len: longint;
us1,us2 : WideString;
hs1, hs2: array of widechar;
begin
if coIgnoreCase in Options then
begin
us1:=UpperWideString(s1);
us2:=UpperWideString(s2);
end
else
begin
us1:=s1;
us2:=s2;
end;
len:=length(us1);
setlength(hs1,len+1);
for i:=1 to len do
if us1[i]<>#0 then
hs1[i-1]:=us1[i]
else
hs1[i-1]:=#32;
hs1[len]:=#0;
len:=length(us2);
setlength(hs2,len+1);
for i:=1 to len do
if us2[i]<>#0 then
hs2[i-1]:=us2[i]
else
hs2[i-1]:=#32;
hs2[len]:=#0;
result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
end;
{$endif}
{ return value: number of code points in the string. Whenever an invalid
code point is encountered, all characters part of this invalid code point
are considered to form one "character" and the next character is
considered to be the start of a new (possibly also invalid) code point }
function CharLengthPChar(const Str: PAnsiChar): PtrInt;
var
nextlen: ptrint;
s: PAnsiChar;
{$if not(defined(beos) and not defined(haiku))}
mbstate: mbstate_t;
{$endif not beos}
begin
result:=0;
s:=str;
{$if not(defined(beos) and not defined(haiku))}
fillchar(mbstate,sizeof(mbstate),0);
{$endif not beos}
repeat
{$if defined(beos) and not defined(haiku)}
nextlen:=ptrint(mblen(s,MB_CUR_MAX));
{$else beos}
nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
{$endif beos}
{ skip invalid/incomplete sequences }
if (nextlen<0) then
nextlen:=1;
inc(result,1);
inc(s,nextlen);
until (nextlen=0);
end;
function CodePointLength(const Str: PAnsiChar; maxlookahead: ptrint): PtrInt;
{$if not(defined(beos) and not defined(haiku))}
var
mbstate: mbstate_t;
{$endif not beos}
begin
{$if defined(beos) and not defined(haiku)}
result:=ptrint(mblen(str,maxlookahead));
{$else beos}
fillchar(mbstate,sizeof(mbstate),0);
result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
{ mbrlen can also return -2 for "incomplete but potially valid character
and data has been processed" }
if result<0 then
result:=-1;
{$endif beos}
end;
function StrCompAnsiIntern(s1,s2 : PAnsiChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
var
a,b: PAnsiChar;
i: PtrInt;
begin
if not(canmodifys1) then
getmem(a,len1+1)
else
a:=s1;
for i:=0 to len1-1 do
if s1[i]<>#0 then
a[i]:=s1[i]
else
a[i]:=#32;
a[len1]:=#0;
if not(canmodifys2) then
getmem(b,len2+1)
else
b:=s2;
for i:=0 to len2-1 do
if s2[i]<>#0 then
b[i]:=s2[i]
else
b[i]:=#32;
b[len2]:=#0;
result:=strcoll(a,b);
if not(canmodifys1) then
freemem(a);
if not(canmodifys2) then
freemem(b);
end;
function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
begin
result:=StrCompAnsiIntern(PAnsiChar(s1),PAnsiChar(s2),length(s1),length(s2),false,false);
end;
function StrCompAnsi(s1,s2 : PAnsiChar): PtrInt;
begin
result:=strcoll(s1,s2);
end;
function AnsiCompareText(const S1, S2: ansistring): PtrInt;
var
a, b: AnsiString;
begin
a:=UpperAnsistring(s1);
b:=UpperAnsistring(s2);
result:=StrCompAnsiIntern(PAnsiChar(a),PAnsiChar(b),length(a),length(b),true,true);
end;
function AnsiStrIComp(S1, S2: PAnsiChar): PtrInt;
begin
result:=AnsiCompareText(ansistring(s1),ansistring(s2));
end;
function AnsiStrLComp(S1, S2: PAnsiChar; MaxLen: PtrUInt): PtrInt;
var
a, b: PAnsiChar;
begin
if (maxlen=0) then
exit(0);
if (s1[maxlen]<>#0) then
begin
getmem(a,maxlen+1);
move(s1^,a^,maxlen);
a[maxlen]:=#0;
end
else
a:=s1;
if (s2[maxlen]<>#0) then
begin
getmem(b,maxlen+1);
move(s2^,b^,maxlen);
b[maxlen]:=#0;
end
else
b:=s2;
result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
if (a<>s1) then
freemem(a);
if (b<>s2) then
freemem(b);
end;
function AnsiStrLIComp(S1, S2: PAnsiChar; MaxLen: PtrUInt): PtrInt;
var
a, b: ansistring;
begin
if (maxlen=0) then
exit(0);
setlength(a,maxlen);
move(s1^,a[1],maxlen);
setlength(b,maxlen);
move(s2^,b[1],maxlen);
result:=AnsiCompareText(a,b);
end;
procedure ansi2pchar(const s: ansistring; const orgp: PAnsiChar; out p: PAnsiChar);
var
newlen: sizeint;
begin
newlen:=length(s);
if newlen>strlen(orgp) then
fpc_rangeerror;
p:=orgp;
if (newlen>0) then
move(s[1],p[0],newlen);
p[newlen]:=#0;
end;
function AnsiStrLower(Str: PAnsiChar): PAnsiChar;
var
temp: ansistring;
begin
temp:=loweransistring(str);
ansi2pchar(temp,str,result);
end;
function AnsiStrUpper(Str: PAnsiChar): PAnsiChar;
var
temp: ansistring;
begin
temp:=upperansistring(str);
ansi2pchar(temp,str,result);
end;
function envvarset(const varname: PAnsiChar): boolean;
var
varval: PAnsiChar;
begin
varval:=fpgetenv(varname);
result:=
assigned(varval) and
(varval[0]<>#0);
end;
function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
var
langinfo: PAnsiChar;
begin
{$ifdef FPCRTL_FILESYSTEM_UTF8}
if stdcp=scpFileSystemSingleByte then
begin
result:=CP_UTF8;
exit;
end;
{$endif}
{ if none of the relevant LC_* environment variables are set, fall back to
UTF-8 (this happens under some versions of OS X for GUI applications, which
otherwise get CP_ASCII) }
if envvarset('LC_ALL') or
envvarset('LC_CTYPE') or
envvarset('LANG') then
begin
langinfo:=nl_langinfo(CODESET);
{ there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
that causes it to return an empty string of UTF-8 locales
-> patch up (and in general, UTF-8 is a good default on
Unix platforms) }
if not assigned(langinfo) or
(langinfo^=#0) then
langinfo:='UTF-8';
Result:=GetCodepageByName(ansistring(langinfo));
end
else
Result:={$IFDEF FPC_DOTTEDUNITS}UnixApi.CP{$ELSE}unixcp{$ENDIF}.GetSystemCodepage;
end;
procedure SetStdIOCodePage(var T: Text); inline;
begin
case TextRec(T).Mode of
fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
end;
end;
procedure SetStdIOCodePages; inline;
begin
SetStdIOCodePage(Input);
SetStdIOCodePage(Output);
SetStdIOCodePage(ErrOutput);
SetStdIOCodePage(StdOut);
SetStdIOCodePage(StdErr);
end;
var
OrgWideStringManager: TUnicodeStringManager;
Procedure SetCWideStringManager;
Var
CWideStringManager : TUnicodeStringManager;
begin
GetUnicodeStringManager(OrgWideStringManager);
CWideStringManager:=OrgWideStringManager;
With CWideStringManager do
begin
Wide2AnsiMoveProc:=@Wide2AnsiMove;
Ansi2WideMoveProc:=@Ansi2WideMove;
UpperWideStringProc:=@UpperWideString;
LowerWideStringProc:=@LowerWideString;
CompareWideStringProc:=@CompareWideString;
// CompareTextWideStringProc:=@CompareTextWideString;
CharLengthPCharProc:=@CharLengthPChar;
CodePointLengthProc:=@CodePointLength;
UpperAnsiStringProc:=@UpperAnsiString;
LowerAnsiStringProc:=@LowerAnsiString;
CompareStrAnsiStringProc:=@CompareStrAnsiString;
CompareTextAnsiStringProc:=@AnsiCompareText;
StrCompAnsiStringProc:=@StrCompAnsi;
StrICompAnsiStringProc:=@AnsiStrIComp;
StrLCompAnsiStringProc:=@AnsiStrLComp;
StrLICompAnsiStringProc:=@AnsiStrLIComp;
StrLowerAnsiStringProc:=@AnsiStrLower;
StrUpperAnsiStringProc:=@AnsiStrUpper;
ThreadInitProc:=@InitThread;
ThreadFiniProc:=@FiniThread;
{ Unicode }
Unicode2AnsiMoveProc:=@Wide2AnsiMove;
Ansi2UnicodeMoveProc:=@Ansi2WideMove;
UpperUnicodeStringProc:=@UpperWideString;
LowerUnicodeStringProc:=@LowerWideString;
CompareUnicodeStringProc:=@CompareWideString;
{ CodePage }
GetStandardCodePageProc:=@GetStandardCodePage;
end;
SetUnicodeStringManager(CWideStringManager);
end;
var
iconvlib:TLibHandle;
initialization
SetCWideStringManager;
{ you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
{ with the information from the environment variables according to POSIX }
{ (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
setlocale(LC_ALL,'');
{ load iconv library and iconvctl function }
iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
if iconvlib=0 then
iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix+'.6');
if iconvlib<>0 then
pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
{ set the DefaultSystemCodePage }
DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
SetStdIOCodePages;
{ init conversion tables for main program }
InitThread;
finalization
{ fini conversion tables for main program }
FiniThread;
{ unload iconv library }
if iconvlib<>0 then
FreeLibrary(iconvlib);
{ restore previous (probably default) widestring manager so that subsequent calls
into the widestring manager won't trigger the finalized functionality }
SetWideStringManager(OrgWideStringManager);
end.