{ This file is part of the Free Pascal run time library. Copyright (c) 1999-2005 by Florian Klaempfl, Copyright (c) 2011 by Jonas Maebe, members of the Free Pascal development team. This file implements support routines for UTF-8 strings with FPC See the file COPYING.FPC, included in this distribution, for details about the copyright. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. **********************************************************************} {$i wustrings.inc} { This file contains the implementation of the UnicodeString type, which on the Java platforms is an alias for java.lang.String } Function NewUnicodeString(Len : SizeInt) : JLString; { Allocate a new UnicodeString on the heap. initialize it to zero length and reference count 1. } var data: array of jchar; begin setlength(data,len); result:=JLString.create(data); end; procedure fpc_UnicodeStr_To_ShortStr (out res: ShortString;const S2 : UnicodeString); [Public, alias: 'FPC_UNICODESTR_TO_SHORTSTR'];compilerproc; { Converts a UnicodeString to a ShortString; } Var Size : SizeInt; temp : ansistring; begin res:=''; Size:=Length(S2); if Size>0 then begin temp:=s2; res:=temp; end; end; Function fpc_ShortStr_To_UnicodeStr (Const S2 : ShortString): UnicodeString;compilerproc; { Converts a ShortString to a UnicodeString; } Var Size : SizeInt; begin result:=''; Size:=Length(S2); if Size>0 then result:=unicodestring(JLString.Create(TJByteArray(ShortstringClass(@S2).fdata),0,length(S2))); end; Function fpc_UnicodeStr_To_AnsiStr (const S2 : UnicodeString): AnsiString; compilerproc; { Converts a UnicodeString to an AnsiString } Var Size : SizeInt; begin result:=Ansistring(AnsistringClass.Create(s2)); end; Function fpc_AnsiStr_To_UnicodeStr (Const S2 : AnsiString): UnicodeString; compilerproc; { Converts an AnsiString to a UnicodeString; } Var Size : SizeInt; begin if length(s2)=0 then result:='' else result:=AnsistringClass(S2).toString; end; Function fpc_UnicodeStr_To_WideStr (const S2 : UnicodeString): WideString; compilerproc; begin result:=s2; end; Function fpc_WideStr_To_UnicodeStr (Const S2 : WideString): UnicodeString; compilerproc; begin result:=s2; end; function fpc_UnicodeStr_Concat (const S1,S2 : UnicodeString): UnicodeString; compilerproc; Var sb: JLStringBuilder; begin { only assign if s1 or s2 is empty } if (length(S1)=0) then begin result:=s2; exit; end; if (length(S2)=0) then begin result:=s1; exit; end; sb:=JLStringBuilder.create(S1); sb.append(s2); result:=sb.toString; end; function fpc_UnicodeStr_Concat_multi (const sarr:array of Unicodestring): unicodestring; compilerproc; Var i : Longint; Size,NewSize : SizeInt; sb: JLStringBuilder; begin { First calculate size of the result so we can allocate a StringBuilder of the right size } NewSize:=0; for i:=low(sarr) to high(sarr) do inc(Newsize,length(sarr[i])); sb:=JLStringBuilder.create(NewSize); for i:=low(sarr) to high(sarr) do begin if length(sarr[i])>0 then sb.append(sarr[i]); end; result:=sb.toString; end; Function fpc_Char_To_UChar(const c : AnsiChar): UnicodeChar; compilerproc; var str: JLString; arr: array of jbyte; begin setlength(arr,1); arr[0]:=ord(c); result:=JLString.create(arr,0,1).charAt(0); end; Function fpc_Char_To_UnicodeStr(const c : AnsiChar): UnicodeString; compilerproc; { Converts a AnsiChar to a UnicodeString; } var str: JLString; arr: array of jbyte; begin setlength(arr,1); arr[0]:=ord(c); result:=JLString.create(arr,0,1); end; Function fpc_UChar_To_Char(const c : UnicodeChar): AnsiChar; compilerproc; { Converts a UnicodeChar to a AnsiChar; } var arrb: array of jbyte; arrw: array of jchar; str: JLString; begin setlength(arrw,1); arrw[0]:=c; str:=JLString.create(arrw); arrb:=str.getbytes(); result:=chr(arrb[0]); end; Function fpc_WChar_To_UnicodeStr(const c : WideChar): UnicodeString; compilerproc; { Converts a WideChar to a UnicodeString; } var arrw: array of jchar; begin setlength(arrw,1); arrw[0]:=c; result:=JLString.create(arrw); end; Function fpc_Char_To_WChar(const c : AnsiChar): WideChar; compilerproc; { Converts a AnsiChar to a WideChar; } var str: JLString; arr: array of jbyte; begin setlength(arr,1); arr[0]:=ord(c); result:=JLString.create(arr,0,1).charAt(0); end; Function fpc_WChar_To_Char(const c : WideChar): AnsiChar; compilerproc; { Converts a WideChar to a AnsiChar; } var arrb: array of jbyte; arrw: array of jchar; begin setlength(arrw,1); arrw[0]:=c; arrb:=JLString.create(arrw).getbytes(); result:=chr(arrb[0]); end; procedure fpc_WChar_To_ShortStr(out res : shortstring;const c : WideChar) compilerproc; { Converts a WideChar to a ShortString; } var u: unicodestring; begin u:=c; res:=u; end; Function fpc_UChar_To_UnicodeStr(const c : UnicodeChar): UnicodeString; compilerproc; { Converts a UnicodeChar to a UnicodeString; } var arr: array[0..0] of UnicodeChar; begin arr[0]:=c; result:=JLString.create(arr); end; Function fpc_UChar_To_AnsiStr(const c : UnicodeChar): AnsiString; compilerproc; { Converts a UnicodeChar to a AnsiString; } var u: unicodestring; begin u:=c; result:=u; end; (* Function fpc_PChar_To_UnicodeStr(const p : pchar): UnicodeString; compilerproc; Var L : SizeInt; begin if (not assigned(p)) or (p[0]=#0) Then begin fpc_pchar_to_unicodestr := ''; exit; end; l:=IndexChar(p^,-1,#0); widestringmanager.Ansi2UnicodeMoveProc(P,fpc_PChar_To_UnicodeStr,l); end; *) Function fpc_CharArray_To_UnicodeStr(const arr: array of ansichar; zerobased: boolean = true): UnicodeString; compilerproc; var i,j : SizeInt; localarr: array of jbyte; foundnull: boolean; begin if (zerobased) then begin if (arr[0]=#0) Then begin fpc_chararray_to_unicodestr := ''; exit; end; foundnull:=false; for i:=low(arr) to high(arr) do if arr[i]=#0 then begin foundnull:=true; break; end; if not foundnull then i := high(arr)+1; end else i := high(arr)+1; setlength(localarr,i); for j:=0 to i-1 do localarr[j]:=ord(arr[j]); result:=JLString.create(localarr,0,i); end; (* function fpc_UnicodeCharArray_To_ShortStr(const arr: array of unicodechar; zerobased: boolean = true): shortstring;[public,alias:'FPC_UNICODECHARARRAY_TO_SHORTSTR']; compilerproc; var l: longint; index: longint; len: byte; temp: ansistring; foundnull: boolean; begin l := high(arr)+1; if l>=256 then l:=255 else if l<0 then l:=0; if zerobased then begin foundnull:=false; for index:=low(arr) to l-1 do if arr[index]=#0 then begin foundnull:=true; break; end; if not foundnull then len := l else len := index; end else len := l; result:=JLString.create(arr,0,l); end; Function fpc_UnicodeCharArray_To_AnsiStr(const arr: array of unicodechar; zerobased: boolean = true): AnsiString; compilerproc; var i : SizeInt; begin if (zerobased) then begin i:=IndexWord(arr,high(arr)+1,0); if i = -1 then i := high(arr)+1; end else i := high(arr)+1; SetLength(fpc_UnicodeCharArray_To_AnsiStr,i); widestringmanager.Unicode2AnsiMoveProc (punicodechar(@arr),fpc_UnicodeCharArray_To_AnsiStr,i); end; *) Function fpc_UnicodeCharArray_To_UnicodeStr(const arr: array of unicodechar; zerobased: boolean = true): UnicodeString; compilerproc; var i : SizeInt; foundnull : boolean; begin if (zerobased) then begin foundnull:=false; for i:=low(arr) to high(arr) do if arr[i]=#0 then begin foundnull:=true; break; end; if not foundnull then i := high(arr)+1; end else i := high(arr)+1; result:=JLString.create(arr,0,i); end; Function real_widechararray_to_unicodestr(const arr: array of widechar; zerobased: boolean): Unicodestring; var i : SizeInt; foundnull : boolean; begin if (zerobased) then begin foundnull:=false; for i:=low(arr) to high(arr) do if arr[i]=#0 then begin foundnull:=true; break; end; if not foundnull then i := high(arr)+1; end else i := high(arr)+1; result:=JLString.create(arr,0,i); end; Function fpc_WideCharArray_To_UnicodeStr(const arr: array of widechar; zerobased: boolean = true): UnicodeString; compilerproc; begin result:=real_widechararray_to_unicodestr(arr,zerobased); end; { due to their names, the following procedures should be in wstrings.inc, however, the compiler generates code using this functions on all platforms } procedure fpc_WideCharArray_To_ShortStr(out res : shortstring;const arr: array of widechar; zerobased: boolean = true);[public,alias:'FPC_WIDECHARARRAY_TO_SHORTSTR']; compilerproc; begin res:=real_widechararray_to_unicodestr(arr,zerobased); end; Function fpc_WideCharArray_To_AnsiStr(const arr: array of widechar; zerobased: boolean = true): AnsiString; compilerproc; begin result:=real_widechararray_to_unicodestr(arr,zerobased); end; procedure fpc_unicodestr_to_chararray(out res: array of AnsiChar; const src: UnicodeString); compilerproc; var i, len: SizeInt; temp: array of jbyte; begin len := length(src); { make sure we don't dereference src if it can be nil (JM) } if len > 0 then begin temp:=JLString(src).getBytes; if len > length(temp) then len := length(temp); for i := 0 to len-1 do res[i] := chr(temp[i]); end; end; procedure fpc_unicodestr_to_unicodechararray(out res: array of unicodechar; const src: UnicodeString); compilerproc; var len: SizeInt; begin len := length(src); { make sure we don't dereference src if it can be nil (JM) } if len > 0 then begin if len > high(res)+1 then len := high(res)+1; JLString(src).getChars(0,len,res,0); end; end; function fpc_unicodestr_setchar(const s: UnicodeString; const index: longint; const ch: unicodechar): UnicodeString; compilerproc; var sb: JLStringBuilder; begin sb:=JLStringBuilder.create(s); { string indexes are 1-based in Pascal, 0-based in Java } sb.setCharAt(index-1,ch); result:=sb.toString(); end; procedure fpc_ansistr_to_unicodechararray(out res: array of unicodechar; const src: AnsiString); compilerproc; var len: SizeInt; temp: unicodestring; begin len := length(src); { make sure we don't dereference src if it can be nil (JM) } if len > 0 then temp:=src; len := length(temp); if len > length(res) then len := length(res); JLString(temp).getChars(0,len,res,0); JUArrays.fill(res,len,high(res),#0); end; (* procedure fpc_shortstr_to_unicodechararray(out res: array of unicodechar; const src: ShortString); compilerproc; var len: longint; temp : unicodestring; begin len := length(src); { temp is initialized with an empty string, so no need to convert src in case it's also empty} if len > 0 then temp:=src; len := length(temp); if len > high(res)+1 then len := high(res)+1; JLString(temp).getChars(0,len,res,0); JUArrays.fill(res,len,high(res),#0); end; *) procedure fpc_ansistr_to_widechararray(out res: array of widechar; const src: AnsiString); compilerproc; var len: SizeInt; temp: widestring; begin len := length(src); { make sure we don't dereference src if it can be nil (JM) } if len > 0 then temp:=src; len := length(temp); if len > high(res)+1 then len := high(res)+1; JLString(temp).getChars(0,len,res,0); JUArrays.fill(res,len,high(res),#0); end; procedure fpc_shortstr_to_widechararray(out res: array of widechar; const src: ShortString); compilerproc; var len: longint; temp : unicodestring; begin len := length(src); { temp is initialized with an empty string, so no need to convert src in case it's also empty} if len > 0 then temp:=src; len := length(temp); if len > high(res)+1 then len := high(res)+1; JLString(temp).getChars(0,len,res,0); JUArrays.fill(res,len,high(res),#0); end; procedure fpc_unicodestr_to_widechararray(out res: array of widechar; const src: UnicodeString); compilerproc; var i, len: SizeInt; begin len := length(src); if len > length(res) then len := length(res); JLString(src).getChars(0,len,res,0); end; Function fpc_UnicodeStr_Compare(const S1,S2 : UnicodeString): SizeInt; compilerproc; { Compares 2 UnicodeStrings; The result is <0 if S10 if S1>S2 } Var MaxI,Temp : SizeInt; begin if JLObject(S1)=JLObject(S2) then begin result:=0; exit; end; result:=JLString(S1).compareTo(S2); end; Function fpc_UnicodeStr_Compare_Equal(const S1,S2 : UnicodeString): SizeInt; compilerproc; { Compares 2 UnicodeStrings for equality only; The result is 0 if S1=S2 <>0 if S1<>S2 } Var MaxI : SizeInt; begin result:=ord(not JLString(S1).equals(JLString(S2))); end; function fpc_UnicodeStr_SetLength(const S : UnicodeString; l : SizeInt): UnicodeString; compilerproc; { Sets The length of string S to L. Makes sure S is unique, and contains enough room. Returns new val } Var movelen: SizeInt; chars: array of widechar; strlen: SizeInt; begin if (l>0) then begin if JLObject(S)=nil then begin { Need a completely new string...} result:=NewUnicodeString(l); end { no need to create a new string, since Java strings are immutable } else begin strlen:=length(s); if l=strlen then result:=s else if (lLength(S)) or (Index+Size>Length(S)) then Size:=Length(S)-Index; If Size>0 then result:=JLString(s).subString(Index,Size) else result:=''; end; Function Pos (Const Substr : UnicodeString; Const Source : UnicodeString) : SizeInt; begin Pos:=0; if Length(SubStr)>0 then Pos:=JLString(Source).indexOf(SubStr)+1; end; { Faster version for a unicodechar alone } Function Pos (c : UnicodeChar; Const s : UnicodeString) : SizeInt; begin Pos:=0; if length(S)>0 then Pos:=JLString(s).indexOf(ord(c))+1; end; (* Function Pos (c : AnsiString; Const s : UnicodeString) : SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif} begin result:=Pos(UnicodeString(c),s); end; Function Pos (c : ShortString; Const s : UnicodeString) : SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif} begin result:=Pos(UnicodeString(c),s); end; Function Pos (c : UnicodeString; Const s : AnsiString) : SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif} begin result:=Pos(c,UnicodeString(s)); end; *) { Faster version for a char alone. Must be implemented because } { pos(c: char; const s: shortstring) also exists, so otherwise } { using pos(char,pchar) will always call the shortstring version } { (exact match for first argument), also with $h+ (JM) } Function Pos (c : AnsiChar; Const s : UnicodeString) : SizeInt; var i: SizeInt; wc : unicodechar; begin wc:=c; result:=Pos(wc,s); end; (* Procedure Delete (Var S : UnicodeString; Index,Size: SizeInt); Var LS : SizeInt; sb: JLStringBuilder; begin LS:=Length(S); if (Index>LS) or (Index<=0) or (Size<=0) then exit; { (Size+Index) will overflow if Size=MaxInt. } if Size>LS-Index then Size:=LS-Index+1; if Size<=LS-Index then begin Dec(Index); sb:=JLStringBuilder.Create(s); sb.delete(index,size); s:=sb.toString; end else s:=JLString(s).substring(0,index-1); end; Procedure Insert (Const Source : UnicodeString; Var S : UnicodeString; Index : SizeInt); var Temp : UnicodeString; LS : SizeInt; sb : JLStringBuilder; begin If Length(Source)=0 then exit; if index <= 0 then index := 1; Ls:=Length(S); if index > LS then index := LS+1; Dec(Index); sb:=JLStringBuilder.Create(S); sb.insert(Index,Source); S:=sb.toString; end; *) Function UpCase(c:UnicodeChar):UnicodeChar; begin result:=JLCharacter.toUpperCase(c); end; function UpCase(const s : UnicodeString) : UnicodeString; begin result:=JLString(s).toUpperCase; end; (* Procedure SetString (Out S : UnicodeString; Buf : PUnicodeChar; Len : SizeInt); begin SetLength(S,Len); If (Buf<>Nil) and (Len>0) then Move (Buf[0],S[1],Len*sizeof(UnicodeChar)); end; Procedure SetString (Out S : UnicodeString; Buf : PChar; Len : SizeInt); var BufLen: SizeInt; begin SetLength(S,Len); If (Buf<>Nil) and (Len>0) then widestringmanager.Ansi2UnicodeMoveProc(Buf,S,Len); end; {$ifndef FPUNONE} Function fpc_Val_Real_UnicodeStr(Const S : UnicodeString; out Code : ValSInt): ValReal; [public, alias:'FPC_VAL_REAL_UNICODESTR']; compilerproc; Var SS : String; begin fpc_Val_Real_UnicodeStr := 0; if length(S) > 255 then code := 256 else begin SS := S; Val(SS,fpc_Val_Real_UnicodeStr,code); end; end; {$endif} function fpc_val_enum_unicodestr(str2ordindex:pointer;const s:unicodestring;out code:valsint):longint;compilerproc; var ss:shortstring; begin if length(s)>255 then code:=256 else begin ss:=s; val(ss,fpc_val_enum_unicodestr,code); end; end; Function fpc_Val_Currency_UnicodeStr(Const S : UnicodeString; out Code : ValSInt): Currency; [public, alias:'FPC_VAL_CURRENCY_UNICODESTR']; compilerproc; Var SS : String; begin if length(S) > 255 then begin fpc_Val_Currency_UnicodeStr:=0; code := 256; end else begin SS := S; Val(SS,fpc_Val_Currency_UnicodeStr,code); end; end; Function fpc_Val_UInt_UnicodeStr (Const S : UnicodeString; out Code : ValSInt): ValUInt; [public, alias:'FPC_VAL_UINT_UNICODESTR']; compilerproc; Var SS : ShortString; begin fpc_Val_UInt_UnicodeStr := 0; if length(S) > 255 then code := 256 else begin SS := S; Val(SS,fpc_Val_UInt_UnicodeStr,code); end; end; Function fpc_Val_SInt_UnicodeStr (DestSize: SizeInt; Const S : UnicodeString; out Code : ValSInt): ValSInt; [public, alias:'FPC_VAL_SINT_UNICODESTR']; compilerproc; Var SS : ShortString; begin fpc_Val_SInt_UnicodeStr:=0; if length(S)>255 then code:=256 else begin SS := S; fpc_Val_SInt_UnicodeStr := int_Val_SInt_ShortStr(DestSize,SS,Code); end; end; {$ifndef CPU64} Function fpc_Val_qword_UnicodeStr (Const S : UnicodeString; out Code : ValSInt): qword; [public, alias:'FPC_VAL_QWORD_UNICODESTR']; compilerproc; Var SS : ShortString; begin fpc_Val_qword_UnicodeStr:=0; if length(S)>255 then code:=256 else begin SS := S; Val(SS,fpc_Val_qword_UnicodeStr,Code); end; end; Function fpc_Val_int64_UnicodeStr (Const S : UnicodeString; out Code : ValSInt): Int64; [public, alias:'FPC_VAL_INT64_UNICODESTR']; compilerproc; Var SS : ShortString; begin fpc_Val_int64_UnicodeStr:=0; if length(S)>255 then code:=256 else begin SS := S; Val(SS,fpc_Val_int64_UnicodeStr,Code); end; end; {$endif CPU64} {$ifndef FPUNONE} procedure fpc_UnicodeStr_Float(d : ValReal;len,fr,rt : SizeInt;out s : UnicodeString);compilerproc; var ss : shortstring; begin str_real(len,fr,d,treal_type(rt),ss); s:=ss; end; {$endif} procedure fpc_unicodestr_enum(ordinal,len:sizeint;typinfo,ord2strindex:pointer;out s:unicodestring);compilerproc; var ss:shortstring; begin fpc_shortstr_enum(ordinal,len,typinfo,ord2strindex,ss); s:=ss; end; procedure fpc_unicodestr_bool(b : boolean;len:sizeint;out s:unicodestring);compilerproc; var ss:shortstring; begin fpc_shortstr_bool(b,len,ss); s:=ss; end; {$ifdef FPC_HAS_STR_CURRENCY} procedure fpc_UnicodeStr_Currency(c : Currency;len,fr : SizeInt;out s : UnicodeString);compilerproc; var ss : shortstring; begin str(c:len:fr,ss); s:=ss; end; {$endif FPC_HAS_STR_CURRENCY} Procedure fpc_UnicodeStr_SInt(v : ValSint; Len : SizeInt; out S : UnicodeString);compilerproc; Var SS : ShortString; begin Str (v:Len,SS); S:=SS; end; Procedure fpc_UnicodeStr_UInt(v : ValUInt;Len : SizeInt; out S : UnicodeString);compilerproc; Var SS : ShortString; begin str(v:Len,SS); S:=SS; end; {$ifndef CPU64} Procedure fpc_UnicodeStr_Int64(v : Int64; Len : SizeInt; out S : UnicodeString);compilerproc; Var SS : ShortString; begin Str (v:Len,SS); S:=SS; end; Procedure fpc_UnicodeStr_Qword(v : Qword;Len : SizeInt; out S : UnicodeString);compilerproc; Var SS : ShortString; begin str(v:Len,SS); S:=SS; end; {$endif CPU64} *) (* { converts an utf-16 code point or surrogate pair to utf-32 } function utf16toutf32(const S: UnicodeString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_UTF16TOUTF32']; var w: unicodechar; begin { UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF } { are the same in UTF-32 } w:=s[index]; if (w<=#$d7ff) or (w>=#$e000) then begin result:=UCS4Char(w); len:=1; end { valid surrogate pair? } else if (w<=#$dbff) and { w>=#$d7ff check not needed, checked above } (index=#$dc00) and (s[index+1]<=#$dfff) then { convert the surrogate pair to UTF-32 } begin result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000; len:=2; end else { invalid surrogate -> do nothing } begin result:=UCS4Char(w); len:=1; end; end; function UnicodeToUtf8(Dest: PChar; Source: PUnicodeChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif} begin if assigned(Source) then Result:=UnicodeToUtf8(Dest,MaxBytes,Source,IndexWord(Source^,-1,0)) else Result:=0; end; function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar; SourceChars: SizeUInt): SizeUInt; var i,j : SizeUInt; w : word; lw : longword; len : longint; begin result:=0; if source=nil then exit; i:=0; j:=0; if assigned(Dest) then begin while (i=MaxDestBytes then break; Dest[j]:=char($c0 or (w shr 6)); Dest[j+1]:=char($80 or (w and $3f)); inc(j,2); end; $800..$d7ff,$e000..$ffff: begin if j+2>=MaxDestBytes then break; Dest[j]:=char($e0 or (w shr 12)); Dest[j+1]:=char($80 or ((w shr 6) and $3f)); Dest[j+2]:=char($80 or (w and $3f)); inc(j,3); end; $d800..$dbff: {High Surrogates} begin if j+3>=MaxDestBytes then break; if (i= $dc00) and (word(Source[i+1]) <= $dfff) then begin lw:=longword(utf16toutf32(Source[i] + Source[i+1], 1, len)); Dest[j]:=char($f0 or (lw shr 18)); Dest[j+1]:=char($80 or ((lw shr 12) and $3f)); Dest[j+2]:=char($80 or ((lw shr 6) and $3f)); Dest[j+3]:=char($80 or (lw and $3f)); inc(j,4); inc(i); end; end; end; inc(i); end; if j>SizeUInt(MaxDestBytes-1) then j:=MaxDestBytes-1; Dest[j]:=#0; end else begin while i= $dc00) and (word(Source[i+1]) <= $dfff) then begin inc(j,4); inc(i); end; end; end; inc(i); end; end; result:=j+1; end; function Utf8ToUnicode(Dest: PUnicodeChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif} begin if assigned(Source) then Result:=Utf8ToUnicode(Dest,MaxChars,Source,strlen(Source)) else Result:=0; end; function UTF8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt; const UNICODE_INVALID=63; var InputUTF8: SizeUInt; IBYTE: BYTE; OutputUnicode: SizeUInt; PRECHAR: SizeUInt; TempBYTE: BYTE; CharLen: SizeUint; LookAhead: SizeUInt; UC: SizeUInt; begin if not assigned(Source) then begin result:=0; exit; end; result:=SizeUInt(-1); InputUTF8:=0; OutputUnicode:=0; PreChar:=0; if Assigned(Dest) Then begin while (OutputUnicode13) and FALSE then begin //Expand to crlf, conform UTF-8. //This procedure will break the memory alocation by //FPC for the widestring, so never use it. Condition never true due the "and FALSE". if OutputUnicode+10 do begin TempBYTE:=(TempBYTE shl 1) and $FE; inc(CharLen); end; //Test for the "CharLen" conforms UTF-8 string //This means the 10xxxxxx pattern. if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then begin //Insuficient chars in string to decode //UTF-8 array. Fallback to single char. CharLen:= 1; end; for LookAhead := 1 to CharLen-1 do begin if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then begin //Invalid UTF-8 sequence, fallback. CharLen:= LookAhead; break; end; end; UC:=$FFFF; case CharLen of 1: begin //Not valid UTF-8 sequence UC:=UNICODE_INVALID; end; 2: begin //Two bytes UTF, convert it UC:=(byte(Source[InputUTF8]) and $1F) shl 6; UC:=UC or (byte(Source[InputUTF8+1]) and $3F); if UC <= $7F then begin //Invalid UTF sequence. UC:=UNICODE_INVALID; end; end; 3: begin //Three bytes, convert it to unicode UC:= (byte(Source[InputUTF8]) and $0F) shl 12; UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6); UC:= UC or ((byte(Source[InputUTF8+2]) and $3F)); if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then begin //Invalid UTF-8 sequence UC:= UNICODE_INVALID; End; end; 4: begin //Four bytes, convert it to two unicode characters UC:= (byte(Source[InputUTF8]) and $07) shl 18; UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12); UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6); UC:= UC or ((byte(Source[InputUTF8+3]) and $3F)); if (UC < $10000) or (UC > $10FFFF) then begin UC:= UNICODE_INVALID; end else begin { only store pair if room } dec(UC,$10000); if (OutputUnicode 0 then begin PreChar:=UC; Dest[OutputUnicode]:=WideChar(UC); inc(OutputUnicode); end; InputUTF8:= InputUTF8 + CharLen; end; end; Result:=OutputUnicode+1; end else begin while (InputUTF813) and FALSE then begin //Expand to crlf, conform UTF-8. //This procedure will break the memory alocation by //FPC for the widestring, so never use it. Condition never true due the "and FALSE". inc(OutputUnicode,2); PreChar:=10; end else begin inc(OutputUnicode); PreChar:=IBYTE; end; end else begin inc(OutputUnicode); PreChar:=IBYTE; end; inc(InputUTF8); end else begin TempByte:=IBYTE; CharLen:=0; while (TempBYTE and $80)<>0 do begin TempBYTE:=(TempBYTE shl 1) and $FE; inc(CharLen); end; //Test for the "CharLen" conforms UTF-8 string //This means the 10xxxxxx pattern. if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then begin //Insuficient chars in string to decode //UTF-8 array. Fallback to single char. CharLen:= 1; end; for LookAhead := 1 to CharLen-1 do begin if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then begin //Invalid UTF-8 sequence, fallback. CharLen:= LookAhead; break; end; end; UC:=$FFFF; case CharLen of 1: begin //Not valid UTF-8 sequence UC:=UNICODE_INVALID; end; 2: begin //Two bytes UTF, convert it UC:=(byte(Source[InputUTF8]) and $1F) shl 6; UC:=UC or (byte(Source[InputUTF8+1]) and $3F); if UC <= $7F then begin //Invalid UTF sequence. UC:=UNICODE_INVALID; end; end; 3: begin //Three bytes, convert it to unicode UC:= (byte(Source[InputUTF8]) and $0F) shl 12; UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6); UC:= UC or ((byte(Source[InputUTF8+2]) and $3F)); If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then begin //Invalid UTF-8 sequence UC:= UNICODE_INVALID; end; end; 4: begin //Four bytes, convert it to two unicode characters UC:= (byte(Source[InputUTF8]) and $07) shl 18; UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12); UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6); UC:= UC or ((byte(Source[InputUTF8+3]) and $3F)); if (UC < $10000) or (UC > $10FFFF) then UC:= UNICODE_INVALID else { extra character character } inc(OutputUnicode); end; 5,6,7: begin //Invalid UTF8 to unicode conversion, //mask it as invalid UNICODE too. UC:=UNICODE_INVALID; end; end; if CharLen > 0 then begin PreChar:=UC; inc(OutputUnicode); end; InputUTF8:= InputUTF8 + CharLen; end; end; Result:=OutputUnicode+1; end; end; function UTF8Encode(const s : Ansistring) : UTF8String; inline; begin Result:=UTF8Encode(UnicodeString(s)); end; function UTF8Encode(const s : UnicodeString) : UTF8String; var i : SizeInt; hs : UTF8String; begin result:=''; if s='' then exit; SetLength(hs,length(s)*3); i:=UnicodeToUtf8(pchar(hs),length(hs)+1,PUnicodeChar(s),length(s)); if i>0 then begin SetLength(hs,i-1); result:=hs; end; end; function UTF8Decode(const s : UTF8String): UnicodeString; var i : SizeInt; hs : UnicodeString; begin result:=''; if s='' then exit; SetLength(hs,length(s)); i:=Utf8ToUnicode(PUnicodeChar(hs),length(hs)+1,pchar(s),length(s)); if i>0 then begin SetLength(hs,i-1); result:=hs; end; end; function AnsiToUtf8(const s : ansistring): UTF8String;{$ifdef SYSTEMINLINE}inline;{$endif} begin Result:=Utf8Encode(s); end; function Utf8ToAnsi(const s : UTF8String) : ansistring;{$ifdef SYSTEMINLINE}inline;{$endif} begin Result:=Utf8Decode(s); end; function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String; var i, slen, destindex : SizeInt; len : longint; begin slen:=length(s); setlength(result,slen+1); i:=1; destindex:=0; while (i<=slen) do begin result[destindex]:=utf16toutf32(s,i,len); inc(destindex); inc(i,len); end; { destindex <= slen (surrogate pairs may have been merged) } { destindex+1 for terminating #0 (dynamic arrays are } { implicitely filled with zero) } setlength(result,destindex+1); end; { concatenates an utf-32 char to a unicodestring. S *must* be unique when entering. } procedure ConcatUTF32ToUnicodeStr(const nc: UCS4Char; var S: UnicodeString; var index: SizeInt); var p : PUnicodeChar; begin { if nc > $ffff, we need two places } if (index+ord(nc > $ffff)>length(s)) then if (length(s) < 10*256) then setlength(s,length(s)+10) else setlength(s,length(s)+length(s) shr 8); { we know that s is unique -> avoid uniquestring calls} p:=@s[index]; if (nc<$ffff) then begin p^:=unicodechar(nc); inc(index); end else if (dword(nc)<=$10ffff) then begin p^:=unicodechar((nc - $10000) shr 10 + $d800); (p+1)^:=unicodechar((nc - $10000) and $3ff + $dc00); inc(index,2); end else { invalid code point } begin p^:='?'; inc(index); end; end; function UCS4StringToUnicodeString(const s : UCS4String) : UnicodeString; var i : SizeInt; resindex : SizeInt; begin { skip terminating #0 } SetLength(result,length(s)-1); resindex:=1; for i:=0 to high(s)-1 do ConcatUTF32ToUnicodeStr(s[i],result,resindex); { adjust result length (may be too big due to growing } { for surrogate pairs) } setlength(result,resindex-1); end; function WideStringToUCS4String(const s : WideString) : UCS4String; var i, slen, destindex : SizeInt; len : longint; begin slen:=length(s); setlength(result,slen+1); i:=1; destindex:=0; while (i<=slen) do begin result[destindex]:=utf16toutf32(s,i,len); inc(destindex); inc(i,len); end; { destindex <= slen (surrogate pairs may have been merged) } { destindex+1 for terminating #0 (dynamic arrays are } { implicitely filled with zero) } setlength(result,destindex+1); end; { concatenates an utf-32 char to a widestring. S *must* be unique when entering. } procedure ConcatUTF32ToWideStr(const nc: UCS4Char; var S: WideString; var index: SizeInt); var p : PWideChar; begin { if nc > $ffff, we need two places } if (index+ord(nc > $ffff)>length(s)) then if (length(s) < 10*256) then setlength(s,length(s)+10) else setlength(s,length(s)+length(s) shr 8); { we know that s is unique -> avoid uniquestring calls} p:=@s[index]; if (nc<$ffff) then begin p^:=widechar(nc); inc(index); end else if (dword(nc)<=$10ffff) then begin p^:=widechar((nc - $10000) shr 10 + $d800); (p+1)^:=widechar((nc - $10000) and $3ff + $dc00); inc(index,2); end else { invalid code point } begin p^:='?'; inc(index); end; end; function UCS4StringToWideString(const s : UCS4String) : WideString; var i : SizeInt; resindex : SizeInt; begin { skip terminating #0 } SetLength(result,length(s)-1); resindex:=1; for i:=0 to high(s)-1 do ConcatUTF32ToWideStr(s[i],result,resindex); { adjust result length (may be too big due to growing } { for surrogate pairs) } setlength(result,resindex-1); end; const SNoUnicodestrings = 'This binary has no unicodestrings support compiled in.'; SRecompileWithUnicodestrings = 'Recompile the application with a unicodestrings-manager in the program uses clause.'; *) function CompareUnicodeString(const s1, s2 : UnicodeString) : PtrInt; begin widestringmanager.collator.setStrength(JTCollator.IDENTICAL); result:=widestringmanager.collator.compare(s1,s2); end; function CompareTextUnicodeString(const s1, s2 : UnicodeString): PtrInt; begin widestringmanager.collator.setStrength(JTCollator.TERTIARY); result:=widestringmanager.collator.compare(s1,s2); end; constructor TUnicodeStringManager.create; begin end; procedure initunicodestringmanager; begin widestringmanager:=TUnicodeStringManager.create; widestringmanager.collator:=JTCollator.getInstance; end;