- Removed string conversion helpers that are not actually used by compiler code generation (duplicated pwidechar/punicodechar stuff).

- Removed one copy of duplicated utf8 encode/decode procedures.

git-svn-id: trunk@20262 -
This commit is contained in:
sergei 2012-02-05 18:01:04 +00:00
parent 3d2a27c66c
commit f07d02b427
4 changed files with 1 additions and 696 deletions

View File

@ -281,12 +281,7 @@ Function fpc_widestr_Copy (Const S : WideString; Index,Size : SizeInt) : WideSt
{$ifndef FPC_WINLIKEWIDESTRING}
function fpc_widestr_Unique(Var S : Pointer): Pointer; compilerproc;
{$endif FPC_WINLIKEWIDESTRING}
Function fpc_WChar_To_AnsiStr(const c : WideChar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): AnsiString; compilerproc;
Function fpc_WChar_To_WideStr(const c : WideChar): WideString; compilerproc;
Function fpc_UChar_To_WideStr(const c : WideChar): WideString; compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
Function fpc_PWideChar_To_WideStr(const p : pwidechar): widestring; compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$endif ndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
@ -312,12 +307,6 @@ Function fpc_Char_To_UnicodeStr(const c : Char): UnicodeString; compilerproc;
Function fpc_PChar_To_UnicodeStr(const p : pchar): UnicodeString; compilerproc;
Function fpc_CharArray_To_UnicodeStr(const arr: array of char; zerobased: boolean = true): UnicodeString; compilerproc;
procedure fpc_unicodestr_to_chararray(out res: array of char; const src: UnicodeString); compilerproc;
procedure fpc_shortstr_to_unicodechararray(out res: array of unicodechar; const src: ShortString); compilerproc;
procedure fpc_ansistr_to_unicodechararray(out res: array of unicodechar; const src: AnsiString); compilerproc;
procedure fpc_unicodestr_to_unicodechararray(out res: array of unicodechar; const src: UnicodeString); compilerproc;
procedure fpc_UnicodeCharArray_To_ShortStr(out res : shortstring;const arr: array of unicodechar; zerobased: boolean = true); compilerproc;
Function fpc_UnicodeCharArray_To_AnsiStr(const arr: array of unicodechar; {$ifdef FPC_HAS_CPSTRING}cp : TSystemCodePage;{$endif FPC_HAS_CPSTRING} zerobased: boolean = true): AnsiString; compilerproc;
Function fpc_UnicodeCharArray_To_UnicodeStr(const arr: array of unicodechar; zerobased: boolean = true): UnicodeString; compilerproc;
procedure fpc_WideCharArray_To_ShortStr(out res : shortstring;const arr: array of widechar; zerobased: boolean = true); compilerproc;
Function fpc_WideCharArray_To_AnsiStr(const arr: array of widechar; {$ifdef FPC_HAS_CPSTRING}cp : TSystemCodePage;{$endif FPC_HAS_CPSTRING} zerobased: boolean = true): AnsiString; compilerproc;
Function fpc_WideCharArray_To_WideStr(const arr: array of widechar; zerobased: boolean = true): WideString; compilerproc;
@ -341,31 +330,16 @@ function fpc_unicodestr_Unique(Var S : Pointer): Pointer; compilerproc;
Function fpc_Char_To_UChar(const c : Char): UnicodeChar; compilerproc;
Function fpc_UChar_To_Char(const c : UnicodeChar): Char; compilerproc;
Function fpc_UChar_To_UnicodeStr(const c : UnicodeChar): UnicodeString; compilerproc;
Function fpc_WChar_To_UnicodeStr(const c : WideChar): UnicodeString; compilerproc;
Function fpc_UChar_To_AnsiStr(const c : UnicodeChar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): AnsiString; compilerproc;
procedure fpc_WChar_To_ShortStr(out res : shortstring;const c : WideChar) compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
Function fpc_PUnicodeChar_To_AnsiStr(const p : punicodechar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): ansistring; compilerproc;
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
Function fpc_PUnicodeChar_To_UnicodeStr(const p : punicodechar): unicodestring; compilerproc;
Function fpc_PWideChar_To_UnicodeStr(const p : pwidechar): unicodestring; compilerproc;
procedure fpc_PUnicodeChar_To_ShortStr(out res : shortstring;const p : punicodechar); compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
Function fpc_PWideChar_To_AnsiStr(const p : pwidechar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): ansistring; compilerproc;
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
procedure fpc_PWideChar_To_ShortStr(out res : shortstring;const p : pwidechar); compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
Function fpc_Char_To_WChar(const c : Char): WideChar; compilerproc;
Function fpc_WChar_To_Char(const c : WideChar): Char; compilerproc;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{ $ifdef FPC_HAS_FEATURE_TEXTIO}
{ from text.inc }

View File

@ -316,39 +316,6 @@ Function fpc_WideStr_To_UnicodeStr (Const S2 : WideString): UnicodeString; compi
end;
Function fpc_PUnicodeChar_To_AnsiStr(const p : punicodechar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): ansistring; compilerproc;
var
Size : SizeInt;
{$ifndef FPC_HAS_CPSTRING}
cp : TSystemCodePage;
{$endif FPC_HAS_CPSTRING}
begin
{$ifndef FPC_HAS_CPSTRING}
cp:=DefaultSystemCodePage;
{$endif FPC_HAS_CPSTRING}
result:='';
if p=nil then
exit;
Size := IndexWord(p^, -1, 0);
if Size>0 then
widestringmanager.Unicode2AnsiMoveProc(P,result,cp,Size);
end;
Function fpc_PUnicodeChar_To_UnicodeStr(const p : punicodechar): unicodestring; compilerproc;
var
Size : SizeInt;
begin
result:='';
if p=nil then
exit;
Size := IndexWord(p^, -1, 0);
Setlength(result,Size);
if Size>0 then
Move(p^,PUnicodeChar(Pointer(result))^,Size*sizeof(UnicodeChar));
end;
Function fpc_PWideChar_To_UnicodeStr(const p : pwidechar): unicodestring; compilerproc;
var
Size : SizeInt;
@ -363,23 +330,6 @@ begin
end;
procedure fpc_PUnicodeChar_To_ShortStr(out res : shortstring;const p : punicodechar); compilerproc;
var
Size : SizeInt;
temp: ansistring;
begin
res:='';
if p=nil then
exit;
Size:=IndexWord(p^, high(PtrInt), 0);
if Size>0 then
begin
widestringmanager.Unicode2AnsiMoveProc(p,temp,DefaultSystemCodePage,Size);
res:=temp;
end;
end;
Function fpc_PWideChar_To_AnsiStr(const p : pwidechar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): ansistring; compilerproc;
var
Size : SizeInt;
@ -577,40 +527,6 @@ begin
end;
Function fpc_WChar_To_UnicodeStr(const c : WideChar): UnicodeString; compilerproc;
{
Converts a WideChar to a UnicodeString;
}
begin
Setlength (Result,1);
Result[1]:= c;
end;
Function fpc_Char_To_WChar(const c : Char): WideChar; compilerproc;
var
w: widestring;
begin
widestringmanager.Ansi2WideMoveProc(@c,DefaultSystemCodePage,w,1);
fpc_Char_To_WChar:=w[1];
end;
Function fpc_WChar_To_Char(const c : WideChar): Char; compilerproc;
{
Converts a WideChar to a Char;
}
var
s: ansistring;
begin
widestringmanager.Wide2AnsiMoveProc(@c, s, DefaultSystemCodePage, 1);
if length(s)=1 then
fpc_WChar_To_Char:= s[1]
else
fpc_WChar_To_Char:='?';
end;
procedure fpc_WChar_To_ShortStr(out res : shortstring;const c : WideChar) compilerproc;
{
Converts a WideChar to a ShortString;
@ -651,18 +567,6 @@ begin
end;
procedure fpc_UChar_To_ShortStr(out res : shortstring;const c : UnicodeChar) compilerproc;
{
Converts a UnicodeChar to a ShortString;
}
var
s: ansistring;
begin
widestringmanager.Unicode2AnsiMoveProc(@c,s,DefaultSystemCodePage,1);
res:=s;
end;
Function fpc_PChar_To_UnicodeStr(const p : pchar): UnicodeString; compilerproc;
Var
L : SizeInt;
@ -698,71 +602,6 @@ begin
end;
procedure fpc_UnicodeCharArray_To_ShortStr(out res : shortstring;const arr: array of unicodechar; zerobased: boolean = true);[public,alias:'FPC_UNICODECHARARRAY_TO_SHORTSTR']; compilerproc;
var
l: longint;
index: ptrint;
len: byte;
temp: ansistring;
begin
l := high(arr)+1;
if l>=high(res)+1 then
l:=high(res)
else if l<0 then
l:=0;
if zerobased then
begin
index:=IndexWord(arr[0],l,0);
if index<0 then
len:=l
else
len:=index;
end
else
len:=l;
widestringmanager.Unicode2AnsiMoveProc (punicodechar(@arr),temp,DefaultSystemCodePage,len);
res:=temp;
end;
Function fpc_UnicodeCharArray_To_AnsiStr(const arr: array of unicodechar; {$ifdef FPC_HAS_CPSTRING}cp : TSystemCodePage;{$endif FPC_HAS_CPSTRING}zerobased: boolean = true): AnsiString; compilerproc;
var
i : SizeInt;
{$ifndef FPC_HAS_CPSTRING}
cp : TSystemCodePage;
{$endif FPC_HAS_CPSTRING}
begin
{$ifndef FPC_HAS_CPSTRING}
cp:=DefaultSystemCodePage;
{$endif FPC_HAS_CPSTRING}
if (zerobased) then
begin
i:=IndexWord(arr,high(arr)+1,0);
if i = -1 then
i := high(arr)+1;
end
else
i := high(arr)+1;
widestringmanager.Unicode2AnsiMoveProc (punicodechar(@arr),fpc_UnicodeCharArray_To_AnsiStr,cp,i);
end;
Function fpc_UnicodeCharArray_To_UnicodeStr(const arr: array of unicodechar; zerobased: boolean = true): UnicodeString; compilerproc;
var
i : SizeInt;
begin
if (zerobased) then
begin
i:=IndexWord(arr,high(arr)+1,0);
if i = -1 then
i := high(arr)+1;
end
else
i := high(arr)+1;
SetLength(fpc_UnicodeCharArray_To_UnicodeStr,i);
Move(arr[0], Pointer(fpc_UnicodeCharArray_To_UnicodeStr)^,i*sizeof(UnicodeChar));
end;
Function fpc_WideCharArray_To_UnicodeStr(const arr: array of widechar; zerobased: boolean = true): UnicodeString; compilerproc;
var
i : SizeInt;
@ -866,61 +705,6 @@ begin
end;
procedure fpc_unicodestr_to_unicodechararray(out res: array of unicodechar; const src: UnicodeString); compilerproc;
var
len: SizeInt;
begin
len := length(src);
if len > length(res) then
len := length(res);
{$push}
{$r-}
{ make sure we don't try to access element 1 of the ansistring if it's nil }
if len > 0 then
move(src[1],res[0],len*SizeOf(UnicodeChar));
fillchar(res[len],(length(res)-len)*SizeOf(UnicodeChar),0);
{$pop}
end;
procedure fpc_ansistr_to_unicodechararray(out res: array of unicodechar; const src: AnsiString); compilerproc;
var
len: SizeInt;
temp: unicodestring;
begin
len := length(src);
{ make sure we don't dereference src if it can be nil (JM) }
if len > 0 then
widestringmanager.ansi2unicodemoveproc(pchar(@src[1]),DefaultSystemCodePage,temp,len);
len := length(temp);
if len > length(res) then
len := length(res);
{$push}
{$r-}
move(temp[1],res[0],len*sizeof(unicodechar));
fillchar(res[len],(length(res)-len)*SizeOf(UnicodeChar),0);
{$pop}
end;
procedure fpc_shortstr_to_unicodechararray(out res: array of unicodechar; const src: ShortString); compilerproc;
var
len: longint;
temp : unicodestring;
begin
len := length(src);
{ make sure we don't access char 1 if length is 0 (JM) }
if len > 0 then
widestringmanager.ansi2unicodemoveproc(pchar(@src[1]),DefaultSystemCodePage,temp,len);
len := length(temp);
if len > length(res) then
len := length(res);
{$push}
{$r-}
move(temp[1],res[0],len*sizeof(unicodechar));
fillchar(res[len],(length(res)-len)*SizeOf(UnicodeChar),0);
{$pop}
end;
procedure fpc_ansistr_to_widechararray(out res: array of widechar; const src: RawByteString); compilerproc;
var
len: SizeInt;

View File

@ -36,10 +36,6 @@ procedure DefaultAnsi2WideMove(source:pchar;cp : TSystemCodePage;var dest:widest
type
TWideStringManager = TUnicodeStringManager;
function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PWideChar; SourceChars: SizeUInt): SizeUInt;
function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
function Utf8ToUnicode(Dest: PWideChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
function UTF8Encode(const s : WideString) : RawByteString;
{$ifdef MSWINDOWS}

View File

@ -349,27 +349,7 @@ Function fpc_Char_To_WideStr(const c : Char): WideString; compilerproc;
Converts a Char to a WideString;
}
begin
Setlength(fpc_Char_To_WideStr,1);
fpc_Char_To_WideStr[1]:=c;
end;
Function fpc_WChar_To_WideStr(const c : WideChar): WideString; compilerproc;
{
Converts a WideChar to a WideString;
}
begin
Setlength (fpc_WChar_To_WideStr,1);
fpc_WChar_To_WideStr[1]:= c;
end;
Function fpc_WChar_To_AnsiStr(const c : WideChar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): AnsiString; compilerproc;
{
Converts a WideChar to a AnsiString;
}
begin
widestringmanager.Wide2AnsiMoveProc(@c, fpc_WChar_To_AnsiStr,{$ifdef FPC_HAS_CPSTRING}cp{$else}TSystemCodePage(0){$endif FPC_HAS_CPSTRING}, 1);
widestringmanager.Ansi2WideMoveProc(@c,DefaultSystemCodePage,fpc_char_To_WideStr,1);
end;
@ -967,435 +947,6 @@ end;
{$endif CPU64}
{ converts an utf-16 code point or surrogate pair to utf-32 }
function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_WIDETOUTF32'];
var
w: widechar;
begin
{ UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
{ are the same in UTF-32 }
w:=s[index];
if (w<=#$d7ff) or
(w>=#$e000) then
begin
result:=UCS4Char(w);
len:=1;
end
{ valid surrogate pair? }
else if (w<=#$dbff) and
{ w>=#$d7ff check not needed, checked above }
(index<length(s)) and
(s[index+1]>=#$dc00) and
(s[index+1]<=#$dfff) then
{ convert the surrogate pair to UTF-32 }
begin
result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
len:=2;
end
else
{ invalid surrogate -> do nothing }
begin
result:=UCS4Char(w);
len:=1;
end;
end;
function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
begin
if assigned(Source) then
Result:=UnicodeToUtf8(Dest,MaxBytes,Source,IndexWord(Source^,-1,0))
else
Result:=0;
end;
function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PWideChar; SourceChars: SizeUInt): SizeUInt;
var
i,j : SizeUInt;
w : word;
lw : longword;
len : longint;
begin
result:=0;
if source=nil then
exit;
i:=0;
j:=0;
if assigned(Dest) then
begin
while (i<SourceChars) and (j<MaxDestBytes) do
begin
w:=word(Source[i]);
case w of
0..$7f:
begin
Dest[j]:=char(w);
inc(j);
end;
$80..$7ff:
begin
if j+1>=MaxDestBytes then
break;
Dest[j]:=char($c0 or (w shr 6));
Dest[j+1]:=char($80 or (w and $3f));
inc(j,2);
end;
$800..$d7ff,$e000..$ffff:
begin
if j+2>=MaxDestBytes then
break;
Dest[j]:=char($e0 or (w shr 12));
Dest[j+1]:=char($80 or ((w shr 6) and $3f));
Dest[j+2]:=char($80 or (w and $3f));
inc(j,3);
end;
$d800..$dbff:
{High Surrogates}
begin
if j+3>=MaxDestBytes then
break;
if (i<sourcechars-1) and
(word(Source[i+1]) >= $dc00) and
(word(Source[i+1]) <= $dfff) then
begin
lw:=longword(utf16toutf32(Source[i] + Source[i+1], 1, len));
Dest[j]:=char($f0 or (lw shr 18));
Dest[j+1]:=char($80 or ((lw shr 12) and $3f));
Dest[j+2]:=char($80 or ((lw shr 6) and $3f));
Dest[j+3]:=char($80 or (lw and $3f));
inc(j,4);
inc(i);
end;
end;
end;
inc(i);
end;
if j>SizeUInt(MaxDestBytes-1) then
j:=MaxDestBytes-1;
Dest[j]:=#0;
end
else
begin
while i<SourceChars do
begin
case word(Source[i]) of
$0..$7f:
inc(j);
$80..$7ff:
inc(j,2);
$800..$d7ff,$e000..$ffff:
inc(j,3);
$d800..$dbff:
begin
if (i<sourcechars-1) and
(word(Source[i+1]) >= $dc00) and
(word(Source[i+1]) <= $dfff) then
begin
inc(j,4);
inc(i);
end;
end;
end;
inc(i);
end;
end;
result:=j+1;
end;
function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
begin
if assigned(Source) then
Result:=Utf8ToUnicode(Dest,MaxChars,Source,strlen(Source))
else
Result:=0;
end;
function UTF8ToUnicode(Dest: PWideChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
const
UNICODE_INVALID=63;
var
InputUTF8: SizeUInt;
IBYTE: BYTE;
OutputUnicode: SizeUInt;
PRECHAR: SizeUInt;
TempBYTE: BYTE;
CharLen: SizeUint;
LookAhead: SizeUInt;
UC: SizeUInt;
begin
if not assigned(Source) then
begin
result:=0;
exit;
end;
result:=SizeUInt(-1);
InputUTF8:=0;
OutputUnicode:=0;
PreChar:=0;
if Assigned(Dest) Then
begin
while (OutputUnicode<MaxDestChars) and (InputUTF8<SourceBytes) do
begin
IBYTE:=byte(Source[InputUTF8]);
if (IBYTE and $80) = 0 then
begin
//One character US-ASCII, convert it to unicode
if IBYTE = 10 then
begin
If (PreChar<>13) and FALSE then
begin
//Expand to crlf, conform UTF-8.
//This procedure will break the memory alocation by
//FPC for the widestring, so never use it. Condition never true due the "and FALSE".
if OutputUnicode+1<MaxDestChars then
begin
Dest[OutputUnicode]:=WideChar(13);
inc(OutputUnicode);
Dest[OutputUnicode]:=WideChar(10);
inc(OutputUnicode);
PreChar:=10;
end
else
begin
Dest[OutputUnicode]:=WideChar(13);
inc(OutputUnicode);
end;
end
else
begin
Dest[OutputUnicode]:=WideChar(IBYTE);
inc(OutputUnicode);
PreChar:=IBYTE;
end;
end
else
begin
Dest[OutputUnicode]:=WideChar(IBYTE);
inc(OutputUnicode);
PreChar:=IBYTE;
end;
inc(InputUTF8);
end
else
begin
TempByte:=IBYTE;
CharLen:=0;
while (TempBYTE and $80)<>0 do
begin
TempBYTE:=(TempBYTE shl 1) and $FE;
inc(CharLen);
end;
//Test for the "CharLen" conforms UTF-8 string
//This means the 10xxxxxx pattern.
if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
begin
//Insuficient chars in string to decode
//UTF-8 array. Fallback to single char.
CharLen:= 1;
end;
for LookAhead := 1 to CharLen-1 do
begin
if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
begin
//Invalid UTF-8 sequence, fallback.
CharLen:= LookAhead;
break;
end;
end;
UC:=$FFFF;
case CharLen of
1: begin
//Not valid UTF-8 sequence
UC:=UNICODE_INVALID;
end;
2: begin
//Two bytes UTF, convert it
UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
if UC <= $7F then
begin
//Invalid UTF sequence.
UC:=UNICODE_INVALID;
end;
end;
3: begin
//Three bytes, convert it to unicode
UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
begin
//Invalid UTF-8 sequence
UC:= UNICODE_INVALID;
End;
end;
4: begin
//Four bytes, convert it to two unicode characters
UC:= (byte(Source[InputUTF8]) and $07) shl 18;
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
if (UC < $10000) or (UC > $10FFFF) then
begin
UC:= UNICODE_INVALID;
end
else
begin
{ only store pair if room }
dec(UC,$10000);
if (OutputUnicode<MaxDestChars-1) then
begin
Dest[OutputUnicode]:=WideChar(UC shr 10 + $D800);
inc(OutputUnicode);
UC:=(UC and $3ff) + $DC00;
end
else
begin
InputUTF8:= InputUTF8 + CharLen;
{ don't store anything }
CharLen:=0;
end;
end;
end;
5,6,7: begin
//Invalid UTF8 to unicode conversion,
//mask it as invalid UNICODE too.
UC:=UNICODE_INVALID;
end;
end;
if CharLen > 0 then
begin
PreChar:=UC;
Dest[OutputUnicode]:=WideChar(UC);
inc(OutputUnicode);
end;
InputUTF8:= InputUTF8 + CharLen;
end;
end;
Result:=OutputUnicode+1;
end
else
begin
while (InputUTF8<SourceBytes) do
begin
IBYTE:=byte(Source[InputUTF8]);
if (IBYTE and $80) = 0 then
begin
//One character US-ASCII, convert it to unicode
if IBYTE = 10 then
begin
if (PreChar<>13) and FALSE then
begin
//Expand to crlf, conform UTF-8.
//This procedure will break the memory alocation by
//FPC for the widestring, so never use it. Condition never true due the "and FALSE".
inc(OutputUnicode,2);
PreChar:=10;
end
else
begin
inc(OutputUnicode);
PreChar:=IBYTE;
end;
end
else
begin
inc(OutputUnicode);
PreChar:=IBYTE;
end;
inc(InputUTF8);
end
else
begin
TempByte:=IBYTE;
CharLen:=0;
while (TempBYTE and $80)<>0 do
begin
TempBYTE:=(TempBYTE shl 1) and $FE;
inc(CharLen);
end;
//Test for the "CharLen" conforms UTF-8 string
//This means the 10xxxxxx pattern.
if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
begin
//Insuficient chars in string to decode
//UTF-8 array. Fallback to single char.
CharLen:= 1;
end;
for LookAhead := 1 to CharLen-1 do
begin
if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
begin
//Invalid UTF-8 sequence, fallback.
CharLen:= LookAhead;
break;
end;
end;
UC:=$FFFF;
case CharLen of
1: begin
//Not valid UTF-8 sequence
UC:=UNICODE_INVALID;
end;
2: begin
//Two bytes UTF, convert it
UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
if UC <= $7F then
begin
//Invalid UTF sequence.
UC:=UNICODE_INVALID;
end;
end;
3: begin
//Three bytes, convert it to unicode
UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
begin
//Invalid UTF-8 sequence
UC:= UNICODE_INVALID;
end;
end;
4: begin
//Four bytes, convert it to two unicode characters
UC:= (byte(Source[InputUTF8]) and $07) shl 18;
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
if (UC < $10000) or (UC > $10FFFF) then
UC:= UNICODE_INVALID
else
{ extra character character }
inc(OutputUnicode);
end;
5,6,7: begin
//Invalid UTF8 to unicode conversion,
//mask it as invalid UNICODE too.
UC:=UNICODE_INVALID;
end;
end;
if CharLen > 0 then
begin
PreChar:=UC;
inc(OutputUnicode);
end;
InputUTF8:= InputUTF8 + CharLen;
end;
end;
Result:=OutputUnicode+1;
end;
end;
function UTF8Encode(const s : WideString) : RawByteString;
var
i : SizeInt;