mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-13 14:49:47 +02:00
- Removed string conversion helpers that are not actually used by compiler code generation (duplicated pwidechar/punicodechar stuff).
- Removed one copy of duplicated utf8 encode/decode procedures. git-svn-id: trunk@20262 -
This commit is contained in:
parent
3d2a27c66c
commit
f07d02b427
@ -281,12 +281,7 @@ Function fpc_widestr_Copy (Const S : WideString; Index,Size : SizeInt) : WideSt
|
||||
{$ifndef FPC_WINLIKEWIDESTRING}
|
||||
function fpc_widestr_Unique(Var S : Pointer): Pointer; compilerproc;
|
||||
{$endif FPC_WINLIKEWIDESTRING}
|
||||
Function fpc_WChar_To_AnsiStr(const c : WideChar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): AnsiString; compilerproc;
|
||||
Function fpc_WChar_To_WideStr(const c : WideChar): WideString; compilerproc;
|
||||
Function fpc_UChar_To_WideStr(const c : WideChar): WideString; compilerproc;
|
||||
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
|
||||
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
Function fpc_PWideChar_To_WideStr(const p : pwidechar): widestring; compilerproc;
|
||||
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
{$endif ndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
|
||||
@ -312,12 +307,6 @@ Function fpc_Char_To_UnicodeStr(const c : Char): UnicodeString; compilerproc;
|
||||
Function fpc_PChar_To_UnicodeStr(const p : pchar): UnicodeString; compilerproc;
|
||||
Function fpc_CharArray_To_UnicodeStr(const arr: array of char; zerobased: boolean = true): UnicodeString; compilerproc;
|
||||
procedure fpc_unicodestr_to_chararray(out res: array of char; const src: UnicodeString); compilerproc;
|
||||
procedure fpc_shortstr_to_unicodechararray(out res: array of unicodechar; const src: ShortString); compilerproc;
|
||||
procedure fpc_ansistr_to_unicodechararray(out res: array of unicodechar; const src: AnsiString); compilerproc;
|
||||
procedure fpc_unicodestr_to_unicodechararray(out res: array of unicodechar; const src: UnicodeString); compilerproc;
|
||||
procedure fpc_UnicodeCharArray_To_ShortStr(out res : shortstring;const arr: array of unicodechar; zerobased: boolean = true); compilerproc;
|
||||
Function fpc_UnicodeCharArray_To_AnsiStr(const arr: array of unicodechar; {$ifdef FPC_HAS_CPSTRING}cp : TSystemCodePage;{$endif FPC_HAS_CPSTRING} zerobased: boolean = true): AnsiString; compilerproc;
|
||||
Function fpc_UnicodeCharArray_To_UnicodeStr(const arr: array of unicodechar; zerobased: boolean = true): UnicodeString; compilerproc;
|
||||
procedure fpc_WideCharArray_To_ShortStr(out res : shortstring;const arr: array of widechar; zerobased: boolean = true); compilerproc;
|
||||
Function fpc_WideCharArray_To_AnsiStr(const arr: array of widechar; {$ifdef FPC_HAS_CPSTRING}cp : TSystemCodePage;{$endif FPC_HAS_CPSTRING} zerobased: boolean = true): AnsiString; compilerproc;
|
||||
Function fpc_WideCharArray_To_WideStr(const arr: array of widechar; zerobased: boolean = true): WideString; compilerproc;
|
||||
@ -341,31 +330,16 @@ function fpc_unicodestr_Unique(Var S : Pointer): Pointer; compilerproc;
|
||||
Function fpc_Char_To_UChar(const c : Char): UnicodeChar; compilerproc;
|
||||
Function fpc_UChar_To_Char(const c : UnicodeChar): Char; compilerproc;
|
||||
Function fpc_UChar_To_UnicodeStr(const c : UnicodeChar): UnicodeString; compilerproc;
|
||||
Function fpc_WChar_To_UnicodeStr(const c : WideChar): UnicodeString; compilerproc;
|
||||
Function fpc_UChar_To_AnsiStr(const c : UnicodeChar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): AnsiString; compilerproc;
|
||||
procedure fpc_WChar_To_ShortStr(out res : shortstring;const c : WideChar) compilerproc;
|
||||
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
|
||||
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
|
||||
Function fpc_PUnicodeChar_To_AnsiStr(const p : punicodechar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): ansistring; compilerproc;
|
||||
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
|
||||
Function fpc_PUnicodeChar_To_UnicodeStr(const p : punicodechar): unicodestring; compilerproc;
|
||||
Function fpc_PWideChar_To_UnicodeStr(const p : pwidechar): unicodestring; compilerproc;
|
||||
procedure fpc_PUnicodeChar_To_ShortStr(out res : shortstring;const p : punicodechar); compilerproc;
|
||||
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
|
||||
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
|
||||
Function fpc_PWideChar_To_AnsiStr(const p : pwidechar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): ansistring; compilerproc;
|
||||
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
|
||||
procedure fpc_PWideChar_To_ShortStr(out res : shortstring;const p : pwidechar); compilerproc;
|
||||
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
|
||||
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
Function fpc_Char_To_WChar(const c : Char): WideChar; compilerproc;
|
||||
Function fpc_WChar_To_Char(const c : WideChar): Char; compilerproc;
|
||||
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
|
||||
{ $ifdef FPC_HAS_FEATURE_TEXTIO}
|
||||
{ from text.inc }
|
||||
|
@ -316,39 +316,6 @@ Function fpc_WideStr_To_UnicodeStr (Const S2 : WideString): UnicodeString; compi
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_PUnicodeChar_To_AnsiStr(const p : punicodechar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): ansistring; compilerproc;
|
||||
var
|
||||
Size : SizeInt;
|
||||
{$ifndef FPC_HAS_CPSTRING}
|
||||
cp : TSystemCodePage;
|
||||
{$endif FPC_HAS_CPSTRING}
|
||||
begin
|
||||
{$ifndef FPC_HAS_CPSTRING}
|
||||
cp:=DefaultSystemCodePage;
|
||||
{$endif FPC_HAS_CPSTRING}
|
||||
result:='';
|
||||
if p=nil then
|
||||
exit;
|
||||
Size := IndexWord(p^, -1, 0);
|
||||
if Size>0 then
|
||||
widestringmanager.Unicode2AnsiMoveProc(P,result,cp,Size);
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_PUnicodeChar_To_UnicodeStr(const p : punicodechar): unicodestring; compilerproc;
|
||||
var
|
||||
Size : SizeInt;
|
||||
begin
|
||||
result:='';
|
||||
if p=nil then
|
||||
exit;
|
||||
Size := IndexWord(p^, -1, 0);
|
||||
Setlength(result,Size);
|
||||
if Size>0 then
|
||||
Move(p^,PUnicodeChar(Pointer(result))^,Size*sizeof(UnicodeChar));
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_PWideChar_To_UnicodeStr(const p : pwidechar): unicodestring; compilerproc;
|
||||
var
|
||||
Size : SizeInt;
|
||||
@ -363,23 +330,6 @@ begin
|
||||
end;
|
||||
|
||||
|
||||
procedure fpc_PUnicodeChar_To_ShortStr(out res : shortstring;const p : punicodechar); compilerproc;
|
||||
var
|
||||
Size : SizeInt;
|
||||
temp: ansistring;
|
||||
begin
|
||||
res:='';
|
||||
if p=nil then
|
||||
exit;
|
||||
Size:=IndexWord(p^, high(PtrInt), 0);
|
||||
if Size>0 then
|
||||
begin
|
||||
widestringmanager.Unicode2AnsiMoveProc(p,temp,DefaultSystemCodePage,Size);
|
||||
res:=temp;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_PWideChar_To_AnsiStr(const p : pwidechar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): ansistring; compilerproc;
|
||||
var
|
||||
Size : SizeInt;
|
||||
@ -577,40 +527,6 @@ begin
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_WChar_To_UnicodeStr(const c : WideChar): UnicodeString; compilerproc;
|
||||
{
|
||||
Converts a WideChar to a UnicodeString;
|
||||
}
|
||||
begin
|
||||
Setlength (Result,1);
|
||||
Result[1]:= c;
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_Char_To_WChar(const c : Char): WideChar; compilerproc;
|
||||
var
|
||||
w: widestring;
|
||||
begin
|
||||
widestringmanager.Ansi2WideMoveProc(@c,DefaultSystemCodePage,w,1);
|
||||
fpc_Char_To_WChar:=w[1];
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_WChar_To_Char(const c : WideChar): Char; compilerproc;
|
||||
{
|
||||
Converts a WideChar to a Char;
|
||||
}
|
||||
var
|
||||
s: ansistring;
|
||||
begin
|
||||
widestringmanager.Wide2AnsiMoveProc(@c, s, DefaultSystemCodePage, 1);
|
||||
if length(s)=1 then
|
||||
fpc_WChar_To_Char:= s[1]
|
||||
else
|
||||
fpc_WChar_To_Char:='?';
|
||||
end;
|
||||
|
||||
|
||||
procedure fpc_WChar_To_ShortStr(out res : shortstring;const c : WideChar) compilerproc;
|
||||
{
|
||||
Converts a WideChar to a ShortString;
|
||||
@ -651,18 +567,6 @@ begin
|
||||
end;
|
||||
|
||||
|
||||
procedure fpc_UChar_To_ShortStr(out res : shortstring;const c : UnicodeChar) compilerproc;
|
||||
{
|
||||
Converts a UnicodeChar to a ShortString;
|
||||
}
|
||||
var
|
||||
s: ansistring;
|
||||
begin
|
||||
widestringmanager.Unicode2AnsiMoveProc(@c,s,DefaultSystemCodePage,1);
|
||||
res:=s;
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_PChar_To_UnicodeStr(const p : pchar): UnicodeString; compilerproc;
|
||||
Var
|
||||
L : SizeInt;
|
||||
@ -698,71 +602,6 @@ begin
|
||||
end;
|
||||
|
||||
|
||||
procedure fpc_UnicodeCharArray_To_ShortStr(out res : shortstring;const arr: array of unicodechar; zerobased: boolean = true);[public,alias:'FPC_UNICODECHARARRAY_TO_SHORTSTR']; compilerproc;
|
||||
var
|
||||
l: longint;
|
||||
index: ptrint;
|
||||
len: byte;
|
||||
temp: ansistring;
|
||||
begin
|
||||
l := high(arr)+1;
|
||||
if l>=high(res)+1 then
|
||||
l:=high(res)
|
||||
else if l<0 then
|
||||
l:=0;
|
||||
if zerobased then
|
||||
begin
|
||||
index:=IndexWord(arr[0],l,0);
|
||||
if index<0 then
|
||||
len:=l
|
||||
else
|
||||
len:=index;
|
||||
end
|
||||
else
|
||||
len:=l;
|
||||
widestringmanager.Unicode2AnsiMoveProc (punicodechar(@arr),temp,DefaultSystemCodePage,len);
|
||||
res:=temp;
|
||||
end;
|
||||
|
||||
Function fpc_UnicodeCharArray_To_AnsiStr(const arr: array of unicodechar; {$ifdef FPC_HAS_CPSTRING}cp : TSystemCodePage;{$endif FPC_HAS_CPSTRING}zerobased: boolean = true): AnsiString; compilerproc;
|
||||
var
|
||||
i : SizeInt;
|
||||
{$ifndef FPC_HAS_CPSTRING}
|
||||
cp : TSystemCodePage;
|
||||
{$endif FPC_HAS_CPSTRING}
|
||||
begin
|
||||
{$ifndef FPC_HAS_CPSTRING}
|
||||
cp:=DefaultSystemCodePage;
|
||||
{$endif FPC_HAS_CPSTRING}
|
||||
if (zerobased) then
|
||||
begin
|
||||
i:=IndexWord(arr,high(arr)+1,0);
|
||||
if i = -1 then
|
||||
i := high(arr)+1;
|
||||
end
|
||||
else
|
||||
i := high(arr)+1;
|
||||
widestringmanager.Unicode2AnsiMoveProc (punicodechar(@arr),fpc_UnicodeCharArray_To_AnsiStr,cp,i);
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_UnicodeCharArray_To_UnicodeStr(const arr: array of unicodechar; zerobased: boolean = true): UnicodeString; compilerproc;
|
||||
var
|
||||
i : SizeInt;
|
||||
begin
|
||||
if (zerobased) then
|
||||
begin
|
||||
i:=IndexWord(arr,high(arr)+1,0);
|
||||
if i = -1 then
|
||||
i := high(arr)+1;
|
||||
end
|
||||
else
|
||||
i := high(arr)+1;
|
||||
SetLength(fpc_UnicodeCharArray_To_UnicodeStr,i);
|
||||
Move(arr[0], Pointer(fpc_UnicodeCharArray_To_UnicodeStr)^,i*sizeof(UnicodeChar));
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_WideCharArray_To_UnicodeStr(const arr: array of widechar; zerobased: boolean = true): UnicodeString; compilerproc;
|
||||
var
|
||||
i : SizeInt;
|
||||
@ -866,61 +705,6 @@ begin
|
||||
end;
|
||||
|
||||
|
||||
procedure fpc_unicodestr_to_unicodechararray(out res: array of unicodechar; const src: UnicodeString); compilerproc;
|
||||
var
|
||||
len: SizeInt;
|
||||
begin
|
||||
len := length(src);
|
||||
if len > length(res) then
|
||||
len := length(res);
|
||||
{$push}
|
||||
{$r-}
|
||||
{ make sure we don't try to access element 1 of the ansistring if it's nil }
|
||||
if len > 0 then
|
||||
move(src[1],res[0],len*SizeOf(UnicodeChar));
|
||||
fillchar(res[len],(length(res)-len)*SizeOf(UnicodeChar),0);
|
||||
{$pop}
|
||||
end;
|
||||
|
||||
|
||||
procedure fpc_ansistr_to_unicodechararray(out res: array of unicodechar; const src: AnsiString); compilerproc;
|
||||
var
|
||||
len: SizeInt;
|
||||
temp: unicodestring;
|
||||
begin
|
||||
len := length(src);
|
||||
{ make sure we don't dereference src if it can be nil (JM) }
|
||||
if len > 0 then
|
||||
widestringmanager.ansi2unicodemoveproc(pchar(@src[1]),DefaultSystemCodePage,temp,len);
|
||||
len := length(temp);
|
||||
if len > length(res) then
|
||||
len := length(res);
|
||||
{$push}
|
||||
{$r-}
|
||||
move(temp[1],res[0],len*sizeof(unicodechar));
|
||||
fillchar(res[len],(length(res)-len)*SizeOf(UnicodeChar),0);
|
||||
{$pop}
|
||||
end;
|
||||
|
||||
procedure fpc_shortstr_to_unicodechararray(out res: array of unicodechar; const src: ShortString); compilerproc;
|
||||
var
|
||||
len: longint;
|
||||
temp : unicodestring;
|
||||
begin
|
||||
len := length(src);
|
||||
{ make sure we don't access char 1 if length is 0 (JM) }
|
||||
if len > 0 then
|
||||
widestringmanager.ansi2unicodemoveproc(pchar(@src[1]),DefaultSystemCodePage,temp,len);
|
||||
len := length(temp);
|
||||
if len > length(res) then
|
||||
len := length(res);
|
||||
{$push}
|
||||
{$r-}
|
||||
move(temp[1],res[0],len*sizeof(unicodechar));
|
||||
fillchar(res[len],(length(res)-len)*SizeOf(UnicodeChar),0);
|
||||
{$pop}
|
||||
end;
|
||||
|
||||
procedure fpc_ansistr_to_widechararray(out res: array of widechar; const src: RawByteString); compilerproc;
|
||||
var
|
||||
len: SizeInt;
|
||||
|
@ -36,10 +36,6 @@ procedure DefaultAnsi2WideMove(source:pchar;cp : TSystemCodePage;var dest:widest
|
||||
type
|
||||
TWideStringManager = TUnicodeStringManager;
|
||||
|
||||
function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PWideChar; SourceChars: SizeUInt): SizeUInt;
|
||||
function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
function Utf8ToUnicode(Dest: PWideChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
|
||||
function UTF8Encode(const s : WideString) : RawByteString;
|
||||
|
||||
{$ifdef MSWINDOWS}
|
||||
|
@ -349,27 +349,7 @@ Function fpc_Char_To_WideStr(const c : Char): WideString; compilerproc;
|
||||
Converts a Char to a WideString;
|
||||
}
|
||||
begin
|
||||
Setlength(fpc_Char_To_WideStr,1);
|
||||
fpc_Char_To_WideStr[1]:=c;
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_WChar_To_WideStr(const c : WideChar): WideString; compilerproc;
|
||||
{
|
||||
Converts a WideChar to a WideString;
|
||||
}
|
||||
begin
|
||||
Setlength (fpc_WChar_To_WideStr,1);
|
||||
fpc_WChar_To_WideStr[1]:= c;
|
||||
end;
|
||||
|
||||
|
||||
Function fpc_WChar_To_AnsiStr(const c : WideChar{$ifdef FPC_HAS_CPSTRING};cp : TSystemCodePage{$endif FPC_HAS_CPSTRING}): AnsiString; compilerproc;
|
||||
{
|
||||
Converts a WideChar to a AnsiString;
|
||||
}
|
||||
begin
|
||||
widestringmanager.Wide2AnsiMoveProc(@c, fpc_WChar_To_AnsiStr,{$ifdef FPC_HAS_CPSTRING}cp{$else}TSystemCodePage(0){$endif FPC_HAS_CPSTRING}, 1);
|
||||
widestringmanager.Ansi2WideMoveProc(@c,DefaultSystemCodePage,fpc_char_To_WideStr,1);
|
||||
end;
|
||||
|
||||
|
||||
@ -967,435 +947,6 @@ end;
|
||||
|
||||
{$endif CPU64}
|
||||
|
||||
{ converts an utf-16 code point or surrogate pair to utf-32 }
|
||||
function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_WIDETOUTF32'];
|
||||
var
|
||||
w: widechar;
|
||||
begin
|
||||
{ UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
|
||||
{ are the same in UTF-32 }
|
||||
w:=s[index];
|
||||
if (w<=#$d7ff) or
|
||||
(w>=#$e000) then
|
||||
begin
|
||||
result:=UCS4Char(w);
|
||||
len:=1;
|
||||
end
|
||||
{ valid surrogate pair? }
|
||||
else if (w<=#$dbff) and
|
||||
{ w>=#$d7ff check not needed, checked above }
|
||||
(index<length(s)) and
|
||||
(s[index+1]>=#$dc00) and
|
||||
(s[index+1]<=#$dfff) then
|
||||
{ convert the surrogate pair to UTF-32 }
|
||||
begin
|
||||
result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
|
||||
len:=2;
|
||||
end
|
||||
else
|
||||
{ invalid surrogate -> do nothing }
|
||||
begin
|
||||
result:=UCS4Char(w);
|
||||
len:=1;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
begin
|
||||
if assigned(Source) then
|
||||
Result:=UnicodeToUtf8(Dest,MaxBytes,Source,IndexWord(Source^,-1,0))
|
||||
else
|
||||
Result:=0;
|
||||
end;
|
||||
|
||||
|
||||
function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PWideChar; SourceChars: SizeUInt): SizeUInt;
|
||||
var
|
||||
i,j : SizeUInt;
|
||||
w : word;
|
||||
lw : longword;
|
||||
len : longint;
|
||||
begin
|
||||
result:=0;
|
||||
if source=nil then
|
||||
exit;
|
||||
i:=0;
|
||||
j:=0;
|
||||
if assigned(Dest) then
|
||||
begin
|
||||
while (i<SourceChars) and (j<MaxDestBytes) do
|
||||
begin
|
||||
w:=word(Source[i]);
|
||||
case w of
|
||||
0..$7f:
|
||||
begin
|
||||
Dest[j]:=char(w);
|
||||
inc(j);
|
||||
end;
|
||||
$80..$7ff:
|
||||
begin
|
||||
if j+1>=MaxDestBytes then
|
||||
break;
|
||||
Dest[j]:=char($c0 or (w shr 6));
|
||||
Dest[j+1]:=char($80 or (w and $3f));
|
||||
inc(j,2);
|
||||
end;
|
||||
$800..$d7ff,$e000..$ffff:
|
||||
begin
|
||||
if j+2>=MaxDestBytes then
|
||||
break;
|
||||
Dest[j]:=char($e0 or (w shr 12));
|
||||
Dest[j+1]:=char($80 or ((w shr 6) and $3f));
|
||||
Dest[j+2]:=char($80 or (w and $3f));
|
||||
inc(j,3);
|
||||
end;
|
||||
$d800..$dbff:
|
||||
{High Surrogates}
|
||||
begin
|
||||
if j+3>=MaxDestBytes then
|
||||
break;
|
||||
if (i<sourcechars-1) and
|
||||
(word(Source[i+1]) >= $dc00) and
|
||||
(word(Source[i+1]) <= $dfff) then
|
||||
begin
|
||||
lw:=longword(utf16toutf32(Source[i] + Source[i+1], 1, len));
|
||||
Dest[j]:=char($f0 or (lw shr 18));
|
||||
Dest[j+1]:=char($80 or ((lw shr 12) and $3f));
|
||||
Dest[j+2]:=char($80 or ((lw shr 6) and $3f));
|
||||
Dest[j+3]:=char($80 or (lw and $3f));
|
||||
inc(j,4);
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
inc(i);
|
||||
end;
|
||||
|
||||
if j>SizeUInt(MaxDestBytes-1) then
|
||||
j:=MaxDestBytes-1;
|
||||
|
||||
Dest[j]:=#0;
|
||||
end
|
||||
else
|
||||
begin
|
||||
while i<SourceChars do
|
||||
begin
|
||||
case word(Source[i]) of
|
||||
$0..$7f:
|
||||
inc(j);
|
||||
$80..$7ff:
|
||||
inc(j,2);
|
||||
$800..$d7ff,$e000..$ffff:
|
||||
inc(j,3);
|
||||
$d800..$dbff:
|
||||
begin
|
||||
if (i<sourcechars-1) and
|
||||
(word(Source[i+1]) >= $dc00) and
|
||||
(word(Source[i+1]) <= $dfff) then
|
||||
begin
|
||||
inc(j,4);
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
result:=j+1;
|
||||
end;
|
||||
|
||||
|
||||
function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
begin
|
||||
if assigned(Source) then
|
||||
Result:=Utf8ToUnicode(Dest,MaxChars,Source,strlen(Source))
|
||||
else
|
||||
Result:=0;
|
||||
end;
|
||||
|
||||
|
||||
function UTF8ToUnicode(Dest: PWideChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
|
||||
const
|
||||
UNICODE_INVALID=63;
|
||||
var
|
||||
InputUTF8: SizeUInt;
|
||||
IBYTE: BYTE;
|
||||
OutputUnicode: SizeUInt;
|
||||
PRECHAR: SizeUInt;
|
||||
TempBYTE: BYTE;
|
||||
CharLen: SizeUint;
|
||||
LookAhead: SizeUInt;
|
||||
UC: SizeUInt;
|
||||
begin
|
||||
if not assigned(Source) then
|
||||
begin
|
||||
result:=0;
|
||||
exit;
|
||||
end;
|
||||
result:=SizeUInt(-1);
|
||||
InputUTF8:=0;
|
||||
OutputUnicode:=0;
|
||||
PreChar:=0;
|
||||
if Assigned(Dest) Then
|
||||
begin
|
||||
while (OutputUnicode<MaxDestChars) and (InputUTF8<SourceBytes) do
|
||||
begin
|
||||
IBYTE:=byte(Source[InputUTF8]);
|
||||
if (IBYTE and $80) = 0 then
|
||||
begin
|
||||
//One character US-ASCII, convert it to unicode
|
||||
if IBYTE = 10 then
|
||||
begin
|
||||
If (PreChar<>13) and FALSE then
|
||||
begin
|
||||
//Expand to crlf, conform UTF-8.
|
||||
//This procedure will break the memory alocation by
|
||||
//FPC for the widestring, so never use it. Condition never true due the "and FALSE".
|
||||
if OutputUnicode+1<MaxDestChars then
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(13);
|
||||
inc(OutputUnicode);
|
||||
Dest[OutputUnicode]:=WideChar(10);
|
||||
inc(OutputUnicode);
|
||||
PreChar:=10;
|
||||
end
|
||||
else
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(13);
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(IBYTE);
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(IBYTE);
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
inc(InputUTF8);
|
||||
end
|
||||
else
|
||||
begin
|
||||
TempByte:=IBYTE;
|
||||
CharLen:=0;
|
||||
while (TempBYTE and $80)<>0 do
|
||||
begin
|
||||
TempBYTE:=(TempBYTE shl 1) and $FE;
|
||||
inc(CharLen);
|
||||
end;
|
||||
//Test for the "CharLen" conforms UTF-8 string
|
||||
//This means the 10xxxxxx pattern.
|
||||
if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
|
||||
begin
|
||||
//Insuficient chars in string to decode
|
||||
//UTF-8 array. Fallback to single char.
|
||||
CharLen:= 1;
|
||||
end;
|
||||
for LookAhead := 1 to CharLen-1 do
|
||||
begin
|
||||
if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
|
||||
((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence, fallback.
|
||||
CharLen:= LookAhead;
|
||||
break;
|
||||
end;
|
||||
end;
|
||||
UC:=$FFFF;
|
||||
case CharLen of
|
||||
1: begin
|
||||
//Not valid UTF-8 sequence
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
2: begin
|
||||
//Two bytes UTF, convert it
|
||||
UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
|
||||
UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
|
||||
if UC <= $7F then
|
||||
begin
|
||||
//Invalid UTF sequence.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
3: begin
|
||||
//Three bytes, convert it to unicode
|
||||
UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
|
||||
if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence
|
||||
UC:= UNICODE_INVALID;
|
||||
End;
|
||||
end;
|
||||
4: begin
|
||||
//Four bytes, convert it to two unicode characters
|
||||
UC:= (byte(Source[InputUTF8]) and $07) shl 18;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
|
||||
if (UC < $10000) or (UC > $10FFFF) then
|
||||
begin
|
||||
UC:= UNICODE_INVALID;
|
||||
end
|
||||
else
|
||||
begin
|
||||
{ only store pair if room }
|
||||
dec(UC,$10000);
|
||||
if (OutputUnicode<MaxDestChars-1) then
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(UC shr 10 + $D800);
|
||||
inc(OutputUnicode);
|
||||
UC:=(UC and $3ff) + $DC00;
|
||||
end
|
||||
else
|
||||
begin
|
||||
InputUTF8:= InputUTF8 + CharLen;
|
||||
{ don't store anything }
|
||||
CharLen:=0;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
5,6,7: begin
|
||||
//Invalid UTF8 to unicode conversion,
|
||||
//mask it as invalid UNICODE too.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
if CharLen > 0 then
|
||||
begin
|
||||
PreChar:=UC;
|
||||
Dest[OutputUnicode]:=WideChar(UC);
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
InputUTF8:= InputUTF8 + CharLen;
|
||||
end;
|
||||
end;
|
||||
Result:=OutputUnicode+1;
|
||||
end
|
||||
else
|
||||
begin
|
||||
while (InputUTF8<SourceBytes) do
|
||||
begin
|
||||
IBYTE:=byte(Source[InputUTF8]);
|
||||
if (IBYTE and $80) = 0 then
|
||||
begin
|
||||
//One character US-ASCII, convert it to unicode
|
||||
if IBYTE = 10 then
|
||||
begin
|
||||
if (PreChar<>13) and FALSE then
|
||||
begin
|
||||
//Expand to crlf, conform UTF-8.
|
||||
//This procedure will break the memory alocation by
|
||||
//FPC for the widestring, so never use it. Condition never true due the "and FALSE".
|
||||
inc(OutputUnicode,2);
|
||||
PreChar:=10;
|
||||
end
|
||||
else
|
||||
begin
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
inc(InputUTF8);
|
||||
end
|
||||
else
|
||||
begin
|
||||
TempByte:=IBYTE;
|
||||
CharLen:=0;
|
||||
while (TempBYTE and $80)<>0 do
|
||||
begin
|
||||
TempBYTE:=(TempBYTE shl 1) and $FE;
|
||||
inc(CharLen);
|
||||
end;
|
||||
//Test for the "CharLen" conforms UTF-8 string
|
||||
//This means the 10xxxxxx pattern.
|
||||
if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
|
||||
begin
|
||||
//Insuficient chars in string to decode
|
||||
//UTF-8 array. Fallback to single char.
|
||||
CharLen:= 1;
|
||||
end;
|
||||
for LookAhead := 1 to CharLen-1 do
|
||||
begin
|
||||
if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
|
||||
((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence, fallback.
|
||||
CharLen:= LookAhead;
|
||||
break;
|
||||
end;
|
||||
end;
|
||||
UC:=$FFFF;
|
||||
case CharLen of
|
||||
1: begin
|
||||
//Not valid UTF-8 sequence
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
2: begin
|
||||
//Two bytes UTF, convert it
|
||||
UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
|
||||
UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
|
||||
if UC <= $7F then
|
||||
begin
|
||||
//Invalid UTF sequence.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
3: begin
|
||||
//Three bytes, convert it to unicode
|
||||
UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
|
||||
If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence
|
||||
UC:= UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
4: begin
|
||||
//Four bytes, convert it to two unicode characters
|
||||
UC:= (byte(Source[InputUTF8]) and $07) shl 18;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
|
||||
if (UC < $10000) or (UC > $10FFFF) then
|
||||
UC:= UNICODE_INVALID
|
||||
else
|
||||
{ extra character character }
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
5,6,7: begin
|
||||
//Invalid UTF8 to unicode conversion,
|
||||
//mask it as invalid UNICODE too.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
if CharLen > 0 then
|
||||
begin
|
||||
PreChar:=UC;
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
InputUTF8:= InputUTF8 + CharLen;
|
||||
end;
|
||||
end;
|
||||
Result:=OutputUnicode+1;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
function UTF8Encode(const s : WideString) : RawByteString;
|
||||
var
|
||||
i : SizeInt;
|
||||
|
Loading…
Reference in New Issue
Block a user