mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-08 10:48:12 +02:00
* fixed UTF8ToUnicode() based on patch by JoshyFun, and also added
support for 4-character UTF-8 codepoints (mantis #11791) * fixed UnicodeToUtf8() based on patch by A. J. Miller (mantis #13075) git-svn-id: trunk@12902 -
This commit is contained in:
parent
a43bda5052
commit
d67dbcf030
2
.gitattributes
vendored
2
.gitattributes
vendored
@ -8726,6 +8726,7 @@ tests/webtbs/tw11711.pp svneol=native#text/plain
|
||||
tests/webtbs/tw11762.pp svneol=native#text/plain
|
||||
tests/webtbs/tw11763.pp svneol=native#text/plain
|
||||
tests/webtbs/tw11786.pp svneol=native#text/plain
|
||||
tests/webtbs/tw11791.pp svneol=native#text/plain
|
||||
tests/webtbs/tw1181.pp svneol=native#text/plain
|
||||
tests/webtbs/tw11825.pp svneol=native#text/plain
|
||||
tests/webtbs/tw11846a.pp svneol=native#text/plain
|
||||
@ -8788,6 +8789,7 @@ tests/webtbs/tw1299.pp svneol=native#text/plain
|
||||
tests/webtbs/tw12993.pp svneol=native#text/plain
|
||||
tests/webtbs/tw13015.pp svneol=native#text/plain
|
||||
tests/webtbs/tw13019.pp svneol=native#text/plain
|
||||
tests/webtbs/tw13075.pp svneol=native#text/plain
|
||||
tests/webtbs/tw1310.pp svneol=native#text/plain
|
||||
tests/webtbs/tw13133.pp svneol=native#text/plain
|
||||
tests/webtbs/tw1318.pp svneol=native#text/plain
|
||||
|
@ -1857,6 +1857,40 @@ end;
|
||||
|
||||
{$endif CPU64}
|
||||
|
||||
{ converts an utf-16 code point or surrogate pair to utf-32 }
|
||||
function utf16toutf32(const S: UnicodeString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_UTF16TOUTF32'];
|
||||
var
|
||||
w: unicodechar;
|
||||
begin
|
||||
{ UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
|
||||
{ are the same in UTF-32 }
|
||||
w:=s[index];
|
||||
if (w<=#$d7ff) or
|
||||
(w>=#$e000) then
|
||||
begin
|
||||
result:=UCS4Char(w);
|
||||
len:=1;
|
||||
end
|
||||
{ valid surrogate pair? }
|
||||
else if (w<=#$dbff) and
|
||||
{ w>=#$d7ff check not needed, checked above }
|
||||
(index<length(s)) and
|
||||
(s[index+1]>=#$dc00) and
|
||||
(s[index+1]<=#$dfff) then
|
||||
{ convert the surrogate pair to UTF-32 }
|
||||
begin
|
||||
result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
|
||||
len:=2;
|
||||
end
|
||||
else
|
||||
{ invalid surrogate -> do nothing }
|
||||
begin
|
||||
result:=UCS4Char(w);
|
||||
len:=1;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
function UnicodeToUtf8(Dest: PChar; Source: PUnicodeChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
begin
|
||||
if assigned(Source) then
|
||||
@ -1870,6 +1904,8 @@ function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar
|
||||
var
|
||||
i,j : SizeUInt;
|
||||
w : word;
|
||||
lw : longword;
|
||||
len : longint;
|
||||
begin
|
||||
result:=0;
|
||||
if source=nil then
|
||||
@ -1895,16 +1931,34 @@ function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar
|
||||
Dest[j+1]:=char($80 or (w and $3f));
|
||||
inc(j,2);
|
||||
end;
|
||||
else
|
||||
$800..$d7ff,$e000..$ffff:
|
||||
begin
|
||||
if j+2>=MaxDestBytes then
|
||||
break;
|
||||
Dest[j]:=char($e0 or (w shr 12));
|
||||
Dest[j+1]:=char($80 or ((w shr 6)and $3f));
|
||||
Dest[j+2]:=char($80 or (w and $3f));
|
||||
inc(j,3);
|
||||
if j+2>=MaxDestBytes then
|
||||
break;
|
||||
Dest[j]:=char($e0 or (w shr 12));
|
||||
Dest[j+1]:=char($80 or ((w shr 6) and $3f));
|
||||
Dest[j+2]:=char($80 or (w and $3f));
|
||||
inc(j,3);
|
||||
end;
|
||||
end;
|
||||
$d800..$dbff:
|
||||
{High Surrogates}
|
||||
begin
|
||||
if j+3>=MaxDestBytes then
|
||||
break;
|
||||
if (i<sourcechars-1) and
|
||||
(word(Source[i+1]) >= $dc00) and
|
||||
(word(Source[i+1]) <= $dfff) then
|
||||
begin
|
||||
lw:=longword(utf16toutf32(Source[i] + Source[i+1], 1, len));
|
||||
Dest[j]:=char($f0 or (lw shr 18));
|
||||
Dest[j+1]:=char($80 or ((lw shr 12) and $3f));
|
||||
Dest[j+2]:=char($80 or ((lw shr 6) and $3f));
|
||||
Dest[j+3]:=char($80 or (lw and $3f));
|
||||
inc(j,4);
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
inc(i);
|
||||
end;
|
||||
|
||||
@ -1922,8 +1976,18 @@ function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar
|
||||
inc(j);
|
||||
$80..$7ff:
|
||||
inc(j,2);
|
||||
else
|
||||
$800..$d7ff,$e000..$ffff:
|
||||
inc(j,3);
|
||||
$d800..$dbff:
|
||||
begin
|
||||
if (i<sourcechars-1) and
|
||||
(word(Source[i+1]) >= $dc00) and
|
||||
(word(Source[i+1]) <= $dfff) then
|
||||
begin
|
||||
inc(j,4);
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
inc(i);
|
||||
end;
|
||||
@ -1941,88 +2005,285 @@ function Utf8ToUnicode(Dest: PUnicodeChar; Source: PChar; MaxChars: SizeInt): Si
|
||||
end;
|
||||
|
||||
|
||||
function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
|
||||
|
||||
var
|
||||
i,j : SizeUInt;
|
||||
w: SizeUInt;
|
||||
b : byte;
|
||||
begin
|
||||
if not assigned(Source) then
|
||||
function UTF8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
|
||||
const
|
||||
UNICODE_INVALID=63;
|
||||
var
|
||||
InputUTF8: SizeUInt;
|
||||
IBYTE: BYTE;
|
||||
OutputUnicode: SizeUInt;
|
||||
PRECHAR: SizeUInt;
|
||||
TempBYTE: BYTE;
|
||||
CharLen: SizeUint;
|
||||
LookAhead: SizeUInt;
|
||||
UC: SizeUInt;
|
||||
begin
|
||||
result:=0;
|
||||
exit;
|
||||
if not assigned(Source) then
|
||||
begin
|
||||
result:=0;
|
||||
exit;
|
||||
end;
|
||||
result:=SizeUInt(-1);
|
||||
InputUTF8:=0;
|
||||
OutputUnicode:=0;
|
||||
PreChar:=0;
|
||||
if Assigned(Dest) Then
|
||||
begin
|
||||
while (OutputUnicode<MaxDestChars) and (InputUTF8<SourceBytes) do
|
||||
begin
|
||||
IBYTE:=byte(Source[InputUTF8]);
|
||||
if (IBYTE and $80) = 0 then
|
||||
begin
|
||||
//One character US-ASCII, convert it to unicode
|
||||
if IBYTE = 10 then
|
||||
begin
|
||||
If (PreChar<>13) and FALSE then
|
||||
begin
|
||||
//Expand to crlf, conform UTF-8.
|
||||
//This procedure will break the memory alocation by
|
||||
//FPC for the widestring, so never use it. Condition never true due the "and FALSE".
|
||||
if OutputUnicode+1<MaxDestChars then
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(13);
|
||||
inc(OutputUnicode);
|
||||
Dest[OutputUnicode]:=WideChar(10);
|
||||
inc(OutputUnicode);
|
||||
PreChar:=10;
|
||||
end
|
||||
else
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(13);
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(IBYTE);
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(IBYTE);
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
inc(InputUTF8);
|
||||
end
|
||||
else
|
||||
begin
|
||||
TempByte:=IBYTE;
|
||||
CharLen:=0;
|
||||
while (TempBYTE and $80)<>0 do
|
||||
begin
|
||||
TempBYTE:=(TempBYTE shl 1) and $FE;
|
||||
inc(CharLen);
|
||||
end;
|
||||
//Test for the "CharLen" conforms UTF-8 string
|
||||
//This means the 10xxxxxx pattern.
|
||||
if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
|
||||
begin
|
||||
//Insuficient chars in string to decode
|
||||
//UTF-8 array. Fallback to single char.
|
||||
CharLen:= 1;
|
||||
end;
|
||||
for LookAhead := 1 to CharLen-1 do
|
||||
begin
|
||||
if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
|
||||
((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence, fallback.
|
||||
CharLen:= LookAhead;
|
||||
break;
|
||||
end;
|
||||
end;
|
||||
UC:=$FFFF;
|
||||
case CharLen of
|
||||
1: begin
|
||||
//Not valid UTF-8 sequence
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
2: begin
|
||||
//Two bytes UTF, convert it
|
||||
UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
|
||||
UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
|
||||
if UC <= $7F then
|
||||
begin
|
||||
//Invalid UTF sequence.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
3: begin
|
||||
//Three bytes, convert it to unicode
|
||||
UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
|
||||
if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence
|
||||
UC:= UNICODE_INVALID;
|
||||
End;
|
||||
end;
|
||||
4: begin
|
||||
//Four bytes, convert it to two unicode characters
|
||||
UC:= (byte(Source[InputUTF8]) and $07) shl 18;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
|
||||
if (UC < $10000) or (UC > $10FFFF) then
|
||||
begin
|
||||
UC:= UNICODE_INVALID;
|
||||
end
|
||||
else
|
||||
begin
|
||||
{ only store pair if room }
|
||||
dec(UC,$10000);
|
||||
if (OutputUnicode<MaxDestChars-1) then
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(UC shr 10 + $D800);
|
||||
inc(OutputUnicode);
|
||||
UC:=(UC and $3ff) + $DC00;
|
||||
end
|
||||
else
|
||||
begin
|
||||
InputUTF8:= InputUTF8 + CharLen;
|
||||
{ don't store anything }
|
||||
CharLen:=0;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
5,6,7: begin
|
||||
//Invalid UTF8 to unicode conversion,
|
||||
//mask it as invalid UNICODE too.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
if CharLen > 0 then
|
||||
begin
|
||||
PreChar:=UC;
|
||||
Dest[OutputUnicode]:=WideChar(UC);
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
InputUTF8:= InputUTF8 + CharLen;
|
||||
end;
|
||||
end;
|
||||
Result:=OutputUnicode+1;
|
||||
end
|
||||
else
|
||||
begin
|
||||
while (InputUTF8<SourceBytes) do
|
||||
begin
|
||||
IBYTE:=byte(Source[InputUTF8]);
|
||||
if (IBYTE and $80) = 0 then
|
||||
begin
|
||||
//One character US-ASCII, convert it to unicode
|
||||
if IBYTE = 10 then
|
||||
begin
|
||||
if (PreChar<>13) and FALSE then
|
||||
begin
|
||||
//Expand to crlf, conform UTF-8.
|
||||
//This procedure will break the memory alocation by
|
||||
//FPC for the widestring, so never use it. Condition never true due the "and FALSE".
|
||||
inc(OutputUnicode,2);
|
||||
PreChar:=10;
|
||||
end
|
||||
else
|
||||
begin
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
inc(InputUTF8);
|
||||
end
|
||||
else
|
||||
begin
|
||||
TempByte:=IBYTE;
|
||||
CharLen:=0;
|
||||
while (TempBYTE and $80)<>0 do
|
||||
begin
|
||||
TempBYTE:=(TempBYTE shl 1) and $FE;
|
||||
inc(CharLen);
|
||||
end;
|
||||
//Test for the "CharLen" conforms UTF-8 string
|
||||
//This means the 10xxxxxx pattern.
|
||||
if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
|
||||
begin
|
||||
//Insuficient chars in string to decode
|
||||
//UTF-8 array. Fallback to single char.
|
||||
CharLen:= 1;
|
||||
end;
|
||||
for LookAhead := 1 to CharLen-1 do
|
||||
begin
|
||||
if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
|
||||
((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence, fallback.
|
||||
CharLen:= LookAhead;
|
||||
break;
|
||||
end;
|
||||
end;
|
||||
UC:=$FFFF;
|
||||
case CharLen of
|
||||
1: begin
|
||||
//Not valid UTF-8 sequence
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
2: begin
|
||||
//Two bytes UTF, convert it
|
||||
UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
|
||||
UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
|
||||
if UC <= $7F then
|
||||
begin
|
||||
//Invalid UTF sequence.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
3: begin
|
||||
//Three bytes, convert it to unicode
|
||||
UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
|
||||
If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence
|
||||
UC:= UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
4: begin
|
||||
//Four bytes, convert it to two unicode characters
|
||||
UC:= (byte(Source[InputUTF8]) and $07) shl 18;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
|
||||
if (UC < $10000) or (UC > $10FFFF) then
|
||||
UC:= UNICODE_INVALID
|
||||
else
|
||||
{ extra character character }
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
5,6,7: begin
|
||||
//Invalid UTF8 to unicode conversion,
|
||||
//mask it as invalid UNICODE too.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
if CharLen > 0 then
|
||||
begin
|
||||
PreChar:=UC;
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
InputUTF8:= InputUTF8 + CharLen;
|
||||
end;
|
||||
end;
|
||||
Result:=OutputUnicode+1;
|
||||
end;
|
||||
end;
|
||||
result:=SizeUInt(-1);
|
||||
i:=0;
|
||||
j:=0;
|
||||
if assigned(Dest) then
|
||||
begin
|
||||
while (j<MaxDestChars) and (i<SourceBytes) do
|
||||
begin
|
||||
b:=byte(Source[i]);
|
||||
w:=b;
|
||||
inc(i);
|
||||
// 2 or 3 bytes?
|
||||
if b>=$80 then
|
||||
begin
|
||||
w:=b and $3f;
|
||||
if i>=SourceBytes then
|
||||
exit;
|
||||
// 3 bytes?
|
||||
if (b and $20)<>0 then
|
||||
begin
|
||||
b:=byte(Source[i]);
|
||||
inc(i);
|
||||
if i>=SourceBytes then
|
||||
exit;
|
||||
if (b and $c0)<>$80 then
|
||||
exit;
|
||||
w:=(w shl 6) or (b and $3f);
|
||||
end;
|
||||
b:=byte(Source[i]);
|
||||
w:=(w shl 6) or (b and $3f);
|
||||
if (b and $c0)<>$80 then
|
||||
exit;
|
||||
inc(i);
|
||||
end;
|
||||
Dest[j]:=UnicodeChar(w);
|
||||
inc(j);
|
||||
end;
|
||||
if j>=MaxDestChars then j:=MaxDestChars-1;
|
||||
Dest[j]:=#0;
|
||||
end
|
||||
else
|
||||
begin
|
||||
while i<SourceBytes do
|
||||
begin
|
||||
b:=byte(Source[i]);
|
||||
inc(i);
|
||||
// 2 or 3 bytes?
|
||||
if b>=$80 then
|
||||
begin
|
||||
if i>=SourceBytes then
|
||||
exit;
|
||||
// 3 bytes?
|
||||
b := b and $3f;
|
||||
if (b and $20)<>0 then
|
||||
begin
|
||||
b:=byte(Source[i]);
|
||||
inc(i);
|
||||
if i>=SourceBytes then
|
||||
exit;
|
||||
if (b and $c0)<>$80 then
|
||||
exit;
|
||||
end;
|
||||
if (byte(Source[i]) and $c0)<>$80 then
|
||||
exit;
|
||||
inc(i);
|
||||
end;
|
||||
inc(j);
|
||||
end;
|
||||
end;
|
||||
result:=j+1;
|
||||
end;
|
||||
|
||||
|
||||
function UTF8Encode(const s : Ansistring) : UTF8String; inline;
|
||||
@ -2079,40 +2340,6 @@ function Utf8ToAnsi(const s : UTF8String) : ansistring;{$ifdef SYSTEMINLINE}inli
|
||||
end;
|
||||
|
||||
|
||||
{ converts an utf-16 code point or surrogate pair to utf-32 }
|
||||
function utf16toutf32(const S: UnicodeString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_UTF16TOUTF32'];
|
||||
var
|
||||
w: unicodechar;
|
||||
begin
|
||||
{ UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
|
||||
{ are the same in UTF-32 }
|
||||
w:=s[index];
|
||||
if (w<=#$d7ff) or
|
||||
(w>=#$e000) then
|
||||
begin
|
||||
result:=UCS4Char(w);
|
||||
len:=1;
|
||||
end
|
||||
{ valid surrogate pair? }
|
||||
else if (w<=#$dbff) and
|
||||
{ w>=#$d7ff check not needed, checked above }
|
||||
(index<length(s)) and
|
||||
(s[index+1]>=#$dc00) and
|
||||
(s[index+1]<=#$dfff) then
|
||||
{ convert the surrogate pair to UTF-32 }
|
||||
begin
|
||||
result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
|
||||
len:=2;
|
||||
end
|
||||
else
|
||||
{ invalid surrogate -> do nothing }
|
||||
begin
|
||||
result:=UCS4Char(w);
|
||||
len:=1;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String;
|
||||
var
|
||||
i, slen,
|
||||
|
@ -1227,192 +1227,6 @@ end;
|
||||
|
||||
{$endif CPU64}
|
||||
|
||||
function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
begin
|
||||
if assigned(Source) then
|
||||
Result:=UnicodeToUtf8(Dest,MaxBytes,Source,IndexWord(Source^,-1,0))
|
||||
else
|
||||
Result:=0;
|
||||
end;
|
||||
|
||||
|
||||
function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PWideChar; SourceChars: SizeUInt): SizeUInt;
|
||||
var
|
||||
i,j : SizeUInt;
|
||||
w : word;
|
||||
begin
|
||||
result:=0;
|
||||
if source=nil then
|
||||
exit;
|
||||
i:=0;
|
||||
j:=0;
|
||||
if assigned(Dest) then
|
||||
begin
|
||||
while (i<SourceChars) and (j<MaxDestBytes) do
|
||||
begin
|
||||
w:=word(Source[i]);
|
||||
case w of
|
||||
0..$7f:
|
||||
begin
|
||||
Dest[j]:=char(w);
|
||||
inc(j);
|
||||
end;
|
||||
$80..$7ff:
|
||||
begin
|
||||
if j+1>=MaxDestBytes then
|
||||
break;
|
||||
Dest[j]:=char($c0 or (w shr 6));
|
||||
Dest[j+1]:=char($80 or (w and $3f));
|
||||
inc(j,2);
|
||||
end;
|
||||
else
|
||||
begin
|
||||
if j+2>=MaxDestBytes then
|
||||
break;
|
||||
Dest[j]:=char($e0 or (w shr 12));
|
||||
Dest[j+1]:=char($80 or ((w shr 6)and $3f));
|
||||
Dest[j+2]:=char($80 or (w and $3f));
|
||||
inc(j,3);
|
||||
end;
|
||||
end;
|
||||
inc(i);
|
||||
end;
|
||||
|
||||
if j>SizeUInt(MaxDestBytes-1) then
|
||||
j:=MaxDestBytes-1;
|
||||
|
||||
Dest[j]:=#0;
|
||||
end
|
||||
else
|
||||
begin
|
||||
while i<SourceChars do
|
||||
begin
|
||||
case word(Source[i]) of
|
||||
$0..$7f:
|
||||
inc(j);
|
||||
$80..$7ff:
|
||||
inc(j,2);
|
||||
else
|
||||
inc(j,3);
|
||||
end;
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
result:=j+1;
|
||||
end;
|
||||
|
||||
|
||||
function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
begin
|
||||
if assigned(Source) then
|
||||
Result:=Utf8ToUnicode(Dest,MaxChars,Source,strlen(Source))
|
||||
else
|
||||
Result:=0;
|
||||
end;
|
||||
|
||||
|
||||
function Utf8ToUnicode(Dest: PWideChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
|
||||
|
||||
var
|
||||
i,j : SizeUInt;
|
||||
w: SizeUInt;
|
||||
b : byte;
|
||||
begin
|
||||
if not assigned(Source) then
|
||||
begin
|
||||
result:=0;
|
||||
exit;
|
||||
end;
|
||||
result:=SizeUInt(-1);
|
||||
i:=0;
|
||||
j:=0;
|
||||
if assigned(Dest) then
|
||||
begin
|
||||
while (j<MaxDestChars) and (i<SourceBytes) do
|
||||
begin
|
||||
b:=byte(Source[i]);
|
||||
w:=b;
|
||||
inc(i);
|
||||
// 2 or 3 bytes?
|
||||
if b>=$80 then
|
||||
begin
|
||||
w:=b and $3f;
|
||||
if i>=SourceBytes then
|
||||
exit;
|
||||
// 3 bytes?
|
||||
if (b and $20)<>0 then
|
||||
begin
|
||||
b:=byte(Source[i]);
|
||||
inc(i);
|
||||
if i>=SourceBytes then
|
||||
exit;
|
||||
if (b and $c0)<>$80 then
|
||||
exit;
|
||||
w:=(w shl 6) or (b and $3f);
|
||||
end;
|
||||
b:=byte(Source[i]);
|
||||
w:=(w shl 6) or (b and $3f);
|
||||
if (b and $c0)<>$80 then
|
||||
exit;
|
||||
inc(i);
|
||||
end;
|
||||
Dest[j]:=WideChar(w);
|
||||
inc(j);
|
||||
end;
|
||||
if j>=MaxDestChars then j:=MaxDestChars-1;
|
||||
Dest[j]:=#0;
|
||||
end
|
||||
else
|
||||
begin
|
||||
while i<SourceBytes do
|
||||
begin
|
||||
b:=byte(Source[i]);
|
||||
inc(i);
|
||||
// 2 or 3 bytes?
|
||||
if b>=$80 then
|
||||
begin
|
||||
if i>=SourceBytes then
|
||||
exit;
|
||||
// 3 bytes?
|
||||
b := b and $3f;
|
||||
if (b and $20)<>0 then
|
||||
begin
|
||||
b:=byte(Source[i]);
|
||||
inc(i);
|
||||
if i>=SourceBytes then
|
||||
exit;
|
||||
if (b and $c0)<>$80 then
|
||||
exit;
|
||||
end;
|
||||
if (byte(Source[i]) and $c0)<>$80 then
|
||||
exit;
|
||||
inc(i);
|
||||
end;
|
||||
inc(j);
|
||||
end;
|
||||
end;
|
||||
result:=j+1;
|
||||
end;
|
||||
|
||||
|
||||
function UTF8Encode(const s : WideString) : UTF8String;
|
||||
var
|
||||
i : SizeInt;
|
||||
hs : UTF8String;
|
||||
begin
|
||||
result:='';
|
||||
if s='' then
|
||||
exit;
|
||||
SetLength(hs,length(s)*3);
|
||||
i:=UnicodeToUtf8(pchar(hs),length(hs)+1,PWideChar(s),length(s));
|
||||
if i>0 then
|
||||
begin
|
||||
SetLength(hs,i-1);
|
||||
result:=hs;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
{ converts an utf-16 code point or surrogate pair to utf-32 }
|
||||
function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_WIDETOUTF32'];
|
||||
var
|
||||
@ -1447,6 +1261,420 @@ begin
|
||||
end;
|
||||
|
||||
|
||||
function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
begin
|
||||
if assigned(Source) then
|
||||
Result:=UnicodeToUtf8(Dest,MaxBytes,Source,IndexWord(Source^,-1,0))
|
||||
else
|
||||
Result:=0;
|
||||
end;
|
||||
|
||||
|
||||
function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PWideChar; SourceChars: SizeUInt): SizeUInt;
|
||||
var
|
||||
i,j : SizeUInt;
|
||||
w : word;
|
||||
lw : longword;
|
||||
len : longint;
|
||||
begin
|
||||
result:=0;
|
||||
if source=nil then
|
||||
exit;
|
||||
i:=0;
|
||||
j:=0;
|
||||
if assigned(Dest) then
|
||||
begin
|
||||
while (i<SourceChars) and (j<MaxDestBytes) do
|
||||
begin
|
||||
w:=word(Source[i]);
|
||||
case w of
|
||||
0..$7f:
|
||||
begin
|
||||
Dest[j]:=char(w);
|
||||
inc(j);
|
||||
end;
|
||||
$80..$7ff:
|
||||
begin
|
||||
if j+1>=MaxDestBytes then
|
||||
break;
|
||||
Dest[j]:=char($c0 or (w shr 6));
|
||||
Dest[j+1]:=char($80 or (w and $3f));
|
||||
inc(j,2);
|
||||
end;
|
||||
$800..$d7ff,$e000..$ffff:
|
||||
begin
|
||||
if j+2>=MaxDestBytes then
|
||||
break;
|
||||
Dest[j]:=char($e0 or (w shr 12));
|
||||
Dest[j+1]:=char($80 or ((w shr 6) and $3f));
|
||||
Dest[j+2]:=char($80 or (w and $3f));
|
||||
inc(j,3);
|
||||
end;
|
||||
$d800..$dbff:
|
||||
{High Surrogates}
|
||||
begin
|
||||
if j+3>=MaxDestBytes then
|
||||
break;
|
||||
if (i<sourcechars-1) and
|
||||
(word(Source[i+1]) >= $dc00) and
|
||||
(word(Source[i+1]) <= $dfff) then
|
||||
begin
|
||||
lw:=longword(utf16toutf32(Source[i] + Source[i+1], 1, len));
|
||||
Dest[j]:=char($f0 or (lw shr 18));
|
||||
Dest[j+1]:=char($80 or ((lw shr 12) and $3f));
|
||||
Dest[j+2]:=char($80 or ((lw shr 6) and $3f));
|
||||
Dest[j+3]:=char($80 or (lw and $3f));
|
||||
inc(j,4);
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
inc(i);
|
||||
end;
|
||||
|
||||
if j>SizeUInt(MaxDestBytes-1) then
|
||||
j:=MaxDestBytes-1;
|
||||
|
||||
Dest[j]:=#0;
|
||||
end
|
||||
else
|
||||
begin
|
||||
while i<SourceChars do
|
||||
begin
|
||||
case word(Source[i]) of
|
||||
$0..$7f:
|
||||
inc(j);
|
||||
$80..$7ff:
|
||||
inc(j,2);
|
||||
$800..$d7ff,$e000..$ffff:
|
||||
inc(j,3);
|
||||
$d800..$dbff:
|
||||
begin
|
||||
if (i<sourcechars-1) and
|
||||
(word(Source[i+1]) >= $dc00) and
|
||||
(word(Source[i+1]) <= $dfff) then
|
||||
begin
|
||||
inc(j,4);
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
inc(i);
|
||||
end;
|
||||
end;
|
||||
result:=j+1;
|
||||
end;
|
||||
|
||||
|
||||
function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
begin
|
||||
if assigned(Source) then
|
||||
Result:=Utf8ToUnicode(Dest,MaxChars,Source,strlen(Source))
|
||||
else
|
||||
Result:=0;
|
||||
end;
|
||||
|
||||
|
||||
|
||||
function UTF8ToUnicode(Dest: PWideChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
|
||||
const
|
||||
UNICODE_INVALID=63;
|
||||
var
|
||||
InputUTF8: SizeUInt;
|
||||
IBYTE: BYTE;
|
||||
OutputUnicode: SizeUInt;
|
||||
PRECHAR: SizeUInt;
|
||||
TempBYTE: BYTE;
|
||||
CharLen: SizeUint;
|
||||
LookAhead: SizeUInt;
|
||||
UC: SizeUInt;
|
||||
begin
|
||||
if not assigned(Source) then
|
||||
begin
|
||||
result:=0;
|
||||
exit;
|
||||
end;
|
||||
result:=SizeUInt(-1);
|
||||
InputUTF8:=0;
|
||||
OutputUnicode:=0;
|
||||
PreChar:=0;
|
||||
if Assigned(Dest) Then
|
||||
begin
|
||||
while (OutputUnicode<MaxDestChars) and (InputUTF8<SourceBytes) do
|
||||
begin
|
||||
IBYTE:=byte(Source[InputUTF8]);
|
||||
if (IBYTE and $80) = 0 then
|
||||
begin
|
||||
//One character US-ASCII, convert it to unicode
|
||||
if IBYTE = 10 then
|
||||
begin
|
||||
If (PreChar<>13) and FALSE then
|
||||
begin
|
||||
//Expand to crlf, conform UTF-8.
|
||||
//This procedure will break the memory alocation by
|
||||
//FPC for the widestring, so never use it. Condition never true due the "and FALSE".
|
||||
if OutputUnicode+1<MaxDestChars then
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(13);
|
||||
inc(OutputUnicode);
|
||||
Dest[OutputUnicode]:=WideChar(10);
|
||||
inc(OutputUnicode);
|
||||
PreChar:=10;
|
||||
end
|
||||
else
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(13);
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(IBYTE);
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(IBYTE);
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
inc(InputUTF8);
|
||||
end
|
||||
else
|
||||
begin
|
||||
TempByte:=IBYTE;
|
||||
CharLen:=0;
|
||||
while (TempBYTE and $80)<>0 do
|
||||
begin
|
||||
TempBYTE:=(TempBYTE shl 1) and $FE;
|
||||
inc(CharLen);
|
||||
end;
|
||||
//Test for the "CharLen" conforms UTF-8 string
|
||||
//This means the 10xxxxxx pattern.
|
||||
if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
|
||||
begin
|
||||
//Insuficient chars in string to decode
|
||||
//UTF-8 array. Fallback to single char.
|
||||
CharLen:= 1;
|
||||
end;
|
||||
for LookAhead := 1 to CharLen-1 do
|
||||
begin
|
||||
if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
|
||||
((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence, fallback.
|
||||
CharLen:= LookAhead;
|
||||
break;
|
||||
end;
|
||||
end;
|
||||
UC:=$FFFF;
|
||||
case CharLen of
|
||||
1: begin
|
||||
//Not valid UTF-8 sequence
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
2: begin
|
||||
//Two bytes UTF, convert it
|
||||
UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
|
||||
UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
|
||||
if UC <= $7F then
|
||||
begin
|
||||
//Invalid UTF sequence.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
3: begin
|
||||
//Three bytes, convert it to unicode
|
||||
UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
|
||||
if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence
|
||||
UC:= UNICODE_INVALID;
|
||||
End;
|
||||
end;
|
||||
4: begin
|
||||
//Four bytes, convert it to two unicode characters
|
||||
UC:= (byte(Source[InputUTF8]) and $07) shl 18;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
|
||||
if (UC < $10000) or (UC > $10FFFF) then
|
||||
begin
|
||||
UC:= UNICODE_INVALID;
|
||||
end
|
||||
else
|
||||
begin
|
||||
{ only store pair if room }
|
||||
dec(UC,$10000);
|
||||
if (OutputUnicode<MaxDestChars-1) then
|
||||
begin
|
||||
Dest[OutputUnicode]:=WideChar(UC shr 10 + $D800);
|
||||
inc(OutputUnicode);
|
||||
UC:=(UC and $3ff) + $DC00;
|
||||
end
|
||||
else
|
||||
begin
|
||||
InputUTF8:= InputUTF8 + CharLen;
|
||||
{ don't store anything }
|
||||
CharLen:=0;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
5,6,7: begin
|
||||
//Invalid UTF8 to unicode conversion,
|
||||
//mask it as invalid UNICODE too.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
if CharLen > 0 then
|
||||
begin
|
||||
PreChar:=UC;
|
||||
Dest[OutputUnicode]:=WideChar(UC);
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
InputUTF8:= InputUTF8 + CharLen;
|
||||
end;
|
||||
end;
|
||||
Result:=OutputUnicode+1;
|
||||
end
|
||||
else
|
||||
begin
|
||||
while (InputUTF8<SourceBytes) do
|
||||
begin
|
||||
IBYTE:=byte(Source[InputUTF8]);
|
||||
if (IBYTE and $80) = 0 then
|
||||
begin
|
||||
//One character US-ASCII, convert it to unicode
|
||||
if IBYTE = 10 then
|
||||
begin
|
||||
if (PreChar<>13) and FALSE then
|
||||
begin
|
||||
//Expand to crlf, conform UTF-8.
|
||||
//This procedure will break the memory alocation by
|
||||
//FPC for the widestring, so never use it. Condition never true due the "and FALSE".
|
||||
inc(OutputUnicode,2);
|
||||
PreChar:=10;
|
||||
end
|
||||
else
|
||||
begin
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
inc(OutputUnicode);
|
||||
PreChar:=IBYTE;
|
||||
end;
|
||||
inc(InputUTF8);
|
||||
end
|
||||
else
|
||||
begin
|
||||
TempByte:=IBYTE;
|
||||
CharLen:=0;
|
||||
while (TempBYTE and $80)<>0 do
|
||||
begin
|
||||
TempBYTE:=(TempBYTE shl 1) and $FE;
|
||||
inc(CharLen);
|
||||
end;
|
||||
//Test for the "CharLen" conforms UTF-8 string
|
||||
//This means the 10xxxxxx pattern.
|
||||
if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
|
||||
begin
|
||||
//Insuficient chars in string to decode
|
||||
//UTF-8 array. Fallback to single char.
|
||||
CharLen:= 1;
|
||||
end;
|
||||
for LookAhead := 1 to CharLen-1 do
|
||||
begin
|
||||
if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
|
||||
((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence, fallback.
|
||||
CharLen:= LookAhead;
|
||||
break;
|
||||
end;
|
||||
end;
|
||||
UC:=$FFFF;
|
||||
case CharLen of
|
||||
1: begin
|
||||
//Not valid UTF-8 sequence
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
2: begin
|
||||
//Two bytes UTF, convert it
|
||||
UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
|
||||
UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
|
||||
if UC <= $7F then
|
||||
begin
|
||||
//Invalid UTF sequence.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
3: begin
|
||||
//Three bytes, convert it to unicode
|
||||
UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
|
||||
If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
|
||||
begin
|
||||
//Invalid UTF-8 sequence
|
||||
UC:= UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
4: begin
|
||||
//Four bytes, convert it to two unicode characters
|
||||
UC:= (byte(Source[InputUTF8]) and $07) shl 18;
|
||||
UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
|
||||
UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
|
||||
UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
|
||||
if (UC < $10000) or (UC > $10FFFF) then
|
||||
UC:= UNICODE_INVALID
|
||||
else
|
||||
{ extra character character }
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
5,6,7: begin
|
||||
//Invalid UTF8 to unicode conversion,
|
||||
//mask it as invalid UNICODE too.
|
||||
UC:=UNICODE_INVALID;
|
||||
end;
|
||||
end;
|
||||
if CharLen > 0 then
|
||||
begin
|
||||
PreChar:=UC;
|
||||
inc(OutputUnicode);
|
||||
end;
|
||||
InputUTF8:= InputUTF8 + CharLen;
|
||||
end;
|
||||
end;
|
||||
Result:=OutputUnicode+1;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
function UTF8Encode(const s : WideString) : UTF8String;
|
||||
var
|
||||
i : SizeInt;
|
||||
hs : UTF8String;
|
||||
begin
|
||||
result:='';
|
||||
if s='' then
|
||||
exit;
|
||||
SetLength(hs,length(s)*3);
|
||||
i:=UnicodeToUtf8(pchar(hs),length(hs)+1,PWideChar(s),length(s));
|
||||
if i>0 then
|
||||
begin
|
||||
SetLength(hs,i-1);
|
||||
result:=hs;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
const
|
||||
SNoWidestrings = 'This binary has no widestrings support compiled in.';
|
||||
SRecompileWithWidestrings = 'Recompile the application with a widestrings-manager in the program uses clause.';
|
||||
|
1435
tests/webtbs/tw11791.pp
Normal file
1435
tests/webtbs/tw11791.pp
Normal file
File diff suppressed because it is too large
Load Diff
172
tests/webtbs/tw13075.pp
Normal file
172
tests/webtbs/tw13075.pp
Normal file
@ -0,0 +1,172 @@
|
||||
{$mode objfpc}{$H+}
|
||||
|
||||
|
||||
uses
|
||||
{$ifdef unix}
|
||||
cwstring,
|
||||
{$endif}
|
||||
Classes, SysUtils;
|
||||
|
||||
function localUnicodeToUTF8(u: cardinal; Buf: PChar): integer;
|
||||
|
||||
procedure RaiseInvalidUnicode;
|
||||
begin
|
||||
raise Exception.Create('UnicodeToUTF8: invalid unicode: '+IntToStr(u));
|
||||
end;
|
||||
|
||||
begin
|
||||
case u of
|
||||
0..$7f:
|
||||
begin
|
||||
Result:=1;
|
||||
Buf[0]:=char(byte(u));
|
||||
end;
|
||||
$80..$7ff:
|
||||
begin
|
||||
Result:=2;
|
||||
Buf[0]:=char(byte($c0 or (u shr 6)));
|
||||
Buf[1]:=char(byte($80 or (u and $3f)));
|
||||
end;
|
||||
$800..$ffff:
|
||||
begin
|
||||
Result:=3;
|
||||
Buf[0]:=char(byte($e0 or (u shr 12)));
|
||||
Buf[1]:=char(byte((u shr 6) and $3f) or $80);
|
||||
Buf[2]:=char(byte(u and $3f) or $80);
|
||||
end;
|
||||
$10000..$10ffff:
|
||||
begin
|
||||
Result:=4;
|
||||
Buf[0]:=char(byte($f0 or (u shr 18)));
|
||||
Buf[1]:=char(byte((u shr 12) and $3f) or $80);
|
||||
Buf[2]:=char(byte((u shr 6) and $3f) or $80);
|
||||
Buf[3]:=char(byte(u and $3f) or $80);
|
||||
end;
|
||||
else
|
||||
RaiseInvalidUnicode;
|
||||
end;
|
||||
end;
|
||||
|
||||
function localUnicodeToUTF8(u: cardinal): shortstring;
|
||||
begin
|
||||
Result[0]:=chr(localUnicodeToUTF8(u,@Result[1]));
|
||||
end;
|
||||
|
||||
|
||||
function localUnicodeToUTF16(u: cardinal): widestring;
|
||||
begin
|
||||
// u should be <= $10FFFF to fit into UTF-16
|
||||
|
||||
if u < $10000 then
|
||||
// Note: codepoints $D800 - $DFFF are reserved
|
||||
Result:=widechar(u)
|
||||
else
|
||||
Result:=widechar($D800+((u - $10000) shr 10))+widechar($DC00+((u - $10000) and $3ff));
|
||||
end;
|
||||
|
||||
|
||||
function UnicodeToCESU8(u: cardinal; Buf: PChar): integer;
|
||||
|
||||
procedure RaiseInvalidUnicode;
|
||||
begin
|
||||
raise Exception.Create('UnicodeToCESU8: invalid unicode: '+IntToStr(u));
|
||||
end;
|
||||
|
||||
var
|
||||
st: widestring;
|
||||
begin
|
||||
case u of
|
||||
0..$ffff:
|
||||
begin
|
||||
Result:=localUnicodeToUTF8(u,Buf);
|
||||
end;
|
||||
$10000..$10ffff:
|
||||
begin
|
||||
st := localUnicodeToUTF16(u);
|
||||
|
||||
Result:=6;
|
||||
Buf[0]:=char(byte($e0 or (ord(st[1]) shr 12)));
|
||||
Buf[1]:=char(byte((ord(st[1]) shr 6) and $3f) or $80);
|
||||
Buf[2]:=char(byte(ord(st[1]) and $3f) or $80);
|
||||
Buf[3]:=char(byte($e0 or (ord(st[2]) shr 12)));
|
||||
Buf[4]:=char(byte((ord(st[2]) shr 6) and $3f) or $80);
|
||||
Buf[5]:=char(byte(ord(st[2]) and $3f) or $80);
|
||||
end;
|
||||
else
|
||||
RaiseInvalidUnicode;
|
||||
end;
|
||||
end;
|
||||
|
||||
function UnicodeToCESU8(u: cardinal): utf8string;
|
||||
begin
|
||||
setlength(result,1000);
|
||||
setlength(result,UnicodeToCESU8(u,@Result[1]));
|
||||
end;
|
||||
|
||||
procedure dotest;
|
||||
var
|
||||
s1,s2: utf8string;
|
||||
w1,w2: unicodestring;
|
||||
s3,s4: utf8string;
|
||||
i: longint;
|
||||
begin
|
||||
s1 := localUnicodeToUTF8 ($10300);
|
||||
s2 := UnicodeToCESU8 ($10300);
|
||||
setlength(w1,20);
|
||||
setlength(w2,20);
|
||||
// -1 because UTF8ToUnicode returns a null-terminated string
|
||||
setlength(w1,UTF8ToUnicode(punicodechar(@w1[1]),length(w1),pchar(s1),Length(s1))-1);
|
||||
setlength(w2,UTF8ToUnicode(punicodechar(@w2[1]),length(w2),pchar(s2),Length(s2))-1);
|
||||
(*
|
||||
writeln('len: ',length(w1),' - "',w1,'"');
|
||||
write(' ');
|
||||
for i:= 1 to length(w1) do
|
||||
write('#$',hexstr(ord(w1[i]),4));
|
||||
writeln;
|
||||
writeln('len: ',length(w2),' - "',w2,'"');
|
||||
write(' ');
|
||||
for i:= 1 to length(w2) do
|
||||
write('#$',hexstr(ord(w2[i]),4));
|
||||
writeln;
|
||||
writeln;
|
||||
*)
|
||||
|
||||
setlength(s3,20);
|
||||
setlength(s4,20);
|
||||
// -1 because UnicodeToUTF8 returns a null-terminated string
|
||||
setlength(s3,UnicodeToUTF8(@s3[1],length(s3),punicodechar(@w1[1]),length(w1))-1);
|
||||
setlength(s4,UnicodeToUTF8(@s4[1],length(s4),punicodechar(@w2[1]),length(w2))-1);
|
||||
|
||||
if (s3<>s1) or
|
||||
{ invalid: CESU-8 }
|
||||
(w2<>'??') or
|
||||
(s4<>'??') then
|
||||
begin
|
||||
writeln('len: ',length(s1),' - "',s1,'"');
|
||||
write(' ');
|
||||
for i:= 1 to length(s1) do
|
||||
write('#$',hexstr(ord(s1[i]),2));
|
||||
writeln;
|
||||
writeln('len: ',length(s2),' - "',s2,'"');
|
||||
write(' ');
|
||||
for i:= 1 to length(s2) do
|
||||
write('#$',hexstr(ord(s2[i]),2));
|
||||
writeln;
|
||||
writeln('len: ',length(s3),' - "',s3,'"');
|
||||
write(' ');
|
||||
for i:= 1 to length(s3) do
|
||||
write('#$',hexstr(ord(s3[i]),2));
|
||||
writeln;
|
||||
writeln('len: ',length(s4),' - "',s4,'"');
|
||||
write(' ');
|
||||
for i:= 1 to length(s4) do
|
||||
write('#$',hexstr(ord(s4[i]),2));
|
||||
writeln;
|
||||
halt(1);
|
||||
end;
|
||||
end;
|
||||
|
||||
begin
|
||||
dotest;
|
||||
end.
|
||||
|
Loading…
Reference in New Issue
Block a user