mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-07 01:48:00 +02:00
194 lines
4.6 KiB
ObjectPascal
194 lines
4.6 KiB
ObjectPascal
{$mode objfpc}{$H+}
|
|
|
|
{$ifdef go32v2}
|
|
{$define USE_INTERNAL_UNICODE}
|
|
{$endif}
|
|
|
|
{$ifdef USE_INTERNAL_UNICODE}
|
|
{$define USE_FPWIDESTRING_UNIT}
|
|
{$define USE_UNICODEDUCET_UNIT}
|
|
{$define USE_CPALL_UNIT}
|
|
{$endif}
|
|
|
|
|
|
uses
|
|
{$ifndef USE_INTERNAL_UNICODE}
|
|
{$ifdef unix}
|
|
{$ifdef darwin}iosxwstr{$else}cwstring{$endif},
|
|
{$endif unix}
|
|
{$endif ndef USE_INTERNAL_UNICODE}
|
|
{$ifdef USE_UNICODEDUCET_UNIT}
|
|
unicodeducet,
|
|
{$endif}
|
|
{$ifdef USE_FPWIDESTRING_UNIT}
|
|
fpwidestring,
|
|
{$endif}
|
|
{$ifdef USE_CPALL_UNIT}
|
|
cpall,
|
|
{$endif}
|
|
Classes, SysUtils;
|
|
|
|
function localUnicodeToUTF8(u: cardinal; Buf: PChar): integer;
|
|
|
|
procedure RaiseInvalidUnicode;
|
|
begin
|
|
raise Exception.Create('UnicodeToUTF8: invalid unicode: '+IntToStr(u));
|
|
end;
|
|
|
|
begin
|
|
case u of
|
|
0..$7f:
|
|
begin
|
|
Result:=1;
|
|
Buf[0]:=char(byte(u));
|
|
end;
|
|
$80..$7ff:
|
|
begin
|
|
Result:=2;
|
|
Buf[0]:=char(byte($c0 or (u shr 6)));
|
|
Buf[1]:=char(byte($80 or (u and $3f)));
|
|
end;
|
|
$800..$ffff:
|
|
begin
|
|
Result:=3;
|
|
Buf[0]:=char(byte($e0 or (u shr 12)));
|
|
Buf[1]:=char(byte((u shr 6) and $3f) or $80);
|
|
Buf[2]:=char(byte(u and $3f) or $80);
|
|
end;
|
|
$10000..$10ffff:
|
|
begin
|
|
Result:=4;
|
|
Buf[0]:=char(byte($f0 or (u shr 18)));
|
|
Buf[1]:=char(byte((u shr 12) and $3f) or $80);
|
|
Buf[2]:=char(byte((u shr 6) and $3f) or $80);
|
|
Buf[3]:=char(byte(u and $3f) or $80);
|
|
end;
|
|
else
|
|
RaiseInvalidUnicode;
|
|
end;
|
|
end;
|
|
|
|
function localUnicodeToUTF8(u: cardinal): shortstring;
|
|
begin
|
|
Result[0]:=chr(localUnicodeToUTF8(u,@Result[1]));
|
|
end;
|
|
|
|
|
|
function localUnicodeToUTF16(u: cardinal): widestring;
|
|
begin
|
|
// u should be <= $10FFFF to fit into UTF-16
|
|
|
|
if u < $10000 then
|
|
// Note: codepoints $D800 - $DFFF are reserved
|
|
Result:=widechar(u)
|
|
else
|
|
Result:=widechar($D800+((u - $10000) shr 10))+widechar($DC00+((u - $10000) and $3ff));
|
|
end;
|
|
|
|
|
|
function UnicodeToCESU8(u: cardinal; Buf: PChar): integer;
|
|
|
|
procedure RaiseInvalidUnicode;
|
|
begin
|
|
raise Exception.Create('UnicodeToCESU8: invalid unicode: '+IntToStr(u));
|
|
end;
|
|
|
|
var
|
|
st: widestring;
|
|
begin
|
|
case u of
|
|
0..$ffff:
|
|
begin
|
|
Result:=localUnicodeToUTF8(u,Buf);
|
|
end;
|
|
$10000..$10ffff:
|
|
begin
|
|
st := localUnicodeToUTF16(u);
|
|
|
|
Result:=6;
|
|
Buf[0]:=char(byte($e0 or (ord(st[1]) shr 12)));
|
|
Buf[1]:=char(byte((ord(st[1]) shr 6) and $3f) or $80);
|
|
Buf[2]:=char(byte(ord(st[1]) and $3f) or $80);
|
|
Buf[3]:=char(byte($e0 or (ord(st[2]) shr 12)));
|
|
Buf[4]:=char(byte((ord(st[2]) shr 6) and $3f) or $80);
|
|
Buf[5]:=char(byte(ord(st[2]) and $3f) or $80);
|
|
end;
|
|
else
|
|
RaiseInvalidUnicode;
|
|
end;
|
|
end;
|
|
|
|
function UnicodeToCESU8(u: cardinal): utf8string;
|
|
begin
|
|
setlength(result,1000);
|
|
setlength(result,UnicodeToCESU8(u,@Result[1]));
|
|
end;
|
|
|
|
procedure dotest;
|
|
var
|
|
s1,s2: utf8string;
|
|
w1,w2: unicodestring;
|
|
s3,s4: utf8string;
|
|
i: longint;
|
|
begin
|
|
s1 := localUnicodeToUTF8 ($10300);
|
|
s2 := UnicodeToCESU8 ($10300);
|
|
setlength(w1,20);
|
|
setlength(w2,20);
|
|
// -1 because UTF8ToUnicode returns a null-terminated string
|
|
setlength(w1,UTF8ToUnicode(punicodechar(@w1[1]),length(w1),pchar(s1),Length(s1))-1);
|
|
setlength(w2,UTF8ToUnicode(punicodechar(@w2[1]),length(w2),pchar(s2),Length(s2))-1);
|
|
(*
|
|
writeln('len: ',length(w1),' - "',w1,'"');
|
|
write(' ');
|
|
for i:= 1 to length(w1) do
|
|
write('#$',hexstr(ord(w1[i]),4));
|
|
writeln;
|
|
writeln('len: ',length(w2),' - "',w2,'"');
|
|
write(' ');
|
|
for i:= 1 to length(w2) do
|
|
write('#$',hexstr(ord(w2[i]),4));
|
|
writeln;
|
|
writeln;
|
|
*)
|
|
|
|
setlength(s3,20);
|
|
setlength(s4,20);
|
|
// -1 because UnicodeToUTF8 returns a null-terminated string
|
|
setlength(s3,UnicodeToUTF8(@s3[1],length(s3),punicodechar(@w1[1]),length(w1))-1);
|
|
setlength(s4,UnicodeToUTF8(@s4[1],length(s4),punicodechar(@w2[1]),length(w2))-1);
|
|
|
|
if (s3<>s1) or
|
|
{ invalid: CESU-8 }
|
|
(w2<>'??') or
|
|
(s4<>'??') then
|
|
begin
|
|
writeln('len: ',length(s1),' - "',s1,'"');
|
|
write(' ');
|
|
for i:= 1 to length(s1) do
|
|
write('#$',hexstr(ord(s1[i]),2));
|
|
writeln;
|
|
writeln('len: ',length(s2),' - "',s2,'"');
|
|
write(' ');
|
|
for i:= 1 to length(s2) do
|
|
write('#$',hexstr(ord(s2[i]),2));
|
|
writeln;
|
|
writeln('len: ',length(s3),' - "',s3,'"');
|
|
write(' ');
|
|
for i:= 1 to length(s3) do
|
|
write('#$',hexstr(ord(s3[i]),2));
|
|
writeln;
|
|
writeln('len: ',length(s4),' - "',s4,'"');
|
|
write(' ');
|
|
for i:= 1 to length(s4) do
|
|
write('#$',hexstr(ord(s4[i]),2));
|
|
writeln;
|
|
halt(1);
|
|
end;
|
|
end;
|
|
|
|
begin
|
|
dotest;
|
|
end.
|
|
|