* Improved utf16 to ucs4 conversion routines: removed function call on every character, allocate memory once and made as much as possible code shared between WideString and UnicodeString versions.

git-svn-id: trunk@20269 -
This commit is contained in:
sergei 2012-02-06 17:55:06 +00:00
parent 8dab34b31a
commit e94508d5eb
2 changed files with 69 additions and 82 deletions

View File

@ -1388,38 +1388,6 @@ end;
{$endif CPU64}
{ converts an utf-16 code point or surrogate pair to utf-32 }
function utf16toutf32(const S: UnicodeString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_UTF16TOUTF32'];
var
w: unicodechar;
begin
{ UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
{ are the same in UTF-32 }
w:=s[index];
if (w<=#$d7ff) or
(w>=#$e000) then
begin
result:=UCS4Char(w);
len:=1;
end
{ valid surrogate pair? }
else if (w<=#$dbff) and
{ w>=#$d7ff check not needed, checked above }
(index<length(s)) and
(s[index+1]>=#$dc00) and
(s[index+1]<=#$dfff) then
{ convert the surrogate pair to UTF-32 }
begin
result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
len:=2;
end
else
{ invalid surrogate -> do nothing }
begin
result:=UCS4Char(w);
len:=1;
end;
end;
function UnicodeToUtf8(Dest: PChar; Source: PUnicodeChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
@ -1870,26 +1838,60 @@ function Utf8ToAnsi(const s : RawByteString) : RawByteString;{$ifdef SYSTEMINLIN
end;
function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String;
procedure UCS4Encode(p: PWideChar; len: sizeint; out res: UCS4String);
var
i, slen,
destindex : SizeInt;
len : longint;
i, reslen: sizeint;
w: longint;
begin
slen:=length(s);
setlength(result,slen+1);
i:=1;
destindex:=0;
while (i<=slen) do
reslen:=0;
i:=0;
{ calculate required length }
while (i<len) do
begin
result[destindex]:=utf16toutf32(s,i,len);
inc(destindex);
inc(i,len);
if (p[i]<=#$d7ff) or (p[i]>=#$e000) then
inc(i)
else if (p[i]<=#$dbff) and
(i+1<len) and
(p[i+1]>=#$dc00) and
(p[i+1]<=#$dfff) then
inc(i,2)
else
inc(i);
inc(reslen);
end;
{ destindex <= slen (surrogate pairs may have been merged) }
{ destindex+1 for terminating #0 (dynamic arrays are }
{ implicitely filled with zero) }
setlength(result,destindex+1);
SetLength(res,reslen+1); { +1 for null termination }
reslen:=0;
i:=0;
{ do conversion }
while (i<len) do
begin
w:=ord(p[i]);
if (w<=$d7ff) or (w>=$e000) then
res[reslen]:=w
else if (w<=$dbff) and
(i+1<len) and
(p[i+1]>=#$dc00) and
(p[i+1]<=#$dfff) then
begin
res[reslen]:=(UCS4Char(w-$d7c0) shl 10)+(UCS4Char(p[i+1]) xor $dc00);
inc(i);
end
else { invalid surrogate pair }
res[reslen]:=w;
inc(i);
inc(reslen);
end;
res[reslen]:=0;
end;
function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String;
begin
UCS4Encode(PWideChar(s),Length(s),result);
end;
function WideStringToUCS4String(const s : WideString) : UCS4String;
begin
UCS4Encode(PWideChar(s),Length(s),result);
end;
@ -1942,29 +1944,6 @@ function UCS4StringToUnicodeString(const s : UCS4String) : UnicodeString;
end;
function WideStringToUCS4String(const s : WideString) : UCS4String;
var
i, slen,
destindex : SizeInt;
len : longint;
begin
slen:=length(s);
setlength(result,slen+1);
i:=1;
destindex:=0;
while (i<=slen) do
begin
result[destindex]:=utf16toutf32(s,i,len);
inc(destindex);
inc(i,len);
end;
{ destindex <= slen (surrogate pairs may have been merged) }
{ destindex+1 for terminating #0 (dynamic arrays are }
{ implicitely filled with zero) }
setlength(result,destindex+1);
end;
{ concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
procedure ConcatUTF32ToWideStr(const nc: UCS4Char; var S: WideString; var index: SizeInt);
var

View File

@ -643,14 +643,10 @@ function UpperAnsiString(const s : AnsiString) : AnsiString;
SetLength(result,resindex-1);
end;
function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char; external name 'FPC_UTF16TOUTF32';
function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
var
i, slen,
destindex : SizeInt;
len : longint;
uch : UCS4Char;
begin
slen:=length(s);
@ -659,16 +655,28 @@ function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
destindex:=0;
while (i<=slen) do
begin
uch:=utf16toutf32(s,i,len);
if (uch=UCS4Char(0)) then
uch:=UCS4Char(32);
result[destindex]:=uch;
uch:=UCS4Char(s[i]);
if (uch=0) then
result[destindex]:=32
else if (uch<=$d7ff) or (uch>=$e000) then
result[destindex]:=uch
else if (uch<=$dbff) and
(i<slen) and
(s[i+1]>=#$dc00) and
(s[i+1]<=#$dfff) then
begin
result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
inc(i);
end
else { invalid surrogate pair }
result[destindex]:=uch;
inc(i);
inc(destindex);
inc(i,len);
end;
result[destindex]:=UCS4Char(0);
{ destindex <= slen }
setlength(result,destindex+1);
{ Trimming length in this particular case is just a waste of time,
because result will be interpreted as null-terminated and discarded
almost immediately }
end;